YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
// ================================================================================================
33
//
34
// The catalog manager handles the current list of tables
35
// and tablets in the cluster, as well as their current locations.
36
// Since most operations in the master go through these data
37
// structures, locking is carefully managed here to prevent unnecessary
38
// contention and deadlocks:
39
//
40
// - each structure has an internal spinlock used for operations that
41
//   are purely in-memory (eg the current status of replicas)
42
// - data that is persisted on disk is stored in separate PersistentTable(t)Info
43
//   structs. These are managed using copy-on-write so that writers may block
44
//   writing them back to disk while not impacting concurrent readers.
45
//
46
// Usage rules:
47
// - You may obtain READ locks in any order. READ locks should never block,
48
//   since they only conflict with COMMIT which is a purely in-memory operation.
49
//   Thus they are deadlock-free.
50
// - If you need a WRITE lock on both a table and one or more of its tablets,
51
//   acquire the lock on the table first. This strict ordering prevents deadlocks.
52
//
53
// ================================================================================================
54
55
#include "yb/master/catalog_manager.h"
56
57
#include <stdlib.h>
58
59
#include <algorithm>
60
#include <atomic>
61
#include <bitset>
62
#include <functional>
63
#include <memory>
64
#include <mutex>
65
#include <set>
66
#include <string>
67
#include <unordered_map>
68
#include <vector>
69
70
#include <boost/optional.hpp>
71
#include <glog/logging.h>
72
73
#include "yb/client/client-internal.h"
74
#include "yb/client/client.h"
75
#include "yb/client/schema.h"
76
#include "yb/client/universe_key_client.h"
77
78
#include "yb/common/common.pb.h"
79
#include "yb/common/common_flags.h"
80
#include "yb/common/key_encoder.h"
81
#include "yb/common/partial_row.h"
82
#include "yb/common/partition.h"
83
#include "yb/common/ql_type.h"
84
#include "yb/common/roles_permissions.h"
85
#include "yb/common/schema.h"
86
#include "yb/common/wire_protocol.h"
87
88
#include "yb/consensus/consensus.h"
89
#include "yb/consensus/consensus.pb.h"
90
#include "yb/consensus/consensus_util.h"
91
#include "yb/consensus/metadata.pb.h"
92
#include "yb/consensus/opid_util.h"
93
#include "yb/consensus/quorum_util.h"
94
95
#include "yb/docdb/doc_key.h"
96
97
#include "yb/gutil/atomicops.h"
98
#include "yb/gutil/bind.h"
99
#include "yb/gutil/casts.h"
100
#include "yb/gutil/map-util.h"
101
#include "yb/gutil/mathlimits.h"
102
#include "yb/gutil/stl_util.h"
103
#include "yb/gutil/strings/escaping.h"
104
#include "yb/gutil/strings/join.h"
105
#include "yb/gutil/strings/substitute.h"
106
#include "yb/gutil/sysinfo.h"
107
#include "yb/gutil/walltime.h"
108
109
#include "yb/master/master_fwd.h"
110
#include "yb/master/async_rpc_tasks.h"
111
#include "yb/master/backfill_index.h"
112
#include "yb/master/catalog_entity_info.h"
113
#include "yb/master/catalog_loaders.h"
114
#include "yb/master/catalog_manager-internal.h"
115
#include "yb/master/catalog_manager_bg_tasks.h"
116
#include "yb/master/catalog_manager_util.h"
117
#include "yb/master/cluster_balance.h"
118
#include "yb/master/encryption_manager.h"
119
#include "yb/master/master.h"
120
#include "yb/master/master_admin.pb.h"
121
#include "yb/master/master_client.pb.h"
122
#include "yb/master/master_cluster.proxy.h"
123
#include "yb/master/master_dcl.pb.h"
124
#include "yb/master/master_ddl.pb.h"
125
#include "yb/master/master_encryption.pb.h"
126
#include "yb/master/master_error.h"
127
#include "yb/master/master_heartbeat.pb.h"
128
#include "yb/master/master_replication.pb.h"
129
#include "yb/master/master_util.h"
130
#include "yb/master/permissions_manager.h"
131
#include "yb/master/scoped_leader_shared_lock-internal.h"
132
#include "yb/master/sys_catalog.h"
133
#include "yb/master/sys_catalog_constants.h"
134
#include "yb/master/ts_descriptor.h"
135
#include "yb/master/yql_aggregates_vtable.h"
136
#include "yb/master/yql_auth_resource_role_permissions_index.h"
137
#include "yb/master/yql_auth_role_permissions_vtable.h"
138
#include "yb/master/yql_auth_roles_vtable.h"
139
#include "yb/master/yql_columns_vtable.h"
140
#include "yb/master/yql_empty_vtable.h"
141
#include "yb/master/yql_functions_vtable.h"
142
#include "yb/master/yql_indexes_vtable.h"
143
#include "yb/master/yql_keyspaces_vtable.h"
144
#include "yb/master/yql_local_vtable.h"
145
#include "yb/master/yql_partitions_vtable.h"
146
#include "yb/master/yql_peers_vtable.h"
147
#include "yb/master/yql_size_estimates_vtable.h"
148
#include "yb/master/yql_tables_vtable.h"
149
#include "yb/master/yql_triggers_vtable.h"
150
#include "yb/master/yql_types_vtable.h"
151
#include "yb/master/yql_views_vtable.h"
152
#include "yb/master/ysql_transaction_ddl.h"
153
154
#include "yb/rpc/messenger.h"
155
#include "yb/rpc/rpc_controller.h"
156
157
#include "yb/tablet/operations/change_metadata_operation.h"
158
#include "yb/tablet/tablet.h"
159
#include "yb/tablet/tablet_metadata.h"
160
#include "yb/tablet/tablet_peer.h"
161
#include "yb/tablet/tablet_retention_policy.h"
162
163
#include "yb/tserver/remote_bootstrap_client.h"
164
#include "yb/tserver/ts_tablet_manager.h"
165
166
#include "yb/util/atomic.h"
167
#include "yb/util/countdown_latch.h"
168
#include "yb/util/debug-util.h"
169
#include "yb/util/debug/trace_event.h"
170
#include "yb/util/flag_tags.h"
171
#include "yb/util/format.h"
172
#include "yb/util/hash_util.h"
173
#include "yb/util/locks.h"
174
#include "yb/util/math_util.h"
175
#include "yb/util/metrics.h"
176
#include "yb/util/monotime.h"
177
#include "yb/util/net/net_util.h"
178
#include "yb/util/oid_generator.h"
179
#include "yb/util/random_util.h"
180
#include "yb/util/rw_mutex.h"
181
#include "yb/util/semaphore.h"
182
#include "yb/util/shared_lock.h"
183
#include "yb/util/size_literals.h"
184
#include "yb/util/status.h"
185
#include "yb/util/status_format.h"
186
#include "yb/util/status_log.h"
187
#include "yb/util/stopwatch.h"
188
#include "yb/util/string_util.h"
189
#include "yb/util/sync_point.h"
190
#include "yb/util/thread.h"
191
#include "yb/util/threadpool.h"
192
#include "yb/util/trace.h"
193
#include "yb/util/tsan_util.h"
194
#include "yb/util/uuid.h"
195
196
#include "yb/yql/pgwrapper/pg_wrapper.h"
197
#include "yb/yql/redis/redisserver/redis_constants.h"
198
199
using namespace std::literals;
200
using namespace yb::size_literals;
201
202
DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000,  // 30 sec
203
             "Timeout used for the Master->TS async rpc calls.");
204
TAG_FLAG(master_ts_rpc_timeout_ms, advanced);
205
206
DEFINE_int32(tablet_creation_timeout_ms, 30 * 1000,  // 30 sec
207
             "Timeout used by the master when attempting to create tablet "
208
             "replicas during table creation.");
209
TAG_FLAG(tablet_creation_timeout_ms, advanced);
210
211
DEFINE_test_flag(bool, disable_tablet_deletion, false,
212
                 "Whether catalog manager should disable tablet deletion.");
213
214
DEFINE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader, true,
215
            "Whether the catalog manager should wait for a newly created tablet to "
216
            "elect a leader before considering it successfully created. "
217
            "This is disabled in some tests where we explicitly manage leader "
218
            "election.");
219
TAG_FLAG(catalog_manager_wait_for_new_tablets_to_elect_leader, hidden);
220
221
DEFINE_int32(catalog_manager_inject_latency_in_delete_table_ms, 0,
222
             "Number of milliseconds that the master will sleep in DeleteTable.");
223
TAG_FLAG(catalog_manager_inject_latency_in_delete_table_ms, hidden);
224
225
DECLARE_int32(catalog_manager_bg_task_wait_ms);
226
227
DEFINE_int32(replication_factor, 3,
228
             "Default number of replicas for tables that do not have the num_replicas set.");
229
TAG_FLAG(replication_factor, advanced);
230
231
DEFINE_int32(max_create_tablets_per_ts, 50,
232
             "The number of tablets per TS that can be requested for a new table.");
233
TAG_FLAG(max_create_tablets_per_ts, advanced);
234
235
DEFINE_int32(catalog_manager_report_batch_size, 1,
236
            "The max number of tablets evaluated in the heartbeat as a single SysCatalog update.");
237
TAG_FLAG(catalog_manager_report_batch_size, advanced);
238
239
DEFINE_int32(master_failover_catchup_timeout_ms, 30 * 1000 * yb::kTimeMultiplier,  // 30 sec
240
             "Amount of time to give a newly-elected leader master to load"
241
             " the previous master's metadata and become active. If this time"
242
             " is exceeded, the node crashes.");
243
TAG_FLAG(master_failover_catchup_timeout_ms, advanced);
244
TAG_FLAG(master_failover_catchup_timeout_ms, experimental);
245
246
DEFINE_bool(master_tombstone_evicted_tablet_replicas, true,
247
            "Whether the Master should tombstone (delete) tablet replicas that "
248
            "are no longer part of the latest reported raft config.");
249
TAG_FLAG(master_tombstone_evicted_tablet_replicas, hidden);
250
DECLARE_bool(master_ignore_deleted_on_load);
251
252
// Temporary.  Can be removed after long-run testing.
253
DEFINE_bool(master_ignore_stale_cstate, true,
254
            "Whether Master processes the raft config when the version is lower.");
255
TAG_FLAG(master_ignore_stale_cstate, hidden);
256
257
DEFINE_bool(catalog_manager_check_ts_count_for_create_table, true,
258
            "Whether the master should ensure that there are enough live tablet "
259
            "servers to satisfy the provided replication count before allowing "
260
            "a table to be created.");
261
TAG_FLAG(catalog_manager_check_ts_count_for_create_table, hidden);
262
263
DEFINE_test_flag(bool, catalog_manager_check_yql_partitions_exist_for_is_create_table_done, true,
264
                 "Whether the master should ensure that all of a table's tablets are "
265
                 "in the YQL system.partitions vtable during the IsCreateTableDone check.");
266
267
METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_live,
268
                           "Number of live tservers in the cluster", yb::MetricUnit::kUnits,
269
                           "The number of tablet servers that have responded or done a heartbeat "
270
                           "in the time interval defined by the gflag "
271
                           "FLAGS_tserver_unresponsive_timeout_ms.");
272
273
METRIC_DEFINE_gauge_uint32(cluster, num_tablet_servers_dead,
274
                           "Number of dead tservers in the cluster", yb::MetricUnit::kUnits,
275
                           "The number of tablet servers that have not responded or done a "
276
                           "heartbeat in the time interval defined by the gflag "
277
                           "FLAGS_tserver_unresponsive_timeout_ms.");
278
279
DEFINE_test_flag(uint64, inject_latency_during_remote_bootstrap_secs, 0,
280
                 "Number of seconds to sleep during a remote bootstrap.");
281
282
DEFINE_test_flag(uint64, inject_latency_during_tablet_report_ms, 0,
283
                 "Number of milliseconds to sleep during the processing of a tablet batch.");
284
285
DEFINE_test_flag(bool, catalog_manager_simulate_system_table_create_failure, false,
286
                 "This is only used in tests to simulate a failure where the table information is "
287
                 "persisted in syscatalog, but the tablet information is not yet persisted and "
288
                 "there is a failure.");
289
290
DEFINE_string(cluster_uuid, "", "Cluster UUID to be used by this cluster");
291
TAG_FLAG(cluster_uuid, hidden);
292
293
DECLARE_int32(yb_num_shards_per_tserver);
294
295
DEFINE_int32(transaction_table_num_tablets, 0,
296
             "Number of tablets to use when creating the transaction status table."
297
             "0 to use transaction_table_num_tablets_per_tserver.");
298
299
DEFINE_int32(transaction_table_num_tablets_per_tserver, kAutoDetectNumShardsPerTServer,
300
    "The default number of tablets per tablet server for transaction status table. If the value is "
301
    "-1, the system automatically determines an appropriate value based on number of CPU cores.");
302
303
DEFINE_bool(auto_create_local_transaction_tables, true,
304
            "Whether or not to create local transaction status tables automatically on table "
305
            "creation with a tablespace with placement specified.");
306
307
DEFINE_test_flag(bool, name_transaction_tables_with_tablespace_id, false,
308
                 "This is only used in tests to make associating automatically created transaction "
309
                 "tables with their tablespaces easier, and causes transaction tables created "
310
                 "automatically for tablespaces to include the tablespace oid in their names.");
311
312
DEFINE_bool(master_enable_metrics_snapshotter, false, "Should metrics snapshotter be enabled");
313
314
DEFINE_int32(metrics_snapshots_table_num_tablets, 0,
315
             "Number of tablets to use when creating the metrics snapshots table."
316
             "0 to use the same default num tablets as for regular tables.");
317
318
DEFINE_bool(disable_index_backfill, false,
319
    "A kill switch to disable multi-stage backfill for YCQL indexes.");
320
TAG_FLAG(disable_index_backfill, runtime);
321
TAG_FLAG(disable_index_backfill, hidden);
322
323
DEFINE_bool(disable_index_backfill_for_non_txn_tables, true,
324
    "A kill switch to disable multi-stage backfill for user enforced YCQL indexes. "
325
    "Note that enabling this feature may cause the create index flow to be slow. "
326
    "This is needed to ensure the safety of the index backfill process. See also "
327
    "index_backfill_upperbound_for_user_enforced_txn_duration_ms");
328
TAG_FLAG(disable_index_backfill_for_non_txn_tables, runtime);
329
TAG_FLAG(disable_index_backfill_for_non_txn_tables, hidden);
330
331
DEFINE_bool(enable_transactional_ddl_gc, true,
332
    "A kill switch for transactional DDL GC. Temporary safety measure.");
333
TAG_FLAG(enable_transactional_ddl_gc, runtime);
334
TAG_FLAG(enable_transactional_ddl_gc, hidden);
335
336
DEFINE_bool(
337
    hide_pg_catalog_table_creation_logs, false,
338
    "Whether to hide detailed log messages for PostgreSQL catalog table creation. "
339
    "This cuts down test logs significantly.");
340
TAG_FLAG(hide_pg_catalog_table_creation_logs, hidden);
341
342
DEFINE_test_flag(int32, simulate_slow_table_create_secs, 0,
343
    "Simulates a slow table creation by sleeping after the table has been added to memory.");
344
345
DEFINE_test_flag(int32, simulate_slow_system_tablet_bootstrap_secs, 0,
346
    "Simulates a slow tablet bootstrap by adding a sleep before system tablet init.");
347
348
DEFINE_test_flag(bool, return_error_if_namespace_not_found, false,
349
    "Return an error from ListTables if a namespace id is not found in the map");
350
351
DEFINE_test_flag(bool, hang_on_namespace_transition, false,
352
    "Used in tests to simulate a lapse between issuing a namespace op and final processing.");
353
354
DEFINE_test_flag(bool, simulate_crash_after_table_marked_deleting, false,
355
    "Crash yb-master after table's state is set to DELETING. This skips tablets deletion.");
356
357
DEFINE_bool(master_drop_table_after_task_response, true,
358
            "Mark a table as DELETED as soon as we get all the responses from all the TS.");
359
TAG_FLAG(master_drop_table_after_task_response, advanced);
360
TAG_FLAG(master_drop_table_after_task_response, runtime);
361
362
DECLARE_int32(yb_client_admin_operation_timeout_sec);
363
364
DEFINE_test_flag(bool, tablegroup_master_only, false,
365
                 "This is only for MasterTest to be able to test tablegroups without the"
366
                 " transaction status table being created.");
367
368
DEFINE_bool(enable_register_ts_from_raft, true, "Whether to register a tserver from the consensus "
369
                                                "information of a reported tablet.");
370
371
DECLARE_int32(tserver_unresponsive_timeout_ms);
372
373
DEFINE_bool(use_create_table_leader_hint, true,
374
            "Whether the Master should hint which replica for each tablet should "
375
            "be leader initially on tablet creation.");
376
TAG_FLAG(use_create_table_leader_hint, runtime);
377
378
DEFINE_test_flag(bool, create_table_leader_hint_min_lexicographic, false,
379
                 "Whether the Master should hint replica with smallest lexicographic rank for each "
380
                 "tablet as leader initially on tablet creation.");
381
382
DEFINE_double(heartbeat_safe_deadline_ratio, .20,
383
              "When the heartbeat deadline has this percentage of time remaining, "
384
              "the master should halt tablet report processing so it can respond in time.");
385
DECLARE_int32(heartbeat_rpc_timeout_ms);
386
DECLARE_CAPABILITY(TabletReportLimit);
387
388
DEFINE_int32(partitions_vtable_cache_refresh_secs, 0,
389
             "Amount of time to wait before refreshing the system.partitions cached vtable. "
390
             "If generate_partitions_vtable_on_changes is set, then this background task will "
391
             "update the cache using the internal map, but won't do any generating of the vtable.");
392
393
DEFINE_int32(txn_table_wait_min_ts_count, 1,
394
             "Minimum Number of TS to wait for before creating the transaction status table."
395
             " Default value is 1. We wait for atleast --replication_factor if this value"
396
             " is smaller than that");
397
TAG_FLAG(txn_table_wait_min_ts_count, advanced);
398
399
DEFINE_bool(enable_ysql_tablespaces_for_placement, true,
400
            "If set, tablespaces will be used for placement of YSQL tables.");
401
TAG_FLAG(enable_ysql_tablespaces_for_placement, runtime);
402
403
DEFINE_int32(ysql_tablespace_info_refresh_secs, 30,
404
             "Frequency at which the table to tablespace information will be updated in master "
405
             "from pg catalog tables. A value of -1 disables the refresh task.");
406
TAG_FLAG(ysql_tablespace_info_refresh_secs, runtime);
407
408
DEFINE_int64(tablet_split_size_threshold_bytes, 0,
409
             "DEPRECATED -- Threshold on tablet size after which tablet should be split. Automated "
410
             "splitting is disabled if this value is set to 0.");
411
TAG_FLAG(tablet_split_size_threshold_bytes, hidden);
412
413
DEFINE_int64(tablet_split_low_phase_shard_count_per_node, 8,
414
             "The per-node tablet count until which a table is splitting at the phase 1 threshold, "
415
             "as defined by tablet_split_low_phase_size_threshold_bytes.");
416
DEFINE_int64(tablet_split_high_phase_shard_count_per_node, 24,
417
             "The per-node tablet count until which a table is splitting at the phase 2 threshold, "
418
             "as defined by tablet_split_high_phase_size_threshold_bytes.");
419
420
DEFINE_int64(tablet_split_low_phase_size_threshold_bytes, 512_MB,
421
             "The tablet size threshold at which to split tablets in phase 1. "
422
             "See tablet_split_low_phase_shard_count_per_node.");
423
DEFINE_int64(tablet_split_high_phase_size_threshold_bytes, 10_GB,
424
             "The tablet size threshold at which to split tablets in phase 2. "
425
             "See tablet_split_high_phase_shard_count_per_node.");
426
DEFINE_int64(tablet_force_split_threshold_bytes, 100_GB,
427
             "The tablet size threshold at which to split tablets regardless of how many tablets "
428
             "exist in the table already. This should be configured to prevent runaway whale "
429
             "tablets from forming in your cluster even if both automatic splitting phases have "
430
             "been finished.");
431
432
DEFINE_test_flag(bool, crash_server_on_sys_catalog_leader_affinity_move, false,
433
                 "When set, crash the master process if it performs a sys catalog leader affinity "
434
                 "move.");
435
DEFINE_int32(blacklist_progress_initial_delay_secs, yb::master::kDelayAfterFailoverSecs,
436
             "When a master leader failsover, the time until which the progress of load movement "
437
             "off the blacklisted tservers is reported as 0. This initial delay "
438
             "gives sufficient time for heartbeats so that we don't report"
439
             " a premature incorrect completion.");
440
TAG_FLAG(blacklist_progress_initial_delay_secs, runtime);
441
442
DEFINE_test_flag(bool, validate_all_tablet_candidates, false,
443
                 "When set to true, consider any tablet a valid candidate for splitting. "
444
                 "Specifically this flag ensures that ValidateSplitCandidateTable and "
445
                 "ValidateSplitCandidateTablet always return OK and all tablets are considered "
446
                 "valid candidates for splitting.");
447
448
DEFINE_test_flag(bool, skip_placement_validation_createtable_api, false,
449
                 "When set, it skips checking that all the tablets of a table have enough tservers"
450
                 " conforming to the table placement policy during CreateTable API call.");
451
TAG_FLAG(TEST_skip_placement_validation_createtable_api, runtime);
452
453
DEFINE_test_flag(int32, slowdown_alter_table_rpcs_ms, 0,
454
                 "Slows down the alter table rpc's send and response handler so that the TServer "
455
                 "has a heartbeat delay and triggers tablet leader change.");
456
457
DEFINE_test_flag(bool, reject_delete_not_serving_tablet_rpc, false,
458
                 "Whether to reject DeleteNotServingTablet RPC.");
459
460
DEFINE_test_flag(double, crash_after_creating_single_split_tablet, 0.0,
461
                 "Crash inside CatalogManager::RegisterNewTabletForSplit after calling Upsert");
462
463
DEFINE_bool(enable_delete_truncate_xcluster_replicated_table, false,
464
            "When set, enables deleting/truncating tables currently in xCluster replication");
465
TAG_FLAG(enable_delete_truncate_xcluster_replicated_table, runtime);
466
467
namespace yb {
468
namespace master {
469
470
using std::atomic;
471
using std::shared_ptr;
472
using std::string;
473
using std::unique_ptr;
474
using std::vector;
475
476
using namespace std::placeholders;
477
478
using base::subtle::NoBarrier_Load;
479
using base::subtle::NoBarrier_CompareAndSwap;
480
using consensus::kMinimumTerm;
481
using consensus::CONSENSUS_CONFIG_COMMITTED;
482
using consensus::CONSENSUS_CONFIG_ACTIVE;
483
using consensus::COMMITTED_OPID;
484
using consensus::Consensus;
485
using consensus::ConsensusMetadata;
486
using consensus::ConsensusServiceProxy;
487
using consensus::ConsensusStatePB;
488
using consensus::GetConsensusRole;
489
using consensus::PeerMemberType;
490
using consensus::RaftPeerPB;
491
using consensus::StartRemoteBootstrapRequestPB;
492
using rpc::RpcContext;
493
using server::MonitoredTask;
494
using strings::Substitute;
495
using tablet::TABLET_DATA_COPYING;
496
using tablet::TABLET_DATA_DELETED;
497
using tablet::TABLET_DATA_READY;
498
using tablet::TABLET_DATA_TOMBSTONED;
499
using tablet::TabletDataState;
500
using tablet::RaftGroupMetadata;
501
using tablet::RaftGroupMetadataPtr;
502
using tablet::TabletPeer;
503
using tablet::RaftGroupStatePB;
504
using tablet::TabletStatusListener;
505
using tablet::TabletStatusPB;
506
using tserver::HandleReplacingStaleTablet;
507
using tserver::TabletServerErrorPB;
508
using yb::pgwrapper::PgWrapper;
509
using yb::server::MasterAddressesToString;
510
511
using yb::client::YBClient;
512
using yb::client::YBClientBuilder;
513
using yb::client::YBColumnSchema;
514
using yb::client::YBSchema;
515
using yb::client::YBSchemaBuilder;
516
using yb::client::YBTable;
517
using yb::client::YBTableName;
518
519
namespace {
520
521
// Macros to access index information in CATALOG.
522
//
523
// NOTES from file master.proto for SysTablesEntryPB.
524
// - For index table: [to be deprecated and replaced by "index_info"]
525
//     optional bytes indexed_table_id = 13; // Indexed table id of this index.
526
//     optional bool is_local_index = 14 [ default = false ];  // Whether this is a local index.
527
//     optional bool is_unique_index = 15 [ default = false ]; // Whether this is a unique index.
528
// - During transition period, we have to consider both fields and the following macros help
529
//   avoiding duplicate protobuf version check thru out our code.
530
531
25.1k
const std::string& GetIndexedTableId(const SysTablesEntryPB& pb) {
532
25.1k
  return pb.has_index_info() ? pb.index_info().indexed_table_id() : pb.indexed_table_id();
533
25.1k
}
534
535
#define PROTO_GET_IS_LOCAL(tabpb) \
536
1.00k
  (tabpb.has_index_info() ? tabpb.index_info().is_local() \
537
0
                          : tabpb.is_local_index())
538
539
#define PROTO_GET_IS_UNIQUE(tabpb) \
540
1.00k
  (tabpb.has_index_info() ? tabpb.index_info().is_unique() \
541
0
                          : tabpb.is_unique_index())
542
543
template <class PB>
544
25.4k
bool IsIndex(const PB& pb) {
545
25.4k
  return pb.has_index_info() || !pb.indexed_table_id().empty();
546
25.4k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_17IsIndexINS0_16SysTablesEntryPBEEEbRKT_
Line
Count
Source
544
21.3k
bool IsIndex(const PB& pb) {
545
21.3k
  return pb.has_index_info() || !pb.indexed_table_id().empty();
546
21.3k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_17IsIndexINS0_20CreateTableRequestPBEEEbRKT_
Line
Count
Source
544
4.06k
bool IsIndex(const PB& pb) {
545
4.06k
  return pb.has_index_info() || !pb.indexed_table_id().empty();
546
4.06k
}
547
548
5.09k
bool IsTable(const SysTablesEntryPB& pb) {
549
5.09k
  return !IsIndex(pb);
550
5.09k
}
551
552
#define PROTO_PTR_IS_INDEX(tabpb) \
553
  (tabpb->has_index_info() || !tabpb->indexed_table_id().empty())
554
555
#define PROTO_PTR_IS_TABLE(tabpb) \
556
3.65k
  (!tabpb->has_index_info() && tabpb->indexed_table_id().empty())
557
558
#if (0)
559
// Once the deprecated fields are obsolete, the above macros should be defined as the following.
560
#define GetIndexedTableId(tabpb) (tabpb.index_info().indexed_table_id())
561
#define PROTO_GET_IS_LOCAL(tabpb) (tabpb.index_info().is_local())
562
#define PROTO_GET_IS_UNIQUE(tabpb) (tabpb.index_info().is_unique())
563
#define PROTO_IS_INDEX(tabpb) (tabpb.has_index_info())
564
#define PROTO_IS_TABLE(tabpb) (!tabpb.has_index_info())
565
#define PROTO_PTR_IS_INDEX(tabpb) (tabpb->has_index_info())
566
#define PROTO_PTR_IS_TABLE(tabpb) (!tabpb->has_index_info())
567
568
#endif
569
570
class IndexInfoBuilder {
571
 public:
572
18
  explicit IndexInfoBuilder(IndexInfoPB* index_info) : index_info_(*index_info) {
573
0
    DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_);
574
18
  }
575
576
18
  void ApplyProperties(const TableId& indexed_table_id, bool is_local, bool is_unique) {
577
18
    index_info_.set_indexed_table_id(indexed_table_id);
578
18
    index_info_.set_version(0);
579
18
    index_info_.set_is_local(is_local);
580
18
    index_info_.set_is_unique(is_unique);
581
0
    DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_);
582
18
  }
583
584
18
  CHECKED_STATUS ApplyColumnMapping(const Schema& indexed_schema, const Schema& index_schema) {
585
72
    for (size_t i = 0; i < index_schema.num_columns(); i++) {
586
54
      const auto& col_name = index_schema.column(i).name();
587
54
      const auto indexed_col_idx = indexed_schema.find_column(col_name);
588
54
      if (PREDICT_FALSE(indexed_col_idx == Schema::kColumnNotFound)) {
589
0
        return STATUS(NotFound, "The indexed table column does not exist", col_name);
590
0
      }
591
54
      auto* col = index_info_.add_columns();
592
54
      col->set_column_id(index_schema.column_id(i));
593
54
      col->set_indexed_column_id(indexed_schema.column_id(indexed_col_idx));
594
54
    }
595
18
    index_info_.set_hash_column_count(narrow_cast<uint32_t>(index_schema.num_hash_key_columns()));
596
18
    index_info_.set_range_column_count(narrow_cast<uint32_t>(index_schema.num_range_key_columns()));
597
598
36
    for (size_t i = 0; i < indexed_schema.num_hash_key_columns(); i++) {
599
18
      index_info_.add_indexed_hash_column_ids(indexed_schema.column_id(i));
600
18
    }
601
18
    for (size_t i = indexed_schema.num_hash_key_columns(); i < indexed_schema.num_key_columns();
602
0
        i++) {
603
0
      index_info_.add_indexed_range_column_ids(indexed_schema.column_id(i));
604
0
    }
605
0
    DVLOG(3) << " After " << __PRETTY_FUNCTION__ << " index_info_ is " << yb::ToString(index_info_);
606
18
    return Status::OK();
607
18
  }
608
609
 private:
610
  IndexInfoPB& index_info_;
611
};
612
613
template<class Lock, class RespClass>
614
325k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
325k
  if (lock->started_deleting()) {
617
62
    Status s = STATUS_SUBSTITUTE(NotFound,
618
62
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
62
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
62
  }
621
325k
  if (!lock->visible_to_client()) {
622
1
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
1
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
1
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
1
  }
626
325k
  return Status::OK();
627
325k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_12CowWriteLockINS0_19PersistentTableInfoEEENS0_21CreateTableResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
603
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
603
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
603
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
603
  return Status::OK();
627
603
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_21CreateTableResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
609
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
609
  if (lock->started_deleting()) {
617
1
    Status s = STATUS_SUBSTITUTE(NotFound,
618
1
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
1
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
1
  }
621
608
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
608
  return Status::OK();
627
608
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_37GetTransactionStatusTabletsResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
2.01k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
2.01k
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
2.01k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
2.01k
  return Status::OK();
627
2.01k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_27IsCreateTableDoneResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
20.3k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
20.3k
  if (lock->started_deleting()) {
617
15
    Status s = STATUS_SUBSTITUTE(NotFound,
618
15
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
15
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
15
  }
621
20.3k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
20.3k
  return Status::OK();
627
20.3k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_23TruncateTableResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
6.43k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
6.43k
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
6.43k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
6.43k
  return Status::OK();
627
6.43k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_29IsTruncateTableDoneResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
8.89k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
8.89k
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
8.89k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
8.89k
  return Status::OK();
627
8.89k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_12CowWriteLockINS0_19PersistentTableInfoEEENS0_20AlterTableResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
2.86k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
2.86k
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
2.86k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
2.86k
  return Status::OK();
627
2.86k
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_26IsAlterTableDoneResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
649
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
649
  if (lock->started_deleting()) {
617
0
    Status s = STATUS_SUBSTITUTE(NotFound,
618
0
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
0
  }
621
649
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
649
  return Status::OK();
627
649
}
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_24GetTableSchemaResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
115k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
115k
  if (lock->started_deleting()) {
617
14
    Status s = STATUS_SUBSTITUTE(NotFound,
618
14
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
14
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
14
  }
621
115k
  if (!lock->visible_to_client()) {
622
1
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
1
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
1
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
1
  }
626
115k
  return Status::OK();
627
115k
}
Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_34GetColocatedTabletSchemaResponsePBEEENS_6StatusERKT_PT0_
catalog_manager.cc:_ZN2yb6master12_GLOBAL__N_139CheckIfTableDeletedOrNotVisibleToClientINS_11CowReadLockINS0_19PersistentTableInfoEEENS0_27GetTableLocationsResponsePBEEENS_6StatusERKT_PT0_
Line
Count
Source
614
167k
Status CheckIfTableDeletedOrNotVisibleToClient(const Lock& lock, RespClass* resp) {
615
  // This covers both in progress and fully deleted objects.
616
167k
  if (lock->started_deleting()) {
617
32
    Status s = STATUS_SUBSTITUTE(NotFound,
618
32
        "The object '$0.$1' does not exist", lock->namespace_id(), lock->name());
619
32
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
620
32
  }
621
167k
  if (!lock->visible_to_client()) {
622
0
    Status s = STATUS_SUBSTITUTE(ServiceUnavailable,
623
0
        "The object '$0.$1' is not running", lock->namespace_id(), lock->name());
624
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
625
0
  }
626
167k
  return Status::OK();
627
167k
}
628
629
4.51k
#define VERIFY_NAMESPACE_FOUND(expr, resp) RESULT_CHECKER_HELPER( \
630
4.14k
    expr, \
631
4.14k
    if (!__result.ok()) { \
632
4.14k
        return SetupError((resp)->mutable_error(), __result.status()); \
633
4.14k
    });
634
635
2
MasterErrorPB_Code NamespaceMasterError(SysNamespaceEntryPB_State state) {
636
2
  switch (state) {
637
2
    case SysNamespaceEntryPB::PREPARING: FALLTHROUGH_INTENDED;
638
2
    case SysNamespaceEntryPB::DELETING:
639
2
      return MasterErrorPB::IN_TRANSITION_CAN_RETRY;
640
0
    case SysNamespaceEntryPB::DELETED: FALLTHROUGH_INTENDED;
641
0
    case SysNamespaceEntryPB::FAILED: FALLTHROUGH_INTENDED;
642
0
    case SysNamespaceEntryPB::RUNNING:
643
0
      return MasterErrorPB::INTERNAL_ERROR;
644
0
    default:
645
0
      FATAL_INVALID_ENUM_VALUE(SysNamespaceEntryPB_State, state);
646
2
  }
647
2
}
648
649
255k
size_t GetNameMapperIndex(YQLDatabase db_type) {
650
255k
  switch (db_type) {
651
0
    case YQL_DATABASE_UNKNOWN: break;
652
250k
    case YQL_DATABASE_CQL: return 1;
653
2.47k
    case YQL_DATABASE_PGSQL: return 2;
654
2.45k
    case YQL_DATABASE_REDIS: return 3;
655
0
  }
656
0
  CHECK(false) << "Unexpected db type " << db_type;
657
0
  return 0;
658
0
}
659
660
4.27k
bool IsIndexBackfillEnabled(TableType table_type, bool is_transactional) {
661
  // Fetch the runtime flag to prevent any issues from the updates to flag while processing.
662
4.27k
  const bool disabled =
663
4.27k
      (table_type == PGSQL_TABLE_TYPE
664
1.59k
          ? GetAtomicFlag(&FLAGS_ysql_disable_index_backfill)
665
2.68k
          : GetAtomicFlag(&FLAGS_disable_index_backfill) ||
666
2.68k
      (!is_transactional && GetAtomicFlag(&FLAGS_disable_index_backfill_for_non_txn_tables)));
667
4.27k
  return !disabled;
668
4.27k
}
669
670
constexpr auto kDefaultYQLPartitionsRefreshBgTaskSleep = 10s;
671
672
void FillRetainedBySnapshotSchedules(
673
      const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map,
674
      const TableId& table_id,
675
2.80k
      RepeatedBytes* retained_by_snapshot_schedules) {
676
0
  for (const auto& entry : schedules_to_tables_map) {
677
0
    if (std::binary_search(entry.second.begin(), entry.second.end(), table_id)) {
678
0
      retained_by_snapshot_schedules->Add()->assign(
679
0
          entry.first.AsSlice().cdata(), entry.first.size());
680
0
    }
681
0
  }
682
2.80k
}
683
684
5.38k
int GetTransactionTableNumShardsPerTServer() {
685
5.38k
  int value = 8;
686
5.38k
  if (IsTsan()) {
687
0
    value = 2;
688
5.38k
  } else if (base::NumCPUs() <= 2) {
689
0
    value = 4;
690
0
  }
691
5.38k
  return value;
692
5.38k
}
693
694
5.45k
void InitMasterFlags() {
695
5.45k
  yb::InitCommonFlags();
696
5.45k
  if (GetAtomicFlag(&FLAGS_transaction_table_num_tablets_per_tserver) ==
697
5.38k
      kAutoDetectNumShardsPerTServer) {
698
5.38k
    const auto value = GetTransactionTableNumShardsPerTServer();
699
0
    VLOG(1) << "Auto setting FLAGS_transaction_table_num_tablets_per_tserver to " << value;
700
5.38k
    SetAtomicFlag(value, &FLAGS_transaction_table_num_tablets_per_tserver);
701
5.38k
  }
702
5.45k
}
703
704
5.31k
Result<bool> DoesTableExist(const Result<TableInfoPtr>& result) {
705
5.31k
  if (result.ok()) {
706
4.65k
    return true;
707
4.65k
  }
708
653
  if (result.status().IsNotFound()
709
653
      && MasterError(result.status()) == MasterErrorPB::OBJECT_NOT_FOUND) {
710
653
    return false;
711
653
  }
712
0
  return result.status();
713
0
}
714
715
}  // anonymous namespace
716
717
////////////////////////////////////////////////////////////
718
// CatalogManager
719
////////////////////////////////////////////////////////////
720
721
CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[](
722
13.5k
    YQLDatabase db_type) {
723
13.5k
  return typed_maps_[GetNameMapperIndex(db_type)];
724
13.5k
}
725
726
const CatalogManager::NamespaceInfoMap& CatalogManager::NamespaceNameMapper::operator[](
727
241k
    YQLDatabase db_type) const {
728
241k
  return typed_maps_[GetNameMapperIndex(db_type)];
729
241k
}
730
731
2.37k
void CatalogManager::NamespaceNameMapper::clear() {
732
9.48k
  for (auto& m : typed_maps_) {
733
9.48k
    m.clear();
734
9.48k
  }
735
2.37k
}
736
737
CatalogManager::CatalogManager(Master* master)
738
    : master_(master),
739
      tablet_exists_(false),
740
      state_(kConstructed),
741
      leader_ready_term_(-1),
742
      leader_lock_(RWMutex::Priority::PREFER_WRITING),
743
      load_balance_policy_(std::make_unique<ClusterLoadBalancer>(this)),
744
      permissions_manager_(std::make_unique<PermissionsManager>(this)),
745
      tasks_tracker_(new TasksTracker(IsUserInitiated::kFalse)),
746
      jobs_tracker_(new TasksTracker(IsUserInitiated::kTrue)),
747
      encryption_manager_(new EncryptionManager()),
748
      tablespace_manager_(std::make_shared<YsqlTablespaceManager>(nullptr, nullptr)),
749
      tablespace_bg_task_running_(false),
750
5.45k
      tablet_split_manager_(this, this, this) {
751
5.45k
  InitMasterFlags();
752
5.45k
  CHECK_OK(ThreadPoolBuilder("leader-initialization")
753
5.45k
           .set_max_threads(1)
754
5.45k
           .Build(&leader_initialization_pool_));
755
5.45k
  CHECK_OK(ThreadPoolBuilder("CatalogManagerBGTasks").Build(&background_tasks_thread_pool_));
756
5.45k
  CHECK_OK(ThreadPoolBuilder("async-tasks").Build(&async_task_pool_));
757
758
5.45k
  if (master_) {
759
5.45k
    sys_catalog_.reset(new SysCatalogTable(
760
5.45k
        master_, master_->metric_registry(),
761
5.45k
        Bind(&CatalogManager::ElectedAsLeaderCb, Unretained(this))));
762
5.45k
  }
763
5.45k
}
764
765
92
CatalogManager::~CatalogManager() {
766
92
  if (StartShutdown()) {
767
0
    CompleteShutdown();
768
0
  }
769
92
}
770
771
5.42k
Status CatalogManager::Init() {
772
5.42k
  {
773
5.42k
    std::lock_guard<simple_spinlock> l(state_lock_);
774
5.42k
    CHECK_EQ(kConstructed, state_);
775
5.42k
    state_ = kStarting;
776
5.42k
  }
777
778
5.42k
  if (master_) {
779
5.42k
    ysql_transaction_ = std::make_unique<YsqlTransactionDdl>(
780
5.42k
        sys_catalog_.get(), master_->async_client_initializer().get_client_future(),
781
5.42k
        background_tasks_thread_pool_.get());
782
5.42k
  }
783
784
  // Initialize the metrics emitted by the catalog manager.
785
5.42k
  metric_num_tablet_servers_live_ =
786
5.42k
    METRIC_num_tablet_servers_live.Instantiate(master_->metric_entity_cluster(), 0);
787
788
5.42k
  metric_num_tablet_servers_dead_ =
789
5.42k
    METRIC_num_tablet_servers_dead.Instantiate(master_->metric_entity_cluster(), 0);
790
791
5.42k
  RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(),
792
5.41k
                        "Failed to initialize sys tables async");
793
794
5.41k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs > 0)) {
795
9
    LOG_WITH_PREFIX(INFO) << "Simulating slow system tablet bootstrap";
796
9
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs));
797
9
  }
798
799
  // WaitUntilRunning() must run outside of the lock as to prevent
800
  // deadlock. This is safe as WaitUntilRunning waits for another
801
  // thread to finish its work and doesn't itself depend on any state
802
  // within CatalogManager. Need not start sys catalog or background tasks
803
  // when we are started in shell mode.
804
5.41k
  if (!master_->opts().IsShellMode()) {
805
5.31k
    RETURN_NOT_OK_PREPEND(sys_catalog_->WaitUntilRunning(),
806
5.31k
                          "Failed waiting for the catalog tablet to run");
807
5.31k
    std::vector<consensus::RaftPeerPB> masters_raft;
808
5.31k
    RETURN_NOT_OK(master_->ListRaftConfigMasters(&masters_raft));
809
5.31k
    std::vector<HostPort> hps;
810
14.4k
    for (const auto& peer : masters_raft) {
811
14.4k
      if (NodeInstance().permanent_uuid() == peer.permanent_uuid()) {
812
5.31k
        continue;
813
5.31k
      }
814
9.17k
      HostPort hp = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB()));
815
9.17k
      hps.push_back(hp);
816
9.17k
    }
817
5.31k
    universe_key_client_ = std::make_unique<client::UniverseKeyClient>(
818
9.16k
        hps, &master_->proxy_cache(), [&] (const encryption::UniverseKeysPB& universe_keys) {
819
9.16k
          encryption_manager_->PopulateUniverseKeys(universe_keys);
820
9.16k
        });
821
5.31k
    universe_key_client_->GetUniverseKeyRegistryAsync();
822
5.31k
    RETURN_NOT_OK(EnableBgTasks());
823
5.31k
  }
824
825
  // Cache the server registration even for shell mode masters. See
826
  // https://github.com/yugabyte/yugabyte-db/issues/8065.
827
5.41k
  RETURN_NOT_OK(GetRegistration(&server_registration_));
828
829
5.41k
  {
830
5.41k
    std::lock_guard<simple_spinlock> l(state_lock_);
831
5.41k
    CHECK_EQ(kStarting, state_);
832
5.41k
    state_ = kRunning;
833
5.41k
  }
834
835
5.41k
  Started();
836
837
5.41k
  return Status::OK();
838
5.41k
}
839
840
Status CatalogManager::ChangeEncryptionInfo(const ChangeEncryptionInfoRequestPB* req,
841
0
                                            ChangeEncryptionInfoResponsePB* resp) {
842
0
  return STATUS(InvalidCommand, "Command only supported in enterprise build.");
843
0
}
844
845
2.01k
Status CatalogManager::ElectedAsLeaderCb() {
846
2.01k
  time_elected_leader_ = MonoTime::Now();
847
2.01k
  return leader_initialization_pool_->SubmitClosure(
848
2.01k
      Bind(&CatalogManager::LoadSysCatalogDataTask, Unretained(this)));
849
2.01k
}
850
851
2.01k
Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) {
852
2.01k
  string uuid = master_->fs_manager()->uuid();
853
2.01k
  Consensus* consensus = tablet_peer()->consensus();
854
2.01k
  ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE);
855
2.01k
  if (!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid) {
856
0
    return STATUS_SUBSTITUTE(IllegalState,
857
0
        "Node $0 not leader. Consensus state: $1", uuid, cstate.ShortDebugString());
858
0
  }
859
860
  // Wait for all transactions to be committed.
861
2.01k
  const CoarseTimePoint deadline = CoarseMonoClock::now() + timeout;
862
2.01k
  {
863
2.01k
    tablet::HistoryCutoffPropagationDisabler disabler(tablet_peer()->tablet()->RetentionPolicy());
864
2.01k
    RETURN_NOT_OK(tablet_peer()->operation_tracker()->WaitForAllToFinish(timeout));
865
2.01k
  }
866
867
2.01k
  RETURN_NOT_OK(tablet_peer()->consensus()->WaitForLeaderLeaseImprecise(deadline));
868
2.01k
  return Status::OK();
869
2.01k
}
870
871
2.01k
void CatalogManager::LoadSysCatalogDataTask() {
872
2.01k
  auto consensus = tablet_peer()->shared_consensus();
873
2.01k
  const int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
874
2.01k
  Status s = WaitUntilCaughtUpAsLeader(
875
2.01k
      MonoDelta::FromMilliseconds(FLAGS_master_failover_catchup_timeout_ms));
876
877
2.01k
  int64_t term_after_wait = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
878
2.01k
  if (term_after_wait != term) {
879
    // If we got elected leader again while waiting to catch up then we will get another callback to
880
    // update state from sys_catalog, so bail now.
881
    //
882
    // If we failed when waiting, i.e. could not acquire a leader lease, this could be due to us
883
    // becoming a follower. If we're not partitioned away, we'll know about a new term soon.
884
1
    LOG_WITH_PREFIX(INFO)
885
1
        << "Term change from " << term << " to " << term_after_wait
886
1
        << " while waiting for master leader catchup. Not loading sys catalog metadata. "
887
1
        << "Status of waiting: " << s;
888
1
    return;
889
1
  }
890
891
2.01k
  if (!s.ok()) {
892
    // This could happen e.g. if we are a partitioned-away leader that failed to acquire a leader
893
    // lease.
894
    //
895
    // TODO: handle this cleanly by transitioning to a follower without crashing.
896
0
    LOG_WITH_PREFIX(WARNING) << "Failed waiting for node to catch up after master election: " << s;
897
898
0
    if (s.IsTimedOut()) {
899
0
      LOG_WITH_PREFIX(FATAL) << "Shutting down due to unavailability of other masters after"
900
0
                             << " election. TODO: Abdicate instead.";
901
0
    }
902
0
    return;
903
0
  }
904
905
2.01k
  LOG_WITH_PREFIX(INFO) << "Loading table and tablet metadata into memory for term " << term;
906
2.01k
  LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + "Loading metadata into memory") {
907
2.00k
    Status status = VisitSysCatalog(term);
908
2.00k
    if (!status.ok()) {
909
1
      {
910
1
        std::lock_guard<simple_spinlock> l(state_lock_);
911
1
        if (state_ == kClosing) {
912
0
          LOG_WITH_PREFIX(INFO)
913
0
              << "Error loading sys catalog; because shutdown is in progress. term " << term
914
0
              << " status : " << status;
915
0
          return;
916
0
        }
917
1
      }
918
1
      auto new_term = consensus->ConsensusState(CONSENSUS_CONFIG_ACTIVE).current_term();
919
1
      if (new_term != term) {
920
0
        LOG_WITH_PREFIX(INFO)
921
0
            << "Error loading sys catalog; but that's OK as term was changed from " << term
922
0
            << " to " << new_term << ": " << status;
923
0
        return;
924
0
      }
925
1
      LOG_WITH_PREFIX(FATAL) << "Failed to load sys catalog: " << status;
926
1
    }
927
2.00k
  }
928
929
2.01k
  {
930
2.01k
    std::lock_guard<simple_spinlock> l(state_lock_);
931
2.01k
    leader_ready_term_ = term;
932
2.01k
    LOG_WITH_PREFIX(INFO) << "Completed load of sys catalog in term " << term;
933
2.01k
  }
934
2.01k
  SysCatalogLoaded(term);
935
  // Once we have loaded the SysCatalog, reset and regenerate the yql partitions table in order to
936
  // regenerate entries for previous tables.
937
2.01k
  GetYqlPartitionsVtable().ResetAndRegenerateCache();
938
2.01k
}
939
940
429
CHECKED_STATUS CatalogManager::WaitForWorkerPoolTests(const MonoDelta& timeout) const {
941
429
  if (!async_task_pool_->WaitFor(timeout)) {
942
0
    return STATUS(TimedOut, "Worker Pool hasn't finished processing tasks");
943
0
  }
944
429
  return Status::OK();
945
429
}
946
947
2.00k
Status CatalogManager::VisitSysCatalog(int64_t term) {
948
  // Block new catalog operations, and wait for existing operations to finish.
949
2.00k
  LOG_WITH_PREFIX_AND_FUNC(INFO)
950
2.00k
      << "Wait on leader_lock_ for any existing operations to finish. Term: " << term;
951
2.00k
  auto start = std::chrono::steady_clock::now();
952
2.00k
  std::lock_guard<RWMutex> leader_lock_guard(leader_lock_);
953
2.00k
  auto finish = std::chrono::steady_clock::now();
954
955
2.00k
  static const auto kLongLockAcquisitionLimit = RegularBuildVsSanitizers(100ms, 750ms);
956
2.00k
  if (finish > start + kLongLockAcquisitionLimit) {
957
0
    LOG_WITH_PREFIX(WARNING) << "Long wait on leader_lock_: " << yb::ToString(finish - start);
958
0
  }
959
960
2.00k
  LOG_WITH_PREFIX(INFO)
961
2.00k
      << __func__ << ": Acquire catalog manager lock_ before loading sys catalog.";
962
2.00k
  LockGuard lock(mutex_);
963
0
  VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
964
965
  // Abort any outstanding tasks. All TableInfos are orphaned below, so
966
  // it's important to end their tasks now; otherwise Shutdown() will
967
  // destroy master state used by these tasks.
968
2.00k
  std::vector<scoped_refptr<TableInfo>> tables;
969
2.00k
  AppendValuesFromMap(*table_ids_map_, &tables);
970
2.00k
  AbortAndWaitForAllTasks(tables);
971
972
  // Clear internal maps and run data loaders.
973
2.00k
  RETURN_NOT_OK(RunLoaders(term));
974
975
  // Prepare various default system configurations.
976
2.00k
  RETURN_NOT_OK(PrepareDefaultSysConfig(term));
977
978
2.00k
  if ((FLAGS_use_initial_sys_catalog_snapshot || FLAGS_enable_ysql) &&
979
451
      !FLAGS_initial_sys_catalog_snapshot_path.empty() &&
980
365
      !FLAGS_create_initial_sys_catalog_snapshot) {
981
365
    if (!namespace_ids_map_.empty() || !system_tablets_.empty()) {
982
4
      LOG_WITH_PREFIX(INFO)
983
4
          << "This is an existing cluster, not initializing from a sys catalog snapshot.";
984
361
    } else {
985
361
      Result<bool> dir_exists =
986
361
          Env::Default()->DoesDirectoryExist(FLAGS_initial_sys_catalog_snapshot_path);
987
361
      if (dir_exists.ok() && *dir_exists) {
988
361
        bool initdb_was_already_done = false;
989
361
        {
990
361
          auto l = ysql_catalog_config_->LockForRead();
991
361
          initdb_was_already_done = l->pb.ysql_catalog_config().initdb_done();
992
361
        }
993
361
        if (initdb_was_already_done) {
994
0
          LOG_WITH_PREFIX(INFO)
995
0
              << "initdb has been run before, no need to restore sys catalog from "
996
0
              << "the initial snapshot";
997
361
        } else {
998
361
          LOG_WITH_PREFIX(INFO) << "Restoring snapshot in sys catalog";
999
361
          Status restore_status = RestoreInitialSysCatalogSnapshot(
1000
361
              FLAGS_initial_sys_catalog_snapshot_path,
1001
361
              sys_catalog_->tablet_peer().get(),
1002
361
              term);
1003
361
          if (!restore_status.ok()) {
1004
0
            LOG_WITH_PREFIX(ERROR) << "Failed restoring snapshot in sys catalog";
1005
0
            return restore_status;
1006
0
          }
1007
1008
361
          LOG_WITH_PREFIX(INFO) << "Re-initializing cluster config";
1009
361
          cluster_config_.reset();
1010
361
          RETURN_NOT_OK(PrepareDefaultClusterConfig(term));
1011
1012
361
          LOG_WITH_PREFIX(INFO) << "Restoring snapshot completed, considering initdb finished";
1013
361
          RETURN_NOT_OK(InitDbFinished(Status::OK(), term));
1014
361
          RETURN_NOT_OK(RunLoaders(term));
1015
361
        }
1016
0
      } else {
1017
0
        LOG_WITH_PREFIX(WARNING)
1018
0
            << "Initial sys catalog snapshot directory does not exist: "
1019
0
            << FLAGS_initial_sys_catalog_snapshot_path
1020
0
            << (dir_exists.ok() ? "" : ", status: " + dir_exists.status().ToString());
1021
0
      }
1022
361
    }
1023
365
  }
1024
1025
  // Create the system namespaces (created only if they don't already exist).
1026
2.00k
  RETURN_NOT_OK(PrepareDefaultNamespaces(term));
1027
1028
  // Create the system tables (created only if they don't already exist).
1029
2.00k
  RETURN_NOT_OK(PrepareSystemTables(term));
1030
1031
  // Create the default cassandra (created only if they don't already exist).
1032
2.00k
  RETURN_NOT_OK(permissions_manager_->PrepareDefaultRoles(term));
1033
1034
  // If this is the first time we start up, we have no config information as default. We write an
1035
  // empty version 0.
1036
2.00k
  RETURN_NOT_OK(PrepareDefaultClusterConfig(term));
1037
1038
2.00k
  permissions_manager_->BuildRecursiveRoles();
1039
1040
2.00k
  if (FLAGS_enable_ysql) {
1041
    // Number of TS to wait for before creating the txn table.
1042
450
    auto wait_ts_count = std::max(FLAGS_txn_table_wait_min_ts_count, FLAGS_replication_factor);
1043
1044
450
    LOG_WITH_PREFIX(INFO)
1045
450
        << "YSQL is enabled, will create the transaction status table when "
1046
450
        << wait_ts_count << " tablet servers are online";
1047
391
    master_->ts_manager()->SetTSCountCallback(wait_ts_count, [this, wait_ts_count] {
1048
391
      LOG_WITH_PREFIX(INFO)
1049
391
          << wait_ts_count
1050
391
          << " tablet servers registered, creating the transaction status table";
1051
      // Retry table creation until it succeedes. It might fail initially because placement UUID
1052
      // of live replicas is set through an RPC from YugaWare, and we won't be able to calculate
1053
      // the number of primary (non-read-replica) tablet servers until that happens.
1054
398
      while (true) {
1055
393
        const auto s = CreateGlobalTransactionStatusTableIfNeeded(/* rpc */ nullptr);
1056
393
        if (s.ok()) {
1057
386
          break;
1058
386
        }
1059
7
        LOG_WITH_PREFIX(WARNING) << "Failed creating transaction status table, waiting: " << s;
1060
7
        if (s.IsShutdownInProgress()) {
1061
0
          return;
1062
0
        }
1063
7
        auto role = Role();
1064
7
        if (role != PeerRole::LEADER) {
1065
0
          LOG_WITH_PREFIX(WARNING)
1066
0
              << "Cancel creating transaction because of role: " << PeerRole_Name(role);
1067
0
          return;
1068
0
        }
1069
7
        SleepFor(MonoDelta::FromSeconds(1));
1070
7
      }
1071
391
      LOG_WITH_PREFIX(INFO) << "Finished creating transaction status table asynchronously";
1072
391
    });
1073
450
  }
1074
1075
2.00k
  if (!StartRunningInitDbIfNeeded(term)) {
1076
    // If we are not running initdb, this is an existing cluster, and we need to check whether we
1077
    // need to do a one-time migration to make YSQL system catalog tables transactional.
1078
2.00k
    RETURN_NOT_OK(MakeYsqlSysCatalogTablesTransactional(
1079
2.00k
      table_ids_map_.CheckOut().get_ptr(), sys_catalog_.get(), ysql_catalog_config_.get(), term));
1080
2.00k
  }
1081
1082
2.00k
  return Status::OK();
1083
2.00k
}
1084
1085
template <class Loader>
1086
18.9k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
18.9k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
18.9k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
18.9k
  RETURN_NOT_OK_PREPEND(
1090
18.9k
      sys_catalog_->Visit(loader.get()),
1091
18.9k
      "Failed while visiting " + title + " in sys catalog");
1092
18.9k
  return Status::OK();
1093
18.9k
}
_ZN2yb6master14CatalogManager4LoadINS0_10RoleLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_15SysConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_11TableLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_12TabletLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_15NamespaceLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_12UDTypeLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_19ClusterConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
_ZN2yb6master14CatalogManager4LoadINS0_17RedisConfigLoaderEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEEx
Line
Count
Source
1086
2.37k
Status CatalogManager::Load(const std::string& title, const int64_t term) {
1087
2.37k
  LOG_WITH_PREFIX(INFO) << __func__ << ": Loading " << title << " into memory.";
1088
2.37k
  std::unique_ptr<Loader> loader = std::make_unique<Loader>(this, term);
1089
2.37k
  RETURN_NOT_OK_PREPEND(
1090
2.37k
      sys_catalog_->Visit(loader.get()),
1091
2.37k
      "Failed while visiting " + title + " in sys catalog");
1092
2.37k
  return Status::OK();
1093
2.37k
}
1094
1095
2.37k
Status CatalogManager::RunLoaders(int64_t term) {
1096
  // Clear the table and tablet state.
1097
2.37k
  table_names_map_.clear();
1098
2.37k
  transaction_table_ids_set_.clear();
1099
2.37k
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
1100
2.37k
  table_ids_map_checkout->clear();
1101
1102
2.37k
  auto tablet_map_checkout = tablet_map_.CheckOut();
1103
2.37k
  tablet_map_checkout->clear();
1104
1105
  // Clear the namespace mappings.
1106
2.37k
  namespace_ids_map_.clear();
1107
2.37k
  namespace_names_mapper_.clear();
1108
1109
  // Clear the type mappings.
1110
2.37k
  udtype_ids_map_.clear();
1111
2.37k
  udtype_names_map_.clear();
1112
1113
  // Clear the current cluster config.
1114
2.37k
  cluster_config_.reset();
1115
1116
  // Clear redis config mapping.
1117
2.37k
  redis_config_map_.clear();
1118
1119
  // Clear ysql catalog config.
1120
2.37k
  ysql_catalog_config_.reset();
1121
1122
  // Clear transaction tables config.
1123
2.37k
  transaction_tables_config_.reset();
1124
1125
  // Clear recent tasks.
1126
2.37k
  tasks_tracker_->Reset();
1127
1128
  // Clear recent jobs.
1129
2.37k
  jobs_tracker_->Reset();
1130
1131
2.37k
  std::vector<std::shared_ptr<TSDescriptor>> descs;
1132
2.37k
  master_->ts_manager()->GetAllDescriptors(&descs);
1133
3
  for (const auto& ts_desc : descs) {
1134
3
    ts_desc->set_has_tablet_report(false);
1135
3
  }
1136
1137
2.37k
  {
1138
2.37k
    LockGuard lock(permissions_manager()->mutex());
1139
1140
    // Clear the roles mapping.
1141
2.37k
    permissions_manager()->ClearRolesUnlocked();
1142
2.37k
    RETURN_NOT_OK(Load<RoleLoader>("roles", term));
1143
2.37k
    RETURN_NOT_OK(Load<SysConfigLoader>("sys config", term));
1144
2.37k
  }
1145
  // Clear the hidden tablets vector.
1146
2.37k
  hidden_tablets_.clear();
1147
1148
2.37k
  RETURN_NOT_OK(Load<TableLoader>("tables", term));
1149
2.37k
  RETURN_NOT_OK(Load<TabletLoader>("tablets", term));
1150
2.37k
  RETURN_NOT_OK(Load<NamespaceLoader>("namespaces", term));
1151
2.37k
  RETURN_NOT_OK(Load<UDTypeLoader>("user-defined types", term));
1152
2.37k
  RETURN_NOT_OK(Load<ClusterConfigLoader>("cluster configuration", term));
1153
2.37k
  RETURN_NOT_OK(Load<RedisConfigLoader>("Redis config", term));
1154
1155
2.37k
  if (!transaction_tables_config_) {
1156
1.94k
    RETURN_NOT_OK(InitializeTransactionTablesConfig(term));
1157
1.94k
  }
1158
1159
2.37k
  return Status::OK();
1160
2.37k
}
1161
1162
Status CatalogManager::CheckResource(
1163
    const GrantRevokePermissionRequestPB* req,
1164
721
    GrantRevokePermissionResponsePB* resp) {
1165
721
  scoped_refptr<TableInfo> table;
1166
1167
  // Checking if resources exist.
1168
721
  if (req->resource_type() == ResourceType::TABLE ||
1169
522
      req->resource_type() == ResourceType::KEYSPACE) {
1170
    // We can't match Apache Cassandra's error because when a namespace is not provided, the error
1171
    // is detected by the semantic analysis in PTQualifiedName::AnalyzeName.
1172
435
    DCHECK(req->has_namespace_());
1173
435
    const auto& namespace_info = req->namespace_();
1174
435
    auto ns = FindNamespace(namespace_info);
1175
1176
435
    if (req->resource_type() == ResourceType::KEYSPACE) {
1177
236
      if (!ns.ok()) {
1178
        // Matches Apache Cassandra's error.
1179
0
        Status s = STATUS_SUBSTITUTE(
1180
0
            NotFound, "Resource <keyspace $0> doesn't exist", namespace_info.name());
1181
0
        return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
1182
0
      }
1183
199
    } else {
1184
199
      if (ns.ok()) {
1185
199
        CatalogManager::SharedLock l(mutex_);
1186
199
        table = FindPtrOrNull(table_names_map_, {(**ns).id(), req->resource_name()});
1187
199
      }
1188
199
      if (table == nullptr) {
1189
        // Matches Apache Cassandra's error.
1190
0
        Status s = STATUS_SUBSTITUTE(
1191
0
            NotFound, "Resource <object '$0.$1'> doesn't exist",
1192
0
            namespace_info.name(), req->resource_name());
1193
0
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
1194
0
      }
1195
721
    }
1196
435
  }
1197
721
  return Status::OK();
1198
721
}
1199
1200
2.36k
Status CatalogManager::PrepareDefaultClusterConfig(int64_t term) {
1201
2.36k
  if (cluster_config_) {
1202
421
    LOG_WITH_PREFIX(INFO)
1203
421
        << "Cluster configuration has already been set up, skipping re-initialization.";
1204
421
    return Status::OK();
1205
421
  }
1206
1207
  // Create default.
1208
1.94k
  SysClusterConfigEntryPB config;
1209
1.94k
  config.set_version(0);
1210
1211
1.94k
  std::string cluster_uuid_source;
1212
1.94k
  if (!FLAGS_cluster_uuid.empty()) {
1213
1
    RETURN_NOT_OK(Uuid::FromString(FLAGS_cluster_uuid));
1214
0
    config.set_cluster_uuid(FLAGS_cluster_uuid);
1215
0
    cluster_uuid_source = "from the --cluster_uuid flag";
1216
1.94k
  } else {
1217
1.94k
    auto uuid = Uuid::Generate();
1218
1.94k
    config.set_cluster_uuid(uuid.ToString());
1219
1.94k
    cluster_uuid_source = "(randomly generated)";
1220
1.94k
  }
1221
1.94k
  LOG_WITH_PREFIX(INFO)
1222
1.94k
      << "Setting cluster UUID to " << config.cluster_uuid() << " " << cluster_uuid_source;
1223
1224
  // Create in memory object.
1225
1.94k
  cluster_config_ = new ClusterConfigInfo();
1226
1227
  // Prepare write.
1228
1.94k
  auto l = cluster_config_->LockForWrite();
1229
1.94k
  l.mutable_data()->pb = std::move(config);
1230
1231
  // Write to sys_catalog and in memory.
1232
1.94k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, cluster_config_));
1233
1.94k
  l.Commit();
1234
1235
1.94k
  return Status::OK();
1236
1.94k
}
1237
1238
18.9k
std::vector<std::string> CatalogManager::GetMasterAddresses() {
1239
18.9k
  std::vector<std::string> result;
1240
18.9k
  consensus::ConsensusStatePB state;
1241
18.9k
  auto status = GetCurrentConfig(&state);
1242
18.9k
  if (!status.ok()) {
1243
11.6k
    LOG(WARNING) << "Failed to get current config: " << status;
1244
11.6k
    return result;
1245
11.6k
  }
1246
19.8k
  for (const auto& peer : state.config().peers()) {
1247
19.8k
    std::vector<std::string> peer_addresses;
1248
39.7k
    for (const auto& list : {peer.last_known_private_addr(), peer.last_known_broadcast_addr()}) {
1249
20.0k
      for (const auto& entry : list) {
1250
20.0k
        peer_addresses.push_back(HostPort::FromPB(entry).ToString());
1251
20.0k
      }
1252
39.7k
    }
1253
19.8k
    if (!peer_addresses.empty()) {
1254
19.8k
      result.push_back(JoinStrings(peer_addresses, ","));
1255
19.8k
    }
1256
19.8k
  }
1257
7.28k
  return result;
1258
7.28k
}
1259
1260
2.00k
Status CatalogManager::PrepareDefaultSysConfig(int64_t term) {
1261
2.00k
  {
1262
2.00k
    LockGuard lock(permissions_manager()->mutex());
1263
2.00k
    RETURN_NOT_OK(permissions_manager()->PrepareDefaultSecurityConfigUnlocked(term));
1264
2.00k
  }
1265
1266
2.00k
  if (!ysql_catalog_config_) {
1267
1.94k
    SysYSQLCatalogConfigEntryPB ysql_catalog_config;
1268
1.94k
    ysql_catalog_config.set_version(0);
1269
1270
    // Create in memory objects.
1271
1.94k
    ysql_catalog_config_ = new SysConfigInfo(kYsqlCatalogConfigType);
1272
1273
    // Prepare write.
1274
1.94k
    auto l = ysql_catalog_config_->LockForWrite();
1275
1.94k
    *l.mutable_data()->pb.mutable_ysql_catalog_config() = std::move(ysql_catalog_config);
1276
1277
    // Write to sys_catalog and in memory.
1278
1.94k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_));
1279
1.94k
    l.Commit();
1280
1.94k
  }
1281
1282
2.00k
  if (!transaction_tables_config_) {
1283
0
    RETURN_NOT_OK(InitializeTransactionTablesConfig(term));
1284
0
  }
1285
1286
2.00k
  return Status::OK();
1287
2.00k
}
1288
1289
2.00k
bool CatalogManager::StartRunningInitDbIfNeeded(int64_t term) {
1290
2.00k
  if (!ShouldAutoRunInitDb(ysql_catalog_config_.get(), pg_proc_exists_)) {
1291
2.00k
    return false;
1292
2.00k
  }
1293
1294
0
  string master_addresses_str = MasterAddressesToString(
1295
0
      *master_->opts().GetMasterAddresses());
1296
1297
0
  initdb_future_ = std::async(std::launch::async, [this, master_addresses_str, term] {
1298
0
    if (FLAGS_create_initial_sys_catalog_snapshot) {
1299
0
      initial_snapshot_writer_.emplace();
1300
0
    }
1301
1302
0
    Status status = PgWrapper::InitDbForYSQL(
1303
0
        master_addresses_str, "/tmp", master_->GetSharedMemoryFd());
1304
1305
0
    if (FLAGS_create_initial_sys_catalog_snapshot && status.ok()) {
1306
0
      Status write_snapshot_status = initial_snapshot_writer_->WriteSnapshot(
1307
0
          sys_catalog_->tablet_peer()->tablet(),
1308
0
          FLAGS_initial_sys_catalog_snapshot_path);
1309
0
      if (!write_snapshot_status.ok()) {
1310
0
        status = write_snapshot_status;
1311
0
      }
1312
0
    }
1313
0
    Status finish_status = InitDbFinished(status, term);
1314
0
    if (!finish_status.ok()) {
1315
0
      if (status.ok()) {
1316
0
        status = finish_status;
1317
0
      }
1318
0
      LOG_WITH_PREFIX(WARNING)
1319
0
          << "Failed to set initdb as finished in sys catalog: " << finish_status;
1320
0
    }
1321
0
    return status;
1322
0
  });
1323
0
  return true;
1324
0
}
1325
1326
2.00k
Status CatalogManager::PrepareDefaultNamespaces(int64_t term) {
1327
2.00k
  RETURN_NOT_OK(PrepareNamespace(
1328
2.00k
      YQL_DATABASE_CQL, kSystemNamespaceName, kSystemNamespaceId, term));
1329
2.00k
  RETURN_NOT_OK(PrepareNamespace(
1330
2.00k
      YQL_DATABASE_CQL, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term));
1331
2.00k
  RETURN_NOT_OK(PrepareNamespace(
1332
2.00k
      YQL_DATABASE_CQL, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term));
1333
2.00k
  return Status::OK();
1334
2.00k
}
1335
1336
2.00k
Status CatalogManager::PrepareSystemTables(int64_t term) {
1337
  // Prepare sys catalog table.
1338
2.00k
  RETURN_NOT_OK(PrepareSysCatalogTable(term));
1339
1340
  // Create the required system tables here.
1341
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<PeersVTable>(
1342
2.00k
      kSystemPeersTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1343
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<LocalVTable>(
1344
2.00k
      kSystemLocalTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1345
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLKeyspacesVTable>(
1346
2.00k
      kSystemSchemaKeyspacesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1347
2.00k
      term)));
1348
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTablesVTable>(
1349
2.00k
      kSystemSchemaTablesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1350
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLColumnsVTable>(
1351
2.00k
      kSystemSchemaColumnsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1352
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLSizeEstimatesVTable>(
1353
2.00k
      kSystemSizeEstimatesTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1354
1355
  // Empty tables.
1356
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAggregatesVTable>(
1357
2.00k
      kSystemSchemaAggregatesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1358
2.00k
      term)));
1359
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLFunctionsVTable>(
1360
2.00k
      kSystemSchemaFunctionsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId,
1361
2.00k
      term)));
1362
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLIndexesVTable>(
1363
2.00k
      kSystemSchemaIndexesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1364
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLTriggersVTable>(
1365
2.00k
      kSystemSchemaTriggersTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1366
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLViewsVTable>(
1367
2.00k
      kSystemSchemaViewsTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1368
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<QLTypesVTable>(
1369
2.00k
      kSystemSchemaTypesTableName, kSystemSchemaNamespaceName, kSystemSchemaNamespaceId, term)));
1370
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLPartitionsVTable>(
1371
2.00k
      kSystemPartitionsTableName, kSystemNamespaceName, kSystemNamespaceId, term)));
1372
1373
  // System auth tables.
1374
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolesVTable>(
1375
2.00k
      kSystemAuthRolesTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId, term)));
1376
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthRolePermissionsVTable>(
1377
2.00k
      kSystemAuthRolePermissionsTableName, kSystemAuthNamespaceName, kSystemAuthNamespaceId,
1378
2.00k
      term)));
1379
2.00k
  RETURN_NOT_OK((PrepareSystemTableTemplate<YQLAuthResourceRolePermissionsIndexVTable>(
1380
2.00k
      kSystemAuthResourceRolePermissionsIndexTableName, kSystemAuthNamespaceName,
1381
2.00k
      kSystemAuthNamespaceId, term)));
1382
1383
  // Ensure kNumSystemTables is in-sync with the system tables created.
1384
1
  LOG_IF(DFATAL, system_tablets_.size() != kNumSystemTables)
1385
1
      << "kNumSystemTables is " << kNumSystemTables << " but " << system_tablets_.size()
1386
1
      << " tables were created";
1387
1388
  // Cache the system.partitions tablet so we can access it in RebuildYQLSystemPartitions.
1389
2.00k
  RETURN_NOT_OK(GetYQLPartitionsVTable(&system_partitions_tablet_));
1390
1391
2.00k
  return Status::OK();
1392
2.00k
}
1393
1394
2.00k
Status CatalogManager::PrepareSysCatalogTable(int64_t term) {
1395
  // Prepare sys catalog table info.
1396
2.00k
  auto sys_catalog_table_iter = table_ids_map_->find(kSysCatalogTableId);
1397
2.00k
  if (sys_catalog_table_iter == table_ids_map_->end()) {
1398
1.58k
    scoped_refptr<TableInfo> table = NewTableInfo(kSysCatalogTableId);
1399
1.58k
    table->mutable_metadata()->StartMutation();
1400
1.58k
    SysTablesEntryPB& metadata = table->mutable_metadata()->mutable_dirty()->pb;
1401
1.58k
    metadata.set_state(SysTablesEntryPB::RUNNING);
1402
1.58k
    metadata.set_namespace_id(kSystemSchemaNamespaceId);
1403
1.58k
    metadata.set_name(kSysCatalogTableName);
1404
1.58k
    metadata.set_table_type(TableType::YQL_TABLE_TYPE);
1405
1.58k
    SchemaToPB(*sys_catalog_->schema_, metadata.mutable_schema());
1406
1.58k
    metadata.set_version(0);
1407
1408
1.58k
    auto table_ids_map_checkout = table_ids_map_.CheckOut();
1409
1.58k
    sys_catalog_table_iter = table_ids_map_checkout->emplace(table->id(), table).first;
1410
1.58k
    table_names_map_[{kSystemSchemaNamespaceId, kSysCatalogTableName}] = table;
1411
1.58k
    table->set_is_system();
1412
1413
1.58k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1414
1.58k
    table->mutable_metadata()->CommitMutation();
1415
1.58k
  }
1416
1417
  // Prepare sys catalog tablet info.
1418
2.00k
  if (tablet_map_->count(kSysCatalogTabletId) == 0) {
1419
1.58k
    scoped_refptr<TableInfo> table = sys_catalog_table_iter->second;
1420
1.58k
    scoped_refptr<TabletInfo> tablet(new TabletInfo(table, kSysCatalogTabletId));
1421
1.58k
    tablet->mutable_metadata()->StartMutation();
1422
1.58k
    SysTabletsEntryPB& metadata = tablet->mutable_metadata()->mutable_dirty()->pb;
1423
1.58k
    metadata.set_state(SysTabletsEntryPB::RUNNING);
1424
1425
1.58k
    auto l = table->LockForRead();
1426
1.58k
    PartitionSchema partition_schema;
1427
1.58k
    RETURN_NOT_OK(PartitionSchema::FromPB(l->pb.partition_schema(),
1428
1.58k
                                          *sys_catalog_->schema_,
1429
1.58k
                                          &partition_schema));
1430
1.58k
    vector<Partition> partitions;
1431
1.58k
    RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
1432
1.58k
    partitions[0].ToPB(metadata.mutable_partition());
1433
1.58k
    metadata.set_table_id(table->id());
1434
1.58k
    metadata.add_table_ids(table->id());
1435
1436
1.58k
    table->set_is_system();
1437
1.58k
    table->AddTablet(tablet.get());
1438
1439
1.58k
    auto tablet_map_checkout = tablet_map_.CheckOut();
1440
1.58k
    (*tablet_map_checkout)[tablet->tablet_id()] = tablet;
1441
1442
1.58k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, tablet));
1443
1.58k
    tablet->mutable_metadata()->CommitMutation();
1444
1.58k
  }
1445
1446
2.00k
  system_tablets_[kSysCatalogTabletId] = sys_catalog_->tablet_peer_->shared_tablet();
1447
1448
2.00k
  return Status::OK();
1449
2.00k
}
1450
1451
template <class T>
1452
Status CatalogManager::PrepareSystemTableTemplate(const TableName& table_name,
1453
                                                  const NamespaceName& namespace_name,
1454
                                                  const NamespaceId& namespace_id,
1455
32.1k
                                                  int64_t term) {
1456
32.1k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
32.1k
  return PrepareSystemTable(
1458
32.1k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
32.1k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_11PeersVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_11LocalVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLKeyspacesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_15YQLTablesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_16YQLColumnsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_22YQLSizeEstimatesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_19YQLAggregatesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLFunctionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_16YQLIndexesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_17YQLTriggersVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_14YQLViewsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_13QLTypesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_19YQLPartitionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_18YQLAuthRolesVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_28YQLAuthRolePermissionsVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
_ZN2yb6master14CatalogManager26PrepareSystemTableTemplateINS0_41YQLAuthResourceRolePermissionsIndexVTableEEENS_6StatusERKNSt3__112basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEESD_SD_x
Line
Count
Source
1455
2.00k
                                                  int64_t term) {
1456
2.00k
  YQLVirtualTable* vtable = new T(table_name, namespace_name, master_);
1457
2.00k
  return PrepareSystemTable(
1458
2.00k
      table_name, namespace_name, namespace_id, vtable->schema(), term, vtable);
1459
2.00k
}
1460
1461
Status CatalogManager::PrepareSystemTable(const TableName& table_name,
1462
                                          const NamespaceName& namespace_name,
1463
                                          const NamespaceId& namespace_id,
1464
                                          const Schema& schema,
1465
                                          int64_t term,
1466
32.1k
                                          YQLVirtualTable* vtable) {
1467
32.1k
  std::unique_ptr<YQLVirtualTable> yql_storage(vtable);
1468
1469
32.1k
  scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_,
1470
32.1k
                                                 std::make_pair(namespace_id, table_name));
1471
32.1k
  bool create_table = true;
1472
32.1k
  if (table != nullptr) {
1473
6.73k
    LOG_WITH_PREFIX(INFO) << "Table " << namespace_name << "." << table_name << " already created";
1474
1475
    // Mark the table as a system table.
1476
6.73k
    table->set_is_system();
1477
1478
6.73k
    Schema persisted_schema;
1479
6.73k
    RETURN_NOT_OK(table->GetSchema(&persisted_schema));
1480
6.73k
    if (!persisted_schema.Equals(schema)) {
1481
0
      LOG_WITH_PREFIX(INFO)
1482
0
          << "Updating schema of " << namespace_name << "." << table_name << " ...";
1483
0
      auto l = table->LockForWrite();
1484
0
      SchemaToPB(schema, l.mutable_data()->pb.mutable_schema());
1485
0
      l.mutable_data()->pb.set_version(l->pb.version() + 1);
1486
0
      l.mutable_data()->pb.set_updates_only_index_permissions(false);
1487
1488
      // Update sys-catalog with the new table schema.
1489
0
      RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1490
0
      l.Commit();
1491
0
    }
1492
1493
    // There might have been a failure after writing the table but before writing the tablets. As
1494
    // a result, if we don't find any tablets, we try to create the tablets only again.
1495
6.73k
    auto tablets = table->GetTablets();
1496
6.73k
    if (!tablets.empty()) {
1497
      // Initialize the appropriate system tablet.
1498
6.73k
      DCHECK_EQ(1, tablets.size());
1499
6.73k
      auto tablet = tablets[0];
1500
6.73k
      system_tablets_[tablet->tablet_id()] =
1501
6.73k
          std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id());
1502
6.73k
      return Status::OK();
1503
1
    } else {
1504
      // Table is already created, only need to create tablets now.
1505
1
      LOG_WITH_PREFIX(INFO)
1506
1
          << "Creating tablets for " << namespace_name << "." << table_name << " ...";
1507
1
      create_table = false;
1508
1
    }
1509
6.73k
  }
1510
1511
  // Create partitions.
1512
25.3k
  vector<Partition> partitions;
1513
25.3k
  PartitionSchemaPB partition_schema_pb;
1514
25.3k
  partition_schema_pb.set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA);
1515
25.3k
  PartitionSchema partition_schema;
1516
25.3k
  RETURN_NOT_OK(PartitionSchema::FromPB(partition_schema_pb, schema, &partition_schema));
1517
25.3k
  RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
1518
1519
25.3k
  TabletInfos tablets;
1520
1521
25.3k
  if (create_table) {
1522
    // Fill in details for the system table.
1523
25.3k
    CreateTableRequestPB req;
1524
25.3k
    req.set_name(table_name);
1525
25.3k
    req.set_table_type(TableType::YQL_TABLE_TYPE);
1526
1527
25.3k
    RETURN_NOT_OK(CreateTableInMemory(
1528
25.3k
        req, schema, partition_schema, namespace_id, namespace_name,
1529
25.3k
        partitions, nullptr, &tablets, nullptr, &table));
1530
    // Mark the table as a system table.
1531
25.3k
    LOG_WITH_PREFIX(INFO) << "Inserted new " << namespace_name << "." << table_name
1532
25.3k
                          << " table info into CatalogManager maps";
1533
    // Update the on-disk table state to "running".
1534
25.3k
    table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
1535
25.3k
    RETURN_NOT_OK(sys_catalog_->Upsert(term, table));
1536
25.3k
    LOG_WITH_PREFIX(INFO) << "Wrote table to system catalog: " << ToString(table) << ", tablets: "
1537
25.3k
                          << ToString(tablets);
1538
1
  } else {
1539
    // Still need to create the tablets.
1540
1
    tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, table));
1541
1
  }
1542
1543
25.3k
  DCHECK_EQ(1, tablets.size());
1544
  // We use LOG_ASSERT here since this is expected to crash in some unit tests.
1545
25.3k
  LOG_ASSERT(!FLAGS_TEST_catalog_manager_simulate_system_table_create_failure);
1546
1547
  // Write Tablets to sys-tablets (in "running" state since we don't want the loadbalancer to
1548
  // assign these tablets since this table is virtual).
1549
25.3k
  for (const auto& tablet : tablets) {
1550
25.3k
    tablet->mutable_metadata()->mutable_dirty()->pb.set_state(SysTabletsEntryPB::RUNNING);
1551
25.3k
  }
1552
25.3k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, tablets));
1553
25.3k
  LOG_WITH_PREFIX(INFO) << "Wrote tablets to system catalog: " << ToString(tablets);
1554
1555
  // Commit the in-memory state.
1556
25.3k
  if (create_table) {
1557
25.3k
    table->mutable_metadata()->CommitMutation();
1558
25.3k
  }
1559
1560
25.3k
  for (const auto& tablet : tablets) {
1561
25.3k
    tablet->mutable_metadata()->CommitMutation();
1562
25.3k
  }
1563
  // Mark the table as a system table.
1564
25.3k
  table->set_is_system();
1565
1566
  // Finally create the appropriate tablet object.
1567
25.3k
  auto tablet = tablets[0];
1568
25.3k
  system_tablets_[tablet->tablet_id()] =
1569
25.3k
      std::make_shared<SystemTablet>(schema, std::move(yql_storage), tablet->tablet_id());
1570
25.3k
  return Status::OK();
1571
25.3k
}
1572
1573
56.0k
bool IsYcqlNamespace(const NamespaceInfo& ns) {
1574
56.0k
  return ns.database_type() == YQLDatabase::YQL_DATABASE_CQL;
1575
56.0k
}
1576
1577
935k
bool IsYcqlTable(const TableInfo& table) {
1578
935k
  return table.GetTableType() == TableType::YQL_TABLE_TYPE && table.id() != kSysCatalogTableId;
1579
935k
}
1580
1581
Status CatalogManager::PrepareNamespace(
1582
6.02k
    YQLDatabase db_type, const NamespaceName& name, const NamespaceId& id, int64_t term) {
1583
1584
6.02k
  scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id);
1585
6.02k
  if (ns != nullptr) {
1586
1.26k
    LOG_WITH_PREFIX(INFO)
1587
1.26k
        << "Keyspace " << ns->ToString() << " already created, skipping initialization";
1588
1.26k
    return Status::OK();
1589
1.26k
  }
1590
1591
  // Create entry.
1592
4.76k
  SysNamespaceEntryPB ns_entry;
1593
4.76k
  ns_entry.set_name(name);
1594
4.76k
  ns_entry.set_database_type(db_type);
1595
4.76k
  ns_entry.set_state(SysNamespaceEntryPB::RUNNING);
1596
1597
  // Create in memory object.
1598
4.76k
  ns = new NamespaceInfo(id);
1599
1600
  // Prepare write.
1601
4.76k
  auto l = ns->LockForWrite();
1602
4.76k
  l.mutable_data()->pb = std::move(ns_entry);
1603
1604
4.76k
  namespace_ids_map_[id] = ns;
1605
4.76k
  namespace_names_mapper_[db_type][l.mutable_data()->pb.name()] = ns;
1606
1607
  // Write to sys_catalog and in memory.
1608
4.76k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, ns));
1609
4.76k
  l.Commit();
1610
1611
4.76k
  LOG_WITH_PREFIX(INFO) << "Created default keyspace: " << ns->ToString();
1612
4.76k
  return Status::OK();
1613
4.76k
}
1614
1615
5.35k
Status CatalogManager::CheckLocalHostInMasterAddresses() {
1616
5.35k
  auto local_hostport = master_->first_rpc_address();
1617
5.35k
  std::vector<IpAddress> local_addrs;
1618
1619
5.35k
  if (local_hostport.address().is_unspecified()) {
1620
0
    auto status = GetLocalAddresses(&local_addrs, AddressFilter::ANY);
1621
0
    if (!status.ok() || local_addrs.empty()) {
1622
0
      LOG(WARNING) << "Could not enumerate network interfaces due to " << status << ", found "
1623
0
                   << local_addrs.size() << " local addresses.";
1624
0
      return Status::OK();
1625
0
    }
1626
5.35k
  } else {
1627
5.35k
    for (auto const &addr : master_->rpc_addresses()) {
1628
5.35k
      local_addrs.push_back(addr.address());
1629
5.35k
    }
1630
5.35k
  }
1631
1632
5.35k
  auto resolved_addresses = VERIFY_RESULT(server::ResolveMasterAddresses(
1633
5.35k
      *master_->opts().GetMasterAddresses()));
1634
1635
9.88k
  for (auto const &addr : resolved_addresses) {
1636
9.88k
    if (addr.address().is_unspecified() ||
1637
9.88k
        std::find(local_addrs.begin(), local_addrs.end(), addr.address()) !=
1638
5.35k
            local_addrs.end()) {
1639
5.35k
      return Status::OK();
1640
5.35k
    }
1641
9.88k
  }
1642
0
  return STATUS_SUBSTITUTE(IllegalState,
1643
5.35k
      "None of the local addresses are present in master_addresses $0.",
1644
5.35k
      master_->opts().master_addresses_flag);
1645
5.35k
}
1646
1647
5.42k
Status CatalogManager::InitSysCatalogAsync() {
1648
5.42k
  LockGuard lock(mutex_);
1649
1650
  // Optimistically try to load data from disk.
1651
5.42k
  Status s = sys_catalog_->Load(master_->fs_manager());
1652
1653
5.42k
  if (!s.ok() && s.IsNotFound()) {
1654
    // We have yet to intialize the syscatalog metadata, need to create the metadata file.
1655
5.40k
    LOG(INFO) << "Did not find previous SysCatalogTable data on disk. " << s;
1656
1657
5.40k
    if (!master_->opts().AreMasterAddressesProvided()) {
1658
41
      master_->SetShellMode(true);
1659
41
      LOG(INFO) << "Starting master in shell mode.";
1660
41
      return Status::OK();
1661
41
    }
1662
1663
5.35k
    RETURN_NOT_OK(CheckLocalHostInMasterAddresses());
1664
5.35k
    RETURN_NOT_OK_PREPEND(sys_catalog_->CreateNew(master_->fs_manager()),
1665
5.35k
        Substitute("Encountered errors during system catalog initialization:"
1666
5.34k
                   "\n\tError on Load: $0\n\tError on CreateNew: ", s.ToString()));
1667
1668
5.34k
    return Status::OK();
1669
23
  }
1670
1671
23
  return s;
1672
23
}
1673
1674
5.06M
bool CatalogManager::IsInitialized() const {
1675
5.06M
  std::lock_guard<simple_spinlock> l(state_lock_);
1676
5.06M
  return state_ == kRunning;
1677
5.06M
}
1678
1679
// TODO - delete this API after HandleReportedTablet() usage is removed.
1680
260k
Status CatalogManager::CheckIsLeaderAndReady() const {
1681
260k
  std::lock_guard<simple_spinlock> l(state_lock_);
1682
260k
  if (PREDICT_FALSE(state_ != kRunning)) {
1683
27
    return STATUS_SUBSTITUTE(ServiceUnavailable,
1684
27
        "Catalog manager is shutting down. State: $0", state_);
1685
27
  }
1686
260k
  string uuid = master_->fs_manager()->uuid();
1687
260k
  if (master_->opts().IsShellMode()) {
1688
    // Consensus and other internal fields should not be checked when is shell mode.
1689
0
    return STATUS_SUBSTITUTE(IllegalState,
1690
0
        "Catalog manager of $0 is in shell mode, not the leader", uuid);
1691
0
  }
1692
260k
  Consensus* consensus = tablet_peer()->consensus();
1693
260k
  if (consensus == nullptr) {
1694
0
    return STATUS(IllegalState, "Consensus has not been initialized yet");
1695
0
  }
1696
260k
  ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED);
1697
260k
  if (PREDICT_FALSE(!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid)) {
1698
0
    return STATUS_SUBSTITUTE(IllegalState,
1699
0
        "Not the leader. Local UUID: $0, Consensus state: $1", uuid, cstate.ShortDebugString());
1700
0
  }
1701
260k
  if (PREDICT_FALSE(leader_ready_term_ != cstate.current_term())) {
1702
0
    return STATUS_SUBSTITUTE(ServiceUnavailable,
1703
0
        "Leader not yet ready to serve requests: ready term $0 vs cstate term $1",
1704
0
        leader_ready_term_, cstate.current_term());
1705
0
  }
1706
260k
  return Status::OK();
1707
260k
}
1708
1709
6.31M
std::shared_ptr<tablet::TabletPeer> CatalogManager::tablet_peer() const {
1710
6.31M
  return sys_catalog_->tablet_peer();
1711
6.31M
}
1712
1713
3.90M
PeerRole CatalogManager::Role() const {
1714
3.90M
  if (!IsInitialized() || master_->opts().IsShellMode()) {
1715
196
    return PeerRole::NON_PARTICIPANT;
1716
196
  }
1717
1718
3.90M
  return tablet_peer()->consensus()->role();
1719
3.90M
}
1720
1721
271
bool CatalogManager::StartShutdown() {
1722
271
  {
1723
271
    std::lock_guard<simple_spinlock> l(state_lock_);
1724
271
    if (state_ == kClosing) {
1725
0
      VLOG(2) << "CatalogManager already shut down";
1726
177
      return false;
1727
177
    }
1728
94
    state_ = kClosing;
1729
94
  }
1730
1731
94
  refresh_yql_partitions_task_.StartShutdown();
1732
1733
94
  refresh_ysql_tablespace_info_task_.StartShutdown();
1734
1735
94
  if (sys_catalog_) {
1736
94
    sys_catalog_->StartShutdown();
1737
94
  }
1738
1739
94
  return true;
1740
94
}
1741
1742
92
void CatalogManager::CompleteShutdown() {
1743
  // Shutdown the Catalog Manager background thread (load balancing).
1744
92
  refresh_yql_partitions_task_.CompleteShutdown();
1745
92
  refresh_ysql_tablespace_info_task_.CompleteShutdown();
1746
1747
92
  if (background_tasks_) {
1748
75
    background_tasks_->Shutdown();
1749
75
  }
1750
92
  if (background_tasks_thread_pool_) {
1751
92
    background_tasks_thread_pool_->Shutdown();
1752
92
  }
1753
92
  if (leader_initialization_pool_) {
1754
92
    leader_initialization_pool_->Shutdown();
1755
92
  }
1756
92
  if (async_task_pool_) {
1757
92
    async_task_pool_->Shutdown();
1758
92
  }
1759
1760
  // Mark all outstanding table tasks as aborted and wait for them to fail.
1761
  //
1762
  // There may be an outstanding table visitor thread modifying the table map,
1763
  // so we must make a copy of it before we iterate. It's OK if the visitor
1764
  // adds more entries to the map even after we finish; it won't start any new
1765
  // tasks for those entries.
1766
92
  vector<scoped_refptr<TableInfo>> copy;
1767
92
  {
1768
92
    SharedLock lock(mutex_);
1769
92
    AppendValuesFromMap(*table_ids_map_, &copy);
1770
92
  }
1771
92
  AbortAndWaitForAllTasks(copy);
1772
1773
  // Shut down the underlying storage for tables and tablets.
1774
92
  if (sys_catalog_) {
1775
92
    sys_catalog_->CompleteShutdown();
1776
92
  }
1777
1778
  // Reset the jobs/tasks tracker.
1779
92
  tasks_tracker_->Reset();
1780
92
  jobs_tracker_->Reset();
1781
1782
92
  if (initdb_future_ && initdb_future_->wait_for(0s) != std::future_status::ready) {
1783
0
    LOG(WARNING) << "initdb is still running, waiting for it to complete.";
1784
0
    initdb_future_->wait();
1785
0
    LOG(INFO) << "Finished running initdb, proceeding with catalog manager shutdown.";
1786
0
  }
1787
92
}
1788
1789
Status CatalogManager::AbortTableCreation(TableInfo* table,
1790
                                          const TabletInfos& tablets,
1791
                                          const Status& s,
1792
4
                                          CreateTableResponsePB* resp) {
1793
4
  LOG(WARNING) << s;
1794
1795
4
  const TableId table_id = table->id();
1796
4
  const TableName table_name = table->mutable_metadata()->mutable_dirty()->pb.name();
1797
4
  const NamespaceId table_namespace_id =
1798
4
      table->mutable_metadata()->mutable_dirty()->pb.namespace_id();
1799
4
  vector<string> tablet_ids_to_erase;
1800
7
  for (const auto& tablet : tablets) {
1801
7
    tablet_ids_to_erase.push_back(tablet->tablet_id());
1802
7
  }
1803
1804
4
  LOG(INFO) << "Aborting creation of table '" << table_name << "', erasing table and tablets (" <<
1805
4
      JoinStrings(tablet_ids_to_erase, ",") << ") from in-memory state.";
1806
1807
  // Since this is a failed creation attempt, it's safe to just abort
1808
  // all tasks, as (by definition) no tasks may be pending against a
1809
  // table that has failed to successfully create.
1810
4
  table->AbortTasksAndClose();
1811
4
  table->WaitTasksCompletion();
1812
1813
4
  LockGuard lock(mutex_);
1814
1815
  // Call AbortMutation() manually, as otherwise the lock won't be released.
1816
7
  for (const auto& tablet : tablets) {
1817
7
    tablet->mutable_metadata()->AbortMutation();
1818
7
  }
1819
4
  table->mutable_metadata()->AbortMutation();
1820
4
  auto tablet_map_checkout = tablet_map_.CheckOut();
1821
7
  for (const TabletId& tablet_id_to_erase : tablet_ids_to_erase) {
1822
0
    CHECK_EQ(tablet_map_checkout->erase(tablet_id_to_erase), 1)
1823
0
        << "Unable to erase tablet " << tablet_id_to_erase << " from tablet map.";
1824
7
  }
1825
1826
4
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
1827
4
  table_names_map_.erase({table_namespace_id, table_name}); // Not present if PGSQL table.
1828
0
  CHECK_EQ(table_ids_map_checkout->erase(table_id), 1)
1829
0
      << "Unable to erase table with id " << table_id << " from table ids map.";
1830
1831
4
  if (IsYcqlTable(*table)) {
1832
4
    GetYqlPartitionsVtable().RemoveFromCache(table->id());
1833
4
  }
1834
4
  return CheckIfNoLongerLeaderAndSetupError(s, resp);
1835
4
}
1836
1837
Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo(
1838
  const ReplicationInfoPB& table_replication_info,
1839
32.3k
  const TablespaceId& tablespace_id) {
1840
1841
32.3k
  if (IsReplicationInfoSet(table_replication_info)) {
1842
    // The table has custom replication info set for it, return it if valid.
1843
5
    RETURN_NOT_OK(ValidateTableReplicationInfo(table_replication_info));
1844
5
    return table_replication_info;
1845
32.3k
  }
1846
  // Table level replication info not set. Check whether the table is
1847
  // associated with a tablespace and if so, return the tablespace
1848
  // replication info.
1849
32.3k
  if (GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
1850
32.3k
    boost::optional<ReplicationInfoPB> tablespace_pb =
1851
32.3k
      VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id));
1852
32.3k
    if (tablespace_pb) {
1853
      // Return the tablespace placement.
1854
0
      return tablespace_pb.value();
1855
0
    }
1856
32.3k
  }
1857
1858
  // Neither table nor tablespace info set. Return cluster level replication info.
1859
32.3k
  auto l = cluster_config_->LockForRead();
1860
32.3k
  return l->pb.replication_info();
1861
32.3k
}
1862
1863
100k
std::shared_ptr<YsqlTablespaceManager> CatalogManager::GetTablespaceManager() const {
1864
100k
  SharedLock lock(tablespace_mutex_);
1865
100k
  return tablespace_manager_;
1866
100k
}
1867
1868
Result<boost::optional<TablespaceId>> CatalogManager::GetTablespaceForTable(
1869
1
    const scoped_refptr<TableInfo>& table) {
1870
1871
1
  auto tablespace_manager = GetTablespaceManager();
1872
1
  return tablespace_manager->GetTablespaceForTable(table);
1873
1
}
1874
1875
Result<boost::optional<ReplicationInfoPB>> CatalogManager::GetTablespaceReplicationInfoWithRetry(
1876
32.3k
  const TablespaceId& tablespace_id) {
1877
1878
32.3k
  auto tablespace_manager = GetTablespaceManager();
1879
32.3k
  auto replication_info_result = tablespace_manager->GetTablespaceReplicationInfo(tablespace_id);
1880
1881
32.3k
  if (replication_info_result) {
1882
32.3k
    return replication_info_result;
1883
32.3k
  }
1884
1885
  // We failed to find the tablespace placement policy. Refresh the tablespace info and try again.
1886
0
  auto tablespace_map = VERIFY_RESULT(GetYsqlTablespaceInfo());
1887
1888
  // We clone the tablespace_manager and update the clone with the new tablespace_map that we
1889
  // fetched above. We do this instead of updating the tablespace_manager object in-place because
1890
  // other clients may have a shared_ptr to it through 'GetTablespaceManager()'.
1891
0
  tablespace_manager = tablespace_manager->CreateCloneWithTablespaceMap(tablespace_map);
1892
0
  {
1893
0
    LockGuard lock(tablespace_mutex_);
1894
0
    tablespace_manager_ = tablespace_manager;
1895
0
  }
1896
1897
0
  return tablespace_manager->GetTablespaceReplicationInfo(tablespace_id);
1898
0
}
1899
1900
153k
bool CatalogManager::IsReplicationInfoSet(const ReplicationInfoPB& replication_info) {
1901
153k
  const auto& live_placement_info = replication_info.live_replicas();
1902
153k
  if (!(live_placement_info.placement_blocks().empty() &&
1903
153k
        live_placement_info.num_replicas() <= 0 &&
1904
153k
        live_placement_info.placement_uuid().empty()) ||
1905
153k
      !replication_info.read_replicas().empty() ||
1906
153k
      !replication_info.affinitized_leaders().empty()) {
1907
1908
17
      return true;
1909
17
  }
1910
153k
  return false;
1911
153k
}
1912
1913
9
Status CatalogManager::ValidateTableReplicationInfo(const ReplicationInfoPB& replication_info) {
1914
9
  if (!IsReplicationInfoSet(replication_info)) {
1915
0
    return STATUS(InvalidArgument, "No replication info set.");
1916
0
  }
1917
  // We don't support setting any other fields other than live replica placements for now.
1918
9
  if (!replication_info.read_replicas().empty() ||
1919
9
      !replication_info.affinitized_leaders().empty()) {
1920
1921
0
      return STATUS(InvalidArgument, "Only live placement info can be set for table "
1922
0
          "level replication info.");
1923
0
  }
1924
  // Today we support setting table level replication info only in clusters where read replica
1925
  // placements is not set. Return error if the cluster has read replica placements set.
1926
9
  auto l = cluster_config_->LockForRead();
1927
9
  const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info();
1928
  // TODO(bogdan): figure this out when we expand on geopartition support.
1929
  // if (!cluster_replication_info.read_replicas().empty() ||
1930
  //     !cluster_replication_info.affinitized_leaders().empty()) {
1931
1932
  //     return STATUS(InvalidArgument, "Setting table level replication info is not supported "
1933
  //         "for clusters with read replica placements");
1934
  // }
1935
  // If the replication info has placement_uuid set, verify that it matches the cluster
1936
  // placement_uuid.
1937
9
  if (replication_info.live_replicas().placement_uuid().empty()) {
1938
7
    return Status::OK();
1939
7
  }
1940
2
  if (replication_info.live_replicas().placement_uuid() !=
1941
0
      cluster_replication_info.live_replicas().placement_uuid()) {
1942
1943
0
      return STATUS(InvalidArgument, "Placement uuid for table level replication info "
1944
0
          "must match that of the cluster's live placement info.");
1945
0
  }
1946
2
  return Status::OK();
1947
2
}
1948
1949
533
Result<shared_ptr<TablespaceIdToReplicationInfoMap>> CatalogManager::GetYsqlTablespaceInfo() {
1950
533
  auto table_info = GetTableInfo(kPgTablespaceTableId);
1951
533
  if (table_info == nullptr) {
1952
50
    return STATUS(InternalError, "pg_tablespace table info not found");
1953
50
  }
1954
1955
483
  auto tablespace_map = VERIFY_RESULT(sys_catalog_->ReadPgTablespaceInfo());
1956
1957
  // The tablespace options do not usually contain the placement uuid.
1958
  // Populate the current cluster placement uuid into the placement information for
1959
  // each tablespace.
1960
483
  string placement_uuid;
1961
483
  {
1962
483
    auto l = cluster_config_->LockForRead();
1963
    // TODO(deepthi.srinivasan): Read-replica placements are not supported as
1964
    // of now.
1965
483
    placement_uuid = l->pb.replication_info().live_replicas().placement_uuid();
1966
483
  }
1967
483
  if (!placement_uuid.empty()) {
1968
2
    for (auto& iter : *tablespace_map) {
1969
2
      if (iter.second) {
1970
0
        iter.second.value().mutable_live_replicas()->set_placement_uuid(placement_uuid);
1971
0
      }
1972
2
    }
1973
1
  }
1974
1975
  // Before updating the tablespace placement map, validate the
1976
  // placement policies.
1977
966
  for (auto& iter : *tablespace_map) {
1978
966
    if (iter.second) {
1979
0
      RETURN_NOT_OK(ValidateTableReplicationInfo(iter.second.value()));
1980
0
    }
1981
966
  }
1982
1983
483
  return tablespace_map;
1984
483
}
1985
1986
boost::optional<TablespaceId> CatalogManager::GetTransactionStatusTableTablespace(
1987
2.13k
    const scoped_refptr<TableInfo>& table) {
1988
2.13k
  auto lock = table->LockForRead();
1989
2.13k
  if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) {
1990
0
    return boost::none;
1991
0
  }
1992
1993
2.13k
  if (!lock->pb.has_transaction_table_tablespace_id()) {
1994
2.13k
    return boost::none;
1995
2.13k
  }
1996
1997
0
  return lock->pb.transaction_table_tablespace_id();
1998
0
}
1999
2000
0
void CatalogManager::ClearTransactionStatusTableTablespace(const scoped_refptr<TableInfo>& table) {
2001
0
  auto lock = table->LockForWrite();
2002
0
  if (lock->pb.table_type() != TRANSACTION_STATUS_TABLE_TYPE) {
2003
0
    return;
2004
0
  }
2005
2006
0
  lock.mutable_data()->pb.clear_transaction_table_tablespace_id();
2007
0
  lock.mutable_data()->pb.set_version(lock.mutable_data()->pb.version() + 1);
2008
0
  lock.Commit();
2009
0
}
2010
2011
bool CatalogManager::CheckTransactionStatusTablesWithMissingTablespaces(
2012
483
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2013
483
  SharedLock lock(mutex_);
2014
122
  for (const auto& table_id : transaction_table_ids_set_) {
2015
122
    auto table = table_ids_map_->find(table_id);
2016
122
    if (table == table_ids_map_->end()) {
2017
0
      LOG(DFATAL) << "Table uuid " << table_id
2018
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
2019
0
      continue;
2020
0
    }
2021
122
    auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2022
122
    if (tablespace_id) {
2023
0
      if (!tablespace_info.count(*tablespace_id)) {
2024
0
        return true;
2025
0
      }
2026
0
    }
2027
122
  }
2028
483
  return false;
2029
483
}
2030
2031
Status CatalogManager::UpdateTransactionStatusTableTablespaces(
2032
483
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2033
483
  if (CheckTransactionStatusTablesWithMissingTablespaces(tablespace_info)) {
2034
0
    {
2035
0
      LockGuard lock(mutex_);
2036
0
      for (const auto& table_id : transaction_table_ids_set_) {
2037
0
        auto table = table_ids_map_->find(table_id);
2038
0
        if (table == table_ids_map_->end()) {
2039
0
          LOG(DFATAL) << "Table uuid " << table_id
2040
0
                      << " in transaction_table_ids_set_ but not in table_ids_map_";
2041
0
          continue;
2042
0
        }
2043
0
        auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2044
0
        if (tablespace_id) {
2045
0
          if (!tablespace_info.count(*tablespace_id)) {
2046
            // TODO: We should also delete the transaction table, see #11123.
2047
0
            LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id
2048
0
                      << " which doesn't exist, clearing tablespace id";
2049
0
            ClearTransactionStatusTableTablespace(table->second);
2050
0
          }
2051
0
        }
2052
0
      }
2053
0
    }
2054
2055
    // A tablespace id has been cleared, meaning a transaction table's placement has changed,
2056
    // and thus the transaction tables version needs to be incremented.
2057
0
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
2058
0
  }
2059
2060
483
  return Status::OK();
2061
483
}
2062
2063
Result<shared_ptr<TableToTablespaceIdMap>> CatalogManager::GetYsqlTableToTablespaceMap(
2064
0
    const TablespaceIdToReplicationInfoMap& tablespace_info) {
2065
0
  auto table_to_tablespace_map = std::make_shared<TableToTablespaceIdMap>();
2066
2067
  // First fetch all namespaces. This is because the table_to_tablespace information is only
2068
  // found in the pg_class catalog table. There exists a separate pg_class table in each
2069
  // namespace. To build in-memory state for all tables, process pg_class table for each
2070
  // namespace.
2071
0
  vector<NamespaceId> namespace_id_vec;
2072
0
  set<NamespaceId> colocated_namespaces;
2073
0
  {
2074
0
    SharedLock lock(mutex_);
2075
0
    for (const auto& ns : namespace_ids_map_) {
2076
0
      if (ns.second->database_type() != YQL_DATABASE_PGSQL) {
2077
0
        continue;
2078
0
      }
2079
2080
0
      if (ns.first == kPgSequencesDataNamespaceId) {
2081
        // Skip the database created for sequences system table.
2082
0
        continue;
2083
0
      }
2084
2085
0
      if (ns.second->colocated()) {
2086
0
        colocated_namespaces.insert(ns.first);
2087
0
      }
2088
2089
      // TODO (Deepthi): Investigate if safe to skip template0 and template1 as well.
2090
0
      namespace_id_vec.emplace_back(ns.first);
2091
0
    }
2092
2093
    // Add local transaction tables corresponding to tablespaces.
2094
0
    for (const auto& table_id : transaction_table_ids_set_) {
2095
0
      auto table = table_ids_map_->find(table_id);
2096
0
      if (table == table_ids_map_->end()) {
2097
0
        LOG(DFATAL) << "Table uuid " << table_id
2098
0
                    << " in transaction_table_ids_set_ but not in table_ids_map_";
2099
0
        continue;
2100
0
      }
2101
0
      auto tablespace_id = GetTransactionStatusTableTablespace(table->second);
2102
0
      if (tablespace_id) {
2103
0
        if (tablespace_info.count(*tablespace_id)) {
2104
0
          (*table_to_tablespace_map)[table_id] = *tablespace_id;
2105
0
        } else {
2106
          // It's possible that a new tablespace had its transaction table created then deleted
2107
          // between when we checked tablespace ids and now; we ignore it here, and it will be
2108
          // caught and cleared in the next tablespace update.
2109
0
          LOG(INFO) << "Found transaction status table for tablespace id " << *tablespace_id
2110
0
                    << " which doesn't exist, ignoring";
2111
0
        }
2112
0
      }
2113
0
    }
2114
0
  }
2115
2116
  // For each namespace, fetch the table->tablespace information by reading pg_class
2117
  // table for each namespace.
2118
0
  for (const NamespaceId& nsid : namespace_id_vec) {
2119
0
    VLOG(1) << "Refreshing placement information for namespace " << nsid;
2120
0
    const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(nsid));
2121
0
    const bool is_colocated_database = colocated_namespaces.count(nsid) > 0;
2122
0
    Status table_tablespace_status = sys_catalog_->ReadPgClassInfo(database_oid,
2123
0
                                                                   is_colocated_database,
2124
0
                                                                   table_to_tablespace_map.get());
2125
0
    if (!table_tablespace_status.ok()) {
2126
0
      LOG(WARNING) << "Refreshing table->tablespace info failed for namespace "
2127
0
                   << nsid << " with error: " << table_tablespace_status.ToString();
2128
0
    }
2129
2130
0
    const bool pg_yb_tablegroup_exists = VERIFY_RESULT(DoesTableExist(FindTableById(
2131
0
      GetPgsqlTableId(database_oid, kPgYbTablegroupTableOid))));
2132
2133
    // no pg_yb_tablegroup means we only need to check pg_class
2134
0
    if (table_tablespace_status.ok() && !pg_yb_tablegroup_exists) {
2135
0
      VLOG(5) << "Successfully refreshed placement information for namespace " << nsid
2136
0
              << " from pg_class";
2137
0
      continue;
2138
0
    }
2139
2140
0
    Status tablegroup_tablespace_status = sys_catalog_->ReadTablespaceInfoFromPgYbTablegroup(
2141
0
      database_oid,
2142
0
      table_to_tablespace_map.get());
2143
0
    if (!tablegroup_tablespace_status.ok()) {
2144
0
      LOG(WARNING) << "Refreshing tablegroup->tablespace info failed for namespace "
2145
0
                  << nsid << " with error: " << tablegroup_tablespace_status.ToString();
2146
0
    }
2147
0
    if (table_tablespace_status.ok() && tablegroup_tablespace_status.ok()) {
2148
0
      VLOG(5) << "Successfully refreshed placement information for namespace " << nsid
2149
0
              << " from pg_class and pg_yb_tablegroup";
2150
0
    }
2151
0
  }
2152
2153
0
  return table_to_tablespace_map;
2154
0
}
2155
2156
Status CatalogManager::CreateTransactionStatusTablesForTablespaces(
2157
    const TablespaceIdToReplicationInfoMap& tablespace_info,
2158
0
    const TableToTablespaceIdMap& table_to_tablespace_map) {
2159
0
  if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement) ||
2160
0
      !GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) {
2161
0
    return Status::OK();
2162
0
  }
2163
2164
0
  std::unordered_set<TablespaceId> valid_tablespaces;
2165
0
  for (const auto& entry : table_to_tablespace_map) {
2166
0
    if (entry.second) {
2167
0
      valid_tablespaces.insert(*entry.second);
2168
0
    }
2169
0
  }
2170
0
  for (const auto& entry : tablespace_info) {
2171
0
    if (!entry.second) {
2172
0
      valid_tablespaces.erase(entry.first);
2173
0
    }
2174
0
  }
2175
2176
0
  for (const auto& tablespace_id : valid_tablespaces) {
2177
0
    RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(nullptr /* rpc */, tablespace_id));
2178
0
  }
2179
2180
0
  return Status::OK();
2181
0
}
2182
2183
17.2k
void CatalogManager::StartTablespaceBgTaskIfStopped() {
2184
17.2k
  if (GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs) <= 0 ||
2185
17.2k
      !GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
2186
    // The tablespace bg task is disabled. Nothing to do.
2187
0
    return;
2188
0
  }
2189
2190
17.2k
  const bool is_task_running = tablespace_bg_task_running_.exchange(true);
2191
17.2k
  if (is_task_running) {
2192
    // Task already running, nothing to do.
2193
16.8k
    return;
2194
16.8k
  }
2195
2196
414
  ScheduleRefreshTablespaceInfoTask(true /* schedule_now */);
2197
414
}
2198
2199
947
void CatalogManager::ScheduleRefreshTablespaceInfoTask(const bool schedule_now) {
2200
947
  int wait_time = 0;
2201
2202
947
  if (!schedule_now) {
2203
533
    wait_time = GetAtomicFlag(&FLAGS_ysql_tablespace_info_refresh_secs);
2204
533
    if (wait_time <= 0) {
2205
      // The tablespace refresh task has been disabled.
2206
0
      tablespace_bg_task_running_ = false;
2207
0
      return;
2208
0
    }
2209
947
  }
2210
2211
947
  refresh_ysql_tablespace_info_task_.Schedule([this](const Status& status) {
2212
561
    Status s = background_tasks_thread_pool_->SubmitFunc(
2213
561
      std::bind(&CatalogManager::RefreshTablespaceInfoPeriodically, this));
2214
561
    if (!s.IsOk()) {
2215
      // Failed to submit task to the thread pool. Mark that the task is now
2216
      // no longer running.
2217
0
      LOG(WARNING) << "Failed to schedule: RefreshTablespaceInfoPeriodically";
2218
0
      tablespace_bg_task_running_ = false;
2219
0
    }
2220
561
  }, wait_time * 1s);
2221
947
}
2222
2223
560
void CatalogManager::RefreshTablespaceInfoPeriodically() {
2224
560
  if (!GetAtomicFlag(&FLAGS_enable_ysql_tablespaces_for_placement)) {
2225
0
    tablespace_bg_task_running_ = false;
2226
0
    return;
2227
0
  }
2228
2229
560
  if (!CheckIsLeaderAndReady().IsOk()) {
2230
27
    LOG(INFO) << "No longer the leader, so cancelling tablespace info task";
2231
27
    tablespace_bg_task_running_ = false;
2232
27
    return;
2233
27
  }
2234
2235
  // Refresh the tablespace info in memory.
2236
533
  Status s = DoRefreshTablespaceInfo();
2237
533
  if (!s.IsOk()) {
2238
50
    LOG(WARNING) << "Tablespace refresh task failed with error " << s.ToString();
2239
50
  }
2240
2241
  // Schedule the next iteration of the task.
2242
533
  ScheduleRefreshTablespaceInfoTask();
2243
533
}
2244
2245
468
Status CatalogManager::DoRefreshTablespaceInfo() {
2246
0
  VLOG(2) << "Running RefreshTablespaceInfoPeriodically task";
2247
2248
  // First refresh the tablespace info in memory.
2249
468
  auto tablespace_info = VERIFY_RESULT(GetYsqlTablespaceInfo());
2250
2251
  // Clear tablespace ids for transaction tables mapped to missing tablespaces.
2252
468
  RETURN_NOT_OK(UpdateTransactionStatusTableTablespaces(*tablespace_info));
2253
2254
468
  shared_ptr<TableToTablespaceIdMap> table_to_tablespace_map = nullptr;
2255
2256
468
  if (tablespace_info->size() > kYsqlNumDefaultTablespaces) {
2257
    // There exist custom tablespaces in the system. Fetch the table->tablespace
2258
    // map from PG catalog tables.
2259
0
    table_to_tablespace_map = VERIFY_RESULT(GetYsqlTableToTablespaceMap(*tablespace_info));
2260
0
  }
2261
2262
  // Update tablespace_manager_.
2263
468
  {
2264
468
    LockGuard lock(tablespace_mutex_);
2265
468
    tablespace_manager_ = std::make_shared<YsqlTablespaceManager>(tablespace_info,
2266
468
                                                                  table_to_tablespace_map);
2267
468
  }
2268
2269
468
  if (table_to_tablespace_map) {
2270
    // Trigger transaction table creates for tablespaces with tables and no transaction tables.
2271
0
    RETURN_NOT_OK(CreateTransactionStatusTablesForTablespaces(
2272
0
        *tablespace_info, *table_to_tablespace_map));
2273
0
  }
2274
2275
0
  VLOG(3) << "Refreshed tablespace information in memory";
2276
468
  return Status::OK();
2277
468
}
2278
2279
Status CatalogManager::AddIndexInfoToTable(const scoped_refptr<TableInfo>& indexed_table,
2280
                                           const IndexInfoPB& index_info,
2281
603
                                           CreateTableResponsePB* resp) {
2282
603
  LOG(INFO) << "AddIndexInfoToTable to " << indexed_table->ToString() << "  IndexInfo "
2283
603
            << yb::ToString(index_info);
2284
603
  TRACE("Locking indexed table");
2285
603
  auto l = DCHECK_NOTNULL(indexed_table)->LockForWrite();
2286
603
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
2287
2288
  // Make sure that the index appears to not have been added to the table until the tservers apply
2289
  // the alter and respond back.
2290
  // Heed issue #6233.
2291
603
  if (!l->pb.has_fully_applied_schema()) {
2292
591
    MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&l.mutable_data()->pb);
2293
591
  }
2294
2295
  // Add index info to indexed table and increment schema version.
2296
603
  auto& pb = l.mutable_data()->pb;
2297
603
  pb.add_indexes()->CopyFrom(index_info);
2298
603
  pb.set_version(l.mutable_data()->pb.version() + 1);
2299
603
  pb.set_updates_only_index_permissions(false);
2300
603
  l.mutable_data()->set_state(
2301
603
      SysTablesEntryPB::ALTERING,
2302
603
      Format("Add index info version=$0 ts=$1", pb.version(), LocalTimeAsString()));
2303
2304
  // Update sys-catalog with the new indexed table info.
2305
603
  TRACE("Updating indexed table metadata on disk");
2306
603
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table));
2307
2308
  // Update the in-memory state.
2309
603
  TRACE("Committing in-memory state");
2310
603
  l.Commit();
2311
2312
603
  RETURN_NOT_OK(SendAlterTableRequest(indexed_table));
2313
2314
603
  return Status::OK();
2315
603
}
2316
2317
Status CatalogManager::CreateCopartitionedTable(const CreateTableRequestPB& req,
2318
                                                CreateTableResponsePB* resp,
2319
                                                rpc::RpcContext* rpc,
2320
                                                Schema schema,
2321
0
                                                scoped_refptr<NamespaceInfo> ns) {
2322
0
  scoped_refptr<TableInfo> parent_table_info;
2323
0
  Status s;
2324
0
  PartitionSchema partition_schema;
2325
0
  std::vector<Partition> partitions;
2326
2327
0
  const NamespaceId& namespace_id = ns->id();
2328
0
  const NamespaceName& namespace_name = ns->name();
2329
2330
0
  LockGuard lock(mutex_);
2331
0
  TRACE("Acquired catalog manager lock");
2332
0
  parent_table_info = FindPtrOrNull(*table_ids_map_,
2333
0
                                    schema.table_properties().CopartitionTableId());
2334
0
  if (parent_table_info == nullptr) {
2335
0
    s = STATUS(NotFound, "The object does not exist: copartitioned table with id",
2336
0
               schema.table_properties().CopartitionTableId());
2337
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
2338
0
  }
2339
2340
0
  TableInfoPtr this_table_info;
2341
  // Verify that the table does not exist.
2342
0
  this_table_info = FindPtrOrNull(table_names_map_, {namespace_id, req.name()});
2343
2344
0
  if (this_table_info != nullptr) {
2345
0
    s = STATUS_SUBSTITUTE(AlreadyPresent,
2346
0
        "Object '$0.$1' already exists",
2347
0
        GetNamespaceNameUnlocked(this_table_info), this_table_info->name());
2348
0
    LOG(WARNING) << "Found table: " << this_table_info->ToStringWithState()
2349
0
                 << ". Failed creating copartitioned table with error: "
2350
0
                 << s.ToString() << " Request:\n" << req.DebugString();
2351
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
2352
0
  }
2353
  // Don't add copartitioned tables to Namespaces that aren't running.
2354
0
  if (ns->state() != SysNamespaceEntryPB::RUNNING) {
2355
0
    Status s = STATUS_SUBSTITUTE(TryAgain,
2356
0
        "Namespace not running (State=$0).  Cannot create $1.$2",
2357
0
        ns->state(), ns->name(), req.name() );
2358
0
    return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
2359
0
  }
2360
2361
  // TODO: pass index_info for copartitioned index.
2362
0
  RETURN_NOT_OK(CreateTableInMemory(
2363
0
      req, schema, partition_schema, namespace_id, namespace_name,
2364
0
      partitions, nullptr, nullptr, resp, &this_table_info));
2365
2366
0
  TRACE("Inserted new table info into CatalogManager maps");
2367
2368
  // NOTE: the table is already locked for write at this point,
2369
  // since the CreateTableInfo function leave it in that state.
2370
  // It will get committed at the end of this function.
2371
  // Sanity check: the table should be in "preparing" state.
2372
0
  CHECK_EQ(SysTablesEntryPB::PREPARING, this_table_info->metadata().dirty().pb.state());
2373
0
  TabletInfos tablets = parent_table_info->GetTablets();
2374
0
  for (auto tablet : tablets) {
2375
0
    tablet->mutable_metadata()->StartMutation();
2376
0
    tablet->mutable_metadata()->mutable_dirty()->pb.add_table_ids(this_table_info->id());
2377
0
  }
2378
2379
  // Update Tablets about new table id to sys-tablets.
2380
0
  s = sys_catalog_->Upsert(leader_ready_term(), tablets);
2381
0
  if (PREDICT_FALSE(!s.ok())) {
2382
0
    return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend(
2383
0
        Substitute("An error occurred while inserting to sys-tablets: $0", s.ToString())), resp);
2384
0
  }
2385
0
  TRACE("Wrote tablets to system table");
2386
2387
  // Update the on-disk table state to "running".
2388
0
  this_table_info->AddTablets(tablets);
2389
0
  this_table_info->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
2390
0
  s = sys_catalog_->Upsert(leader_ready_term(), this_table_info);
2391
0
  if (PREDICT_FALSE(!s.ok())) {
2392
0
    return AbortTableCreation(this_table_info.get(), tablets, s.CloneAndPrepend(
2393
0
        Substitute("An error occurred while inserting to sys-tablets: $0",
2394
0
                   s.ToString())), resp);
2395
0
  }
2396
0
  TRACE("Wrote table to system table");
2397
2398
  // Commit the in-memory state.
2399
0
  this_table_info->mutable_metadata()->CommitMutation();
2400
2401
0
  for (const auto& tablet : tablets) {
2402
0
    tablet->mutable_metadata()->CommitMutation();
2403
0
  }
2404
2405
0
  for (const auto& tablet : tablets) {
2406
0
    SendCopartitionTabletRequest(tablet, this_table_info);
2407
0
  }
2408
2409
0
  LOG(INFO) << "Successfully created table " << this_table_info->ToString()
2410
0
            << " per request from " << RequestorString(rpc);
2411
0
  return Status::OK();
2412
0
}
2413
2414
2415
template <class Req, class Resp, class Action>
2416
0
Status CatalogManager::PerformOnSysCatalogTablet(const Req& req, Resp* resp, const Action& action) {
2417
0
  auto tablet_peer = sys_catalog_->tablet_peer();
2418
0
  auto shared_tablet = tablet_peer ? tablet_peer->shared_tablet() : nullptr;
2419
0
  if (!shared_tablet) {
2420
0
    return SetupError(
2421
0
        resp->mutable_error(),
2422
0
        MasterErrorPB::TABLET_NOT_RUNNING,
2423
0
        STATUS(NotFound, "The sys catalog tablet was not found."));
2424
0
  }
2425
2426
0
  auto s = action(shared_tablet);
2427
0
  if (!s.ok()) {
2428
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
2429
0
  }
2430
2431
0
  return Status::OK();
2432
0
}
Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master14CatalogManager25PerformOnSysCatalogTabletIPKNS0_24FlushSysCatalogRequestPBENS0_25FlushSysCatalogResponsePBEZNS1_15FlushSysCatalogES5_PS6_PNS_3rpc10RpcContextEE3$_4EENS_6StatusERKT_PT0_RKT1_
Unexecuted instantiation: catalog_manager.cc:_ZN2yb6master14CatalogManager25PerformOnSysCatalogTabletIPKNS0_26CompactSysCatalogRequestPBENS0_27CompactSysCatalogResponsePBEZNS1_17CompactSysCatalogES5_PS6_PNS_3rpc10RpcContextEE3$_5EENS_6StatusERKT_PT0_RKT1_
2433
2434
Status CatalogManager::FlushSysCatalog(
2435
    const FlushSysCatalogRequestPB* req,
2436
    FlushSysCatalogResponsePB* resp,
2437
0
    rpc::RpcContext* context) {
2438
0
  return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) {
2439
0
    return shared_tablet->Flush(tablet::FlushMode::kSync);
2440
0
  });
2441
0
}
2442
2443
Status CatalogManager::CompactSysCatalog(
2444
    const CompactSysCatalogRequestPB* req,
2445
    CompactSysCatalogResponsePB* resp,
2446
0
    rpc::RpcContext* context) {
2447
0
  return PerformOnSysCatalogTablet(req, resp, [](auto shared_tablet) {
2448
0
    return shared_tablet->ForceFullRocksDBCompact();
2449
0
  });
2450
0
}
2451
2452
namespace {
2453
2454
Result<std::array<PartitionPB, kNumSplitParts>> CreateNewTabletsPartition(
2455
45
    const TabletInfo& tablet_info, const std::string& split_partition_key) {
2456
45
  const auto& source_partition = tablet_info.LockForRead()->pb.partition();
2457
2458
45
  if (split_partition_key <= source_partition.partition_key_start() ||
2459
45
      (!source_partition.partition_key_end().empty() &&
2460
32
       split_partition_key >= source_partition.partition_key_end())) {
2461
0
    return STATUS_FORMAT(
2462
0
        InvalidArgument,
2463
0
        "Can't split tablet $0 (partition_key_start: $1 partition_key_end: $2) by partition "
2464
0
        "boundary (split_key: $3)",
2465
0
        tablet_info.tablet_id(), source_partition.partition_key_start(),
2466
0
        source_partition.partition_key_end(), split_partition_key);
2467
0
  }
2468
2469
45
  std::array<PartitionPB, kNumSplitParts> new_tablets_partition;
2470
2471
45
  new_tablets_partition.fill(source_partition);
2472
2473
45
  new_tablets_partition[0].set_partition_key_end(split_partition_key);
2474
45
  new_tablets_partition[1].set_partition_key_start(split_partition_key);
2475
45
  static_assert(kNumSplitParts == 2, "We expect tablet to be split into 2 new tablets here");
2476
2477
45
  return new_tablets_partition;
2478
45
}
2479
2480
}  // namespace
2481
2482
CHECKED_STATUS CatalogManager::TEST_SplitTablet(
2483
    const TabletId& tablet_id, const std::string& split_encoded_key,
2484
0
    const std::string& split_partition_key) {
2485
0
  auto source_tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id));
2486
0
  return DoSplitTablet(source_tablet_info, split_encoded_key, split_partition_key,
2487
0
      true /* select_all_tablets_for_split */);
2488
0
}
2489
2490
Status CatalogManager::TEST_SplitTablet(
2491
0
    const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code) {
2492
0
  return DoSplitTablet(source_tablet_info, split_hash_code,
2493
0
      true /* select_all_tablets_for_split */);
2494
0
}
2495
2496
0
Status CatalogManager::TEST_IncrementTablePartitionListVersion(const TableId& table_id) {
2497
0
  auto table_info = GetTableInfo(table_id);
2498
0
  SCHECK(table_info != nullptr, NotFound, Format("Table $0 not found", table_id));
2499
2500
0
  LockGuard lock(mutex_);
2501
0
  auto table_lock = table_info->LockForWrite();
2502
0
  auto& table_pb = table_lock.mutable_data()->pb;
2503
0
  table_pb.set_partition_list_version(table_pb.partition_list_version() + 1);
2504
0
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table_info));
2505
0
  table_lock.Commit();
2506
0
  return Status::OK();
2507
0
}
2508
2509
Result<ReplicationInfoPB> CatalogManager::GetTableReplicationInfo(
2510
81
    const TabletInfo& tablet_info) const {
2511
81
  auto table = tablet_info.table();
2512
81
  {
2513
81
    auto table_lock = table->LockForRead();
2514
81
    if (table_lock->pb.has_replication_info()) {
2515
0
      return table_lock->pb.replication_info();
2516
0
    }
2517
81
  }
2518
2519
81
  auto replication_info_opt = VERIFY_RESULT(
2520
81
      GetTablespaceManager()->GetTableReplicationInfo(table));
2521
81
  if (replication_info_opt) {
2522
0
    return replication_info_opt.value();
2523
0
  }
2524
2525
81
  return cluster_config_->LockForRead()->pb.replication_info();
2526
81
}
2527
2528
bool CatalogManager::ShouldSplitValidCandidate(
2529
545k
    const TabletInfo& tablet_info, const TabletReplicaDriveInfo& drive_info) const {
2530
545k
  if (drive_info.may_have_orphaned_post_split_data) {
2531
87.0k
    return false;
2532
87.0k
  }
2533
458k
  ssize_t size = drive_info.sst_files_size;
2534
0
  DCHECK(size >= 0) << "Detected overflow in casting sst_files_size to signed int.";
2535
458k
  if (size < FLAGS_tablet_split_low_phase_size_threshold_bytes) {
2536
458k
    return false;
2537
458k
  }
2538
81
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
2539
2540
81
  size_t num_servers = 0;
2541
81
  auto table_replication_info_or_status = GetTableReplicationInfo(tablet_info);
2542
2543
  // If there is custom placement information present then
2544
  // only count the tservers which the table has access to
2545
  // according to the placement policy
2546
81
  if (table_replication_info_or_status.ok()
2547
81
      && table_replication_info_or_status->has_live_replicas()) {
2548
0
    auto pb = table_replication_info_or_status->live_replicas();
2549
0
    auto valid_tservers_res = FindTServersForPlacementInfo(
2550
0
      table_replication_info_or_status->live_replicas(), ts_descs);
2551
0
    if (!valid_tservers_res.ok()) {
2552
0
      num_servers = ts_descs.size();
2553
0
    } else {
2554
0
      num_servers = valid_tservers_res.get().size();
2555
0
    }
2556
81
  } else {
2557
81
    num_servers = ts_descs.size();
2558
81
  }
2559
2560
81
  int64 num_tablets_per_server = tablet_info.table()->NumPartitions() / num_servers;
2561
2562
81
  if (num_tablets_per_server < FLAGS_tablet_split_low_phase_shard_count_per_node) {
2563
0
    return size > FLAGS_tablet_split_low_phase_size_threshold_bytes;
2564
0
  }
2565
81
  if (num_tablets_per_server < FLAGS_tablet_split_high_phase_shard_count_per_node) {
2566
0
    return size > FLAGS_tablet_split_high_phase_size_threshold_bytes;
2567
0
  }
2568
81
  return size > FLAGS_tablet_force_split_threshold_bytes;
2569
81
}
2570
2571
Status CatalogManager::DoSplitTablet(
2572
    const scoped_refptr<TabletInfo>& source_tablet_info, std::string split_encoded_key,
2573
45
    std::string split_partition_key, bool select_all_tablets_for_split) {
2574
45
  auto source_table_lock = source_tablet_info->table()->LockForWrite();
2575
45
  auto source_tablet_lock = source_tablet_info->LockForWrite();
2576
2577
  // We must re-validate the split candidate here *after* grabbing locks on the table and tablet to
2578
  // ensure a backfill does not happen before we modify catalog metadata to include new subtablets.
2579
  // This process adds new subtablets in the CREATING state, which if encountered by backfill code
2580
  // will block the backfill process.
2581
45
  RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTable(*source_tablet_info->table()));
2582
45
  RETURN_NOT_OK(tablet_split_manager_.ValidateSplitCandidateTablet(*source_tablet_info));
2583
2584
45
  auto drive_info = VERIFY_RESULT(source_tablet_info->GetLeaderReplicaDriveInfo());
2585
45
  if (!select_all_tablets_for_split &&
2586
35
      !ShouldSplitValidCandidate(*source_tablet_info, drive_info)) {
2587
    // It is possible that we queued up a split candidate in TabletSplitManager which was, at the
2588
    // time, a valid split candidate, but by the time the candidate was actually processed here, the
2589
    // cluster may have changed, putting us in a new split threshold phase, and it may no longer be
2590
    // a valid candidate. This is not an unexpected error, but we should bail out of splitting this
2591
    // tablet regardless.
2592
0
    return STATUS_FORMAT(
2593
0
        InvalidArgument,
2594
0
        "Tablet split candidate $0 is no longer a valid split candidate.",
2595
0
        source_tablet_info->tablet_id());
2596
0
  }
2597
2598
  // Check if at least one child tablet already registered
2599
45
  if (source_tablet_lock->pb.split_tablet_ids().size() > 0) {
2600
27
    const auto child_tablet_id = source_tablet_lock->pb.split_tablet_ids(0);
2601
27
    const auto child_tablet = VERIFY_RESULT(GetTabletInfo(child_tablet_id));
2602
27
    const auto parent_partition = source_tablet_lock->pb.partition();
2603
27
    const auto child_partition = child_tablet->LockForRead()->pb.partition();
2604
2605
27
    if (parent_partition.partition_key_start() == child_partition.partition_key_start()) {
2606
27
      split_partition_key = child_partition.partition_key_end();
2607
0
    } else {
2608
0
      SCHECK_EQ(parent_partition.partition_key_end(), child_partition.partition_key_end(),
2609
0
        IllegalState, "Parent partion key end does not equal child partition key end");
2610
0
      split_partition_key = child_partition.partition_key_start();
2611
0
    }
2612
2613
    // Re-compute the encoded key
2614
    // to ensure we use the same partition boundary for both child tablets
2615
27
    split_encoded_key = PartitionSchema::GetEncodedKeyPrefix(
2616
27
      split_partition_key, source_table_lock->pb.partition_schema());
2617
27
  }
2618
2619
45
  LOG(INFO) << "Starting tablet split: " << source_tablet_info->ToString()
2620
45
            << " by partition key: " << Slice(split_partition_key).ToDebugHexString();
2621
2622
45
  std::array<PartitionPB, kNumSplitParts> new_tablets_partition = VERIFY_RESULT(
2623
45
      CreateNewTabletsPartition(*source_tablet_info, split_partition_key));
2624
2625
45
  std::array<TabletId, kNumSplitParts> new_tablet_ids;
2626
133
  for (int i = 0; i < kNumSplitParts; ++i) {
2627
88
    if (i < source_tablet_lock->pb.split_tablet_ids_size()) {
2628
      // Post-split tablet `i` has been already registered.
2629
52
      new_tablet_ids[i] = source_tablet_lock->pb.split_tablet_ids(i);
2630
36
    } else {
2631
36
      auto new_tablet_info = VERIFY_RESULT(RegisterNewTabletForSplit(
2632
36
          source_tablet_info.get(), new_tablets_partition[i],
2633
36
          &source_table_lock, &source_tablet_lock));
2634
2635
36
      new_tablet_ids[i] = new_tablet_info->id();
2636
36
    }
2637
88
  }
2638
45
  source_tablet_lock.Commit();
2639
45
  source_table_lock.Commit();
2640
2641
  // TODO(tsplit): what if source tablet will be deleted before or during TS leader is processing
2642
  // split? Add unit-test.
2643
45
  RETURN_NOT_OK(SendSplitTabletRequest(
2644
45
      source_tablet_info, new_tablet_ids, split_encoded_key, split_partition_key));
2645
2646
45
  return Status::OK();
2647
45
}
2648
2649
Status CatalogManager::DoSplitTablet(
2650
    const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code,
2651
0
    bool select_all_tablets_for_split) {
2652
0
  docdb::KeyBytes split_encoded_key;
2653
0
  docdb::DocKeyEncoderAfterTableIdStep(&split_encoded_key)
2654
0
      .Hash(split_hash_code, std::vector<docdb::PrimitiveValue>());
2655
2656
0
  const auto split_partition_key = PartitionSchema::EncodeMultiColumnHashValue(split_hash_code);
2657
2658
0
  return DoSplitTablet(source_tablet_info, split_encoded_key.ToStringBuffer(), split_partition_key,
2659
0
      select_all_tablets_for_split);
2660
0
}
2661
2662
80
Result<scoped_refptr<TabletInfo>> CatalogManager::GetTabletInfo(const TabletId& tablet_id) {
2663
80
  LockGuard lock(mutex_);
2664
80
  TRACE("Acquired catalog manager lock");
2665
2666
80
  const auto tablet_info = FindPtrOrNull(*tablet_map_, tablet_id);
2667
80
  SCHECK(tablet_info != nullptr, NotFound, Format("Tablet $0 not found", tablet_id));
2668
2669
80
  return tablet_info;
2670
80
}
2671
2672
void CatalogManager::SplitTabletWithKey(
2673
    const scoped_refptr<TabletInfo>& tablet, const std::string& split_encoded_key,
2674
45
    const std::string& split_partition_key, const bool select_all_tablets_for_split) {
2675
  // Note that DoSplitTablet() will trigger an async SplitTablet task, and will only return not OK()
2676
  // if it failed to submit that task. In other words, any failures here are not retriable, and
2677
  // success indicates that an async and automatically retrying task was submitted.
2678
45
  auto s = DoSplitTablet(
2679
45
      tablet, split_encoded_key, split_partition_key, select_all_tablets_for_split);
2680
45
  WARN_NOT_OK(s, Format("Failed to split tablet with GetSplitKey result for tablet: $0",
2681
45
                        tablet->tablet_id()));
2682
45
}
2683
2684
45
Status CatalogManager::SplitTablet(const TabletId& tablet_id, bool select_all_tablets_for_split) {
2685
45
  LOG(INFO) << "Got tablet to split: " << tablet_id;
2686
2687
45
  const auto tablet = VERIFY_RESULT(GetTabletInfo(tablet_id));
2688
2689
0
  VLOG(2) << "Scheduling GetSplitKey request to leader tserver for source tablet ID: "
2690
0
          << tablet->tablet_id();
2691
45
  auto call = std::make_shared<AsyncGetTabletSplitKey>(
2692
45
      master_, AsyncTaskPool(), tablet,
2693
45
      [this, tablet, select_all_tablets_for_split]
2694
45
          (const Result<AsyncGetTabletSplitKey::Data>& result) {
2695
45
        if (result.ok()) {
2696
45
          SplitTabletWithKey(tablet, result->split_encoded_key, result->split_partition_key,
2697
45
              select_all_tablets_for_split);
2698
0
        } else {
2699
0
          LOG(WARNING) << "AsyncGetTabletSplitKey task failed with status: " << result.status();
2700
0
        }
2701
45
      });
2702
45
  tablet->table()->AddTask(call);
2703
45
  return ScheduleTask(call);
2704
45
}
2705
2706
Status CatalogManager::SplitTablet(
2707
10
    const SplitTabletRequestPB* req, SplitTabletResponsePB* resp, rpc::RpcContext* rpc) {
2708
10
  const auto source_tablet_id = req->tablet_id();
2709
10
  return SplitTablet(source_tablet_id, true /* select_all_tablets_for_split */);
2710
10
}
2711
2712
Status CatalogManager::DeleteNotServingTablet(
2713
    const DeleteNotServingTabletRequestPB* req, DeleteNotServingTabletResponsePB* resp,
2714
8
    rpc::RpcContext* rpc) {
2715
8
  const auto& tablet_id = req->tablet_id();
2716
8
  const auto tablet_info = VERIFY_RESULT(GetTabletInfo(tablet_id));
2717
2718
8
  if (PREDICT_FALSE(FLAGS_TEST_reject_delete_not_serving_tablet_rpc)) {
2719
0
    TEST_SYNC_POINT("CatalogManager::DeleteNotServingTablet:Reject");
2720
0
    return STATUS(
2721
0
        InvalidArgument, "Rejecting due to FLAGS_TEST_reject_delete_not_serving_tablet_rpc");
2722
0
  }
2723
2724
8
  const auto& table_info = tablet_info->table();
2725
2726
8
  RETURN_NOT_OK(CheckIfForbiddenToDeleteTabletOf(table_info));
2727
2728
8
  RETURN_NOT_OK(CatalogManagerUtil::CheckIfCanDeleteSingleTablet(tablet_info));
2729
2730
7
  auto schedules_to_tables_map = VERIFY_RESULT(
2731
7
      MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE));
2732
7
  RepeatedBytes retained_by_snapshot_schedules;
2733
7
  FillRetainedBySnapshotSchedules(
2734
7
      schedules_to_tables_map, table_info->id(), &retained_by_snapshot_schedules);
2735
2736
7
  return DeleteTabletListAndSendRequests(
2737
7
      { tablet_info }, "Not serving tablet deleted upon request at " + LocalTimeAsString(),
2738
7
      retained_by_snapshot_schedules);
2739
7
}
2740
2741
Status CatalogManager::DdlLog(
2742
0
    const DdlLogRequestPB* req, DdlLogResponsePB* resp, rpc::RpcContext* rpc) {
2743
0
  return sys_catalog_->FetchDdlLog(resp->mutable_entries());
2744
0
}
2745
2746
namespace {
2747
2748
6.29k
CHECKED_STATUS ValidateCreateTableSchema(const Schema& schema, CreateTableResponsePB* resp) {
2749
6.29k
  if (schema.num_key_columns() <= 0) {
2750
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2751
0
                      STATUS(InvalidArgument, "Must specify at least one key column"));
2752
0
  }
2753
17.4k
  for (size_t i = 0; i < schema.num_key_columns(); i++) {
2754
11.1k
    if (!IsTypeAllowableInKey(schema.column(i).type_info())) {
2755
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2756
0
                        STATUS(InvalidArgument, "Invalid datatype for primary key column"));
2757
0
    }
2758
11.1k
  }
2759
6.29k
  return Status::OK();
2760
6.29k
}
2761
2762
}  // namespace
2763
2764
Status CatalogManager::CreateYsqlSysTable(const CreateTableRequestPB* req,
2765
2.26k
                                          CreateTableResponsePB* resp) {
2766
2.26k
  LOG(INFO) << "CreateYsqlSysTable: " << req->name();
2767
  // Lookup the namespace and verify if it exists.
2768
2.26k
  TRACE("Looking up namespace");
2769
2.26k
  auto ns = VERIFY_RESULT(FindNamespace(req->namespace_()));
2770
2.26k
  const NamespaceId& namespace_id = ns->id();
2771
2.26k
  const NamespaceName& namespace_name = ns->name();
2772
2773
2.26k
  Schema schema;
2774
2.26k
  RETURN_NOT_OK(SchemaFromPB(req->schema(), &schema));
2775
  // If the schema contains column ids, we are copying a Postgres table from one namespace to
2776
  // another. Anyway, validate the schema.
2777
2.26k
  RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp));
2778
2.26k
  if (!schema.has_column_ids()) {
2779
0
    schema.InitColumnIdsByDefault();
2780
0
  }
2781
2.26k
  schema.mutable_table_properties()->set_is_ysql_catalog_table(true);
2782
2783
  // Verify no hash partition schema is specified.
2784
2.26k
  if (req->partition_schema().has_hash_schema()) {
2785
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2786
0
                      STATUS(InvalidArgument,
2787
0
                             "PostgreSQL system catalog tables are non-partitioned"));
2788
0
  }
2789
2790
2.26k
  if (req->table_type() != TableType::PGSQL_TABLE_TYPE) {
2791
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA,
2792
0
                      STATUS_FORMAT(
2793
0
                          InvalidArgument,
2794
0
                          "Expected table type to be PGSQL_TABLE_TYPE ($0), got $1 ($2)",
2795
0
                          PGSQL_TABLE_TYPE,
2796
0
                          TableType_Name(req->table_type())));
2797
2798
0
  }
2799
2800
  // Create partition schema and one partition.
2801
2.26k
  PartitionSchema partition_schema;
2802
2.26k
  vector<Partition> partitions;
2803
2.26k
  RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
2804
2805
  // Create table info in memory.
2806
2.26k
  scoped_refptr<TableInfo> table;
2807
2.26k
  scoped_refptr<TabletInfo> sys_catalog_tablet;
2808
2.26k
  {
2809
2.26k
    LockGuard lock(mutex_);
2810
2.26k
    TRACE("Acquired catalog manager lock");
2811
2812
    // Verify that the table does not exist, or has been deleted.
2813
2.26k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
2814
2.26k
    if (table != nullptr && !table->is_deleted()) {
2815
0
      Status s = STATUS_SUBSTITUTE(AlreadyPresent,
2816
0
          "YSQL table '$0.$1' (ID: $2) already exists", ns->name(), table->name(), table->id());
2817
0
      LOG(WARNING) << "Found table: " << table->ToStringWithState()
2818
0
                   << ". Failed creating YSQL system table with error: "
2819
0
                   << s.ToString() << " Request:\n" << req->DebugString();
2820
      // Technically, client already knows table ID, but we set it anyway for unified handling of
2821
      // AlreadyPresent errors. See comment in CreateTable()
2822
0
      resp->set_table_id(table->id());
2823
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
2824
0
    }
2825
2826
2.26k
    RETURN_NOT_OK(CreateTableInMemory(
2827
2.26k
        *req, schema, partition_schema, namespace_id, namespace_name,
2828
2.26k
        partitions, nullptr /* index_info */, nullptr /* tablets */, resp, &table));
2829
2830
2.26k
    sys_catalog_tablet = tablet_map_->find(kSysCatalogTabletId)->second;
2831
2.26k
  }
2832
2833
  // Tables with a transaction should be rolled back if the transaction does not get committed.
2834
  // Store this on the table persistent state until the transaction has been a verified success.
2835
2.26k
  TransactionMetadata txn;
2836
2.26k
  if (req->has_transaction() && FLAGS_enable_transactional_ddl_gc) {
2837
0
    table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()->
2838
0
        CopyFrom(req->transaction());
2839
0
    txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction()));
2840
0
    RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
2841
0
  }
2842
2843
2.26k
  {
2844
2.26k
    auto tablet_lock = sys_catalog_tablet->LockForWrite();
2845
2.26k
    tablet_lock.mutable_data()->pb.add_table_ids(table->id());
2846
2847
2.26k
    Status s = sys_catalog_->Upsert(leader_ready_term(), sys_catalog_tablet);
2848
2.26k
    if (PREDICT_FALSE(!s.ok())) {
2849
1
      return AbortTableCreation(table.get(), {}, s.CloneAndPrepend(
2850
1
        "An error occurred while inserting to sys-tablets: "), resp);
2851
1
    }
2852
2.26k
    table->set_is_system();
2853
2.26k
    table->AddTablet(sys_catalog_tablet.get());
2854
2.26k
    tablet_lock.Commit();
2855
2.26k
  }
2856
2.26k
  TRACE("Inserted new table info into CatalogManager maps");
2857
2858
  // Update the on-disk table state to "running".
2859
2.26k
  table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
2860
2.26k
  Status s = sys_catalog_->Upsert(leader_ready_term(), table);
2861
2.26k
  if (PREDICT_FALSE(!s.ok())) {
2862
0
    return AbortTableCreation(table.get(), {}, s.CloneAndPrepend(
2863
0
      "An error occurred while inserting to sys-tablets: "), resp);
2864
0
  }
2865
2.26k
  TRACE("Wrote table to system table");
2866
2867
  // Commit the in-memory state.
2868
2.26k
  table->mutable_metadata()->CommitMutation();
2869
2870
  // Verify Transaction gets committed, which occurs after table create finishes.
2871
2.26k
  if (req->has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) {
2872
0
    LOG(INFO) << "Enqueuing table for Transaction Verification: " << req->name();
2873
0
    std::function<Status(bool)> when_done =
2874
0
        std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1);
2875
0
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
2876
0
        std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)),
2877
0
                "Could not submit VerifyTransaction to thread pool");
2878
0
  }
2879
2880
2.26k
  tablet::ChangeMetadataRequestPB change_req;
2881
2.26k
  change_req.set_tablet_id(kSysCatalogTabletId);
2882
2.26k
  auto& add_table = *change_req.mutable_add_table();
2883
2884
2.26k
  add_table.set_table_id(req->table_id());
2885
2.26k
  add_table.set_table_type(TableType::PGSQL_TABLE_TYPE);
2886
2.26k
  add_table.set_table_name(req->name());
2887
2.26k
  SchemaToPB(schema, add_table.mutable_schema());
2888
2.26k
  add_table.set_schema_version(0);
2889
2890
2.26k
  partition_schema.ToPB(add_table.mutable_partition_schema());
2891
2892
2.26k
  RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation(
2893
2.26k
      &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term()));
2894
2895
2.26k
  if (initial_snapshot_writer_) {
2896
0
    initial_snapshot_writer_->AddMetadataChange(change_req);
2897
0
  }
2898
2.26k
  return Status::OK();
2899
2.26k
}
2900
2901
Status CatalogManager::ReservePgsqlOids(const ReservePgsqlOidsRequestPB* req,
2902
                                        ReservePgsqlOidsResponsePB* resp,
2903
380
                                        rpc::RpcContext* rpc) {
2904
0
  VLOG(1) << "ReservePgsqlOids request: " << req->ShortDebugString();
2905
2906
  // Lookup namespace
2907
380
  scoped_refptr<NamespaceInfo> ns;
2908
380
  {
2909
380
    SharedLock lock(mutex_);
2910
380
    ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id());
2911
380
  }
2912
380
  if (!ns) {
2913
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
2914
0
                      STATUS(NotFound, "Namespace not found", req->namespace_id()));
2915
0
  }
2916
2917
  // Reserve oids.
2918
380
  auto l = ns->LockForWrite();
2919
2920
380
  uint32_t begin_oid = l->pb.next_pg_oid();
2921
380
  if (begin_oid < req->next_oid()) {
2922
355
    begin_oid = req->next_oid();
2923
355
  }
2924
380
  if (begin_oid == std::numeric_limits<uint32_t>::max()) {
2925
0
    LOG(WARNING) << Format("No more object identifier is available for Postgres database $0 ($1)",
2926
0
                           l->pb.name(), req->namespace_id());
2927
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR,
2928
0
                      STATUS(InvalidArgument, "No more object identifier is available"));
2929
0
  }
2930
2931
380
  uint32_t end_oid = begin_oid + req->count();
2932
380
  if (end_oid < begin_oid) {
2933
0
    end_oid = std::numeric_limits<uint32_t>::max(); // Handle wraparound.
2934
0
  }
2935
2936
380
  resp->set_begin_oid(begin_oid);
2937
380
  resp->set_end_oid(end_oid);
2938
380
  l.mutable_data()->pb.set_next_pg_oid(end_oid);
2939
2940
  // Update the on-disk state.
2941
380
  const Status s = sys_catalog_->Upsert(leader_ready_term(), ns);
2942
380
  if (!s.ok()) {
2943
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
2944
0
  }
2945
2946
  // Commit the in-memory state.
2947
380
  l.Commit();
2948
2949
0
  VLOG(1) << "ReservePgsqlOids response: " << resp->ShortDebugString();
2950
2951
380
  return Status::OK();
2952
380
}
2953
2954
Status CatalogManager::GetYsqlCatalogConfig(const GetYsqlCatalogConfigRequestPB* req,
2955
                                            GetYsqlCatalogConfigResponsePB* resp,
2956
0
                                            rpc::RpcContext* rpc) {
2957
0
  VLOG(1) << "GetYsqlCatalogConfig request: " << req->ShortDebugString();
2958
0
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead();
2959
0
  resp->set_version(l->pb.ysql_catalog_config().version());
2960
2961
0
  return Status::OK();
2962
0
}
2963
2964
Status CatalogManager::CopyPgsqlSysTables(const NamespaceId& namespace_id,
2965
22
                                          const std::vector<scoped_refptr<TableInfo>>& tables) {
2966
22
  const uint32_t database_oid = CHECK_RESULT(GetPgsqlDatabaseOid(namespace_id));
2967
22
  vector<TableId> source_table_ids;
2968
22
  vector<TableId> target_table_ids;
2969
2.68k
  for (const auto& table : tables) {
2970
2.68k
    CreateTableRequestPB table_req;
2971
2.68k
    CreateTableResponsePB table_resp;
2972
2973
2.68k
    const uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table->id()));
2974
2.68k
    const TableId table_id = GetPgsqlTableId(database_oid, table_oid);
2975
2976
    // Hold read lock until rows from the table are copied also.
2977
2.68k
    auto l = table->LockForRead();
2978
2979
    // Skip shared table.
2980
2.68k
    if (l->pb.is_pg_shared_table()) {
2981
420
      continue;
2982
420
    }
2983
2984
2.26k
    table_req.set_name(l->pb.name());
2985
2.26k
    table_req.mutable_namespace_()->set_id(namespace_id);
2986
2.26k
    table_req.set_table_type(PGSQL_TABLE_TYPE);
2987
2.26k
    table_req.mutable_schema()->CopyFrom(l->schema());
2988
2.26k
    table_req.set_is_pg_catalog_table(true);
2989
2.26k
    table_req.set_table_id(table_id);
2990
2991
2.26k
    if (IsIndex(l->pb)) {
2992
1.00k
      const uint32_t indexed_table_oid =
2993
1.00k
        VERIFY_RESULT(GetPgsqlTableOid(GetIndexedTableId(l->pb)));
2994
1.00k
      const TableId indexed_table_id = GetPgsqlTableId(database_oid, indexed_table_oid);
2995
2996
      // Set index_info.
2997
      // Previously created INDEX wouldn't have the attribute index_info.
2998
1.00k
      if (l->pb.has_index_info()) {
2999
1.00k
        table_req.mutable_index_info()->CopyFrom(l->pb.index_info());
3000
1.00k
        table_req.mutable_index_info()->set_indexed_table_id(indexed_table_id);
3001
1.00k
      }
3002
3003
      // Set deprecated field for index_info.
3004
1.00k
      table_req.set_indexed_table_id(indexed_table_id);
3005
1.00k
      table_req.set_is_local_index(PROTO_GET_IS_LOCAL(l->pb));
3006
1.00k
      table_req.set_is_unique_index(PROTO_GET_IS_UNIQUE(l->pb));
3007
1.00k
    }
3008
3009
2.26k
    auto s = CreateYsqlSysTable(&table_req, &table_resp);
3010
2.26k
    if (!s.ok()) {
3011
1
      return s.CloneAndPrepend(Substitute(
3012
1
          "Failure when creating PGSQL System Tables: $0", table_resp.error().ShortDebugString()));
3013
1
    }
3014
3015
2.26k
    source_table_ids.push_back(table->id());
3016
2.26k
    target_table_ids.push_back(table_id);
3017
2.26k
  }
3018
21
  RETURN_NOT_OK(
3019
21
      sys_catalog_->CopyPgsqlTables(source_table_ids, target_table_ids, leader_ready_term()));
3020
21
  return Status::OK();
3021
21
}
3022
3023
718
size_t CatalogManager::GetNumLiveTServersForPlacement(const PlacementId& placement_id) {
3024
718
  BlacklistSet blacklist = BlacklistSetFromPB();
3025
718
  TSDescriptorVector ts_descs;
3026
718
  master_->ts_manager()->GetAllLiveDescriptorsInCluster(&ts_descs, placement_id, blacklist);
3027
718
  return ts_descs.size();
3028
718
}
3029
3030
116k
TSDescriptorVector CatalogManager::GetAllLiveNotBlacklistedTServers() const {
3031
116k
  TSDescriptorVector ts_descs;
3032
116k
  BlacklistSet blacklist = BlacklistSetFromPB();
3033
116k
  master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist);
3034
116k
  return ts_descs;
3035
116k
}
3036
3037
namespace {
3038
3039
306k
size_t GetNumReplicasFromPlacementInfo(const PlacementInfoPB& placement_info) {
3040
306k
  return placement_info.num_replicas() > 0 ?
3041
301k
      placement_info.num_replicas() : FLAGS_replication_factor;
3042
306k
}
3043
3044
Status CheckNumReplicas(const PlacementInfoPB& placement_info,
3045
                        const TSDescriptorVector& ts_descs,
3046
                        const vector<Partition>& partitions,
3047
4.01k
                        CreateTableResponsePB* resp) {
3048
4.01k
  auto max_tablets = FLAGS_max_create_tablets_per_ts * ts_descs.size();
3049
4.01k
  auto num_replicas = GetNumReplicasFromPlacementInfo(placement_info);
3050
4.01k
  if (num_replicas > 1 && max_tablets > 0 && partitions.size() > max_tablets) {
3051
0
    std::string msg = Substitute("The requested number of tablets ($0) is over the permitted "
3052
0
                                 "maximum ($1)", partitions.size(), max_tablets);
3053
0
    Status s = STATUS(InvalidArgument, msg);
3054
0
    LOG(WARNING) << msg;
3055
0
    return SetupError(resp->mutable_error(), MasterErrorPB::TOO_MANY_TABLETS, s);
3056
0
  }
3057
3058
4.01k
  return Status::OK();
3059
4.01k
}
3060
3061
} // namespace
3062
3063
// Create a new table.
3064
// See README file in this directory for a description of the design.
3065
Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req,
3066
                                   CreateTableResponsePB* resp,
3067
3.65k
                                   rpc::RpcContext* rpc) {
3068
0
  DVLOG(3) << __PRETTY_FUNCTION__ << " Begin. " << orig_req->DebugString();
3069
3070
3.65k
  const bool is_pg_table = orig_req->table_type() == PGSQL_TABLE_TYPE;
3071
3.65k
  const bool is_pg_catalog_table = is_pg_table && orig_req->is_pg_catalog_table();
3072
3.65k
  if (!is_pg_catalog_table || !FLAGS_hide_pg_catalog_table_creation_logs) {
3073
3.65k
    LOG(INFO) << "CreateTable from " << RequestorString(rpc)
3074
3.65k
                << ":\n" << orig_req->DebugString();
3075
0
  } else {
3076
0
    LOG(INFO) << "CreateTable from " << RequestorString(rpc) << ": " << orig_req->name();
3077
0
  }
3078
3079
3.65k
  const bool is_transactional = orig_req->schema().table_properties().is_transactional();
3080
  // If this is a transactional table, we need to create the transaction status table (if it does
3081
  // not exist already).
3082
3.65k
  if (is_transactional && (!is_pg_catalog_table || !FLAGS_create_initial_sys_catalog_snapshot)) {
3083
1.87k
    Status s = CreateGlobalTransactionStatusTableIfNeeded(rpc);
3084
1.87k
    if (!s.ok()) {
3085
0
      return s.CloneAndPrepend("Error while creating transaction status table");
3086
0
    }
3087
1.78k
  } else {
3088
0
    VLOG(1)
3089
0
        << "Not attempting to create a transaction status table:\n"
3090
0
        << "  " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n "
3091
0
        << "  " << EXPR_VALUE_FOR_LOG(is_pg_catalog_table) << "\n "
3092
0
        << "  " << EXPR_VALUE_FOR_LOG(FLAGS_create_initial_sys_catalog_snapshot);
3093
1.78k
  }
3094
3095
  // If this is a transactional table and there is a associated tablespace, try to create a
3096
  // local transaction status table for the tablespace if there is a placement attached to it
3097
  // (and if it does not exist already).
3098
3.65k
  if (GetAtomicFlag(&FLAGS_auto_create_local_transaction_tables)) {
3099
3.65k
    if (is_transactional && orig_req->has_tablespace_id()) {
3100
1
      const auto& tablespace_id = orig_req->tablespace_id();
3101
1
      auto tablespace_pb = VERIFY_RESULT(GetTablespaceReplicationInfoWithRetry(tablespace_id));
3102
1
      if (tablespace_pb) {
3103
0
        RETURN_NOT_OK(CreateLocalTransactionStatusTableIfNeeded(rpc, tablespace_id));
3104
1
      } else {
3105
0
        VLOG(1)
3106
0
            << "Not attempting to create a local transaction status table: "
3107
0
            << "tablespace " << EXPR_VALUE_FOR_LOG(tablespace_id) << " has no placement\n";
3108
1
      }
3109
3.65k
    } else {
3110
0
        VLOG(1)
3111
0
            << "Not attempting to create a local transaction status table:\n"
3112
0
            << "  " << EXPR_VALUE_FOR_LOG(is_transactional) << "\n "
3113
0
            << "  " << EXPR_VALUE_FOR_LOG(orig_req->has_tablespace_id());
3114
3.65k
    }
3115
3.65k
  }
3116
3117
3.65k
  if (is_pg_catalog_table) {
3118
0
    return CreateYsqlSysTable(orig_req, resp);
3119
0
  }
3120
3121
3.65k
  Status s;
3122
3.65k
  const char* const object_type = PROTO_PTR_IS_TABLE(orig_req) ? "table" : "index";
3123
3124
  // Copy the request, so we can fill in some defaults.
3125
3.65k
  CreateTableRequestPB req = *orig_req;
3126
3127
  // Lookup the namespace and verify if it exists.
3128
3.65k
  TRACE("Looking up namespace");
3129
3.65k
  auto ns = VERIFY_RESULT(FindNamespace(req.namespace_()));
3130
3.65k
  bool colocated;
3131
3.65k
  NamespaceId namespace_id;
3132
3.65k
  NamespaceName namespace_name;
3133
3.65k
  {
3134
3.65k
    auto ns_lock = ns->LockForRead();
3135
3.65k
    if (ns->database_type() != GetDatabaseTypeForTable(req.table_type())) {
3136
0
      Status s = STATUS(NotFound, "Namespace not found");
3137
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
3138
0
    }
3139
3.65k
    namespace_id = ns->id();
3140
3.65k
    namespace_name = ns->name();
3141
3.65k
    colocated = ns->colocated();
3142
3.65k
  }
3143
3144
  // For index table, find the table info
3145
3.65k
  scoped_refptr<TableInfo> indexed_table;
3146
3.65k
  if (IsIndex(req)) {
3147
548
    TRACE("Looking up indexed table");
3148
548
    indexed_table = GetTableInfo(req.indexed_table_id());
3149
548
    if (indexed_table == nullptr) {
3150
0
      return STATUS_SUBSTITUTE(
3151
0
            NotFound, "The indexed table $0 does not exist", req.indexed_table_id());
3152
0
    }
3153
3154
548
    TRACE("Locking indexed table");
3155
548
    RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(indexed_table->LockForRead(), resp));
3156
548
  }
3157
3158
  // Determine if this table should be colocated. If not specified, the table should be colocated if
3159
  // and only if the namespace is colocated.
3160
3.65k
  if (!req.colocated()) {
3161
    // Opt out of colocation if the request says so.
3162
1.25k
    colocated = false;
3163
2.39k
  } else if (indexed_table && !indexed_table->colocated()) {
3164
    // Opt out of colocation if the indexed table opted out of colocation.
3165
542
    colocated = false;
3166
542
  }
3167
3168
  // TODO: If this is a colocated index table in a colocated database, convert any hash partition
3169
  // columns into range partition columns. This is because postgres does not know that this index
3170
  // table is in a colocated database. When we get to the "tablespaces" step where we store this
3171
  // into PG metadata, then PG will know if db/table is colocated and do the work there.
3172
3.65k
  if ((colocated || req.has_tablegroup_id()) && IsIndex(req)) {
3173
13
    for (auto& col_pb : *req.mutable_schema()->mutable_columns()) {
3174
13
      col_pb.set_is_hash_key(false);
3175
13
    }
3176
5
  }
3177
3178
  // Validate schema.
3179
3.65k
  Schema schema;
3180
3.65k
  RETURN_NOT_OK(SchemaFromPB(req.schema(), &schema));
3181
3.65k
  RETURN_NOT_OK(ValidateCreateTableSchema(schema, resp));
3182
3183
  // checking that referenced user-defined types (if any) exist.
3184
3.65k
  {
3185
3.65k
    SharedLock lock(mutex_);
3186
14.4k
    for (size_t i = 0; i < schema.num_columns(); i++) {
3187
72
      for (const auto &udt_id : schema.column(i).type()->GetUserDefinedTypeIds()) {
3188
72
        if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) {
3189
0
          Status s = STATUS(InvalidArgument, "Referenced user-defined type not found");
3190
0
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3191
0
        }
3192
72
      }
3193
10.7k
    }
3194
3.65k
  }
3195
  // TODO (ENG-1860) The referenced namespace and types retrieved/checked above could be deleted
3196
  // some time between this point and table creation below.
3197
3198
  // Usually the column ids are available if it's called on the backup-restoring code path
3199
  // (from CatalogManager::RecreateTable). Else the column ids must be empty in the client schema.
3200
3.65k
  if (!schema.has_column_ids()) {
3201
3.65k
    schema.InitColumnIdsByDefault();
3202
3.65k
  }
3203
3204
3.65k
  if (schema.table_properties().HasCopartitionTableId()) {
3205
0
    return CreateCopartitionedTable(req, resp, rpc, schema, ns);
3206
0
  }
3207
3208
3.65k
  if (colocated || req.has_tablegroup_id()) {
3209
    // If the table is colocated, then there should be no hash partition columns.
3210
    // Do the same for tables that are being placed in tablegroups.
3211
28
    if (schema.num_hash_key_columns() > 0) {
3212
1
      Status s = STATUS(InvalidArgument, "Cannot colocate hash partitioned table");
3213
1
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3214
1
    }
3215
3.62k
  } else if (
3216
3.62k
      !req.partition_schema().has_hash_schema() && !req.partition_schema().has_range_schema()) {
3217
    // If neither hash nor range schema have been specified by the protobuf request, we assume the
3218
    // table uses a hash schema, and we use the table_type and hash_key to determine the hashing
3219
    // scheme (redis or multi-column) that should be used.
3220
2.21k
    if (req.table_type() == REDIS_TABLE_TYPE) {
3221
107
      req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::REDIS_HASH_SCHEMA);
3222
2.10k
    } else if (schema.num_hash_key_columns() > 0) {
3223
2.10k
      req.mutable_partition_schema()->set_hash_schema(PartitionSchemaPB::MULTI_COLUMN_HASH_SCHEMA);
3224
0
    } else {
3225
0
      Status s = STATUS(InvalidArgument, "Unknown table type or partitioning method");
3226
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3227
0
    }
3228
3.65k
  }
3229
3230
  // Verify that custom placement policy has not been specified for colocated table.
3231
3.65k
  const bool is_replication_info_set = IsReplicationInfoSet(req.replication_info());
3232
3.65k
  if (is_replication_info_set && colocated) {
3233
0
    Status s = STATUS(InvalidArgument, "Custom placement policy should not be set for "
3234
0
      "colocated tables");
3235
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s);
3236
0
  }
3237
3238
3.65k
  if (is_replication_info_set && req.table_type() == PGSQL_TABLE_TYPE) {
3239
0
    const Status s = STATUS(InvalidArgument, "Cannot set placement policy for YSQL tables "
3240
0
        "use Tablespaces instead");
3241
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
3242
0
  }
3243
3244
  // Get placement info.
3245
3.65k
  const ReplicationInfoPB& replication_info = VERIFY_RESULT(
3246
3.65k
    GetTableReplicationInfo(req.replication_info(), req.tablespace_id()));
3247
3.65k
  const PlacementInfoPB& placement_info = replication_info.live_replicas();
3248
3249
  // Calculate number of tablets to be used. Priorities:
3250
  //   1. Use Internally specified value from 'CreateTableRequestPB::num_tablets'.
3251
  //   2. Use User specified value from
3252
  //      'CreateTableRequestPB::SchemaPB::TablePropertiesPB::num_tablets'.
3253
  //      Note, that the number will be saved in schema stored in the master persistent
3254
  //      SysCatalog irrespective of which way we choose the number of tablets to create.
3255
  //      If nothing is specified in this field, nothing will be stored in the table
3256
  //      TablePropertiesPB for number of tablets
3257
  //   3. Calculate own value.
3258
3.65k
  int num_tablets = 0;
3259
3.65k
  if (req.has_num_tablets()) {
3260
2.93k
    num_tablets = req.num_tablets(); // Internal request.
3261
2.93k
  }
3262
3263
3.65k
  if (num_tablets <= 0 && schema.table_properties().HasNumTablets()) {
3264
503
    num_tablets = schema.table_properties().num_tablets(); // User request.
3265
503
  }
3266
3267
3.65k
  if (num_tablets <= 0) {
3268
    // Use default as client could have gotten the value before any tserver had heartbeated
3269
    // to (a new) master leader.
3270
215
    const auto num_live_tservers =
3271
215
        GetNumLiveTServersForPlacement(placement_info.placement_uuid());
3272
215
    num_tablets = narrow_cast<int>(
3273
3
        num_live_tservers * (is_pg_table ? FLAGS_ysql_num_shards_per_tserver
3274
212
                                         : FLAGS_yb_num_shards_per_tserver));
3275
215
    LOG(INFO) << "Setting default tablets to " << num_tablets << " with "
3276
215
              << num_live_tservers << " primary servers";
3277
215
  }
3278
3279
  // Create partitions.
3280
3.65k
  PartitionSchema partition_schema;
3281
3.65k
  vector<Partition> partitions;
3282
3.65k
  if (colocated || req.has_tablegroup_id()) {
3283
27
    RETURN_NOT_OK(partition_schema.CreatePartitions(1, &partitions));
3284
27
    req.clear_partition_schema();
3285
27
    num_tablets = 1;
3286
3.62k
  } else {
3287
3.62k
    RETURN_NOT_OK(PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema));
3288
3.62k
    if (req.partitions_size() > 0) {
3289
0
      if (req.partitions_size() != num_tablets) {
3290
0
        Status s = STATUS(InvalidArgument, "Partitions are not defined for all tablets");
3291
0
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3292
0
      }
3293
0
      string last;
3294
0
      for (const auto& p : req.partitions()) {
3295
0
        Partition np;
3296
0
        Partition::FromPB(p, &np);
3297
0
        if (np.partition_key_start() != last) {
3298
0
          Status s = STATUS(InvalidArgument,
3299
0
                            "Partitions does not cover the full partition keyspace");
3300
0
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3301
0
        }
3302
0
        last = np.partition_key_end();
3303
0
        partitions.push_back(std::move(np));
3304
0
      }
3305
3.62k
    } else {
3306
      // Supplied number of partitions is merely a suggestion, actual number of
3307
      // created partitions might differ.
3308
3.62k
      RETURN_NOT_OK(partition_schema.CreatePartitions(num_tablets, &partitions));
3309
3.62k
    }
3310
    // The vector 'partitions' contains real setup partitions, so the variable
3311
    // should be updated.
3312
3.62k
    num_tablets = narrow_cast<int>(partitions.size());
3313
3.62k
  }
3314
3315
3.64k
  TSDescriptorVector all_ts_descs;
3316
3.64k
  master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs);
3317
3.64k
  RETURN_NOT_OK(CheckNumReplicas(placement_info, all_ts_descs, partitions, resp));
3318
3319
3.64k
  if (!FLAGS_TEST_skip_placement_validation_createtable_api) {
3320
3.64k
    ValidateReplicationInfoRequestPB validate_req;
3321
3.64k
    validate_req.mutable_replication_info()->CopyFrom(replication_info);
3322
3.64k
    ValidateReplicationInfoResponsePB validate_resp;
3323
3.64k
    RETURN_NOT_OK(ValidateReplicationInfo(&validate_req, &validate_resp));
3324
3.64k
  }
3325
3326
3.64k
  LOG(INFO) << "Set number of tablets: " << num_tablets;
3327
3.64k
  req.set_num_tablets(num_tablets);
3328
3329
  // For index table, populate the index info.
3330
3.64k
  IndexInfoPB index_info;
3331
3332
3.64k
  const bool index_backfill_enabled =
3333
3.64k
      IsIndexBackfillEnabled(orig_req->table_type(), is_transactional);
3334
3.64k
  if (req.has_index_info()) {
3335
    // Current message format.
3336
547
    index_info.CopyFrom(req.index_info());
3337
3338
    // Assign column-ids that have just been computed and assigned to "index_info".
3339
547
    if (!is_pg_table) {
3340
0
      DCHECK_EQ(index_info.columns().size(), schema.num_columns())
3341
0
        << "Number of columns are not the same between index_info and index_schema";
3342
2.01k
      for (size_t colidx = 0; colidx < schema.num_columns(); colidx++) {
3343
1.62k
        index_info.mutable_columns(narrow_cast<int>(colidx))->set_column_id(
3344
1.62k
            schema.column_id(colidx));
3345
1.62k
      }
3346
396
    }
3347
3.10k
  } else if (req.has_indexed_table_id()) {
3348
    // Old client message format when rolling upgrade (Not having "index_info").
3349
0
    IndexInfoBuilder index_info_builder(&index_info);
3350
0
    index_info_builder.ApplyProperties(req.indexed_table_id(),
3351
0
        req.is_local_index(), req.is_unique_index());
3352
0
    if (orig_req->table_type() != PGSQL_TABLE_TYPE) {
3353
0
      Schema indexed_schema;
3354
0
      RETURN_NOT_OK(indexed_table->GetSchema(&indexed_schema));
3355
0
      RETURN_NOT_OK(index_info_builder.ApplyColumnMapping(indexed_schema, schema));
3356
0
    }
3357
0
  }
3358
3359
3.64k
  if ((req.has_index_info() || req.has_indexed_table_id()) &&
3360
547
      index_backfill_enabled &&
3361
488
      !req.skip_index_backfill()) {
3362
    // Start off the index table with major compactions disabled. We need this to retain the delete
3363
    // markers until the backfill process is completed.  No need to set index_permissions in the
3364
    // index table.
3365
427
    schema.SetRetainDeleteMarkers(true);
3366
427
  }
3367
3368
3.64k
  LOG(INFO) << "CreateTable with IndexInfo " << AsString(index_info);
3369
3370
3.64k
  scoped_refptr<TableInfo> table;
3371
3.64k
  TabletInfos tablets;
3372
3.64k
  bool tablets_exist;
3373
3.64k
  bool tablegroup_tablets_exist = false;
3374
3375
3.64k
  {
3376
3.64k
    LockGuard lock(mutex_);
3377
3.64k
    auto ns_lock = ns->LockForRead();
3378
3.64k
    TRACE("Acquired catalog manager lock");
3379
3380
3.64k
    tablets_exist =
3381
3.64k
        colocated && colocated_tablet_ids_map_.find(ns->id()) != colocated_tablet_ids_map_.end();
3382
    // Verify that the table does not exist.
3383
3.64k
    table = FindPtrOrNull(table_names_map_, {namespace_id, req.name()});
3384
3385
3.64k
    if (table != nullptr) {
3386
4
      s = STATUS_SUBSTITUTE(AlreadyPresent,
3387
4
              "Object '$0.$1' already exists", ns->name(), table->name());
3388
4
      LOG(WARNING) << "Found table: " << table->ToStringWithState()
3389
4
                   << ". Failed creating table with error: "
3390
4
                   << s.ToString() << " Request:\n" << orig_req->DebugString();
3391
      // If the table already exists, we set the response table_id field to the id of the table that
3392
      // already exists. This is necessary because before we return the error to the client (or
3393
      // success in case of a "CREATE TABLE IF NOT EXISTS" request) we want to wait for the existing
3394
      // table to be available to receive requests. And we need the table id for that.
3395
4
      resp->set_table_id(table->id());
3396
4
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
3397
4
    }
3398
3399
    // Namespace state validity check:
3400
    // 1. Allow Namespaces that are RUNNING
3401
    // 2. Allow Namespaces that are PREPARING under 2 situations
3402
    //    2a. System Namespaces.
3403
    //    2b. The parent table from a Colocated Namespace.
3404
3.64k
    const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix;
3405
3.64k
    bool valid_ns_state = (ns->state() == SysNamespaceEntryPB::RUNNING) ||
3406
2
      (ns->state() == SysNamespaceEntryPB::PREPARING &&
3407
2
        (ns->name() == kSystemNamespaceName || req.name() == parent_table_name));
3408
3.64k
    if (!valid_ns_state) {
3409
0
      Status s = STATUS_SUBSTITUTE(TryAgain, "Invalid Namespace State ($0). Cannot create $1.$2",
3410
0
          SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), req.name());
3411
0
      return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
3412
0
    }
3413
3414
    // Check whether this CREATE TABLE request which has a tablegroup_id is for a normal user table
3415
    // or the request to create the parent table for the tablegroup. This is done by checking the
3416
    // catalog manager maps.
3417
3.64k
    if (req.has_tablegroup_id() &&
3418
8
        tablegroup_tablet_ids_map_.find(ns->id()) != tablegroup_tablet_ids_map_.end() &&
3419
7
        tablegroup_tablet_ids_map_[ns->id()].find(req.tablegroup_id()) !=
3420
7
        tablegroup_tablet_ids_map_[ns->id()].end()) {
3421
7
      tablegroup_tablets_exist = true;
3422
7
    }
3423
3424
3.64k
    RETURN_NOT_OK(CreateTableInMemory(
3425
3.64k
        req, schema, partition_schema, namespace_id, namespace_name, partitions, &index_info,
3426
3.64k
        (!tablets_exist && !tablegroup_tablets_exist) ? &tablets : nullptr, resp, &table));
3427
3428
    // Section is executed when a table is either the parent table or a user table in a tablegroup.
3429
    // It additionally sets the table metadata (and tablet metadata if this is the parent table)
3430
    // to have the colocated property so we can take advantage of code reuse.
3431
3.64k
    if (req.has_tablegroup_id()) {
3432
8
      table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3433
8
      if (tablegroup_tablets_exist) {
3434
        // If the table is not a tablegroup parent table, it performs a lookup for the proper tablet
3435
        // to place the table on as a child table.
3436
7
        auto tablet = tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()];
3437
7
        RSTATUS_DCHECK(
3438
7
            tablet->colocated(), InternalError,
3439
7
            "The tablet for tablegroup should be colocated.");
3440
7
        tablets.push_back(tablet.get());
3441
7
        auto tablet_lock = tablet->LockForWrite();
3442
7
        tablet_lock.mutable_data()->pb.add_table_ids(table->id());
3443
7
        RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet));
3444
7
        tablet_lock.Commit();
3445
3446
7
        tablet->mutable_metadata()->StartMutation();
3447
7
        table->AddTablet(tablet);
3448
7
        tablegroup_ids_map_[req.tablegroup_id()]->AddChildTable(table->id());
3449
1
      } else {
3450
        // If the table is a tablegroup parent table, it creates a dummy tablet for the tablegroup
3451
        // along with updating the catalog manager maps.
3452
1
        RSTATUS_DCHECK_EQ(
3453
1
            tablets.size(), 1U, InternalError,
3454
1
            "Only one tablet should be created for each tablegroup");
3455
1
        tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3456
        // Update catalog manager maps for tablegroups
3457
1
        tablegroup_tablet_ids_map_[ns->id()][req.tablegroup_id()] =
3458
1
            tablet_map_->find(tablets[0]->id())->second;
3459
1
      }
3460
3.63k
    } else if (colocated) {
3461
19
      table->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3462
      // if the tablet already exists, add the tablet to tablets
3463
19
      if (tablets_exist) {
3464
17
        auto tablet = colocated_tablet_ids_map_[ns->id()];
3465
17
        RSTATUS_DCHECK(
3466
17
            tablet->colocated(), InternalError,
3467
17
            "The tablet for colocated database should be colocated.");
3468
17
        tablets.push_back(tablet.get());
3469
17
        auto tablet_lock = tablet->LockForWrite();
3470
17
        tablet_lock.mutable_data()->pb.add_table_ids(table->id());
3471
17
        RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet));
3472
17
        tablet_lock.Commit();
3473
3474
17
        tablet->mutable_metadata()->StartMutation();
3475
17
        table->AddTablet(tablet);
3476
2
      } else {  // Record the tablet
3477
2
        RSTATUS_DCHECK_EQ(
3478
2
            tablets.size(), 1U, InternalError,
3479
2
            "Only one tablet should be created for each colocated database");
3480
2
        tablets[0]->mutable_metadata()->mutable_dirty()->pb.set_colocated(true);
3481
2
        colocated_tablet_ids_map_[ns->id()] = tablet_map_->find(tablets[0]->id())->second;
3482
2
      }
3483
19
    }
3484
3.64k
    if (req.has_matview_pg_table_id()) {
3485
0
      matview_pg_table_ids_map_[req.table_id()] = req.matview_pg_table_id();
3486
0
    }
3487
3.64k
  }
3488
3489
  // For create transaction table requests with tablespace id, save the tablespace id.
3490
3.64k
  const auto is_transaction_status_table =
3491
3.64k
      orig_req->table_type() == TableType::TRANSACTION_STATUS_TABLE_TYPE;
3492
3.64k
  if (is_transaction_status_table && req.has_tablespace_id()) {
3493
0
    table->mutable_metadata()->mutable_dirty()->pb.set_transaction_table_tablespace_id(
3494
0
        req.tablespace_id());
3495
0
  }
3496
3497
  // Tables with a transaction should be rolled back if the transaction does not get committed.
3498
  // Store this on the table persistent state until the transaction has been a verified success.
3499
3.64k
  TransactionMetadata txn;
3500
3.64k
  if (req.has_transaction() && FLAGS_enable_transactional_ddl_gc) {
3501
1.41k
    table->mutable_metadata()->mutable_dirty()->pb.mutable_transaction()->
3502
1.41k
        CopyFrom(req.transaction());
3503
1.41k
    txn = VERIFY_RESULT(TransactionMetadata::FromPB(req.transaction()));
3504
1.41k
    RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
3505
1.41k
  }
3506
3507
3.64k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_table_create_secs > 0) &&
3508
11
      req.table_type() != TableType::TRANSACTION_STATUS_TABLE_TYPE) {
3509
8
    LOG(INFO) << "Simulating slow table creation";
3510
8
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_slow_table_create_secs));
3511
8
  }
3512
3513
  // NOTE: the table and tablets are already locked for write at this point,
3514
  // since the CreateTableInfo/CreateTabletInfo functions leave them in that state.
3515
  // They will get committed at the end of this function.
3516
  // Sanity check: the tables and tablets should all be in "preparing" state.
3517
3.64k
  CHECK_EQ(SysTablesEntryPB::PREPARING, table->metadata().dirty().pb.state());
3518
  // Update the on-disk table state to "running".
3519
3.64k
  table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING);
3520
3.64k
  TRACE("Inserted new table and tablet info into CatalogManager maps");
3521
0
  VLOG_WITH_PREFIX(1) << "Inserted new table and tablet info into CatalogManager maps";
3522
3523
3.64k
  if (!tablets_exist && !tablegroup_tablets_exist) {
3524
    // Write Tablets to sys-tablets (in "preparing" state).
3525
27.1k
    for (const auto& tablet : tablets) {
3526
27.1k
      CHECK_EQ(SysTabletsEntryPB::PREPARING, tablet->metadata().dirty().pb.state());
3527
27.1k
    }
3528
3.62k
  }
3529
3530
3.64k
  s = sys_catalog_->Upsert(leader_ready_term(), table, tablets);
3531
3.64k
  if (PREDICT_FALSE(!s.ok())) {
3532
3
    return AbortTableCreation(
3533
3
        table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting to sys-tablets"),
3534
3
        resp);
3535
3
  }
3536
3.64k
  TRACE("Wrote table and tablets to system table");
3537
3538
  // For index table, insert index info in the indexed table.
3539
3.64k
  if ((req.has_index_info() || req.has_indexed_table_id())) {
3540
542
    if (index_backfill_enabled && !req.skip_index_backfill()) {
3541
422
      if (is_pg_table) {
3542
        // YSQL: start at some permission before backfill.  The real enforcement happens with
3543
        // pg_index system table's indislive and indisready columns.  Choose WRITE_AND_DELETE
3544
        // because it will probably be less confusing.
3545
89
        index_info.set_index_permissions(INDEX_PERM_WRITE_AND_DELETE);
3546
333
      } else {
3547
        // YCQL
3548
333
        index_info.set_index_permissions(INDEX_PERM_DELETE_ONLY);
3549
333
      }
3550
422
    }
3551
542
    s = AddIndexInfoToTable(indexed_table, index_info, resp);
3552
542
    if (PREDICT_FALSE(!s.ok())) {
3553
0
      return AbortTableCreation(
3554
0
          table.get(), tablets, s.CloneAndPrepend("An error occurred while inserting index info"),
3555
0
          resp);
3556
0
    }
3557
3.64k
  }
3558
3559
  // Commit the in-memory state.
3560
3.64k
  table->mutable_metadata()->CommitMutation();
3561
3562
27.1k
  for (const auto& tablet : tablets) {
3563
27.1k
    tablet->mutable_metadata()->CommitMutation();
3564
27.1k
  }
3565
3566
3.64k
  if ((colocated && tablets_exist) || (req.has_tablegroup_id() && tablegroup_tablets_exist)) {
3567
24
    auto call =
3568
24
        std::make_shared<AsyncAddTableToTablet>(master_, AsyncTaskPool(), tablets[0], table);
3569
24
    table->AddTask(call);
3570
24
    WARN_NOT_OK(ScheduleTask(call), "Failed to send AddTableToTablet request");
3571
24
  }
3572
3573
3.64k
  if (req.has_creator_role_name()) {
3574
242
    const NamespaceName& keyspace_name = req.namespace_().name();
3575
242
    const TableName& table_name = req.name();
3576
242
    RETURN_NOT_OK(permissions_manager_->GrantPermissions(
3577
242
        req.creator_role_name(),
3578
242
        get_canonical_table(keyspace_name, table_name),
3579
242
        table_name,
3580
242
        keyspace_name,
3581
242
        all_permissions_for_resource(ResourceType::TABLE),
3582
242
        ResourceType::TABLE,
3583
242
        resp));
3584
242
  }
3585
3586
  // Verify Transaction gets committed, which occurs after table create finishes.
3587
3.64k
  if (req.has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) {
3588
1.41k
    LOG(INFO) << "Enqueuing table for Transaction Verification: " << req.name();
3589
1.41k
    std::function<Status(bool)> when_done =
3590
1.41k
        std::bind(&CatalogManager::VerifyTablePgLayer, this, table, _1);
3591
1.41k
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
3592
1.41k
        std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(), txn, when_done)),
3593
1.41k
        "Could not submit VerifyTransaction to thread pool");
3594
1.41k
  }
3595
3596
3.64k
  LOG(INFO) << "Successfully created " << object_type << " " << table->ToString() << " in "
3597
3.64k
            << ns->ToString() << " per request from " << RequestorString(rpc);
3598
3.64k
  background_tasks_->Wake();
3599
3600
3.64k
  if (FLAGS_master_enable_metrics_snapshotter &&
3601
0
      !(req.table_type() == TableType::YQL_TABLE_TYPE &&
3602
0
        namespace_id == kSystemNamespaceId &&
3603
0
        req.name() == kMetricsSnapshotsTableName)) {
3604
0
    Status s = CreateMetricsSnapshotsTableIfNeeded(rpc);
3605
0
    if (!s.ok()) {
3606
0
      return s.CloneAndPrepend("Error while creating metrics snapshots table");
3607
0
    }
3608
3.64k
  }
3609
3610
  // Increment transaction status version if needed.
3611
3.64k
  if (is_transaction_status_table) {
3612
492
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
3613
492
  }
3614
3615
0
  DVLOG(3) << __PRETTY_FUNCTION__ << " Done.";
3616
3.64k
  return Status::OK();
3617
3.64k
}
3618
3619
1.41k
Status CatalogManager::VerifyTablePgLayer(scoped_refptr<TableInfo> table, bool rpc_success) {
3620
  // Upon Transaction completion, check pg system table using OID to ensure SUCCESS.
3621
1.41k
  const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOidByTableId(table->id()));
3622
1.41k
  const auto pg_table_id = GetPgsqlTableId(database_oid, kPgClassTableOid);
3623
1.41k
  auto table_storage_id = GetPgsqlTableOid(table->id());
3624
1.41k
  {
3625
1.41k
    SharedLock lock(mutex_);
3626
1.41k
    if (matview_pg_table_ids_map_.find(table->id()) != matview_pg_table_ids_map_.end()) {
3627
0
      table_storage_id = GetPgsqlTableOid(matview_pg_table_ids_map_[table->id()]);
3628
0
    }
3629
1.41k
  }
3630
1.41k
  auto entry_exists = VERIFY_RESULT(
3631
1.41k
      ysql_transaction_->PgEntryExists(pg_table_id, table_storage_id));
3632
1.41k
  auto l = table->LockForWrite();
3633
1.41k
  auto& metadata = table->mutable_metadata()->mutable_dirty()->pb;
3634
3635
1.41k
  SCHECK(metadata.state() == SysTablesEntryPB::RUNNING ||
3636
1.41k
         metadata.state() == SysTablesEntryPB::ALTERING, Aborted,
3637
1.41k
         Substitute("Unexpected table state ($0), abandoning transaction GC work for $1",
3638
1.41k
                    SysTablesEntryPB_State_Name(metadata.state()), table->ToString()));
3639
3640
  // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns.
3641
1.39k
  const bool txn_check_passed = entry_exists || !rpc_success;
3642
3643
1.39k
  if (txn_check_passed) {
3644
    // Remove the transaction from the entry since we're done processing it.
3645
1.37k
    metadata.clear_transaction();
3646
1.37k
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table));
3647
1.37k
    if (entry_exists) {
3648
1.37k
      LOG_WITH_PREFIX(INFO) << "Table transaction succeeded: " << table->ToString();
3649
0
    } else {
3650
0
      LOG_WITH_PREFIX(WARNING)
3651
0
          << "Unknown RPC failure, removing transaction on table: " << table->ToString();
3652
0
    }
3653
    // Commit the in-memory state.
3654
1.37k
    l.Commit();
3655
23
  } else {
3656
23
    LOG(INFO) << "Table transaction failed, deleting: " << table->ToString();
3657
    // Async enqueue delete.
3658
23
    DeleteTableRequestPB del_tbl_req;
3659
23
    del_tbl_req.mutable_table()->set_table_name(table->name());
3660
23
    del_tbl_req.mutable_table()->set_table_id(table->id());
3661
23
    del_tbl_req.set_is_index_table(table->is_index());
3662
3663
23
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc( [this, del_tbl_req]() {
3664
23
      DeleteTableResponsePB del_tbl_resp;
3665
23
      WARN_NOT_OK(DeleteTable(&del_tbl_req, &del_tbl_resp, nullptr),
3666
23
          "Failed to Delete Table with failed transaction");
3667
23
    }));
3668
23
  }
3669
1.39k
  return Status::OK();
3670
1.39k
}
3671
3672
Result<TabletInfos> CatalogManager::CreateTabletsFromTable(const vector<Partition>& partitions,
3673
29.3k
                                                           const TableInfoPtr& table) {
3674
29.3k
  TabletInfos tablets;
3675
  // Create the TabletInfo objects in state PREPARING.
3676
53.7k
  for (const Partition& partition : partitions) {
3677
53.7k
    PartitionPB partition_pb;
3678
53.7k
    partition.ToPB(&partition_pb);
3679
53.7k
    tablets.push_back(CreateTabletInfo(table.get(), partition_pb));
3680
53.7k
  }
3681
3682
  // Add the table/tablets to the in-memory map for the assignment.
3683
29.3k
  table->AddTablets(tablets);
3684
29.3k
  auto tablet_map_checkout = tablet_map_.CheckOut();
3685
53.7k
  for (const TabletInfoPtr& tablet : tablets) {
3686
53.7k
    InsertOrDie(tablet_map_checkout.get_ptr(), tablet->tablet_id(), tablet);
3687
53.7k
  }
3688
3689
29.3k
  return tablets;
3690
29.3k
}
3691
3692
Status CatalogManager::CheckValidPlacementInfo(const PlacementInfoPB& placement_info,
3693
                                               const TSDescriptorVector& ts_descs,
3694
32.3k
                                               ValidateReplicationInfoResponsePB* resp) {
3695
32.3k
  size_t num_live_tservers = ts_descs.size();
3696
32.3k
  size_t num_replicas = GetNumReplicasFromPlacementInfo(placement_info);
3697
32.3k
  Status s;
3698
32.3k
  string msg;
3699
3700
  // Verify that the number of replicas isn't larger than the required number of live tservers.
3701
  // To ensure quorum, we need n/2 + 1 live tservers.
3702
32.3k
  size_t replica_quorum_needed = num_replicas / 2 + 1;
3703
32.3k
  if (FLAGS_catalog_manager_check_ts_count_for_create_table &&
3704
31.9k
      replica_quorum_needed > num_live_tservers) {
3705
2
    msg = Substitute("Not enough live tablet servers to create table with replication factor $0. "
3706
2
                     "Need at least $1 tablet servers whereas $2 are alive.",
3707
2
                     num_replicas, replica_quorum_needed, num_live_tservers);
3708
2
    LOG(WARNING) << msg
3709
2
                 << ". Placement info: " << placement_info.ShortDebugString()
3710
2
                 << ", replication factor flag: " << FLAGS_replication_factor;
3711
2
    s = STATUS(InvalidArgument, msg);
3712
2
    return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3713
2
  }
3714
3715
  // Verify that placement requests are reasonable.
3716
32.3k
  if (!placement_info.placement_blocks().empty()) {
3717
107
    size_t minimum_sum = 0;
3718
242
    for (const auto& pb : placement_info.placement_blocks()) {
3719
242
      minimum_sum += pb.min_num_replicas();
3720
242
      if (!pb.has_cloud_info()) {
3721
1
        msg = Substitute("Got placement info without cloud info set: $0", pb.ShortDebugString());
3722
1
        s = STATUS(InvalidArgument, msg);
3723
1
        LOG(WARNING) << msg;
3724
1
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3725
1
      }
3726
242
    }
3727
    // Total replicas requested should be at least the sum of minimums
3728
    // requested in individual placement blocks.
3729
106
    if (minimum_sum > num_replicas) {
3730
1
      msg = Substitute("Sum of minimum replicas per placement ($0) is greater than num_replicas "
3731
1
                       " ($1)", minimum_sum, num_replicas);
3732
1
      s = STATUS(InvalidArgument, msg);
3733
1
      LOG(WARNING) << msg;
3734
1
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
3735
1
    }
3736
3737
    // Verify that there are enough TServers in the requested placements
3738
    // to match the total required replication factor.
3739
105
    auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs));
3740
3741
    // Fail if we don't have enough tablet servers in the areas requested.
3742
    // We need n/2 + 1 for quorum.
3743
105
    if (allowed_ts.size() < replica_quorum_needed) {
3744
1
      msg = Substitute("Not enough tablet servers in the requested placements. "
3745
1
                        "Need at least $0, have $1",
3746
1
                        replica_quorum_needed, allowed_ts.size());
3747
1
      s = STATUS(InvalidArgument, msg);
3748
1
      LOG(WARNING) << msg;
3749
1
      return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3750
1
    }
3751
3752
    // Try allocating tservers for the replicas and see if we can place a quorum
3753
    // number of replicas.
3754
    // Essentially, the logic is:
3755
    // 1. We satisfy whatever we can from the minimums.
3756
    // 2. We then satisfy whatever we can from the slack.
3757
    //    Here it doesn't whether where we put the slack replicas as long as
3758
    //    the tservers are chosen from any of the valid placement blocks.
3759
    // Overall, if in this process we are able to place n/2 + 1 replicas
3760
    // then we succeed otherwise we fail.
3761
104
    size_t total_extra_replicas = num_replicas - minimum_sum;
3762
104
    size_t total_feasible_replicas = 0;
3763
104
    size_t total_extra_servers = 0;
3764
239
    for (const auto& pb : placement_info.placement_blocks()) {
3765
239
      auto allowed_ts = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs));
3766
239
      size_t allowed_ts_size = allowed_ts.size();
3767
239
      size_t min_num_replicas = pb.min_num_replicas();
3768
      // For every placement block, we can only satisfy upto the number of
3769
      // tservers present in that particular placement block.
3770
239
      total_feasible_replicas += min(allowed_ts_size, min_num_replicas);
3771
      // Extra tablet servers beyond min_num_replicas will be used to place
3772
      // the extra replicas over and above the minimums.
3773
239
      if (allowed_ts_size > min_num_replicas) {
3774
168
        total_extra_servers += allowed_ts_size - min_num_replicas;
3775
168
      }
3776
239
    }
3777
    // The total number of extra replicas that we can put cannot be more than
3778
    // the total tablet servers that are extra.
3779
104
    total_feasible_replicas += min(total_extra_replicas, total_extra_servers);
3780
3781
    // If we place the replicas in accordance with above, we should be able to place
3782
    // at least replica_quorum_needed otherwise we fail.
3783
104
    if (total_feasible_replicas < replica_quorum_needed) {
3784
1
      msg = Substitute("Not enough tablet servers in the requested placements. "
3785
1
                        "Can only find $0 tablet servers for the replicas but need at least "
3786
1
                        "$1.", total_feasible_replicas, replica_quorum_needed);
3787
1
      s = STATUS(InvalidArgument, msg);
3788
1
      LOG(WARNING) << msg;
3789
1
      return SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s);
3790
1
    }
3791
32.3k
  }
3792
3793
32.3k
  return Status::OK();
3794
32.3k
}
3795
3796
Status CatalogManager::CreateTableInMemory(const CreateTableRequestPB& req,
3797
                                           const Schema& schema,
3798
                                           const PartitionSchema& partition_schema,
3799
                                           const NamespaceId& namespace_id,
3800
                                           const NamespaceName& namespace_name,
3801
                                           const std::vector<Partition>& partitions,
3802
                                           IndexInfoPB* index_info,
3803
                                           TabletInfos* tablets,
3804
                                           CreateTableResponsePB* resp,
3805
31.6k
                                           scoped_refptr<TableInfo>* table) {
3806
  // Add the new table in "preparing" state.
3807
31.6k
  *table = CreateTableInfo(req, schema, partition_schema, namespace_id, namespace_name, index_info);
3808
31.6k
  const TableId& table_id = (*table)->id();
3809
3810
0
  VLOG_WITH_PREFIX_AND_FUNC(2)
3811
0
      << "Table: " << (**table).ToString() << ", create_tablets: " << (tablets ? "YES" : "NO");
3812
3813
31.6k
  auto table_ids_map_checkout = table_ids_map_.CheckOut();
3814
31.6k
  (*table_ids_map_checkout)[table_id] = *table;
3815
  // Do not add Postgres tables to the name map as the table name is not unique in a namespace.
3816
31.6k
  if (req.table_type() != PGSQL_TABLE_TYPE) {
3817
27.9k
    table_names_map_[{namespace_id, req.name()}] = *table;
3818
27.9k
  }
3819
3820
31.6k
  if (req.table_type() == TRANSACTION_STATUS_TABLE_TYPE) {
3821
563
    transaction_table_ids_set_.insert(table_id);
3822
563
  }
3823
3824
31.6k
  if (tablets) {
3825
29.3k
    *tablets = VERIFY_RESULT(CreateTabletsFromTable(partitions, *table));
3826
29.3k
  }
3827
3828
31.6k
  if (resp != nullptr) {
3829
6.27k
    resp->set_table_id(table_id);
3830
6.27k
  }
3831
3832
31.6k
  HandleNewTableId(table_id);
3833
3834
31.6k
  return Status::OK();
3835
31.6k
}
3836
3837
Result<bool> CatalogManager::TableExists(
3838
2.50k
    const std::string& namespace_name, const std::string& table_name) const {
3839
2.50k
  TableIdentifierPB table_id_pb;
3840
2.50k
  table_id_pb.set_table_name(table_name);
3841
2.50k
  table_id_pb.mutable_namespace_()->set_name(namespace_name);
3842
2.50k
  return DoesTableExist(FindTable(table_id_pb));
3843
2.50k
}
3844
3845
CHECKED_STATUS CatalogManager::CreateTransactionStatusTable(
3846
    const CreateTransactionStatusTableRequestPB* req, CreateTransactionStatusTableResponsePB* resp,
3847
0
    rpc::RpcContext *rpc) {
3848
0
  const string& table_name = req->table_name();
3849
0
  Status s = CreateTransactionStatusTableInternal(rpc, table_name, nullptr /* tablespace_id */);
3850
0
  if (s.IsAlreadyPresent()) {
3851
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
3852
0
  }
3853
0
  if (!s.ok()) {
3854
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
3855
0
  }
3856
0
  return Status::OK();
3857
0
}
3858
3859
CHECKED_STATUS CatalogManager::CreateTransactionStatusTableInternal(
3860
2.34k
    rpc::RpcContext *rpc, const string& table_name, const TablespaceId* tablespace_id) {
3861
2.34k
  if (VERIFY_RESULT(TableExists(kSystemNamespaceName, table_name))) {
3862
1.78k
    return STATUS_SUBSTITUTE(AlreadyPresent, "Table already exists: $0", table_name);
3863
1.78k
  }
3864
3865
565
  LOG(INFO) << "Creating transaction status table " << table_name;
3866
  // Set up a CreateTable request internally.
3867
565
  CreateTableRequestPB req;
3868
565
  CreateTableResponsePB resp;
3869
565
  req.set_name(table_name);
3870
565
  req.mutable_namespace_()->set_name(kSystemNamespaceName);
3871
565
  req.set_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE);
3872
565
  if (tablespace_id) {
3873
0
    req.set_tablespace_id(*tablespace_id);
3874
0
  }
3875
3876
  // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable
3877
  // will use the same defaults as for regular tables.
3878
565
  int num_tablets;
3879
565
  if (FLAGS_transaction_table_num_tablets > 0) {
3880
71
    num_tablets = FLAGS_transaction_table_num_tablets;
3881
494
  } else {
3882
494
    auto placement_uuid =
3883
494
        cluster_config_->LockForRead()->pb.replication_info().live_replicas().placement_uuid();
3884
494
    num_tablets = narrow_cast<int>(GetNumLiveTServersForPlacement(placement_uuid) *
3885
494
                                   FLAGS_transaction_table_num_tablets_per_tserver);
3886
494
  }
3887
565
  req.mutable_schema()->mutable_table_properties()->set_num_tablets(num_tablets);
3888
3889
565
  ColumnSchema hash(kRedisKeyColumnName, BINARY, /* is_nullable */ false, /* is_hash_key */ true);
3890
565
  ColumnSchemaToPB(hash, req.mutable_schema()->mutable_columns()->Add());
3891
3892
565
  Status s = CreateTable(&req, &resp, rpc);
3893
  // We do not lock here so it is technically possible that the table was already created.
3894
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
3895
565
  if (!s.ok() && !s.IsAlreadyPresent()) {
3896
2
    return s;
3897
2
  }
3898
3899
563
  return Status::OK();
3900
563
}
3901
3902
0
bool CatalogManager::DoesTransactionTableExistForTablespace(const TablespaceId& tablespace_id) {
3903
0
  SharedLock lock(mutex_);
3904
0
  for (const auto& table_id : transaction_table_ids_set_) {
3905
0
    auto table = table_ids_map_->find(table_id);
3906
0
    if (table == table_ids_map_->end()) {
3907
0
      LOG(DFATAL) << "Table uuid " << table_id
3908
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
3909
0
      continue;
3910
0
    }
3911
0
    auto this_tablespace_id = GetTransactionStatusTableTablespace(table->second);
3912
0
    if (this_tablespace_id && *this_tablespace_id == tablespace_id) {
3913
0
      return true;
3914
0
    }
3915
0
  }
3916
0
  return false;
3917
0
}
3918
3919
CHECKED_STATUS CatalogManager::CreateLocalTransactionStatusTableIfNeeded(
3920
0
    rpc::RpcContext *rpc, const TablespaceId& tablespace_id) {
3921
0
  std::lock_guard<std::mutex> lock(tablespace_transaction_table_creation_mutex_);
3922
3923
0
  if (DoesTransactionTableExistForTablespace(tablespace_id)) {
3924
0
    VLOG(1) << "Transaction status table already exists, not creating.";
3925
0
    return Status::OK();
3926
0
  }
3927
3928
0
  std::string table_name;
3929
0
  if (FLAGS_TEST_name_transaction_tables_with_tablespace_id) {
3930
0
    uint32_t tablespace_oid = VERIFY_RESULT(GetPgsqlTablespaceOid(tablespace_id));
3931
0
    table_name = kTransactionTablePrefix + std::to_string(tablespace_oid);
3932
0
  } else {
3933
0
    std::string uuid;
3934
0
    RETURN_NOT_OK(yb::Uuid::Generate().ToString(&uuid));
3935
0
    table_name = kTransactionTablePrefix + uuid;
3936
0
  }
3937
3938
0
  return CreateTransactionStatusTableInternal(rpc, table_name, &tablespace_id);
3939
0
}
3940
3941
2.34k
CHECKED_STATUS CatalogManager::CreateGlobalTransactionStatusTableIfNeeded(rpc::RpcContext *rpc) {
3942
2.34k
  Status s = CreateTransactionStatusTableInternal(
3943
2.34k
      rpc, kGlobalTransactionsTableName, nullptr /* tablespace_id */);
3944
2.34k
  if (s.IsAlreadyPresent()) {
3945
0
    VLOG(1) << "Transaction status table already exists, not creating.";
3946
1.78k
    return Status::OK();
3947
1.78k
  }
3948
565
  return s;
3949
565
}
3950
3951
CHECKED_STATUS CatalogManager::GetGlobalTransactionStatusTablets(
3952
2.01k
    GetTransactionStatusTabletsResponsePB* resp) {
3953
2.01k
  TableIdentifierPB global_txn_table_identifier;
3954
2.01k
  global_txn_table_identifier.set_table_name(kGlobalTransactionsTableName);
3955
2.01k
  global_txn_table_identifier.mutable_namespace_()->set_name(kSystemNamespaceName);
3956
2.01k
  scoped_refptr<TableInfo> global_txn_table = VERIFY_RESULT(FindTable(global_txn_table_identifier));
3957
3958
2.01k
  RETURN_NOT_OK(WaitForCreateTableToFinish(global_txn_table->id()));
3959
3960
2.01k
  auto l = global_txn_table->LockForRead();
3961
2.01k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
3962
3963
39.4k
  for (const auto& tablet : global_txn_table->GetTablets()) {
3964
39.4k
    TabletLocationsPB locs_pb;
3965
39.4k
    RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb));
3966
39.4k
    resp->add_global_tablet_id(tablet->tablet_id());
3967
39.4k
  }
3968
3969
2.01k
  return Status::OK();
3970
2.01k
}
3971
3972
Result<std::vector<TableId>> CatalogManager::GetPlacementLocalTransactionStatusTables(
3973
2.01k
    const CloudInfoPB& placement) {
3974
2.01k
  std::vector<TableId> same_placement_transaction_tables;
3975
2.01k
  auto tablespace_manager = GetTablespaceManager();
3976
3977
2.01k
  SharedLock lock(mutex_);
3978
1.98k
  for (const auto& table_id : transaction_table_ids_set_) {
3979
1.98k
    auto table = table_ids_map_->find(table_id);
3980
1.98k
    if (table == table_ids_map_->end()) {
3981
0
      LOG(DFATAL) << "Table uuid " << table_id
3982
0
                  << " in transaction_table_ids_set_ but not in table_ids_map_";
3983
0
      continue;
3984
0
    }
3985
    // system.transaction is filtered out because it cannot have a placement set.
3986
1.98k
    auto table_info = table->second;
3987
1.98k
    auto lock = table_info->LockForRead();
3988
1.98k
    auto tablespace_id = GetTransactionStatusTableTablespace(table_info);
3989
1.98k
    auto cloud_info = lock->pb.replication_info();
3990
2.00k
    if (!IsReplicationInfoSet(cloud_info)) {
3991
2.00k
      if (tablespace_id) {
3992
0
        const auto result = tablespace_manager->GetTablespaceReplicationInfo(*tablespace_id);
3993
0
        if (!result.ok() || !*result || !IsReplicationInfoSet(**result)) {
3994
0
          continue;
3995
0
        }
3996
0
        cloud_info = **result;
3997
0
      }
3998
2.00k
    }
3999
1.98k
    const auto& txn_table_replicas = cloud_info.live_replicas();
4000
    // Skip transaction tables spanning multiple regions, since using them will incur global
4001
    // latencies. See #11268.
4002
1.98k
    if (CatalogManagerUtil::DoesPlacementInfoSpanMultipleRegions(txn_table_replicas)) {
4003
0
      continue;
4004
0
    }
4005
1.98k
    if (CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(txn_table_replicas, placement)) {
4006
0
      same_placement_transaction_tables.push_back(table_id);
4007
0
    }
4008
1.98k
  }
4009
4010
2.01k
  return same_placement_transaction_tables;
4011
2.01k
}
4012
4013
CHECKED_STATUS CatalogManager::GetPlacementLocalTransactionStatusTablets(
4014
    const CloudInfoPB& placement,
4015
2.01k
    GetTransactionStatusTabletsResponsePB* resp) {
4016
2.01k
  auto same_placement_transaction_tables = VERIFY_RESULT(GetPlacementLocalTransactionStatusTables(
4017
2.01k
      placement));
4018
4019
2.01k
  if (!same_placement_transaction_tables.empty()) {
4020
0
    for (const auto& table_id : same_placement_transaction_tables) {
4021
0
      RETURN_NOT_OK(WaitForCreateTableToFinish(table_id));
4022
0
    }
4023
4024
0
    SharedLock lock(mutex_);
4025
0
    for (const auto& table_id : same_placement_transaction_tables) {
4026
0
      if (!table_ids_map_->count(table_id)) {
4027
0
        Status s = STATUS_FORMAT(
4028
0
            NotFound, "Transaction table with id $0 does not exist", table_id);
4029
0
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4030
0
      }
4031
4032
0
      auto& table_info = *table_ids_map_->at(table_id);
4033
0
      auto lock = table_info.LockForRead();
4034
0
      for (const auto& tablet : table_info.GetTablets()) {
4035
0
        TabletLocationsPB locs_pb;
4036
0
        RETURN_NOT_OK(BuildLocationsForTablet(tablet, &locs_pb));
4037
0
        resp->add_placement_local_tablet_id(tablet->tablet_id());
4038
0
      }
4039
0
    }
4040
0
  }
4041
4042
2.01k
  return Status::OK();
4043
2.01k
}
4044
4045
CHECKED_STATUS CatalogManager::GetTransactionStatusTablets(
4046
    const GetTransactionStatusTabletsRequestPB* req,
4047
    GetTransactionStatusTabletsResponsePB* resp,
4048
2.01k
    rpc::RpcContext *rpc) {
4049
4050
2.01k
  RETURN_NOT_OK(GetGlobalTransactionStatusTablets(resp));
4051
4052
2.01k
  if (req->has_placement()) {
4053
2.01k
    RETURN_NOT_OK(GetPlacementLocalTransactionStatusTablets(req->placement(), resp));
4054
2.01k
  }
4055
4056
2.01k
  return Status::OK();
4057
2.01k
}
4058
4059
0
Status CatalogManager::CreateMetricsSnapshotsTableIfNeeded(rpc::RpcContext *rpc) {
4060
0
  if (VERIFY_RESULT(TableExists(kSystemNamespaceName, kMetricsSnapshotsTableName))) {
4061
0
    return Status::OK();
4062
0
  }
4063
4064
  // Set up a CreateTable request internally.
4065
0
  CreateTableRequestPB req;
4066
0
  CreateTableResponsePB resp;
4067
0
  req.set_name(kMetricsSnapshotsTableName);
4068
0
  req.mutable_namespace_()->set_name(kSystemNamespaceName);
4069
0
  req.set_table_type(TableType::YQL_TABLE_TYPE);
4070
4071
  // Explicitly set the number tablets if the corresponding flag is set, otherwise CreateTable
4072
  // will use the same defaults as for regular tables.
4073
0
  if (FLAGS_metrics_snapshots_table_num_tablets > 0) {
4074
0
    req.mutable_schema()->mutable_table_properties()->set_num_tablets(
4075
0
        FLAGS_metrics_snapshots_table_num_tablets);
4076
0
  }
4077
4078
  // Schema description: "node" refers to tserver uuid. "entity_type" can be either
4079
  // "tserver" or "table". "entity_id" is uuid of corresponding tserver or table.
4080
  // "metric" is the name of the metric and "value" is its val. "ts" is time at
4081
  // which the snapshot was recorded. "details" is a json column for future extensibility.
4082
4083
0
  YBSchemaBuilder schemaBuilder;
4084
0
  schemaBuilder.AddColumn("node")->Type(STRING)->HashPrimaryKey()->NotNull();
4085
0
  schemaBuilder.AddColumn("entity_type")->Type(STRING)->PrimaryKey()->NotNull();
4086
0
  schemaBuilder.AddColumn("entity_id")->Type(STRING)->PrimaryKey()->NotNull();
4087
0
  schemaBuilder.AddColumn("metric")->Type(STRING)->PrimaryKey()->NotNull();
4088
0
  schemaBuilder.AddColumn("ts")->Type(TIMESTAMP)->PrimaryKey()->NotNull()->
4089
0
    SetSortingType(SortingType::kDescending);
4090
0
  schemaBuilder.AddColumn("value")->Type(INT64);
4091
0
  schemaBuilder.AddColumn("details")->Type(JSONB);
4092
4093
0
  YBSchema ybschema;
4094
0
  CHECK_OK(schemaBuilder.Build(&ybschema));
4095
4096
0
  auto schema = yb::client::internal::GetSchema(ybschema);
4097
0
  SchemaToPB(schema, req.mutable_schema());
4098
4099
0
  Status s = CreateTable(&req, &resp, rpc);
4100
  // We do not lock here so it is technically possible that the table was already created.
4101
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
4102
0
  if (s.IsAlreadyPresent()) {
4103
0
    return Status::OK();
4104
0
  }
4105
0
  return s;
4106
0
}
4107
4108
Status CatalogManager::IsCreateTableDone(const IsCreateTableDoneRequestPB* req,
4109
20.3k
                                         IsCreateTableDoneResponsePB* resp) {
4110
20.3k
  TRACE("Looking up table");
4111
  // 1. Lookup the table and verify if it exists.
4112
20.3k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
4113
4114
20.3k
  TRACE("Locking table");
4115
20.3k
  auto l = table->LockForRead();
4116
20.3k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
4117
20.3k
  const auto& pb = l->pb;
4118
4119
  // 2. Verify if the create is in-progress.
4120
20.3k
  TRACE("Verify if the table creation is in progress for $0", table->ToString());
4121
20.3k
  resp->set_done(!table->IsCreateInProgress());
4122
4123
  // 3. Set any current errors, if we are experiencing issues creating the table. This will be
4124
  // bubbled up to the MasterService layer. If it is an error, it gets wrapped around in
4125
  // MasterErrorPB::UNKNOWN_ERROR.
4126
20.3k
  RETURN_NOT_OK(table->GetCreateTableErrorStatus());
4127
4128
  // 4. If this is an index, we are not done until the index is in the indexed table's schema.  An
4129
  // exception is YSQL system table indexes, which don't get added to their indexed tables' schemas.
4130
20.3k
  if (resp->done() && IsIndex(pb)) {
4131
690
    auto& indexed_table_id = GetIndexedTableId(pb);
4132
    // For user indexes (which add index info to indexed table's schema),
4133
    // - if this index is created without backfill,
4134
    //   - waiting for the index to be in the indexed table's schema is sufficient, and, by that
4135
    //     point, things are fully created.
4136
    // - if this index is created with backfill
4137
    //   - and it's YCQL,
4138
    //     - waiting for the index to be in the indexed table's schema means waiting for the
4139
    //       DELETE_ONLY index permission, and it's fine to return to the client before the index
4140
    //       gets the rest of the permissions because the expectation is that backfill will be
4141
    //       completed asynchronously.
4142
    //   - and it's YSQL,
4143
    //     - waiting for the index to be in the indexed table's schema means just that (DocDB index
4144
    //       permissions don't really matter for YSQL besides being used for backfill purposes), and
4145
    //       it's a signal for postgres to continue the index backfill process, activating index
4146
    //       state flags then later triggering backfill and so on.
4147
    // For YSQL system indexes (which don't add index info to indexed table's schema),
4148
    // - there's nothing additional to wait on.
4149
    // Therefore, the only thing needed here is to check whether the index info is in the indexed
4150
    // table's schema for user indexes.
4151
690
    if (pb.table_type() == YQL_TABLE_TYPE ||
4152
690
        (pb.table_type() == PGSQL_TABLE_TYPE && IsUserCreatedTable(*table))) {
4153
690
      GetTableSchemaRequestPB get_schema_req;
4154
690
      GetTableSchemaResponsePB get_schema_resp;
4155
690
      get_schema_req.mutable_table()->set_table_id(indexed_table_id);
4156
690
      const bool get_fully_applied_indexes = true;
4157
690
      const Status s = GetTableSchemaInternal(&get_schema_req,
4158
690
                                              &get_schema_resp,
4159
690
                                              get_fully_applied_indexes);
4160
690
      if (!s.ok()) {
4161
0
        resp->mutable_error()->Swap(get_schema_resp.mutable_error());
4162
0
        return s;
4163
0
      }
4164
4165
690
      resp->set_done(false);
4166
1.30k
      for (const auto& index : get_schema_resp.indexes()) {
4167
1.30k
        if (index.has_table_id() && index.table_id() == table->id()) {
4168
606
          resp->set_done(true);
4169
606
          break;
4170
606
        }
4171
1.30k
      }
4172
690
    }
4173
690
  }
4174
4175
  // Sanity check that this table is present in system.partitions if it is a YCQL table.
4176
  // Only check if we are automatically generating the vtable on changes. If we are creating via
4177
  // the bg task, then there may be a delay.
4178
20.3k
  if (DCHECK_IS_ON() &&
4179
20.3k
      resp->done() &&
4180
7.46k
      IsYcqlTable(*table) &&
4181
1.91k
      YQLPartitionsVTable::GeneratePartitionsVTableOnChanges() &&
4182
1.91k
      FLAGS_TEST_catalog_manager_check_yql_partitions_exist_for_is_create_table_done) {
4183
1.91k
    Schema schema;
4184
1.91k
    RETURN_NOT_OK(table->GetSchema(&schema));
4185
    // Copartitioned tables don't actually create tablets currently (unimplemented), so ignore them.
4186
1.91k
    if (!schema.table_properties().HasCopartitionTableId()) {
4187
1.91k
      DCHECK(GetYqlPartitionsVtable().CheckTableIsPresent(table->id(), table->NumPartitions()));
4188
1.91k
    }
4189
1.91k
  }
4190
4191
  // If this is a transactional table we are not done until the transaction status table is created.
4192
  // However, if we are currently initializing the system catalog snapshot, we don't create the
4193
  // transactions table.
4194
20.3k
  if (!FLAGS_create_initial_sys_catalog_snapshot &&
4195
20.3k
      resp->done() && pb.schema().table_properties().is_transactional()) {
4196
2.08k
    RETURN_NOT_OK(IsTransactionStatusTableCreated(resp));
4197
2.08k
  }
4198
4199
  // We are not done until the metrics snapshots table is created.
4200
20.3k
  if (FLAGS_master_enable_metrics_snapshotter && resp->done() &&
4201
0
      !(table->GetTableType() == TableType::YQL_TABLE_TYPE &&
4202
0
        table->namespace_id() == kSystemNamespaceId &&
4203
0
        table->name() == kMetricsSnapshotsTableName)) {
4204
0
    RETURN_NOT_OK(IsMetricsSnapshotsTableCreated(resp));
4205
0
  }
4206
4207
  // If this is a colocated table and there is a pending AddTableToTablet task then we are not done.
4208
20.3k
  if (resp->done() && pb.colocated()) {
4209
57
    resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_ADD_TABLE_TO_TABLET));
4210
57
  }
4211
4212
20.3k
  return Status::OK();
4213
20.3k
}
4214
4215
Status CatalogManager::IsCreateTableInProgress(const TableId& table_id,
4216
                                               CoarseTimePoint deadline,
4217
1.99k
                                               bool* create_in_progress) {
4218
1.99k
  DCHECK_ONLY_NOTNULL(create_in_progress);
4219
1.99k
  DCHECK(!table_id.empty());
4220
4221
1.99k
  IsCreateTableDoneRequestPB req;
4222
1.99k
  IsCreateTableDoneResponsePB resp;
4223
1.99k
  req.mutable_table()->set_table_id(table_id);
4224
1.99k
  RETURN_NOT_OK(IsCreateTableDone(&req, &resp));
4225
4226
1.99k
  if (resp.has_error()) {
4227
0
    return StatusFromPB(resp.error().status());
4228
0
  }
4229
4230
1.99k
  *create_in_progress = !resp.done();
4231
1.99k
  return Status::OK();
4232
1.99k
}
4233
4234
2.01k
Status CatalogManager::WaitForCreateTableToFinish(const TableId& table_id) {
4235
2.01k
  MonoDelta default_admin_operation_timeout(
4236
2.01k
      MonoDelta::FromSeconds(FLAGS_yb_client_admin_operation_timeout_sec));
4237
2.01k
  auto deadline = CoarseMonoClock::Now() + default_admin_operation_timeout;
4238
4239
2.01k
  return client::RetryFunc(
4240
2.01k
      deadline, "Waiting on Create Table to be completed", "Timed out waiting for Table Creation",
4241
2.01k
      std::bind(&CatalogManager::IsCreateTableInProgress, this, table_id, _1, _2));
4242
2.01k
}
4243
4244
2.08k
Status CatalogManager::IsTransactionStatusTableCreated(IsCreateTableDoneResponsePB* resp) {
4245
2.08k
  IsCreateTableDoneRequestPB req;
4246
4247
2.08k
  req.mutable_table()->set_table_name(kGlobalTransactionsTableName);
4248
2.08k
  req.mutable_table()->mutable_namespace_()->set_name(kSystemNamespaceName);
4249
4250
2.08k
  return IsCreateTableDone(&req, resp);
4251
2.08k
}
4252
4253
0
Status CatalogManager::IsMetricsSnapshotsTableCreated(IsCreateTableDoneResponsePB* resp) {
4254
0
  IsCreateTableDoneRequestPB req;
4255
4256
0
  req.mutable_table()->set_table_name(kMetricsSnapshotsTableName);
4257
0
  req.mutable_table()->mutable_namespace_()->set_name(kSystemNamespaceName);
4258
0
  req.mutable_table()->mutable_namespace_()->set_database_type(YQLDatabase::YQL_DATABASE_CQL);
4259
4260
0
  return IsCreateTableDone(&req, resp);
4261
0
}
4262
4263
7
std::string CatalogManager::GenerateId(boost::optional<const SysRowEntryType> entity_type) {
4264
7
  SharedLock lock(mutex_);
4265
7
  return GenerateIdUnlocked(entity_type);
4266
7
}
4267
4268
std::string CatalogManager::GenerateIdUnlocked(
4269
83.9k
    boost::optional<const SysRowEntryType> entity_type) {
4270
83.9k
  while (true) {
4271
    // Generate id and make sure it is unique within its category.
4272
83.9k
    std::string id = GenerateObjectId();
4273
83.9k
    if (!entity_type) {
4274
7
      return id;
4275
7
    }
4276
83.9k
    switch (*entity_type) {
4277
2.03k
      case SysRowEntryType::NAMESPACE:
4278
2.03k
        if (FindPtrOrNull(namespace_ids_map_, id) == nullptr) return id;
4279
0
        break;
4280
27.9k
      case SysRowEntryType::TABLE:
4281
27.9k
        if (FindPtrOrNull(*table_ids_map_, id) == nullptr) return id;
4282
0
        break;
4283
53.7k
      case SysRowEntryType::TABLET:
4284
53.7k
        if (FindPtrOrNull(*tablet_map_, id) == nullptr) return id;
4285
0
        break;
4286
45
      case SysRowEntryType::UDTYPE:
4287
45
        if (FindPtrOrNull(udtype_ids_map_, id) == nullptr) return id;
4288
0
        break;
4289
0
      case SysRowEntryType::SNAPSHOT:
4290
0
        return id;
4291
157
      case SysRowEntryType::CDC_STREAM:
4292
157
        if (!CDCStreamExistsUnlocked(id)) return id;
4293
0
        break;
4294
0
      case SysRowEntryType::CLUSTER_CONFIG: FALLTHROUGH_INTENDED;
4295
0
      case SysRowEntryType::ROLE: FALLTHROUGH_INTENDED;
4296
0
      case SysRowEntryType::REDIS_CONFIG: FALLTHROUGH_INTENDED;
4297
0
      case SysRowEntryType::UNIVERSE_REPLICATION: FALLTHROUGH_INTENDED;
4298
0
      case SysRowEntryType::SYS_CONFIG: FALLTHROUGH_INTENDED;
4299
0
      case SysRowEntryType::SNAPSHOT_SCHEDULE: FALLTHROUGH_INTENDED;
4300
0
      case SysRowEntryType::DDL_LOG_ENTRY: FALLTHROUGH_INTENDED;
4301
0
      case SysRowEntryType::UNKNOWN:
4302
0
        LOG(DFATAL) << "Invalid id type: " << *entity_type;
4303
0
        return id;
4304
83.9k
    }
4305
83.9k
  }
4306
83.9k
}
4307
4308
scoped_refptr<TableInfo> CatalogManager::CreateTableInfo(const CreateTableRequestPB& req,
4309
                                                         const Schema& schema,
4310
                                                         const PartitionSchema& partition_schema,
4311
                                                         const NamespaceId& namespace_id,
4312
                                                         const NamespaceName& namespace_name,
4313
31.6k
                                                         IndexInfoPB* index_info) {
4314
31.6k
  DCHECK(schema.has_column_ids());
4315
31.6k
  TableId table_id
4316
27.9k
      = !req.table_id().empty() ? req.table_id() : GenerateIdUnlocked(SysRowEntryType::TABLE);
4317
31.6k
  scoped_refptr<TableInfo> table = NewTableInfo(table_id);
4318
31.6k
  if (req.has_tablespace_id()) {
4319
3
    table->SetTablespaceIdForTableCreation(req.tablespace_id());
4320
3
  }
4321
31.6k
  table->mutable_metadata()->StartMutation();
4322
31.6k
  SysTablesEntryPB *metadata = &table->mutable_metadata()->mutable_dirty()->pb;
4323
31.6k
  metadata->set_state(SysTablesEntryPB::PREPARING);
4324
31.6k
  metadata->set_name(req.name());
4325
31.6k
  metadata->set_table_type(req.table_type());
4326
31.6k
  metadata->set_namespace_id(namespace_id);
4327
31.6k
  metadata->set_namespace_name(namespace_name);
4328
31.6k
  metadata->set_version(0);
4329
31.6k
  metadata->set_next_column_id(ColumnId(schema.max_col_id() + 1));
4330
31.6k
  if (req.has_replication_info()) {
4331
1
    metadata->mutable_replication_info()->CopyFrom(req.replication_info());
4332
1
  }
4333
  // Use the Schema object passed in, since it has the column IDs already assigned,
4334
  // whereas the user request PB does not.
4335
31.6k
  SchemaToPB(schema, metadata->mutable_schema());
4336
31.6k
  partition_schema.ToPB(metadata->mutable_partition_schema());
4337
  // For index table, set index details (indexed table id and whether the index is local).
4338
31.6k
  if (req.has_index_info()) {
4339
1.59k
    metadata->mutable_index_info()->CopyFrom(req.index_info());
4340
4341
    // Set the deprecated fields also for compatibility reasons.
4342
1.59k
    metadata->set_indexed_table_id(req.index_info().indexed_table_id());
4343
1.59k
    metadata->set_is_local_index(req.index_info().is_local());
4344
1.59k
    metadata->set_is_unique_index(req.index_info().is_unique());
4345
4346
    // Setup index info.
4347
1.59k
    if (index_info != nullptr) {
4348
586
      index_info->set_table_id(table->id());
4349
586
      metadata->mutable_index_info()->CopyFrom(*index_info);
4350
586
    }
4351
30.0k
  } else if (req.has_indexed_table_id()) {
4352
    // Read data from the deprecated field and update the new fields.
4353
18
    metadata->mutable_index_info()->set_indexed_table_id(req.indexed_table_id());
4354
18
    metadata->mutable_index_info()->set_is_local(req.is_local_index());
4355
18
    metadata->mutable_index_info()->set_is_unique(req.is_unique_index());
4356
4357
    // Set the deprecated fields also for compatibility reasons.
4358
18
    metadata->set_indexed_table_id(req.indexed_table_id());
4359
18
    metadata->set_is_local_index(req.is_local_index());
4360
18
    metadata->set_is_unique_index(req.is_unique_index());
4361
4362
    // Setup index info.
4363
18
    if (index_info != nullptr) {
4364
18
      index_info->set_table_id(table->id());
4365
18
      metadata->mutable_index_info()->CopyFrom(*index_info);
4366
18
    }
4367
18
  }
4368
4369
31.6k
  if (req.is_pg_shared_table()) {
4370
0
    metadata->set_is_pg_shared_table(true);
4371
0
  }
4372
4373
31.6k
  return table;
4374
31.6k
}
4375
4376
TabletInfoPtr CatalogManager::CreateTabletInfo(TableInfo* table,
4377
53.7k
                                               const PartitionPB& partition) {
4378
53.7k
  auto tablet = make_scoped_refptr<TabletInfo>(table, GenerateIdUnlocked(SysRowEntryType::TABLET));
4379
0
  VLOG_WITH_PREFIX_AND_FUNC(2)
4380
0
      << "Table: " << table->ToString() << ", tablet: " << tablet->ToString();
4381
4382
53.7k
  tablet->mutable_metadata()->StartMutation();
4383
53.7k
  SysTabletsEntryPB *metadata = &tablet->mutable_metadata()->mutable_dirty()->pb;
4384
53.7k
  metadata->set_state(SysTabletsEntryPB::PREPARING);
4385
53.7k
  metadata->mutable_partition()->CopyFrom(partition);
4386
53.7k
  metadata->set_table_id(table->id());
4387
  // This is important: we are setting the first table id in the table_ids list
4388
  // to be the id of the original table that creates the tablet.
4389
53.7k
  metadata->add_table_ids(table->id());
4390
53.7k
  return tablet;
4391
53.7k
}
4392
4393
Status CatalogManager::RemoveTableIdsFromTabletInfo(
4394
    TabletInfoPtr tablet_info,
4395
48
    std::unordered_set<TableId> tables_to_remove) {
4396
48
  auto tablet_lock = tablet_info->LockForWrite();
4397
4398
48
  google::protobuf::RepeatedPtrField<std::string> new_table_ids;
4399
14.1k
  for (const auto& table_id : tablet_lock->pb.table_ids()) {
4400
14.1k
    if (tables_to_remove.find(table_id) == tables_to_remove.end()) {
4401
11.9k
      *new_table_ids.Add() = std::move(table_id);
4402
11.9k
    }
4403
14.1k
  }
4404
48
  tablet_lock.mutable_data()->pb.mutable_table_ids()->Swap(&new_table_ids);
4405
4406
48
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_info));
4407
44
  tablet_lock.Commit();
4408
44
  return Status::OK();
4409
48
}
4410
4411
Result<scoped_refptr<TableInfo>> CatalogManager::FindTable(
4412
434k
    const TableIdentifierPB& table_identifier) const {
4413
434k
  SharedLock lock(mutex_);
4414
434k
  return FindTableUnlocked(table_identifier);
4415
434k
}
4416
4417
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableUnlocked(
4418
434k
    const TableIdentifierPB& table_identifier) const {
4419
434k
  if (table_identifier.has_table_id()) {
4420
320k
    return FindTableByIdUnlocked(table_identifier.table_id());
4421
320k
  }
4422
4423
114k
  if (table_identifier.has_table_name()) {
4424
114k
    auto namespace_info = VERIFY_RESULT(FindNamespaceUnlocked(table_identifier.namespace_()));
4425
4426
    // We can't lookup YSQL table by name because Postgres concept of "schemas"
4427
    // introduces ambiguity.
4428
114k
    if (namespace_info->database_type() == YQL_DATABASE_PGSQL) {
4429
0
      return STATUS(InvalidArgument, "Cannot lookup YSQL table by name");
4430
0
    }
4431
4432
114k
    auto it = table_names_map_.find({namespace_info->id(), table_identifier.table_name()});
4433
114k
    if (it == table_names_map_.end()) {
4434
3.01k
      return STATUS_EC_FORMAT(
4435
3.01k
          NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
4436
3.01k
          "Table $0.$1 not found", namespace_info->name(), table_identifier.table_name());
4437
3.01k
    }
4438
111k
    return it->second;
4439
111k
  }
4440
4441
11
  return STATUS(InvalidArgument, "Neither table id or table name are specified",
4442
11
                table_identifier.ShortDebugString());
4443
11
}
4444
4445
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableById(
4446
3.53k
    const TableId& table_id) const {
4447
3.53k
  SharedLock lock(mutex_);
4448
3.53k
  return FindTableByIdUnlocked(table_id);
4449
3.53k
}
4450
4451
Result<scoped_refptr<TableInfo>> CatalogManager::FindTableByIdUnlocked(
4452
323k
    const TableId& table_id) const {
4453
323k
  auto it = table_ids_map_->find(table_id);
4454
323k
  if (it == table_ids_map_->end()) {
4455
76
    return STATUS_EC_FORMAT(
4456
76
        NotFound, MasterError(MasterErrorPB::OBJECT_NOT_FOUND),
4457
76
        "Table with identifier $0 not found", table_id);
4458
76
  }
4459
323k
  return it->second;
4460
323k
}
4461
4462
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceById(
4463
623k
    const NamespaceId& id) const {
4464
623k
  SharedLock lock(mutex_);
4465
623k
  return FindNamespaceByIdUnlocked(id);
4466
623k
}
4467
4468
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceByIdUnlocked(
4469
659k
    const NamespaceId& id) const {
4470
659k
  auto it = namespace_ids_map_.find(id);
4471
659k
  if (it == namespace_ids_map_.end()) {
4472
0
    VLOG_WITH_FUNC(4) << "Not found: " << id << "\n" << GetStackTrace();
4473
3
    return STATUS(NotFound, "Keyspace identifier not found", id,
4474
3
                  MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4475
3
  }
4476
659k
  return it->second;
4477
659k
}
4478
4479
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespaceUnlocked(
4480
156k
    const NamespaceIdentifierPB& ns_identifier) const {
4481
156k
  if (ns_identifier.has_id()) {
4482
35.9k
    return FindNamespaceByIdUnlocked(ns_identifier.id());
4483
35.9k
  }
4484
4485
121k
  if (ns_identifier.has_name()) {
4486
120k
    auto db = GetDatabaseType(ns_identifier);
4487
120k
    auto it = namespace_names_mapper_[db].find(ns_identifier.name());
4488
120k
    if (it == namespace_names_mapper_[db].end()) {
4489
1.70k
      return STATUS(NotFound, "Keyspace name not found", ns_identifier.name(),
4490
1.70k
                    MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4491
1.70k
    }
4492
119k
    return it->second;
4493
119k
  }
4494
4495
4
  LOG(DFATAL) << __func__ << ": " << ns_identifier.ShortDebugString() << ", \n" << GetStackTrace();
4496
4
  return STATUS(NotFound, "Neither keyspace id nor keyspace name is specified",
4497
4
                ns_identifier.ShortDebugString(), MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
4498
4
}
4499
4500
Result<scoped_refptr<NamespaceInfo>> CatalogManager::FindNamespace(
4501
14.4k
    const NamespaceIdentifierPB& ns_identifier) const {
4502
14.4k
  SharedLock lock(mutex_);
4503
14.4k
  return FindNamespaceUnlocked(ns_identifier);
4504
14.4k
}
4505
4506
Result<TableDescription> CatalogManager::DescribeTable(
4507
0
    const TableIdentifierPB& table_identifier, bool succeed_if_create_in_progress) {
4508
0
  TRACE("Looking up table");
4509
0
  return DescribeTable(VERIFY_RESULT(FindTable(table_identifier)), succeed_if_create_in_progress);
4510
0
}
4511
4512
Result<TableDescription> CatalogManager::DescribeTable(
4513
7
    const TableInfoPtr& table_info, bool succeed_if_create_in_progress) {
4514
7
  TableDescription result;
4515
7
  result.table_info = table_info;
4516
7
  NamespaceId namespace_id;
4517
7
  {
4518
7
    TRACE("Locking table");
4519
7
    auto l = table_info->LockForRead();
4520
4521
7
    if (!succeed_if_create_in_progress && table_info->IsCreateInProgress()) {
4522
0
      return STATUS(IllegalState, "Table creation is in progress", table_info->ToString(),
4523
0
                    MasterError(MasterErrorPB::TABLE_CREATION_IS_IN_PROGRESS));
4524
0
    }
4525
4526
7
    result.tablet_infos = table_info->GetTablets();
4527
4528
7
    namespace_id = table_info->namespace_id();
4529
7
  }
4530
4531
7
  TRACE("Looking up namespace");
4532
7
  result.namespace_info = VERIFY_RESULT(FindNamespaceById(namespace_id));
4533
4534
7
  return result;
4535
7
}
4536
4537
0
Result<string> CatalogManager::GetPgSchemaName(const TableInfoPtr& table_info) {
4538
0
  RSTATUS_DCHECK_EQ(table_info->GetTableType(), PGSQL_TABLE_TYPE, InternalError,
4539
0
      Format("Expected YSQL table, got: $0", table_info->GetTableType()));
4540
4541
0
  const uint32_t database_oid = VERIFY_RESULT(GetPgsqlDatabaseOid(table_info->namespace_id()));
4542
0
  uint32_t table_oid = VERIFY_RESULT(GetPgsqlTableOid(table_info->id()));
4543
0
  {
4544
0
    if (matview_pg_table_ids_map_.find(table_info->id()) != matview_pg_table_ids_map_.end()) {
4545
0
      table_oid = VERIFY_RESULT(GetPgsqlTableOid(matview_pg_table_ids_map_[table_info->id()]));
4546
0
    }
4547
0
  }
4548
0
  const uint32_t relnamespace_oid = VERIFY_RESULT(
4549
0
      sys_catalog_->ReadPgClassRelnamespace(database_oid, table_oid));
4550
0
  return sys_catalog_->ReadPgNamespaceNspname(database_oid, relnamespace_oid);
4551
0
}
4552
4553
// Truncate a Table.
4554
Status CatalogManager::TruncateTable(const TruncateTableRequestPB* req,
4555
                                     TruncateTableResponsePB* resp,
4556
11.8k
                                     rpc::RpcContext* rpc) {
4557
11.8k
  LOG(INFO) << "Servicing TruncateTable request from " << RequestorString(rpc)
4558
11.8k
            << ": " << req->ShortDebugString();
4559
4560
14.9k
  for (int i = 0; i < req->table_ids_size(); i++) {
4561
3.05k
    RETURN_NOT_OK(TruncateTable(req->table_ids(i), resp, rpc));
4562
3.05k
  }
4563
4564
11.8k
  return Status::OK();
4565
11.8k
}
4566
4567
Status CatalogManager::TruncateTable(const TableId& table_id,
4568
                                     TruncateTableResponsePB* resp,
4569
6.43k
                                     rpc::RpcContext* rpc) {
4570
  // Lookup the table and verify if it exists.
4571
6.43k
  TRACE(Substitute("Looking up object by id $0", table_id));
4572
6.43k
  scoped_refptr<TableInfo> table;
4573
6.43k
  {
4574
6.43k
    SharedLock lock(mutex_);
4575
6.43k
    table = FindPtrOrNull(*table_ids_map_, table_id);
4576
6.43k
    if (table == nullptr) {
4577
0
      Status s = STATUS_SUBSTITUTE(NotFound, "The object with id $0 does not exist", table_id);
4578
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4579
0
    }
4580
6.43k
  }
4581
4582
6.43k
  TRACE(Substitute("Locking object with id $0", table_id));
4583
6.43k
  auto l = table->LockForRead();
4584
6.43k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
4585
4586
  // Truncate on a colocated table should not hit master because it should be handled by a write
4587
  // DML that creates a table-level tombstone.
4588
0
  LOG_IF(WARNING, table->IsColocatedUserTable()) << "cannot truncate a colocated table on master";
4589
4590
6.43k
  if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && IsCdcEnabled(*table)) {
4591
0
    return STATUS(NotSupported,
4592
0
                  "Cannot truncate a table in replication.",
4593
0
                  table_id,
4594
0
                  MasterError(MasterErrorPB::INVALID_REQUEST));
4595
0
  }
4596
4597
  // Send a Truncate() request to each tablet in the table.
4598
6.43k
  SendTruncateTableRequest(table);
4599
4600
6.43k
  LOG(INFO) << "Successfully initiated TRUNCATE for " << table->ToString() << " per request from "
4601
6.43k
            << RequestorString(rpc);
4602
6.43k
  background_tasks_->Wake();
4603
4604
  // Truncate indexes also.
4605
  // Note: PG table does not have references to indexes in the base table, so associated indexes
4606
  //       must be truncated from the PG code separately.
4607
6.43k
  const bool is_index = IsIndex(l->pb);
4608
0
  DCHECK(!is_index || l->pb.indexes().empty()) << "indexes should be empty for index table";
4609
3.37k
  for (const auto& index_info : l->pb.indexes()) {
4610
3.37k
    RETURN_NOT_OK(TruncateTable(index_info.table_id(), resp, rpc));
4611
3.37k
  }
4612
4613
6.43k
  return Status::OK();
4614
6.43k
}
4615
4616
6.43k
void CatalogManager::SendTruncateTableRequest(const scoped_refptr<TableInfo>& table) {
4617
53.6k
  for (const auto& tablet : table->GetTablets()) {
4618
53.6k
    SendTruncateTabletRequest(tablet);
4619
53.6k
  }
4620
6.43k
}
4621
4622
53.6k
void CatalogManager::SendTruncateTabletRequest(const scoped_refptr<TabletInfo>& tablet) {
4623
53.6k
  LOG_WITH_PREFIX(INFO) << "Truncating tablet " << tablet->id();
4624
53.6k
  auto call = std::make_shared<AsyncTruncate>(master_, AsyncTaskPool(), tablet);
4625
53.6k
  tablet->table()->AddTask(call);
4626
53.6k
  WARN_NOT_OK(
4627
53.6k
      ScheduleTask(call),
4628
53.6k
      Substitute("Failed to send truncate request for tablet $0", tablet->id()));
4629
53.6k
}
4630
4631
Status CatalogManager::IsTruncateTableDone(const IsTruncateTableDoneRequestPB* req,
4632
8.89k
                                           IsTruncateTableDoneResponsePB* resp) {
4633
8.89k
  LOG(INFO) << "Servicing IsTruncateTableDone request for table id " << req->table_id();
4634
4635
  // Lookup the truncated table.
4636
8.89k
  TRACE("Looking up table $0", req->table_id());
4637
8.89k
  scoped_refptr<TableInfo> table;
4638
8.89k
  {
4639
8.89k
    SharedLock lock(mutex_);
4640
8.89k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
4641
8.89k
  }
4642
4643
8.89k
  if (table == nullptr) {
4644
0
    Status s = STATUS(NotFound, "The object does not exist: table with id", req->table_id());
4645
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4646
0
  }
4647
4648
8.89k
  TRACE("Locking table");
4649
8.89k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(table->LockForRead(), resp));
4650
4651
8.89k
  resp->set_done(!table->HasTasks(MonitoredTask::Type::ASYNC_TRUNCATE_TABLET));
4652
8.89k
  return Status::OK();
4653
8.89k
}
4654
4655
// Note: only used by YSQL as of 2020-10-29.
4656
Status CatalogManager::BackfillIndex(
4657
    const BackfillIndexRequestPB* req,
4658
    BackfillIndexResponsePB* resp,
4659
89
    rpc::RpcContext* rpc) {
4660
89
  const TableIdentifierPB& index_table_identifier = req->index_identifier();
4661
4662
89
  scoped_refptr<TableInfo> index_table = VERIFY_RESULT(FindTable(index_table_identifier));
4663
4664
89
  if (index_table->GetTableType() != PGSQL_TABLE_TYPE) {
4665
    // This request is only supported for YSQL for now.  YCQL has its own mechanism.
4666
0
    return STATUS(
4667
0
        InvalidArgument,
4668
0
        "Unexpected non-YSQL table",
4669
0
        index_table_identifier.ShortDebugString());
4670
0
  }
4671
4672
  // Collect indexed_table.
4673
89
  scoped_refptr<TableInfo> indexed_table;
4674
89
  {
4675
89
    auto l = index_table->LockForRead();
4676
89
    TableId indexed_table_id = GetIndexedTableId(l->pb);
4677
89
    resp->mutable_table_identifier()->set_table_id(indexed_table_id);
4678
89
    indexed_table = GetTableInfo(indexed_table_id);
4679
89
  }
4680
4681
89
  if (indexed_table == nullptr) {
4682
0
    return STATUS(InvalidArgument, "Empty indexed table",
4683
0
                  index_table_identifier.ShortDebugString());
4684
0
  }
4685
4686
  // TODO(jason): when ready to use INDEX_PERM_DO_BACKFILL for resuming backfill across master
4687
  // leader changes, replace the following (issue #6218).
4688
4689
  // Collect index_info_pb.
4690
89
  IndexInfoPB index_info_pb;
4691
89
  indexed_table->GetIndexInfo(index_table->id()).ToPB(&index_info_pb);
4692
89
  if (index_info_pb.index_permissions() != INDEX_PERM_WRITE_AND_DELETE) {
4693
0
    return SetupError(
4694
0
        resp->mutable_error(),
4695
0
        MasterErrorPB::INVALID_SCHEMA,
4696
0
        STATUS_FORMAT(
4697
0
            InvalidArgument,
4698
0
            "Expected WRITE_AND_DELETE perm, got $0",
4699
0
            IndexPermissions_Name(index_info_pb.index_permissions())));
4700
0
  }
4701
4702
89
  return MultiStageAlterTable::StartBackfillingData(
4703
89
      this, indexed_table, {index_info_pb}, boost::none);
4704
89
}
4705
4706
Status CatalogManager::GetBackfillJobs(
4707
    const GetBackfillJobsRequestPB* req,
4708
    GetBackfillJobsResponsePB* resp,
4709
695
    rpc::RpcContext* rpc) {
4710
695
  TableIdentifierPB table_id = req->table_identifier();
4711
4712
695
  scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id));
4713
695
  if (indexed_table == nullptr) {
4714
0
    Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString());
4715
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4716
0
  }
4717
4718
695
  {
4719
695
    auto l = indexed_table->LockForRead();
4720
695
    resp->mutable_backfill_jobs()->CopyFrom(l->pb.backfill_jobs());
4721
695
  }
4722
695
  return Status::OK();
4723
695
}
4724
4725
Status CatalogManager::LaunchBackfillIndexForTable(
4726
    const LaunchBackfillIndexForTableRequestPB* req,
4727
    LaunchBackfillIndexForTableResponsePB* resp,
4728
1
    rpc::RpcContext* rpc) {
4729
1
  const TableIdentifierPB& table_id = req->table_identifier();
4730
4731
1
  scoped_refptr<TableInfo> indexed_table = VERIFY_RESULT(FindTable(table_id));
4732
1
  if (indexed_table == nullptr) {
4733
0
    Status s = STATUS(NotFound, "Requested table $0 does not exist", table_id.ShortDebugString());
4734
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
4735
0
  }
4736
1
  if (indexed_table->GetTableType() != YQL_TABLE_TYPE) {
4737
    // This request is only supported for YCQL for now.  YSQL has its own mechanism.
4738
0
    return STATUS(InvalidArgument, "Unexpected non-YCQL table $0", table_id.ShortDebugString());
4739
0
  }
4740
4741
1
  uint32_t current_version;
4742
1
  {
4743
1
    auto l = indexed_table->LockForRead();
4744
1
    if (l->pb.state() != SysTablesEntryPB::RUNNING) {
4745
0
      Status s = STATUS(TryAgain,
4746
0
                        "The table is in state $0. An alter may already be in progress.",
4747
0
                        SysTablesEntryPB_State_Name(l->pb.state()));
4748
0
      VLOG(2) << "Table " << indexed_table->ToString() << " is not running returning " << s;
4749
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
4750
0
    }
4751
1
    current_version = l->pb.version();
4752
1
  }
4753
4754
1
  auto s = MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(
4755
1
      this, indexed_table, current_version, /* respect deferrals for backfill */ false);
4756
1
  if (!s.ok()) {
4757
0
    VLOG(3) << __func__ << " Done failed " << s;
4758
0
    return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
4759
0
  }
4760
1
  return Status::OK();
4761
1
}
4762
4763
Status CatalogManager::MarkIndexInfoFromTableForDeletion(
4764
    const TableId& indexed_table_id, const TableId& index_table_id, bool multi_stage,
4765
372
    DeleteTableResponsePB* resp) {
4766
  // Lookup the indexed table and verify if it exists.
4767
372
  scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4768
372
  if (indexed_table == nullptr) {
4769
0
    LOG(WARNING) << "Indexed table " << indexed_table_id << " for index "
4770
0
                 << index_table_id << " not found";
4771
0
    return Status::OK();
4772
0
  }
4773
4774
372
  if (resp) {
4775
372
    auto ns_info = VERIFY_RESULT(master_->catalog_manager()->FindNamespaceById(
4776
372
        indexed_table->namespace_id()));
4777
372
    auto* resp_indexed_table = resp->mutable_indexed_table();
4778
372
    resp_indexed_table->mutable_namespace_()->set_name(ns_info->name());
4779
372
    resp_indexed_table->set_table_name(indexed_table->name());
4780
372
    resp_indexed_table->set_table_id(indexed_table_id);
4781
372
  }
4782
372
  if (multi_stage) {
4783
104
    RETURN_NOT_OK(MultiStageAlterTable::UpdateIndexPermission(
4784
104
        this, indexed_table,
4785
104
        {{index_table_id, IndexPermissions::INDEX_PERM_WRITE_AND_DELETE_WHILE_REMOVING}}));
4786
268
  } else {
4787
268
    RETURN_NOT_OK(DeleteIndexInfoFromTable(indexed_table_id, index_table_id));
4788
268
  }
4789
4790
  // Actual Deletion of the index info will happen asynchronously after all the
4791
  // tablets move to the new IndexPermission of DELETE_ONLY_WHILE_REMOVING.
4792
372
  RETURN_NOT_OK(SendAlterTableRequest(indexed_table));
4793
372
  return Status::OK();
4794
372
}
4795
4796
Status CatalogManager::DeleteIndexInfoFromTable(
4797
268
    const TableId& indexed_table_id, const TableId& index_table_id) {
4798
268
  scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4799
268
  if (indexed_table == nullptr) {
4800
0
    LOG(WARNING) << "Indexed table " << indexed_table_id << " for index " << index_table_id
4801
0
                 << " not found";
4802
0
    return Status::OK();
4803
0
  }
4804
268
  TRACE("Locking indexed table");
4805
268
  auto l = indexed_table->LockForWrite();
4806
268
  auto &indexed_table_data = *l.mutable_data();
4807
4808
  // Heed issue #6233.
4809
268
  if (!l->pb.has_fully_applied_schema()) {
4810
159
    MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&indexed_table_data.pb);
4811
159
  }
4812
268
  auto *indexes = indexed_table_data.pb.mutable_indexes();
4813
290
  for (int i = 0; i < indexes->size(); i++) {
4814
290
    if (indexes->Get(i).table_id() == index_table_id) {
4815
4816
268
      indexes->DeleteSubrange(i, 1);
4817
4818
268
      indexed_table_data.pb.set_version(indexed_table_data.pb.version() + 1);
4819
      // TODO(Amit) : Is this compatible with the previous version?
4820
268
      indexed_table_data.pb.set_updates_only_index_permissions(false);
4821
268
      indexed_table_data.set_state(
4822
268
          SysTablesEntryPB::ALTERING,
4823
268
          Format("Delete index info version=$0 ts=$1",
4824
268
                 indexed_table_data.pb.version(), LocalTimeAsString()));
4825
4826
      // Update sys-catalog with the deleted indexed table info.
4827
268
      TRACE("Updating indexed table metadata on disk");
4828
268
      RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), indexed_table));
4829
4830
      // Update the in-memory state.
4831
268
      TRACE("Committing in-memory state");
4832
268
      l.Commit();
4833
268
      return Status::OK();
4834
268
    }
4835
290
  }
4836
4837
0
  LOG(WARNING) << "Index " << index_table_id << " not found in indexed table " << indexed_table_id;
4838
0
  return Status::OK();
4839
268
}
4840
4841
Status CatalogManager::DeleteTable(
4842
2.49k
    const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) {
4843
2.49k
  LOG(INFO) << "Servicing DeleteTable request from " << RequestorString(rpc) << ": "
4844
2.49k
            << req->ShortDebugString();
4845
4846
2.47k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
4847
2.47k
  bool result = IsCdcEnabled(*table);
4848
2.47k
  if (!FLAGS_enable_delete_truncate_xcluster_replicated_table && result) {
4849
1
    return STATUS(NotSupported,
4850
1
                  "Cannot delete a table in replication.",
4851
1
                  req->ShortDebugString(),
4852
1
                  MasterError(MasterErrorPB::INVALID_REQUEST));
4853
1
  }
4854
4855
2.47k
  if (req->is_index_table()) {
4856
264
    TRACE("Looking up index");
4857
264
    TableId table_id = table->id();
4858
264
    resp->set_table_id(table_id);
4859
264
    TableId indexed_table_id;
4860
264
    {
4861
264
      auto l = table->LockForRead();
4862
264
      indexed_table_id = GetIndexedTableId(l->pb);
4863
264
    }
4864
264
    scoped_refptr<TableInfo> indexed_table = GetTableInfo(indexed_table_id);
4865
264
    const bool is_pg_table = indexed_table != nullptr &&
4866
264
                             indexed_table->GetTableType() == PGSQL_TABLE_TYPE;
4867
264
    bool is_transactional;
4868
264
    {
4869
264
      Schema index_schema;
4870
264
      RETURN_NOT_OK(table->GetSchema(&index_schema));
4871
264
      is_transactional = index_schema.table_properties().is_transactional();
4872
264
    }
4873
264
    const bool index_backfill_enabled =
4874
264
        IsIndexBackfillEnabled(table->GetTableType(), is_transactional);
4875
264
    if (!is_pg_table && index_backfill_enabled) {
4876
104
      return MarkIndexInfoFromTableForDeletion(
4877
104
          indexed_table_id, table_id, /* multi_stage */ true, resp);
4878
104
    }
4879
2.36k
  }
4880
4881
2.36k
  return DeleteTableInternal(req, resp, rpc);
4882
2.36k
}
4883
4884
// Delete a Table
4885
//  - Update the table state to "DELETING".
4886
//  - Issue DeleteTablet tasks to all said tablets.
4887
//  - Update all the underlying tablet states as "DELETED".
4888
//
4889
// This order of events can help us guarantee that:
4890
//  - If a table is DELETING/DELETED, we do not add further tasks to it.
4891
//  - A DeleteTable is done when a table is either DELETING or DELETED and has no running tasks.
4892
//  - If a table is DELETING and it has no tasks on it, then it is safe to mark DELETED.
4893
//
4894
// We are lazy about deletions.
4895
//
4896
// IMPORTANT: If modifying, consider updating DeleteYsqlDBTables(), the bulk deletion API.
4897
Status CatalogManager::DeleteTableInternal(
4898
2.51k
    const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc) {
4899
2.51k
  auto schedules_to_tables_map = VERIFY_RESULT(
4900
2.51k
      MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::TABLE));
4901
4902
2.51k
  vector<DeletingTableData> tables;
4903
2.51k
  RETURN_NOT_OK(DeleteTableInMemory(req->table(), req->is_index_table(),
4904
2.51k
                                    true /* update_indexed_table */, schedules_to_tables_map,
4905
2.51k
                                    &tables, resp, rpc));
4906
4907
  // Update the in-memory state.
4908
2.47k
  TRACE("Committing in-memory state");
4909
2.47k
  std::unordered_set<TableId> sys_table_ids;
4910
2.74k
  for (auto& table : tables) {
4911
2.74k
    if (IsSystemTable(*table.info)) {
4912
0
      sys_table_ids.insert(table.info->id());
4913
0
    }
4914
2.74k
    table.write_lock.Commit();
4915
2.74k
  }
4916
4917
  // Delete any CDC streams that are set up on this table, after releasing the Table lock.
4918
2.47k
  TRACE("Deleting CDC streams on table");
4919
  // table_id for the requested table will be added to the end of the response.
4920
2.47k
  RSTATUS_DCHECK_GE(resp->deleted_table_ids_size(), 1, IllegalState,
4921
2.47k
      "DeleteTableInMemory expected to add the index id to resp");
4922
2.47k
  RETURN_NOT_OK(
4923
2.47k
      DeleteCDCStreamsForTable(resp->deleted_table_ids(resp->deleted_table_ids_size() - 1)));
4924
4925
2.47k
  if (PREDICT_FALSE(FLAGS_catalog_manager_inject_latency_in_delete_table_ms > 0)) {
4926
2
    LOG(INFO) << "Sleeping in CatalogManager::DeleteTable for " <<
4927
2
        FLAGS_catalog_manager_inject_latency_in_delete_table_ms << " ms";
4928
2
    SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_inject_latency_in_delete_table_ms));
4929
2
  }
4930
4931
  // Update the internal table maps. Exclude Postgres tables which are not in the name map.
4932
  // Also exclude hidden tables, that were already removed from this map.
4933
2.47k
  if (std::any_of(tables.begin(), tables.end(), [](auto& t) { return t.remove_from_name_map; })) {
4934
1.26k
    TRACE("Removing tables from by-name map");
4935
1.26k
    LockGuard lock(mutex_);
4936
1.54k
    for (const auto& table : tables) {
4937
1.54k
      if (table.remove_from_name_map) {
4938
1.54k
        TableInfoByNameMap::key_type key = {table.info->namespace_id(), table.info->name()};
4939
1.54k
        if (table_names_map_.erase(key) != 1) {
4940
0
          LOG(WARNING) << "Could not remove table from map: " << key.first << "." << key.second;
4941
0
        }
4942
4943
        // Also remove from the system.partitions table.
4944
1.54k
        GetYqlPartitionsVtable().RemoveFromCache(table.info->id());
4945
4946
        // Remove matviews from matview to pg table id map
4947
1.54k
        matview_pg_table_ids_map_.erase(table.info->id());
4948
1.54k
      }
4949
1.54k
    }
4950
    // We commit another map to increment its version and reset cache.
4951
    // Since table_name_map_ does not have version.
4952
1.26k
    table_ids_map_.Commit();
4953
1.26k
  }
4954
4955
2.74k
  for (const auto& table : tables) {
4956
2.74k
    LOG(INFO) << "Deleting table: " << table.info->name() << ", retained by: "
4957
2.74k
              << AsString(table.retained_by_snapshot_schedules, &Uuid::TryFullyDecode);
4958
4959
    // Send a DeleteTablet() request to each tablet replica in the table.
4960
2.74k
    RETURN_NOT_OK(DeleteTabletsAndSendRequests(table.info, table.retained_by_snapshot_schedules));
4961
    // Send a RemoveTableFromTablet() request to each colocated parent tablet replica in the table.
4962
    // TODO(pitr) handle YSQL colocated tables.
4963
2.74k
    if (table.info->IsColocatedUserTable()) {
4964
15
      auto call = std::make_shared<AsyncRemoveTableFromTablet>(
4965
15
          master_, AsyncTaskPool(), table.info->GetColocatedTablet(), table.info);
4966
15
      table.info->AddTask(call);
4967
15
      WARN_NOT_OK(ScheduleTask(call), "Failed to send RemoveTableFromTablet request");
4968
15
    }
4969
2.74k
  }
4970
4971
  // If there are any permissions granted on this table find them and delete them. This is necessary
4972
  // because we keep track of the permissions based on the canonical resource name which is a
4973
  // combination of the keyspace and table names, so if another table with the same name is created
4974
  // (in the same keyspace where the previous one existed), and the permissions were not deleted at
4975
  // the time of the previous table deletion, then the permissions that existed for the previous
4976
  // table will automatically be granted to the new table even though this wasn't the intention.
4977
2.47k
  string canonical_resource = get_canonical_table(req->table().namespace_().name(),
4978
2.47k
                                                  req->table().table_name());
4979
2.47k
  RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp));
4980
4981
  // Remove the system tables from system catalog.
4982
2.47k
  if (!sys_table_ids.empty()) {
4983
    // We do not expect system tables deletion during initial snapshot forming.
4984
0
    DCHECK(!initial_snapshot_writer_);
4985
4986
0
    TRACE("Sending system table delete RPCs");
4987
0
    for (auto& table_id : sys_table_ids) {
4988
      // "sys_catalog_->DeleteYsqlSystemTable(table_id)" won't work here
4989
      // as it only acts on the leader.
4990
0
      tablet::ChangeMetadataRequestPB change_req;
4991
0
      change_req.set_tablet_id(kSysCatalogTabletId);
4992
0
      change_req.set_remove_table_id(table_id);
4993
0
      RETURN_NOT_OK(tablet::SyncReplicateChangeMetadataOperation(
4994
0
          &change_req, sys_catalog_->tablet_peer().get(), leader_ready_term()));
4995
0
    }
4996
2.47k
  } else {
4997
2.47k
    TRACE("No system tables to delete");
4998
2.47k
  }
4999
5000
2.47k
  LOG(INFO) << "Successfully initiated deletion of "
5001
2.20k
            << (req->is_index_table() ? "index" : "table") << " with "
5002
2.47k
            << req->table().DebugString() << " per request from " << RequestorString(rpc);
5003
  // Asynchronously cleans up the final memory traces of the deleted database.
5004
2.47k
  background_tasks_->Wake();
5005
2.47k
  return Status::OK();
5006
2.47k
}
5007
5008
Status CatalogManager::DeleteTableInMemory(
5009
    const TableIdentifierPB& table_identifier,
5010
    const bool is_index_table,
5011
    const bool update_indexed_table,
5012
    const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map,
5013
    vector<DeletingTableData>* tables,
5014
    DeleteTableResponsePB* resp,
5015
2.80k
    rpc::RpcContext* rpc) {
5016
  // TODO(NIC): How to handle a DeleteTable request when the namespace is being deleted?
5017
2.20k
  const char* const object_type = is_index_table ? "index" : "table";
5018
2.80k
  const bool cascade_delete_index = is_index_table && !update_indexed_table;
5019
5020
0
  VLOG_WITH_PREFIX_AND_FUNC(1) << YB_STRUCT_TO_STRING(
5021
0
      table_identifier, is_index_table, update_indexed_table) << "\n" << GetStackTrace();
5022
5023
  // Lookup the table and verify if it exists.
5024
2.80k
  TRACE(Substitute("Looking up $0", object_type));
5025
2.80k
  auto table_result = FindTable(table_identifier);
5026
2.80k
  if (!VERIFY_RESULT(DoesTableExist(table_result))) {
5027
0
    if (cascade_delete_index) {
5028
0
      LOG(WARNING) << "Index " << table_identifier.DebugString() << " not found";
5029
0
      return Status::OK();
5030
0
    } else {
5031
0
      return table_result.status();
5032
0
    }
5033
2.80k
  }
5034
2.80k
  auto table = std::move(*table_result);
5035
5036
2.80k
  TRACE(Substitute("Locking $0", object_type));
5037
2.80k
  auto data = DeletingTableData {
5038
2.80k
    .info = table,
5039
2.80k
    .write_lock = table->LockForWrite(),
5040
2.80k
    .retained_by_snapshot_schedules = RepeatedBytes(),
5041
2.80k
    .remove_from_name_map = false
5042
2.80k
  };
5043
2.80k
  auto& l = data.write_lock;
5044
  // table_id for the requested table will be added to the end of the response.
5045
2.80k
  *resp->add_deleted_table_ids() = table->id();
5046
5047
2.80k
  if (is_index_table == IsTable(l->pb)) {
5048
0
    Status s = STATUS(NotFound, "The object does not exist");
5049
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5050
0
  }
5051
5052
2.80k
  FillRetainedBySnapshotSchedules(
5053
2.80k
      schedules_to_tables_map, table->id(), &data.retained_by_snapshot_schedules);
5054
2.80k
  bool hide_only = !data.retained_by_snapshot_schedules.empty();
5055
5056
2.80k
  if (l->started_deleting() || (hide_only && l->started_hiding())) {
5057
34
    if (cascade_delete_index) {
5058
0
      LOG(WARNING) << "Index " << table_identifier.ShortDebugString() << " was "
5059
0
                   << (l->started_deleting() ? "deleted" : "hidden");
5060
0
      return Status::OK();
5061
34
    } else {
5062
34
      Status s = STATUS(NotFound, "The object was deleted", l->pb.state_msg());
5063
34
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5064
34
    }
5065
2.77k
  }
5066
5067
  // Determine if we have to remove from the name map here before we change the table state.
5068
2.77k
  data.remove_from_name_map = l.data().table_type() != PGSQL_TABLE_TYPE && !l->started_hiding();
5069
5070
2.77k
  TRACE("Updating metadata on disk");
5071
  // Update the metadata for the on-disk state.
5072
2.77k
  if (hide_only) {
5073
0
    l.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDING);
5074
2.77k
  } else {
5075
2.77k
    l.mutable_data()->set_state(SysTablesEntryPB::DELETING,
5076
2.77k
                                 Substitute("Started deleting at $0", LocalTimeAsString()));
5077
2.77k
  }
5078
5079
2.77k
  auto now = master_->clock()->Now();
5080
2.77k
  DdlLogEntry ddl_log_entry(now, table->id(), l->pb, "Drop");
5081
2.77k
  if (is_index_table) {
5082
563
    const auto& indexed_table_id = GetIndexedTableId(l->pb);
5083
563
    auto indexed_table = FindTableById(indexed_table_id);
5084
563
    if (indexed_table.ok()) {
5085
563
      auto lock = (**indexed_table).LockForRead();
5086
563
      ddl_log_entry = DdlLogEntry(
5087
563
          now, indexed_table_id, lock->pb, Format("Drop index $0", l->name()));
5088
563
    }
5089
563
  }
5090
5091
  // Update sys-catalog with the removed table state.
5092
2.77k
  Status s = sys_catalog_->Upsert(leader_ready_term(), &ddl_log_entry, table);
5093
5094
2.77k
  if (PREDICT_FALSE(FLAGS_TEST_simulate_crash_after_table_marked_deleting)) {
5095
1
    return Status::OK();
5096
1
  }
5097
5098
2.77k
  if (!s.ok()) {
5099
    // The mutation will be aborted when 'l' exits the scope on early return.
5100
2
    s = s.CloneAndPrepend("An error occurred while updating sys tables");
5101
2
    LOG(WARNING) << s;
5102
2
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
5103
2
  }
5104
5105
  // For regular (indexed) table, delete all its index tables if any. Else for index table, delete
5106
  // index info from the indexed table.
5107
2.76k
  if (!is_index_table) {
5108
2.19k
    TableIdentifierPB index_identifier;
5109
294
    for (const auto& index : l->pb.indexes()) {
5110
294
      index_identifier.set_table_id(index.table_id());
5111
294
      RETURN_NOT_OK(DeleteTableInMemory(index_identifier, true /* is_index_table */,
5112
294
                                        false /* update_indexed_table */, schedules_to_tables_map,
5113
294
                                        tables, resp, rpc));
5114
294
    }
5115
574
  } else if (update_indexed_table) {
5116
268
    s = MarkIndexInfoFromTableForDeletion(
5117
268
        GetIndexedTableId(l->pb), table->id(), /* multi_stage */ false, resp);
5118
268
    if (!s.ok()) {
5119
0
      s = s.CloneAndPrepend(Substitute("An error occurred while deleting index info: $0",
5120
0
                                       s.ToString()));
5121
0
      LOG(WARNING) << s.ToString();
5122
0
      return CheckIfNoLongerLeaderAndSetupError(s, resp);
5123
0
    }
5124
2.76k
  }
5125
5126
2.76k
  if (!hide_only) {
5127
    // If table is being hidden we should not abort snapshot related tasks.
5128
2.75k
    table->AbortTasks();
5129
2.75k
  }
5130
5131
  // For regular (indexed) table, insert table info and lock in the front of the list. Else for
5132
  // index table, append them to the end. We do so so that we will commit and delete the indexed
5133
  // table first before its indexes.
5134
2.20k
  tables->insert(is_index_table ? tables->end() : tables->begin(), std::move(data));
5135
5136
2.76k
  return Status::OK();
5137
2.76k
}
5138
5139
3.41M
TableInfo::WriteLock CatalogManager::MaybeTransitionTableToDeleted(const TableInfoPtr& table) {
5140
3.41M
  if (!table) {
5141
0
    LOG_WITH_PREFIX(INFO) << "Finished deleting an Orphaned tablet. "
5142
0
                          << "Table Information is null. Skipping updating its state to DELETED.";
5143
0
    return TableInfo::WriteLock();
5144
0
  }
5145
3.41M
  if (table->HasTasks()) {
5146
0
    VLOG_WITH_PREFIX_AND_FUNC(2) << table->ToString() << " has tasks";
5147
54.3k
    return TableInfo::WriteLock();
5148
54.3k
  }
5149
3.35M
  bool hide_only;
5150
3.35M
  {
5151
3.35M
    auto lock = table->LockForRead();
5152
5153
    // For any table in DELETING state, we will want to mark it as DELETED once all its respective
5154
    // tablets have been successfully removed from tservers.
5155
    // For any hiding table we will want to mark it as HIDDEN once all its respective
5156
    // tablets have been successfully hidden on tservers.
5157
3.35M
    if (lock->is_deleted()) {
5158
      // Clear the tablets_ and partitions_ maps if table has already been DELETED.
5159
      // Usually this would have been done except for tables that were hidden and are now deleted.
5160
      // Also, this is a catch all in case any other path misses clearing the maps.
5161
183k
      table->ClearTabletMaps();
5162
183k
      return TableInfo::WriteLock();
5163
183k
    }
5164
3.17M
    hide_only = !lock->is_deleting();
5165
3.17M
    if (hide_only && !lock->is_hiding()) {
5166
3.16M
      return TableInfo::WriteLock();
5167
3.16M
    }
5168
6.12k
  }
5169
  // The current relevant order of operations during a DeleteTable is:
5170
  // 1) Mark the table as DELETING
5171
  // 2) Abort the current table tasks
5172
  // 3) Per tablet, send DeleteTable requests to all TS, then mark that tablet as DELETED
5173
  //
5174
  // This creates a race, wherein, after 2, HasTasks can be false, but we still have not
5175
  // gotten to point 3, which would add further tasks for the deletes.
5176
  //
5177
  // However, HasTasks is cheaper than AreAllTabletsDeletedOrHidden...
5178
6.12k
  auto all_tablets_done = hide_only ? table->AreAllTabletsHidden() : table->AreAllTabletsDeleted();
5179
18.4E
  VLOG_WITH_PREFIX_AND_FUNC(2)
5180
18.4E
      << table->ToString() << " hide only: " << hide_only << ", all tablets done: "
5181
18.4E
      << all_tablets_done;
5182
6.12k
  if (!all_tablets_done && !IsSystemTable(*table) && !table->IsColocatedUserTable()) {
5183
104
    return TableInfo::WriteLock();
5184
104
  }
5185
5186
6.01k
  auto lock = table->LockForWrite();
5187
6.01k
  if (lock->is_hiding()) {
5188
0
    LOG(INFO) << "Marking table as HIDDEN: " << table->ToString();
5189
0
    lock.mutable_data()->pb.set_hide_state(SysTablesEntryPB::HIDDEN);
5190
    // Erase all the tablets from partitions_ structure.
5191
0
    table->ClearTabletMaps(DeactivateOnly::kTrue);
5192
0
    return lock;
5193
0
  }
5194
6.01k
  if (lock->is_deleting()) {
5195
    // Update the metadata for the on-disk state.
5196
5.05k
    LOG(INFO) << "Marking table as DELETED: " << table->ToString();
5197
5.05k
    lock.mutable_data()->set_state(SysTablesEntryPB::DELETED,
5198
5.05k
        Substitute("Deleted with tablets at $0", LocalTimeAsString()));
5199
    // Erase all the tablets from tablets_ and partitions_ structures.
5200
5.05k
    table->ClearTabletMaps();
5201
5.05k
    return lock;
5202
5.05k
  }
5203
965
  return TableInfo::WriteLock();
5204
965
}
5205
5206
17.2k
void CatalogManager::CleanUpDeletedTables() {
5207
  // TODO(bogdan): Cache tables being deleted to make this iterate only over those?
5208
17.2k
  vector<scoped_refptr<TableInfo>> tables_to_delete;
5209
  // Garbage collecting.
5210
  // Going through all tables under the global lock, copying them to not hold lock for too long.
5211
17.2k
  TableInfoMap copy_of_table_by_id_map;
5212
17.2k
  {
5213
17.2k
    LockGuard lock(mutex_);
5214
17.2k
    copy_of_table_by_id_map = *table_ids_map_;
5215
17.2k
  }
5216
  // Mark the tables as DELETED and remove them from the in-memory maps.
5217
17.2k
  vector<TableInfo*> tables_to_update_on_disk;
5218
17.2k
  vector<TableInfo::WriteLock> table_locks;
5219
3.33M
  for (const auto& it : copy_of_table_by_id_map) {
5220
3.33M
    const auto& table = it.second;
5221
3.33M
    auto lock = MaybeTransitionTableToDeleted(table);
5222
3.33M
    if (lock.locked()) {
5223
2.29k
      table_locks.push_back(std::move(lock));
5224
2.29k
      tables_to_update_on_disk.push_back(table.get());
5225
2.29k
    }
5226
3.33M
  }
5227
17.2k
  if (tables_to_update_on_disk.size() > 0) {
5228
32
    Status s = sys_catalog_->Upsert(leader_ready_term(), tables_to_update_on_disk);
5229
32
    if (!s.ok()) {
5230
1
      LOG(WARNING) << "Error marking tables as DELETED: " << s.ToString();
5231
1
      return;
5232
1
    }
5233
    // Update the table in-memory info as DELETED after we've removed them from the maps.
5234
2.28k
    for (auto& lock : table_locks) {
5235
2.28k
      lock.Commit();
5236
2.28k
    }
5237
    // TODO: Check if we want to delete the totally deleted table from the sys_catalog here.
5238
    // TODO: SysCatalog::DeleteItem() if we've DELETED all user tables in a DELETING namespace.
5239
    // TODO: Also properly handle namespace_ids_map_.erase(table->namespace_id())
5240
31
  }
5241
17.2k
}
5242
5243
Status CatalogManager::IsDeleteTableDone(const IsDeleteTableDoneRequestPB* req,
5244
5.58k
                                         IsDeleteTableDoneResponsePB* resp) {
5245
  // Lookup the deleted table.
5246
5.58k
  TRACE("Looking up table $0", req->table_id());
5247
5.58k
  scoped_refptr<TableInfo> table;
5248
5.58k
  {
5249
5.58k
    SharedLock lock(mutex_);
5250
5.58k
    table = FindPtrOrNull(*table_ids_map_, req->table_id());
5251
5.58k
  }
5252
5253
5.58k
  if (table == nullptr) {
5254
2
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id "
5255
2
              << req->table_id() << ": deleted (not found)";
5256
2
    resp->set_done(true);
5257
2
    return Status::OK();
5258
2
  }
5259
5260
5.58k
  TRACE("Locking table");
5261
5.58k
  auto l = table->LockForRead();
5262
5263
5.58k
  if (!l->started_deleting() && !l->started_hiding()) {
5264
104
    LOG(WARNING) << "Servicing IsDeleteTableDone request for table id "
5265
104
                 << req->table_id() << ": NOT deleted";
5266
104
    Status s = STATUS(IllegalState, "The object was NOT deleted", l->pb.state_msg());
5267
104
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5268
104
  }
5269
5270
  // Temporary fix for github issue #5290.
5271
  // TODO: Wait till deletion completed for tablegroup parent table.
5272
5.48k
  if (table->IsTablegroupParentTable()) {
5273
0
    LOG(INFO) << "Servicing IsDeleteTableDone request for tablegroup parent table id "
5274
0
              << req->table_id() << ": deleting. Skipping wait for DELETED state.";
5275
0
    resp->set_done(true);
5276
0
    return Status::OK();
5277
0
  }
5278
5279
5.48k
  if (l->is_deleted() || l->is_hidden()) {
5280
2.61k
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id "
5281
2.61k
              << req->table_id() << ": totally " << (l->is_hidden() ? "hidden" : "deleted");
5282
2.61k
    resp->set_done(true);
5283
2.86k
  } else {
5284
2.86k
    LOG(INFO) << "Servicing IsDeleteTableDone request for table id " << req->table_id()
5285
2.84k
              << ((!table->IsColocatedUserTable()) ? ": deleting tablets" : "");
5286
5287
2.86k
    std::vector<std::shared_ptr<TSDescriptor>> descs;
5288
2.86k
    master_->ts_manager()->GetAllDescriptors(&descs);
5289
8.52k
    for (auto& ts_desc : descs) {
5290
8.52k
      LOG(INFO) << "Deleting on " << ts_desc->permanent_uuid() << ": "
5291
8.52k
                << ts_desc->PendingTabletDeleteToString();
5292
8.52k
    }
5293
5294
2.86k
    resp->set_done(false);
5295
2.86k
  }
5296
5297
5.48k
  return Status::OK();
5298
5.48k
}
5299
5300
namespace {
5301
5302
CHECKED_STATUS ApplyAlterSteps(server::Clock* clock,
5303
                               const TableId& table_id,
5304
                               const SysTablesEntryPB& current_pb,
5305
                               const AlterTableRequestPB* req,
5306
                               Schema* new_schema,
5307
                               ColumnId* next_col_id,
5308
276
                               std::vector<DdlLogEntry>* ddl_log_entries) {
5309
276
  const SchemaPB& current_schema_pb = current_pb.schema();
5310
276
  Schema cur_schema;
5311
276
  RETURN_NOT_OK(SchemaFromPB(current_schema_pb, &cur_schema));
5312
5313
276
  SchemaBuilder builder(cur_schema);
5314
276
  if (current_pb.has_next_column_id()) {
5315
276
    builder.set_next_column_id(ColumnId(current_pb.next_column_id()));
5316
276
  }
5317
276
  if (current_pb.has_colocated() && current_pb.colocated()) {
5318
2
    if (current_schema_pb.table_properties().is_ysql_catalog_table()) {
5319
0
      Uuid cotable_id;
5320
0
      RETURN_NOT_OK(cotable_id.FromHexString(req->table().table_id()));
5321
0
      builder.set_cotable_id(cotable_id);
5322
2
    } else {
5323
2
      uint32_t pgtable_id = VERIFY_RESULT(GetPgsqlTableOid(req->table().table_id()));
5324
2
      builder.set_pgtable_id(pgtable_id);
5325
2
    }
5326
2
  }
5327
5328
284
  for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) {
5329
284
    auto time = clock->Now();
5330
284
    switch (step.type()) {
5331
173
      case AlterTableRequestPB::ADD_COLUMN: {
5332
173
        if (!step.has_add_column()) {
5333
0
          return STATUS(InvalidArgument, "ADD_COLUMN missing column info");
5334
0
        }
5335
5336
        // Verify that encoding is appropriate for the new column's type.
5337
173
        ColumnSchemaPB new_col_pb = step.add_column().schema();
5338
173
        if (new_col_pb.has_id()) {
5339
0
          return STATUS_SUBSTITUTE(InvalidArgument,
5340
0
              "column $0: client should not specify column id", new_col_pb.ShortDebugString());
5341
0
        }
5342
173
        ColumnSchema new_col = ColumnSchemaFromPB(new_col_pb);
5343
5344
173
        RETURN_NOT_OK(builder.AddColumn(new_col, false));
5345
173
        ddl_log_entries->emplace_back(time, table_id, current_pb, Format("Add column $0", new_col));
5346
173
        break;
5347
173
      }
5348
5349
96
      case AlterTableRequestPB::DROP_COLUMN: {
5350
96
        if (!step.has_drop_column()) {
5351
0
          return STATUS(InvalidArgument, "DROP_COLUMN missing column info");
5352
0
        }
5353
5354
96
        if (cur_schema.is_key_column(step.drop_column().name())) {
5355
0
          return STATUS(InvalidArgument, "cannot remove a key column");
5356
0
        }
5357
5358
96
        RETURN_NOT_OK(builder.RemoveColumn(step.drop_column().name()));
5359
96
        ddl_log_entries->emplace_back(
5360
96
            time, table_id, current_pb, Format("Drop column $0", step.drop_column().name()));
5361
96
        break;
5362
96
      }
5363
5364
15
      case AlterTableRequestPB::RENAME_COLUMN: {
5365
15
        if (!step.has_rename_column()) {
5366
0
          return STATUS(InvalidArgument, "RENAME_COLUMN missing column info");
5367
0
        }
5368
5369
15
        RETURN_NOT_OK(builder.RenameColumn(
5370
15
            step.rename_column().old_name(),
5371
15
            step.rename_column().new_name()));
5372
15
        ddl_log_entries->emplace_back(
5373
15
            time, table_id, current_pb,
5374
15
            Format("Rename column $0 => $1", step.rename_column().old_name(),
5375
15
                   step.rename_column().new_name()));
5376
15
        break;
5377
15
      }
5378
5379
        // TODO: EDIT_COLUMN.
5380
5381
0
      default: {
5382
0
        return STATUS_SUBSTITUTE(InvalidArgument, "Invalid alter step type: $0", step.type());
5383
15
      }
5384
284
    }
5385
284
  }
5386
5387
276
  if (req->has_alter_properties()) {
5388
7
    RETURN_NOT_OK(builder.AlterProperties(req->alter_properties()));
5389
7
  }
5390
5391
276
  *new_schema = builder.Build();
5392
276
  *next_col_id = builder.next_column_id();
5393
276
  return Status::OK();
5394
276
}
5395
5396
} // namespace
5397
5398
Status CatalogManager::AlterTable(const AlterTableRequestPB* req,
5399
                                  AlterTableResponsePB* resp,
5400
2.86k
                                  rpc::RpcContext* rpc) {
5401
2.86k
  LOG_WITH_PREFIX(INFO) << "Servicing " << __func__ << " request from " << RequestorString(rpc)
5402
2.86k
                        << ": " << req->ShortDebugString();
5403
5404
2.86k
  std::vector<DdlLogEntry> ddl_log_entries;
5405
5406
  // Lookup the table and verify if it exists.
5407
2.86k
  TRACE("Looking up table");
5408
2.86k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5409
5410
2.86k
  NamespaceId new_namespace_id;
5411
5412
2.86k
  if (req->has_new_namespace()) {
5413
    // Lookup the new namespace and verify if it exists.
5414
46
    TRACE("Looking up new namespace");
5415
46
    scoped_refptr<NamespaceInfo> ns;
5416
46
    NamespaceIdentifierPB namespace_identifier = req->new_namespace();
5417
    // Use original namespace_id as new_namespace_id for YSQL tables.
5418
46
    if (table->GetTableType() == PGSQL_TABLE_TYPE && !namespace_identifier.has_id()) {
5419
42
      namespace_identifier.set_id(table->namespace_id());
5420
42
    }
5421
44
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(namespace_identifier), resp);
5422
5423
44
    auto ns_lock = ns->LockForRead();
5424
44
    new_namespace_id = ns->id();
5425
    // Don't use Namespaces that aren't running.
5426
44
    if (ns->state() != SysNamespaceEntryPB::RUNNING) {
5427
0
      Status s = STATUS_SUBSTITUTE(TryAgain,
5428
0
          "Namespace not running (State=$0). Cannot create $1.$2",
5429
0
          SysNamespaceEntryPB::State_Name(ns->state()), ns->name(), table->name() );
5430
0
      return SetupError(resp->mutable_error(), NamespaceMasterError(ns->state()), s);
5431
0
    }
5432
2.86k
  }
5433
2.86k
  if (req->has_new_namespace() || req->has_new_table_name()) {
5434
44
    if (new_namespace_id.empty()) {
5435
0
      const Status s = STATUS(InvalidArgument, "No namespace used");
5436
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NO_NAMESPACE_USED, s);
5437
0
    }
5438
2.86k
  }
5439
5440
2.86k
  TRACE("Locking table");
5441
2.86k
  auto l = table->LockForWrite();
5442
2.86k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5443
5444
2.86k
  bool has_changes = false;
5445
2.86k
  auto& table_pb = l.mutable_data()->pb;
5446
2.86k
  const TableName table_name = l->name();
5447
2.86k
  const NamespaceId namespace_id = l->namespace_id();
5448
2.82k
  const TableName new_table_name = req->has_new_table_name() ? req->new_table_name() : table_name;
5449
5450
  // Calculate new schema for the on-disk state, not persisted yet.
5451
2.86k
  Schema new_schema;
5452
2.86k
  ColumnId next_col_id = ColumnId(l->pb.next_column_id());
5453
2.86k
  if (req->alter_schema_steps_size() || req->has_alter_properties()) {
5454
276
    TRACE("Apply alter schema");
5455
276
    Status s = ApplyAlterSteps(
5456
276
        master_->clock(), table->id(), l->pb, req, &new_schema, &next_col_id, &ddl_log_entries);
5457
276
    if (!s.ok()) {
5458
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5459
0
    }
5460
276
    DCHECK_NE(next_col_id, 0);
5461
276
    DCHECK_EQ(new_schema.find_column_by_id(next_col_id),
5462
276
              static_cast<int>(Schema::kColumnNotFound));
5463
276
    has_changes = true;
5464
276
  }
5465
5466
  // Try to acquire the new table name.
5467
2.86k
  if (req->has_new_namespace() || req->has_new_table_name()) {
5468
5469
    // Postgres handles name uniqueness constraints in it's own layer.
5470
44
    if (l->table_type() != PGSQL_TABLE_TYPE) {
5471
2
      LockGuard lock(mutex_);
5472
0
      VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
5473
5474
2
      TRACE("Acquired catalog manager lock");
5475
5476
      // Verify that the table does not exist.
5477
2
      scoped_refptr<TableInfo> other_table = FindPtrOrNull(
5478
2
          table_names_map_, {new_namespace_id, new_table_name});
5479
2
      if (other_table != nullptr) {
5480
1
        Status s = STATUS_SUBSTITUTE(AlreadyPresent,
5481
1
            "Object '$0.$1' already exists",
5482
1
            GetNamespaceNameUnlocked(new_namespace_id), other_table->name());
5483
1
        LOG(WARNING) << "Found table: " << other_table->ToStringWithState()
5484
1
                     << ". Failed alterring table with error: "
5485
1
                     << s.ToString() << " Request:\n" << req->DebugString();
5486
1
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
5487
1
      }
5488
5489
      // Acquire the new table name (now we have 2 name for the same table).
5490
1
      table_names_map_[{new_namespace_id, new_table_name}] = table;
5491
1
    }
5492
5493
43
    table_pb.set_namespace_id(new_namespace_id);
5494
43
    table_pb.set_name(new_table_name);
5495
5496
43
    has_changes = true;
5497
43
  }
5498
5499
  // Check if there has been any changes to the placement policies for this table.
5500
2.86k
  if (req->has_replication_info()) {
5501
    // If this is a colocated table, it does not make sense to set placement
5502
    // policy for this table, as the tablet associated with it is shared by
5503
    // multiple tables.
5504
4
    if (table->colocated()) {
5505
0
      const Status s = STATUS(InvalidArgument,
5506
0
          "Placement policy cannot be altered for a colocated table");
5507
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5508
0
    }
5509
4
    if (table->GetTableType() == PGSQL_TABLE_TYPE) {
5510
0
      const Status s = STATUS(InvalidArgument,
5511
0
            "Placement policy cannot be altered for YSQL tables, use Tablespaces");
5512
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5513
0
    }
5514
    // Validate table replication info.
5515
4
    RETURN_NOT_OK(ValidateTableReplicationInfo(req->replication_info()));
5516
4
    table_pb.mutable_replication_info()->CopyFrom(req->replication_info());
5517
4
    has_changes = true;
5518
4
  }
5519
5520
  // TODO(hector): Simplify the AlterSchema workflow to avoid doing the same checks on every layer
5521
  // this request goes through: https://github.com/YugaByte/yugabyte-db/issues/1882.
5522
2.86k
  if (req->has_wal_retention_secs()) {
5523
2.54k
    if (has_changes) {
5524
0
      const Status s = STATUS(InvalidArgument,
5525
0
          "wal_retention_secs cannot be altered concurrently with other properties");
5526
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
5527
0
    }
5528
    // TODO(hector): Handle co-partitioned tables:
5529
    // https://github.com/YugaByte/yugabyte-db/issues/1905.
5530
2.54k
    table_pb.set_wal_retention_secs(req->wal_retention_secs());
5531
2.54k
    has_changes = true;
5532
2.54k
  }
5533
5534
2.86k
  if (!has_changes) {
5535
0
    if (req->has_force_send_alter_request() && req->force_send_alter_request()) {
5536
0
      RETURN_NOT_OK(SendAlterTableRequest(table, req));
5537
0
    }
5538
    // Skip empty requests...
5539
0
    return Status::OK();
5540
2.86k
  }
5541
5542
  // Serialize the schema Increment the version number.
5543
2.86k
  if (new_schema.initialized()) {
5544
276
    if (!l->pb.has_fully_applied_schema()) {
5545
      // The idea here is that if we are in the middle of updating the schema
5546
      // from one state to another, then YBClients will be given the older
5547
      // version until the schema is updated on all the tablets.
5548
      // As of Dec 2019, this may lead to some rejected operations/retries during
5549
      // the index backfill. See #3284 for possible optimizations.
5550
276
      MultiStageAlterTable::CopySchemaDetailsToFullyApplied(&table_pb);
5551
276
    }
5552
276
    SchemaToPB(new_schema, table_pb.mutable_schema());
5553
276
  }
5554
5555
  // Only increment the version number if it is a schema change (AddTable change goes through a
5556
  // different path and it's not processed here).
5557
2.86k
  if (!req->has_wal_retention_secs()) {
5558
323
    table_pb.set_version(table_pb.version() + 1);
5559
323
    table_pb.set_updates_only_index_permissions(false);
5560
323
  }
5561
2.86k
  table_pb.set_next_column_id(next_col_id);
5562
2.86k
  l.mutable_data()->set_state(
5563
2.86k
      SysTablesEntryPB::ALTERING,
5564
2.86k
      Substitute("Alter table version=$0 ts=$1", table_pb.version(), LocalTimeAsString()));
5565
5566
  // Update sys-catalog with the new table schema.
5567
2.86k
  TRACE("Updating metadata on disk");
5568
2.86k
  std::vector<const DdlLogEntry*> ddl_log_entry_pointers;
5569
2.86k
  ddl_log_entry_pointers.reserve(ddl_log_entries.size());
5570
284
  for (const auto& entry : ddl_log_entries) {
5571
284
    ddl_log_entry_pointers.push_back(&entry);
5572
284
  }
5573
2.86k
  Status s = sys_catalog_->Upsert(leader_ready_term(), ddl_log_entry_pointers, table);
5574
2.86k
  if (!s.ok()) {
5575
1
    s = s.CloneAndPrepend(
5576
1
        Substitute("An error occurred while updating sys-catalog tables entry: $0",
5577
1
                   s.ToString()));
5578
1
    LOG(WARNING) << s.ToString();
5579
1
    if (table->GetTableType() != PGSQL_TABLE_TYPE &&
5580
0
        (req->has_new_namespace() || req->has_new_table_name())) {
5581
0
      LockGuard lock(mutex_);
5582
0
      VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
5583
0
      CHECK_EQ(table_names_map_.erase({new_namespace_id, new_table_name}), 1);
5584
0
    }
5585
    // TableMetadaLock follows RAII paradigm: when it leaves scope,
5586
    // 'l' will be unlocked, and the mutation will be aborted.
5587
1
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
5588
1
  }
5589
5590
  // Remove the old name. Not present if PGSQL.
5591
2.86k
  if (table->GetTableType() != PGSQL_TABLE_TYPE &&
5592
172
      (req->has_new_namespace() || req->has_new_table_name())) {
5593
1
    TRACE("Removing (namespace, table) combination ($0, $1) from by-name map",
5594
1
          namespace_id, table_name);
5595
1
    LockGuard lock(mutex_);
5596
1
    table_names_map_.erase({namespace_id, table_name});
5597
1
  }
5598
5599
  // Update the in-memory state.
5600
2.86k
  TRACE("Committing in-memory state");
5601
2.86k
  l.Commit();
5602
5603
2.86k
  RETURN_NOT_OK(SendAlterTableRequest(table, req));
5604
5605
  // Increment transaction status version if needed.
5606
2.86k
  if (table->GetTableType() == TableType::TRANSACTION_STATUS_TABLE_TYPE) {
5607
0
    RETURN_NOT_OK(IncrementTransactionTablesVersion());
5608
0
  }
5609
5610
2.86k
  LOG(INFO) << "Successfully initiated ALTER TABLE (pending tablet schema updates) for "
5611
2.86k
            << table->ToString() << " per request from " << RequestorString(rpc);
5612
2.86k
  return Status::OK();
5613
2.86k
}
5614
5615
Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req,
5616
649
                                        IsAlterTableDoneResponsePB* resp) {
5617
  // 1. Lookup the table and verify if it exists.
5618
649
  TRACE("Looking up table");
5619
649
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5620
5621
649
  TRACE("Locking table");
5622
649
  auto l = table->LockForRead();
5623
649
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5624
5625
  // 2. Verify if the alter is in-progress.
5626
649
  TRACE("Verify if there is an alter operation in progress for $0", table->ToString());
5627
649
  resp->set_schema_version(l->pb.version());
5628
649
  resp->set_done(l->pb.state() != SysTablesEntryPB::ALTERING);
5629
5630
649
  return Status::OK();
5631
649
}
5632
5633
Result<TabletInfoPtr> CatalogManager::RegisterNewTabletForSplit(
5634
    TabletInfo* source_tablet_info, const PartitionPB& partition,
5635
36
    TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock) {
5636
36
  const auto tablet_lock = source_tablet_info->LockForRead();
5637
5638
36
  auto table = source_tablet_info->table();
5639
36
  TabletInfoPtr new_tablet;
5640
36
  {
5641
36
    LockGuard lock(mutex_);
5642
36
    new_tablet = CreateTabletInfo(table.get(), partition);
5643
36
  }
5644
36
  const auto& source_tablet_meta = tablet_lock->pb;
5645
5646
36
  auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb;
5647
36
  new_tablet_meta.set_state(SysTabletsEntryPB::CREATING);
5648
36
  new_tablet_meta.mutable_committed_consensus_state()->CopyFrom(
5649
36
      source_tablet_meta.committed_consensus_state());
5650
36
  new_tablet_meta.set_split_depth(source_tablet_meta.split_depth() + 1);
5651
36
  new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id());
5652
  // TODO(tsplit): consider and handle failure scenarios, for example:
5653
  // - Crash or leader failover before sending out the split tasks.
5654
  // - Long enough partition while trying to send out the splits so that they timeout and
5655
  //   not get executed.
5656
36
  int new_partition_list_version;
5657
36
  {
5658
36
    LockGuard lock(mutex_);
5659
5660
36
    auto& table_pb = table_write_lock->mutable_data()->pb;
5661
36
    new_partition_list_version = table_pb.partition_list_version() + 1;
5662
36
    table_pb.set_partition_list_version(new_partition_list_version);
5663
5664
36
    tablet_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id());
5665
36
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), table, new_tablet, source_tablet_info));
5666
5667
36
    MAYBE_FAULT(FLAGS_TEST_crash_after_creating_single_split_tablet);
5668
5669
36
    table->AddTablet(new_tablet);
5670
    // TODO: We use this pattern in other places, but what if concurrent thread accesses not yet
5671
    // committed TabletInfo from the `table` ?
5672
36
    new_tablet->mutable_metadata()->CommitMutation();
5673
5674
36
    auto tablet_map_checkout = tablet_map_.CheckOut();
5675
36
    (*tablet_map_checkout)[new_tablet->id()] = new_tablet;
5676
36
  }
5677
36
  LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id()
5678
36
            << " (" << AsString(partition) << ") to split the tablet "
5679
36
            << source_tablet_info->tablet_id()
5680
36
            << " (" << AsString(source_tablet_meta.partition())
5681
36
            << ") for table " << table->ToString()
5682
36
            << ", new partition_list_version: " << new_partition_list_version;
5683
5684
36
  return new_tablet;
5685
36
}
5686
5687
Status CatalogManager::GetTableSchema(const GetTableSchemaRequestPB* req,
5688
117k
                                      GetTableSchemaResponsePB* resp) {
5689
0
  VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString();
5690
5691
  // Lookup the table and verify if it exists.
5692
117k
  TRACE("Looking up table");
5693
114k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5694
5695
  // Due to differences in the way proxies handle version mismatch (pull for yql vs push for sql).
5696
  // For YQL tables, we will return the "set of indexes" being applied instead of the ones
5697
  // that are fully completed.
5698
  // For PGSQL (and other) tables we want to return the fully applied schema.
5699
114k
  const bool get_fully_applied_indexes = table->GetTableType() != TableType::YQL_TABLE_TYPE;
5700
114k
  return GetTableSchemaInternal(req, resp, get_fully_applied_indexes);
5701
117k
}
5702
5703
Status CatalogManager::GetTableSchemaInternal(const GetTableSchemaRequestPB* req,
5704
                                              GetTableSchemaResponsePB* resp,
5705
115k
                                              bool get_fully_applied_indexes) {
5706
12
  VLOG(1) << "Servicing GetTableSchema request for " << req->ShortDebugString();
5707
5708
  // Lookup the table and verify if it exists.
5709
115k
  TRACE("Looking up table");
5710
115k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
5711
5712
115k
  TRACE("Locking table");
5713
115k
  auto l = table->LockForRead();
5714
115k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5715
5716
115k
  if (l->pb.has_fully_applied_schema()) {
5717
    // An AlterTable is in progress; fully_applied_schema is the last
5718
    // schema that has reached every TS.
5719
1.16k
    DCHECK(l->pb.state() == SysTablesEntryPB::ALTERING);
5720
1.16k
    resp->mutable_schema()->CopyFrom(l->pb.fully_applied_schema());
5721
114k
  } else {
5722
    // There's no AlterTable, the regular schema is "fully applied".
5723
114k
    resp->mutable_schema()->CopyFrom(l->pb.schema());
5724
114k
  }
5725
5726
115k
  if (get_fully_applied_indexes && l->pb.has_fully_applied_schema()) {
5727
123
    resp->set_version(l->pb.fully_applied_schema_version());
5728
123
    resp->mutable_indexes()->CopyFrom(l->pb.fully_applied_indexes());
5729
123
    if (l->pb.has_fully_applied_index_info()) {
5730
0
      resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb));
5731
0
      *resp->mutable_index_info() = l->pb.fully_applied_index_info();
5732
0
    }
5733
0
    VLOG(1) << "Returning"
5734
0
            << "\nfully_applied_schema with version "
5735
0
            << l->pb.fully_applied_schema_version()
5736
0
            << ":\n"
5737
0
            << yb::ToString(l->pb.fully_applied_indexes())
5738
0
            << "\ninstead of schema with version "
5739
0
            << l->pb.version()
5740
0
            << ":\n"
5741
0
            << yb::ToString(l->pb.indexes());
5742
115k
  } else {
5743
115k
    resp->set_version(l->pb.version());
5744
115k
    resp->mutable_indexes()->CopyFrom(l->pb.indexes());
5745
115k
    if (l->pb.has_index_info()) {
5746
22.2k
      resp->set_obsolete_indexed_table_id(GetIndexedTableId(l->pb));
5747
22.2k
      *resp->mutable_index_info() = l->pb.index_info();
5748
22.2k
    }
5749
17
    VLOG(3) << "Returning"
5750
17
            << "\nschema with version "
5751
17
            << l->pb.version()
5752
17
            << ":\n"
5753
17
            << yb::ToString(l->pb.indexes());
5754
115k
  }
5755
115k
  resp->set_is_compatible_with_previous_version(l->pb.updates_only_index_permissions());
5756
115k
  resp->mutable_partition_schema()->CopyFrom(l->pb.partition_schema());
5757
115k
  if (IsReplicationInfoSet(l->pb.replication_info())) {
5758
2
    resp->mutable_replication_info()->CopyFrom(l->pb.replication_info());
5759
2
  }
5760
115k
  resp->set_create_table_done(!table->IsCreateInProgress());
5761
115k
  resp->set_table_type(table->metadata().state().pb.table_type());
5762
115k
  resp->mutable_identifier()->set_table_name(l->pb.name());
5763
115k
  resp->mutable_identifier()->set_table_id(table->id());
5764
115k
  resp->mutable_identifier()->mutable_namespace_()->set_id(table->namespace_id());
5765
115k
  auto nsinfo = FindNamespaceById(table->namespace_id());
5766
115k
  if (nsinfo.ok()) {
5767
115k
    resp->mutable_identifier()->mutable_namespace_()->set_name((**nsinfo).name());
5768
115k
  }
5769
5770
115k
  if (l->pb.has_wal_retention_secs()) {
5771
2.46k
    resp->set_wal_retention_secs(l->pb.wal_retention_secs());
5772
2.46k
  }
5773
5774
  // Get namespace name by id.
5775
115k
  SharedLock lock(mutex_);
5776
115k
  TRACE("Looking up namespace");
5777
115k
  const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, table->namespace_id());
5778
5779
115k
  if (ns == nullptr) {
5780
0
    Status s = STATUS_SUBSTITUTE(
5781
0
        NotFound, "Could not find namespace by namespace id $0 for request $1.",
5782
0
        table->namespace_id(), req->DebugString());
5783
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
5784
0
  }
5785
5786
115k
  resp->mutable_identifier()->mutable_namespace_()->set_name(ns->name());
5787
5788
115k
  resp->set_colocated(table->colocated());
5789
5790
18.4E
  VLOG(1) << "Serviced GetTableSchema request for " << req->ShortDebugString() << " with "
5791
18.4E
          << yb::ToString(*resp);
5792
115k
  return Status::OK();
5793
115k
}
5794
5795
Status CatalogManager::GetTablegroupSchema(const GetTablegroupSchemaRequestPB* req,
5796
0
                                           GetTablegroupSchemaResponsePB* resp) {
5797
0
  VLOG(1) << "Servicing GetTablegroupSchema request for " << req->ShortDebugString();
5798
0
  if (!req->parent_tablegroup().has_id()) {
5799
0
    Status s = STATUS(InvalidArgument, "Invalid get tablegroup request (missing fields)");
5800
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5801
0
  }
5802
5803
0
  const std::string& tablegroupId = req->parent_tablegroup().id();
5804
0
  if (!IsTablegroupParentTableId(tablegroupId)) {
5805
0
    Status s = STATUS(InvalidArgument, "Received a non tablegroup ID");
5806
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
5807
0
  }
5808
5809
  // Strip the suffix from the tablegroup ID request (since tablegroup_ids_map_
5810
  // only accepts the plain ID).
5811
0
  DCHECK(boost::algorithm::ends_with(tablegroupId, master::kTablegroupParentTableIdSuffix));
5812
0
  size_t tgid_len = tablegroupId.size() - strlen(master::kTablegroupParentTableIdSuffix);
5813
0
  TablegroupId tgid = tablegroupId.substr(0, tgid_len);
5814
5815
  // Lookup the tablegroup.
5816
0
  std::unordered_set<TableId> tablesInTablegroup;
5817
0
  {
5818
0
    SharedLock lock(mutex_);
5819
5820
0
    if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) {
5821
0
      return STATUS(NotFound, Substitute("Tablegroup not found for tablegroup id: $0",
5822
0
                                         req->parent_tablegroup().id()));
5823
0
    }
5824
0
    scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid];
5825
0
    tablesInTablegroup = tginfo->ChildTables();
5826
0
  }
5827
5828
0
  for (const auto& t : tablesInTablegroup) {
5829
0
    TRACE("Looking up table");
5830
0
    GetTableSchemaRequestPB schemaReq;
5831
0
    GetTableSchemaResponsePB schemaResp;
5832
0
    schemaReq.mutable_table()->set_table_id(t);
5833
0
    Status s = GetTableSchema(&schemaReq, &schemaResp);
5834
0
    if (!s.ok() || schemaResp.has_error()) {
5835
0
      LOG(ERROR) << "Error while getting table schema: " << s;
5836
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, s);
5837
0
    }
5838
0
    resp->add_get_table_schema_response_pbs()->Swap(&schemaResp);
5839
0
  }
5840
5841
0
  return Status::OK();
5842
0
}
5843
5844
Status CatalogManager::GetColocatedTabletSchema(const GetColocatedTabletSchemaRequestPB* req,
5845
0
                                                GetColocatedTabletSchemaResponsePB* resp) {
5846
0
  VLOG(1) << "Servicing GetColocatedTabletSchema request for " << req->ShortDebugString();
5847
5848
  // Lookup the given parent colocated table and verify if it exists.
5849
0
  TRACE("Looking up table");
5850
0
  auto parent_colocated_table = VERIFY_RESULT(FindTable(req->parent_colocated_table()));
5851
0
  {
5852
0
    TRACE("Locking table");
5853
0
    auto l = parent_colocated_table->LockForRead();
5854
0
    RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
5855
0
  }
5856
5857
0
  if (!parent_colocated_table->colocated() || !parent_colocated_table->IsColocatedParentTable()) {
5858
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_TYPE,
5859
0
                      STATUS(InvalidArgument, "Table provided is not a parent colocated table"));
5860
0
  }
5861
5862
  // Next get all the user tables that are in the database.
5863
0
  ListTablesRequestPB listTablesReq;
5864
0
  ListTablesResponsePB ListTablesResp;
5865
5866
0
  listTablesReq.mutable_namespace_()->set_id(parent_colocated_table->namespace_id());
5867
0
  listTablesReq.mutable_namespace_()->set_database_type(YQL_DATABASE_PGSQL);
5868
0
  listTablesReq.set_exclude_system_tables(true);
5869
0
  Status status = ListTables(&listTablesReq, &ListTablesResp);
5870
0
  if (!status.ok() || ListTablesResp.has_error()) {
5871
0
    LOG(ERROR) << "Error while listing tables: " << status;
5872
0
    return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
5873
0
  }
5874
5875
  // Get the table schema for each colocated table.
5876
0
  for (const auto& t : ListTablesResp.tables()) {
5877
    // Need to check if this table is colocated first.
5878
0
    TRACE("Looking up table");
5879
0
    scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTableById(t.id()));
5880
5881
0
    if (table->colocated()) {
5882
      // Now we can get the schema for this table.
5883
0
      GetTableSchemaRequestPB schemaReq;
5884
0
      GetTableSchemaResponsePB schemaResp;
5885
0
      schemaReq.mutable_table()->set_table_id(t.id());
5886
0
      status = GetTableSchema(&schemaReq, &schemaResp);
5887
0
      if (!status.ok() || schemaResp.has_error()) {
5888
0
        LOG(ERROR) << "Error while getting table schema: " << status;
5889
0
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
5890
0
      }
5891
0
      resp->add_get_table_schema_response_pbs()->Swap(&schemaResp);
5892
0
    }
5893
0
  }
5894
5895
0
  return Status::OK();
5896
0
}
5897
5898
Status CatalogManager::ListTables(const ListTablesRequestPB* req,
5899
1.83k
                                  ListTablesResponsePB* resp) {
5900
1.83k
  NamespaceId namespace_id;
5901
5902
  // Validate namespace.
5903
1.83k
  if (req->has_namespace_()) {
5904
    // Lookup the namespace and verify if it exists.
5905
349
    auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
5906
5907
176
    auto ns_lock = ns->LockForRead();
5908
176
    namespace_id = ns->id();
5909
5910
    // Don't list tables with a namespace that isn't running.
5911
176
    if (ns->state() != SysNamespaceEntryPB::RUNNING) {
5912
0
      LOG(INFO) << "ListTables request for a Namespace not running (State="
5913
0
                << SysNamespaceEntryPB::State_Name(ns->state()) << ")";
5914
0
      return Status::OK();
5915
0
    }
5916
1.66k
  }
5917
5918
1.66k
  bool has_rel_filter = req->relation_type_filter_size() > 0;
5919
1.50k
  bool include_user_table = has_rel_filter ? false : true;
5920
1.50k
  bool include_user_index = has_rel_filter ? false : true;
5921
198
  bool include_system_table = req->exclude_system_tables() ? false
5922
1.46k
      : (has_rel_filter ? false : true);
5923
5924
158
  for (const auto &relation : req->relation_type_filter()) {
5925
158
    if (relation == SYSTEM_TABLE_RELATION) {
5926
2
      include_system_table = true;
5927
156
    } else if (relation == USER_TABLE_RELATION) {
5928
155
      include_user_table = true;
5929
1
    } else if (relation == INDEX_TABLE_RELATION) {
5930
1
      include_user_index = true;
5931
1
    }
5932
158
  }
5933
5934
1.66k
  SharedLock lock(mutex_);
5935
1.66k
  RelationType relation_type;
5936
5937
213k
  for (const auto& entry : *table_ids_map_) {
5938
213k
    auto& table_info = *entry.second;
5939
213k
    auto ltm = table_info.LockForRead();
5940
5941
213k
    if (!ltm->visible_to_client() && !req->include_not_running()) {
5942
36
      continue;
5943
36
    }
5944
5945
213k
    if (!namespace_id.empty() && namespace_id != table_info.namespace_id()) {
5946
72.4k
      continue; // Skip tables from other namespaces.
5947
72.4k
    }
5948
5949
141k
    if (req->has_name_filter()) {
5950
2.70k
      size_t found = ltm->name().find(req->name_filter());
5951
2.70k
      if (found == string::npos) {
5952
2.69k
        continue;
5953
2.69k
      }
5954
138k
    }
5955
5956
138k
    if (IsUserIndexUnlocked(table_info)) {
5957
2
      if (!include_user_index) {
5958
0
        continue;
5959
0
      }
5960
2
      relation_type = INDEX_TABLE_RELATION;
5961
138k
    } else if (IsUserTableUnlocked(table_info)) {
5962
6.38k
      if (!include_user_table) {
5963
4
        continue;
5964
4
      }
5965
6.37k
      relation_type = USER_TABLE_RELATION;
5966
132k
    } else {
5967
132k
      if (!include_system_table) {
5968
110k
        continue;
5969
110k
      }
5970
21.7k
      relation_type = SYSTEM_TABLE_RELATION;
5971
21.7k
    }
5972
5973
28.1k
    NamespaceIdentifierPB ns_identifier;
5974
28.1k
    ns_identifier.set_id(ltm->namespace_id());
5975
28.1k
    auto ns = FindNamespaceUnlocked(ns_identifier);
5976
28.1k
    if (!ns.ok() || (**ns).state() != SysNamespaceEntryPB::RUNNING) {
5977
2
      if (PREDICT_FALSE(FLAGS_TEST_return_error_if_namespace_not_found)) {
5978
0
        VERIFY_NAMESPACE_FOUND(std::move(ns), resp);
5979
0
      }
5980
1
      LOG(ERROR) << "Unable to find namespace with id " << ltm->namespace_id()
5981
1
                 << " for table " << ltm->name();
5982
1
      continue;
5983
28.1k
    }
5984
5985
28.1k
    ListTablesResponsePB::TableInfo *table = resp->add_tables();
5986
28.1k
    {
5987
28.1k
      auto namespace_lock = (**ns).LockForRead();
5988
28.1k
      table->mutable_namespace_()->set_id((**ns).id());
5989
28.1k
      table->mutable_namespace_()->set_name(namespace_lock->name());
5990
28.1k
      table->mutable_namespace_()->set_database_type(namespace_lock->pb.database_type());
5991
28.1k
    }
5992
28.1k
    table->set_id(entry.second->id());
5993
28.1k
    table->set_name(ltm->name());
5994
28.1k
    table->set_table_type(ltm->table_type());
5995
28.1k
    table->set_relation_type(relation_type);
5996
28.1k
    table->set_state(ltm->pb.state());
5997
28.1k
    table->set_pgschema_name(ltm->schema().pgschema_name());
5998
28.1k
  }
5999
1.66k
  return Status::OK();
6000
1.66k
}
6001
6002
0
boost::optional<TablegroupId> CatalogManager::FindTablegroupByTableId(const TableId& table_id) {
6003
0
  SharedLock lock(mutex_);
6004
6005
0
  for (const auto& tablegroup : tablegroup_ids_map_) {
6006
0
    const auto& tgid = tablegroup.first;
6007
0
    const auto& tginfo = tablegroup.second;
6008
0
    for (const auto& t : tginfo->ChildTables()) {
6009
0
      if (table_id == t) {
6010
0
        return boost::optional<TablegroupId>(tgid + kTablegroupParentTableIdSuffix);
6011
0
      }
6012
0
    }
6013
0
  }
6014
6015
0
  return boost::none;
6016
0
}
6017
6018
410k
scoped_refptr<TableInfo> CatalogManager::GetTableInfo(const TableId& table_id) {
6019
410k
  SharedLock lock(mutex_);
6020
410k
  return FindPtrOrNull(*table_ids_map_, table_id);
6021
410k
}
6022
6023
scoped_refptr<TableInfo> CatalogManager::GetTableInfoFromNamespaceNameAndTableName(
6024
0
    YQLDatabase db_type, const NamespaceName& namespace_name, const TableName& table_name) {
6025
0
  if (db_type == YQL_DATABASE_PGSQL)
6026
0
    return nullptr;
6027
0
  SharedLock lock(mutex_);
6028
0
  const auto ns = FindPtrOrNull(namespace_names_mapper_[db_type], namespace_name);
6029
0
  return ns
6030
0
    ? FindPtrOrNull(table_names_map_, {ns->id(), table_name})
6031
0
    : nullptr;
6032
0
}
6033
6034
243k
scoped_refptr<TableInfo> CatalogManager::GetTableInfoUnlocked(const TableId& table_id) {
6035
243k
  return FindPtrOrNull(*table_ids_map_, table_id);
6036
243k
}
6037
6038
46.5k
std::vector<TableInfoPtr> CatalogManager::GetTables(GetTablesMode mode) {
6039
46.5k
  std::vector<TableInfoPtr> result;
6040
46.5k
  {
6041
46.5k
    SharedLock lock(mutex_);
6042
46.5k
    result.reserve(table_ids_map_->size());
6043
1.11M
    for (const auto& e : *table_ids_map_) {
6044
1.11M
      result.push_back(e.second);
6045
1.11M
    }
6046
46.5k
  }
6047
46.5k
  switch (mode) {
6048
1
    case GetTablesMode::kAll:
6049
1
      return result;
6050
165
    case GetTablesMode::kRunning: {
6051
3.53k
      auto filter = [](const TableInfoPtr& table_info) { return !table_info->is_running(); };
6052
165
      EraseIf(filter, &result);
6053
165
      return result;
6054
0
    }
6055
46.3k
    case GetTablesMode::kVisibleToClient: {
6056
1.10M
      auto filter = [](const TableInfoPtr& table_info) {
6057
1.10M
        return !table_info->LockForRead()->visible_to_client();
6058
1.10M
      };
6059
46.3k
      EraseIf(filter, &result);
6060
46.3k
      return result;
6061
0
    }
6062
0
  }
6063
0
  FATAL_INVALID_ENUM_VALUE(GetTablesMode, mode);
6064
0
}
6065
6066
void CatalogManager::GetAllNamespaces(std::vector<scoped_refptr<NamespaceInfo>>* namespaces,
6067
13.6k
                                      bool includeOnlyRunningNamespaces) {
6068
13.6k
  namespaces->clear();
6069
13.6k
  SharedLock lock(mutex_);
6070
56.3k
  for (const NamespaceInfoMap::value_type& e : namespace_ids_map_) {
6071
56.3k
    if (includeOnlyRunningNamespaces && e.second->state() != SysNamespaceEntryPB::RUNNING) {
6072
3
      continue;
6073
3
    }
6074
56.3k
    namespaces->push_back(e.second);
6075
56.3k
  }
6076
13.6k
}
6077
6078
13.9k
void CatalogManager::GetAllUDTypes(std::vector<scoped_refptr<UDTypeInfo>>* types) {
6079
13.9k
  types->clear();
6080
13.9k
  SharedLock lock(mutex_);
6081
204
  for (const UDTypeInfoMap::value_type& e : udtype_ids_map_) {
6082
204
    types->push_back(e.second);
6083
204
  }
6084
13.9k
}
6085
6086
3
std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentTasks() {
6087
3
  return tasks_tracker_->GetTasks();
6088
3
}
6089
6090
0
std::vector<std::shared_ptr<MonitoredTask>> CatalogManager::GetRecentJobs() {
6091
0
  return jobs_tracker_->GetTasks();
6092
0
}
6093
6094
13.1k
NamespaceName CatalogManager::GetNamespaceNameUnlocked(const NamespaceId& id) const  {
6095
13.1k
  const scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_map_, id);
6096
13.1k
  return ns == nullptr ? NamespaceName() : ns->name();
6097
13.1k
}
6098
6099
19
NamespaceName CatalogManager::GetNamespaceName(const NamespaceId& id) const {
6100
19
  TRACE("Acquired catalog manager lock");
6101
19
  SharedLock lock(mutex_);
6102
19
  return GetNamespaceNameUnlocked(id);
6103
19
}
6104
6105
NamespaceName CatalogManager::GetNamespaceNameUnlocked(
6106
0
    const scoped_refptr<TableInfo>& table) const  {
6107
0
  return GetNamespaceNameUnlocked(table->namespace_id());
6108
0
}
6109
6110
0
NamespaceName CatalogManager::GetNamespaceName(const scoped_refptr<TableInfo>& table) const {
6111
0
  return GetNamespaceName(table->namespace_id());
6112
0
}
6113
6114
28.9M
bool CatalogManager::IsSystemTable(const TableInfo& table) const {
6115
28.9M
  return table.is_system();
6116
28.9M
}
6117
6118
// True if table is created by user.
6119
// Table can be regular table or index in this case.
6120
208
bool CatalogManager::IsUserCreatedTable(const TableInfo& table) const {
6121
208
  SharedLock lock(mutex_);
6122
208
  return IsUserCreatedTableUnlocked(table);
6123
208
}
6124
6125
277k
bool CatalogManager::IsUserCreatedTableUnlocked(const TableInfo& table) const {
6126
277k
  if (table.GetTableType() == PGSQL_TABLE_TYPE || table.GetTableType() == YQL_TABLE_TYPE) {
6127
276k
    if (!IsSystemTable(table) && !IsSequencesSystemTable(table) &&
6128
13.0k
        GetNamespaceNameUnlocked(table.namespace_id()) != kSystemNamespaceName &&
6129
12.9k
        !table.IsColocatedParentTable() &&
6130
12.9k
        !table.IsTablegroupParentTable()) {
6131
12.9k
      return true;
6132
12.9k
    }
6133
264k
  }
6134
264k
  return false;
6135
264k
}
6136
6137
198
bool CatalogManager::IsUserTable(const TableInfo& table) const {
6138
198
  SharedLock lock(mutex_);
6139
198
  return IsUserTableUnlocked(table);
6140
198
}
6141
6142
138k
bool CatalogManager::IsUserTableUnlocked(const TableInfo& table) const {
6143
138k
  return IsUserCreatedTableUnlocked(table) && table.indexed_table_id().empty();
6144
138k
}
6145
6146
18
bool CatalogManager::IsUserIndex(const TableInfo& table) const {
6147
18
  SharedLock lock(mutex_);
6148
18
  return IsUserIndexUnlocked(table);
6149
18
}
6150
6151
138k
bool CatalogManager::IsUserIndexUnlocked(const TableInfo& table) const {
6152
138k
  return IsUserCreatedTableUnlocked(table) && !table.indexed_table_id().empty();
6153
138k
}
6154
6155
0
bool CatalogManager::IsTablegroupParentTableId(const TableId& table_id) const {
6156
0
  return table_id.find(kTablegroupParentTableIdSuffix) != std::string::npos;
6157
0
}
6158
6159
0
bool CatalogManager::IsColocatedParentTableId(const TableId& table_id) const {
6160
0
  return table_id.find(kColocatedParentTableIdSuffix) != std::string::npos;
6161
0
}
6162
6163
13.0k
bool CatalogManager::IsSequencesSystemTable(const TableInfo& table) const {
6164
13.0k
  if (table.GetTableType() == PGSQL_TABLE_TYPE && !table.IsColocatedParentTable()
6165
10.3k
                                               && !table.IsTablegroupParentTable()) {
6166
    // This case commonly occurs during unit testing. Avoid unnecessary assert within Get().
6167
10.3k
    if (!IsPgsqlId(table.namespace_id()) || !IsPgsqlId(table.id())) {
6168
4
      LOG(WARNING) << "Not PGSQL IDs " << table.namespace_id() << ", " << table.id();
6169
4
      return false;
6170
4
    }
6171
10.3k
    Result<uint32_t> database_oid = GetPgsqlDatabaseOid(table.namespace_id());
6172
10.3k
    if (!database_oid.ok()) {
6173
0
      LOG(WARNING) << "Invalid Namespace ID " << table.namespace_id();
6174
0
      return false;
6175
0
    }
6176
10.3k
    Result<uint32_t> table_oid = GetPgsqlTableOid(table.id());
6177
10.3k
    if (!table_oid.ok()) {
6178
0
      LOG(WARNING) << "Invalid Table ID " << table.id();
6179
0
      return false;
6180
0
    }
6181
10.3k
    if (*database_oid == kPgSequencesDataDatabaseOid && *table_oid == kPgSequencesDataTableOid) {
6182
0
      return true;
6183
0
    }
6184
13.0k
  }
6185
13.0k
  return false;
6186
13.0k
}
6187
6188
void CatalogManager::NotifyTabletDeleteFinished(const TabletServerId& tserver_uuid,
6189
                                                const TabletId& tablet_id,
6190
47.8k
                                                const TableInfoPtr& table) {
6191
47.8k
  shared_ptr<TSDescriptor> ts_desc;
6192
47.8k
  if (!master_->ts_manager()->LookupTSByUUID(tserver_uuid, &ts_desc)) {
6193
0
    LOG(WARNING) << "Unable to find tablet server " << tserver_uuid;
6194
47.8k
  } else if (!ts_desc->IsTabletDeletePending(tablet_id)) {
6195
952
    LOG(WARNING) << "Pending delete for tablet " << tablet_id << " in ts "
6196
952
                 << tserver_uuid << " doesn't exist";
6197
46.9k
  } else {
6198
46.9k
    LOG(INFO) << "Clearing pending delete for tablet " << tablet_id << " in ts " << tserver_uuid;
6199
46.9k
    ts_desc->ClearPendingTabletDelete(tablet_id);
6200
46.9k
  }
6201
47.8k
  CheckTableDeleted(table);
6202
47.8k
}
6203
6204
bool CatalogManager::ReplicaMapDiffersFromConsensusState(const scoped_refptr<TabletInfo>& tablet,
6205
214k
                                                         const ConsensusStatePB& cstate) {
6206
214k
  auto locs = tablet->GetReplicaLocations();
6207
214k
  if (locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) {
6208
28.5k
    return true;
6209
28.5k
  }
6210
760k
  for (auto iter = cstate.config().peers().begin(); iter != cstate.config().peers().end(); iter++) {
6211
575k
      if (locs->find(iter->permanent_uuid()) == locs->end()) {
6212
0
        return true;
6213
0
      }
6214
575k
  }
6215
185k
  return false;
6216
185k
}
6217
6218
namespace {
6219
6220
513k
int64_t GetCommittedConsensusStateOpIdIndex(const ReportedTabletPB& report) {
6221
513k
  if (!report.has_committed_consensus_state() ||
6222
510k
      !report.committed_consensus_state().config().has_opid_index()) {
6223
2.39k
    return consensus::kInvalidOpIdIndex;
6224
2.39k
  }
6225
6226
510k
  return report.committed_consensus_state().config().opid_index();
6227
510k
}
6228
6229
} // namespace
6230
6231
bool CatalogManager::ProcessCommittedConsensusState(
6232
    TSDescriptor* ts_desc,
6233
    bool is_incremental,
6234
    const ReportedTabletPB& report,
6235
    const TableInfo::WriteLock& table_lock,
6236
    const TabletInfoPtr& tablet,
6237
    const TabletInfo::WriteLock& tablet_lock,
6238
257k
    std::vector<RetryingTSRpcTaskPtr>* rpcs) {
6239
257k
  const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state();
6240
257k
  ConsensusStatePB cstate = report.committed_consensus_state();
6241
257k
  bool tablet_was_mutated = false;
6242
6243
  // 6a. The master only processes reports for replicas with committed
6244
  // consensus configurations since it needs the committed index to only
6245
  // cache the most up-to-date config. Since it's possible for TOMBSTONED
6246
  // replicas with no ConsensusMetadata on disk to be reported as having no
6247
  // committed config opid_index, we skip over those replicas.
6248
257k
  if (!cstate.config().has_opid_index()) {
6249
0
    LOG(WARNING) << "Missing opid_index in reported config: " << report.ShortDebugString();
6250
0
    return false;
6251
0
  }
6252
257k
  if (PREDICT_TRUE(FLAGS_master_ignore_stale_cstate) &&
6253
257k
        (cstate.current_term() < prev_cstate.current_term() ||
6254
253k
         GetCommittedConsensusStateOpIdIndex(report) < prev_cstate.config().opid_index())) {
6255
6.93k
    LOG(WARNING) << "Stale heartbeat for Tablet " << tablet->ToString()
6256
6.93k
                 << " on TS " << ts_desc->permanent_uuid()
6257
6.93k
                 << "cstate=" << cstate.ShortDebugString()
6258
6.93k
                 << ", prev_cstate=" << prev_cstate.ShortDebugString();
6259
6.93k
    return false;
6260
6.93k
  }
6261
6262
  // 6b. Disregard the leader state if the reported leader is not a member
6263
  // of the committed config.
6264
250k
  if (cstate.leader_uuid().empty() ||
6265
154k
      !IsRaftConfigMember(cstate.leader_uuid(), cstate.config())) {
6266
95.6k
    cstate.clear_leader_uuid();
6267
95.6k
    tablet_was_mutated = true;
6268
95.6k
  }
6269
6270
  // 6c. Mark the tablet as RUNNING if it makes sense to do so.
6271
  //
6272
  // We need to wait for a leader before marking a tablet as RUNNING, or
6273
  // else we could incorrectly consider a tablet created when only a
6274
  // minority of its replicas were successful. In that case, the tablet
6275
  // would be stuck in this bad state forever.
6276
  // - FLAG added to avoid waiting during mock tests.
6277
250k
  if (!tablet_lock->is_running() &&
6278
121k
      report.state() == tablet::RUNNING &&
6279
121k
        (cstate.has_leader_uuid() ||
6280
93.6k
        !FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader)) {
6281
0
    DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock->pb.state())
6282
0
        << "Tablet in unexpected state: " << tablet->ToString()
6283
0
        << ": " << tablet_lock->pb.ShortDebugString();
6284
0
    VLOG(1) << "Tablet " << tablet->ToString() << " is now online";
6285
28.1k
    tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING,
6286
28.1k
        "Tablet reported with an active leader");
6287
28.1k
    tablet_was_mutated = true;
6288
28.1k
  }
6289
6290
  // 6d. Update the consensus state if:
6291
  // - A config change operation was committed (reflected by a change to
6292
  //   the committed config's opid_index).
6293
  // - The new cstate has a leader, and either the old cstate didn't, or
6294
  //   there was a term change.
6295
250k
  if (cstate.config().opid_index() > prev_cstate.config().opid_index() ||
6296
247k
      (cstate.has_leader_uuid() &&
6297
151k
          (!prev_cstate.has_leader_uuid() ||
6298
123k
              cstate.current_term() > prev_cstate.current_term()))) {
6299
6300
    // 6d(i). Retain knowledge of the leader even if it wasn't reported in
6301
    // the latest config.
6302
    //
6303
    // When a config change is reported to the master, it may not include the
6304
    // leader because the follower doing the reporting may not know who the
6305
    // leader is yet (it may have just started up). It is safe to reuse
6306
    // the previous leader if the reported cstate has the same term as the
6307
    // previous cstate, and the leader was known for that term.
6308
35.7k
    if (cstate.current_term() == prev_cstate.current_term()) {
6309
2.72k
      if (!cstate.has_leader_uuid() && prev_cstate.has_leader_uuid()) {
6310
1
        cstate.set_leader_uuid(prev_cstate.leader_uuid());
6311
        // Sanity check to detect consensus divergence bugs.
6312
2.72k
      } else if (cstate.has_leader_uuid() && prev_cstate.has_leader_uuid() &&
6313
2.72k
          cstate.leader_uuid() != prev_cstate.leader_uuid()) {
6314
0
        string msg = Substitute("Previously reported cstate for tablet $0 gave "
6315
0
                                "a different leader for term $1 than the current cstate. "
6316
0
                                "Previous cstate: $2. Current cstate: $3.",
6317
0
            tablet->ToString(), cstate.current_term(),
6318
0
            prev_cstate.ShortDebugString(), cstate.ShortDebugString());
6319
0
        LOG(DFATAL) << msg;
6320
0
        return false;
6321
0
      }
6322
35.7k
    }
6323
6324
    // 6d(ii). Delete any replicas from the previous config that are not in the new one.
6325
35.7k
    if (FLAGS_master_tombstone_evicted_tablet_replicas) {
6326
35.7k
      std::unordered_set<string> current_member_uuids;
6327
107k
      for (const consensus::RaftPeerPB &peer : cstate.config().peers()) {
6328
107k
        InsertOrDie(&current_member_uuids, peer.permanent_uuid());
6329
107k
      }
6330
107k
      for (const consensus::RaftPeerPB &prev_peer : prev_cstate.config().peers()) {
6331
107k
        const string& peer_uuid = prev_peer.permanent_uuid();
6332
107k
        if (!ContainsKey(current_member_uuids, peer_uuid)) {
6333
          // Don't delete a tablet server that hasn't reported in yet (Bootstrapping).
6334
829
          shared_ptr<TSDescriptor> dummy_ts_desc;
6335
829
          if (!master_->ts_manager()->LookupTSByUUID(peer_uuid, &dummy_ts_desc)) {
6336
9
            continue;
6337
9
          }
6338
          // Otherwise, the TabletServer needs to remove this peer.
6339
820
          rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6340
820
              master_, AsyncTaskPool(), peer_uuid, tablet->table(), tablet->tablet_id(),
6341
820
              TABLET_DATA_TOMBSTONED, prev_cstate.config().opid_index(),
6342
820
              Substitute("TS $0 not found in new config with opid_index $1",
6343
820
                  peer_uuid, cstate.config().opid_index())));
6344
820
        }
6345
107k
      }
6346
35.7k
    }
6347
    // 6d(iii). Update the in-memory ReplicaLocations for this tablet using the new config.
6348
3
    VLOG(2) << "Updating replicas for tablet " << tablet->tablet_id()
6349
3
          << " using config reported by " << ts_desc->permanent_uuid()
6350
3
          << " to that committed in log index " << cstate.config().opid_index()
6351
3
          << " with leader state from term " << cstate.current_term();
6352
35.7k
    ReconcileTabletReplicasInLocalMemoryWithReport(
6353
35.7k
      tablet, ts_desc->permanent_uuid(), cstate, report);
6354
6355
    // 6d(iv). Update the consensus state. Don't use 'prev_cstate' after this.
6356
35.7k
    LOG(INFO) << "Tablet: " << tablet->tablet_id() << " reported consensus state change."
6357
35.7k
              << " New consensus state: " << cstate.ShortDebugString()
6358
35.7k
              << " from " << ts_desc->permanent_uuid();
6359
35.7k
    *tablet_lock.mutable_data()->pb.mutable_committed_consensus_state() = cstate;
6360
35.7k
    tablet_was_mutated = true;
6361
214k
  } else {
6362
    // Report opid_index is equal to the previous opid_index. If some
6363
    // replica is reporting the same consensus configuration we already know about, but we
6364
    // haven't yet heard from all the tservers in the config, update the in-memory
6365
    // ReplicaLocations.
6366
214k
    LOG(INFO) << "Peer " << ts_desc->permanent_uuid() << " sent "
6367
213k
              << (is_incremental ? "incremental" : "full tablet")
6368
214k
              << " report for " << tablet->tablet_id()
6369
214k
              << ", prev state op id: " << prev_cstate.config().opid_index()
6370
214k
              << ", prev state term: " << prev_cstate.current_term()
6371
214k
              << ", prev state has_leader_uuid: " << prev_cstate.has_leader_uuid()
6372
214k
              << ". Consensus state: " << cstate.ShortDebugString();
6373
214k
    if (GetAtomicFlag(&FLAGS_enable_register_ts_from_raft) &&
6374
214k
        ReplicaMapDiffersFromConsensusState(tablet, cstate)) {
6375
28.5k
       ReconcileTabletReplicasInLocalMemoryWithReport(
6376
28.5k
         tablet, ts_desc->permanent_uuid(), cstate, report);
6377
185k
    } else {
6378
185k
      UpdateTabletReplicaInLocalMemory(ts_desc, &cstate, report, tablet);
6379
185k
    }
6380
214k
  }
6381
6382
250k
  if (FLAGS_use_create_table_leader_hint &&
6383
249k
      !cstate.has_leader_uuid() && cstate.current_term() == 0) {
6384
91.8k
    StartElectionIfReady(cstate, tablet.get());
6385
91.8k
  }
6386
6387
  // 7. Send an AlterSchema RPC if the tablet has an old schema version.
6388
250k
  if (report.has_schema_version() &&
6389
250k
      report.schema_version() != table_lock->pb.version()) {
6390
45
    if (report.schema_version() > table_lock->pb.version()) {
6391
0
      LOG(ERROR) << "TS " << ts_desc->permanent_uuid()
6392
0
                 << " has reported a schema version greater than the current one "
6393
0
                 << " for tablet " << tablet->ToString()
6394
0
                 << ". Expected version " << table_lock->pb.version()
6395
0
                 << " got " << report.schema_version()
6396
0
                 << " (corruption)";
6397
45
    } else {
6398
      // TODO: For Alter (rolling apply to tablets), this is an expected transitory state.
6399
45
      LOG(INFO) << "TS " << ts_desc->permanent_uuid()
6400
45
                << " does not have the latest schema for tablet " << tablet->ToString()
6401
45
                << ". Expected version " << table_lock->pb.version()
6402
45
                << " got " << report.schema_version();
6403
45
    }
6404
    // It's possible that the tablet being reported is a laggy replica, and in fact
6405
    // the leader has already received an AlterTable RPC. That's OK, though --
6406
    // it'll safely ignore it if we send another.
6407
45
    TransactionId txn_id = TransactionId::Nil();
6408
45
    if (table_lock->pb.has_transaction() &&
6409
8
        table_lock->pb.transaction().has_transaction_id()) {
6410
8
      LOG(INFO) << "Parsing transaction ID for tablet ID " << tablet->tablet_id();
6411
8
      auto txn_id_res = FullyDecodeTransactionId(table_lock->pb.transaction().transaction_id());
6412
8
      if (!txn_id_res.ok()) {
6413
0
        LOG(WARNING) << "Parsing transaction ID failed for tablet ID " << tablet->tablet_id();
6414
0
        return false;
6415
0
      }
6416
8
      txn_id = txn_id_res.get();
6417
8
    }
6418
45
    LOG(INFO) << "Triggering AlterTable with transaction ID " << txn_id
6419
45
              << " due to heartbeat delay for tablet ID " << tablet->tablet_id();
6420
45
    rpcs->push_back(std::make_shared<AsyncAlterTable>(
6421
45
        master_, AsyncTaskPool(), tablet, tablet->table(), txn_id));
6422
45
  }
6423
6424
250k
  return tablet_was_mutated;
6425
250k
}
6426
6427
Status CatalogManager::ProcessTabletReportBatch(
6428
    TSDescriptor* ts_desc,
6429
    bool is_incremental,
6430
    ReportedTablets::const_iterator begin,
6431
    ReportedTablets::const_iterator end,
6432
    TabletReportUpdatesPB* full_report_update,
6433
259k
    std::vector<RetryingTSRpcTaskPtr>* rpcs) {
6434
  // 1. First Pass. Iterate in TabletId Order to discover all Table locks we'll need. Even though
6435
  //    read locks are sufficient here, take write locks since we'll be writing to the tablet while
6436
  //    holding this.
6437
  //    Need to acquire both types of locks in Id order to prevent deadlock.
6438
259k
  std::map<TableId, TableInfo::WriteLock> table_write_locks;
6439
518k
  for (auto it = begin; it != end; ++it) {
6440
258k
    auto& lock = table_write_locks[it->info->table()->id()];
6441
258k
    if (!lock.locked()) {
6442
258k
      lock = it->info->table()->LockForWrite();
6443
258k
    }
6444
258k
  }
6445
6446
259k
  map<TabletId, TabletInfo::WriteLock> tablet_write_locks; // used for unlock.
6447
  // 2. Second Pass.  Process each tablet. This may not be in the order that the tablets
6448
  // appear in 'full_report', but that has no bearing on correctness.
6449
259k
  vector<TabletInfo*> mutated_tablets; // refcount protected by 'tablet_infos'
6450
519k
  for (auto it = begin; it != end; ++it) {
6451
259k
    const auto& tablet_id = it->tablet_id;
6452
259k
    const TabletInfoPtr& tablet = it->info;
6453
259k
    const ReportedTabletPB& report = *it->report;
6454
259k
    const TableInfoPtr& table = tablet->table();
6455
6456
    // Prepare an heartbeat response entry for this tablet, now that we're going to process it.
6457
    // Every tablet in the report that is processed gets one, even if there are no changes to it.
6458
259k
    ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6459
259k
    update->set_tablet_id(tablet_id);
6460
6461
    // Get tablet lock on demand.  This works in the batch case because the loop is ordered.
6462
259k
    tablet_write_locks[tablet_id] = tablet->LockForWrite();
6463
259k
    auto& table_lock = table_write_locks[table->id()];
6464
259k
    auto& tablet_lock = tablet_write_locks[tablet_id];
6465
6466
259k
    TRACE_EVENT1("master", "HandleReportedTablet", "tablet_id", report.tablet_id());
6467
259k
    RETURN_NOT_OK_PREPEND(CheckIsLeaderAndReady(),
6468
259k
        Substitute("This master is no longer the leader, unable to handle report for tablet $0",
6469
259k
            tablet_id));
6470
6471
18.4E
    VLOG(3) << "tablet report: " << report.ShortDebugString();
6472
6473
    // 3. Delete the tablet if it (or its table) have been deleted.
6474
259k
    if (tablet_lock->is_deleted() ||
6475
259k
        table_lock->started_deleting()) {
6476
244
      const string msg = tablet_lock->pb.state_msg();
6477
244
      update->set_state_msg(msg);
6478
244
      LOG(INFO) << "Got report from deleted tablet " << tablet->ToString()
6479
244
                << " (" << msg << "): Sending delete request for this tablet";
6480
      // TODO(unknown): Cancel tablet creation, instead of deleting, in cases
6481
      // where that might be possible (tablet creation timeout & replacement).
6482
244
      rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6483
244
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6484
244
          TABLET_DATA_DELETED, boost::none, msg));
6485
244
      ts_desc->AddPendingTabletDelete(tablet_id);
6486
244
      continue;
6487
244
    }
6488
6489
259k
    if (!table_lock->is_running()) {
6490
0
      const string msg = tablet_lock->pb.state_msg();
6491
0
      LOG(INFO) << "Got report from tablet " << tablet->tablet_id()
6492
0
                << " for non-running table " << table->ToString() << ": " << msg;
6493
0
      update->set_state_msg(msg);
6494
0
      continue;
6495
0
    }
6496
6497
    // 3. Tombstone a replica that is no longer part of the Raft config (and
6498
    // not already tombstoned or deleted outright).
6499
    //
6500
    // If the report includes a committed raft config, we only tombstone if
6501
    // the opid_index is strictly less than the latest reported committed
6502
    // config. This prevents us from spuriously deleting replicas that have
6503
    // just been added to the committed config and are in the process of copying.
6504
259k
    const ConsensusStatePB& prev_cstate = tablet_lock->pb.committed_consensus_state();
6505
259k
    const int64_t prev_opid_index = prev_cstate.config().opid_index();
6506
259k
    const int64_t report_opid_index = GetCommittedConsensusStateOpIdIndex(report);
6507
259k
    if (FLAGS_master_tombstone_evicted_tablet_replicas &&
6508
259k
        report.tablet_data_state() != TABLET_DATA_TOMBSTONED &&
6509
259k
        report.tablet_data_state() != TABLET_DATA_DELETED &&
6510
259k
        report_opid_index < prev_opid_index &&
6511
6.02k
        !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.config())) {
6512
173
      const string delete_msg = (report_opid_index == consensus::kInvalidOpIdIndex) ?
6513
16
          "Replica has no consensus available" :
6514
157
          Substitute("Replica with old config index $0", report_opid_index);
6515
173
      rpcs->push_back(std::make_shared<AsyncDeleteReplica>(
6516
173
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6517
173
          TABLET_DATA_TOMBSTONED, prev_opid_index,
6518
173
          Substitute("$0 (current committed config index is $1)",
6519
173
              delete_msg, prev_opid_index)));
6520
173
      ts_desc->AddPendingTabletDelete(tablet_id);
6521
173
      continue;
6522
173
    }
6523
6524
    // 4. Skip a non-deleted tablet which reports an error.
6525
259k
    if (report.has_error()) {
6526
0
      Status s = StatusFromPB(report.error());
6527
0
      DCHECK(!s.ok());
6528
0
      DCHECK_EQ(report.state(), tablet::FAILED);
6529
0
      LOG(WARNING) << "Tablet " << tablet->ToString() << " has failed on TS "
6530
0
                   << ts_desc->permanent_uuid() << ": " << s.ToString();
6531
0
      continue;
6532
0
    }
6533
6534
    // Hide the tablet if it (or its table) has been hidden and the tablet hasn't.
6535
259k
    if ((tablet_lock->is_hidden() ||
6536
259k
        table_lock->started_hiding()) &&
6537
0
        report.has_is_hidden() &&
6538
0
        !report.is_hidden()) {
6539
0
      const string msg = tablet_lock->pb.state_msg();
6540
0
      LOG(INFO) << "Got report from hidden tablet " << tablet->ToString()
6541
0
                << " (" << msg << "): Sending hide request for this tablet";
6542
0
      auto task = std::make_shared<AsyncDeleteReplica>(
6543
0
          master_, AsyncTaskPool(), ts_desc->permanent_uuid(), table, tablet_id,
6544
0
          TABLET_DATA_DELETED, boost::none, msg);
6545
0
      task->set_hide_only(true);
6546
0
      ts_desc->AddPendingTabletDelete(tablet_id);
6547
0
      rpcs->push_back(task);
6548
0
    }
6549
6550
    // 5. Process the report's consensus state.
6551
    // The report will not have a committed_consensus_state if it is in the
6552
    // middle of starting up, such as during tablet bootstrap.
6553
    // If we received an incremental report, and the tablet is starting up, we will update the
6554
    // replica so that the balancer knows how many tablets are in the middle of remote bootstrap.
6555
259k
    if (report.has_committed_consensus_state()) {
6556
257k
      if (ProcessCommittedConsensusState(
6557
131k
              ts_desc, is_incremental, report, table_lock, tablet, tablet_lock, rpcs)) {
6558
        // 6. If the tablet was mutated, add it to the tablets to be re-persisted.
6559
        //
6560
        // Done here and not on a per-mutation basis to avoid duplicate entries.
6561
131k
        mutated_tablets.push_back(tablet.get());
6562
131k
      }
6563
2.35k
    } else if (is_incremental &&
6564
2.29k
        (report.state() == tablet::NOT_STARTED || report.state() == tablet::BOOTSTRAPPING)) {
6565
      // When a tablet server is restarted, it sends a full tablet report with all of its tablets
6566
      // in the NOT_STARTED state, so this would make the load balancer think that all the
6567
      // tablets are being remote bootstrapped at once, so only process incremental reports here.
6568
2.29k
      UpdateTabletReplicaInLocalMemory(ts_desc, nullptr /* consensus */, report, tablet);
6569
2.29k
    }
6570
259k
  } // Finished one round of batch processing.
6571
6572
  // 7. Unlock the tables; we no longer need to access their state.
6573
259k
  for (auto& l : table_write_locks) {
6574
259k
    l.second.Unlock();
6575
259k
  }
6576
259k
  table_write_locks.clear();
6577
6578
  // 8. Write all tablet mutations to the catalog table.
6579
  //
6580
  // SysCatalogTable::Write will short-circuit the case where the data has not
6581
  // in fact changed since the previous version and avoid any unnecessary mutations.
6582
259k
  if (!mutated_tablets.empty()) {
6583
131k
    Status s = sys_catalog_->Upsert(leader_ready_term(), mutated_tablets);
6584
131k
    if (!s.ok()) {
6585
0
      LOG(WARNING) << "Error updating tablets: " << s;
6586
0
      return s;
6587
0
    }
6588
259k
  }
6589
  // Filter the mutated tablets to find which tablets were modified. Need to actually commit the
6590
  // state of the tablets before updating the system.partitions table, so get this first.
6591
259k
  vector<TabletInfoPtr> yql_partitions_mutated_tablets =
6592
259k
      VERIFY_RESULT(GetYqlPartitionsVtable().FilterRelevantTablets(mutated_tablets));
6593
6594
  // 9. Publish the in-memory tablet mutations and release the locks.
6595
259k
  for (auto& l : tablet_write_locks) {
6596
259k
    l.second.Commit();
6597
259k
  }
6598
259k
  tablet_write_locks.clear();
6599
6600
  // Update the relevant tablet entries in system.partitions.
6601
259k
  if (!yql_partitions_mutated_tablets.empty()) {
6602
17.7k
    Status s = GetYqlPartitionsVtable()
6603
17.7k
        .ProcessMutatedTablets(yql_partitions_mutated_tablets, tablet_write_locks);
6604
17.7k
  }
6605
6606
  // 10. Third Pass. Process all tablet schema version changes.
6607
  // (This is separate from tablet state mutations because only table on-disk state is changed.)
6608
519k
  for (auto it = begin; it != end; ++it) {
6609
259k
    const ReportedTabletPB& report = *it->report;
6610
259k
    if (!report.has_schema_version()) {
6611
0
      continue;
6612
0
    }
6613
259k
    const TabletInfoPtr& tablet = it->info;
6614
259k
    auto leader = tablet->GetLeader();
6615
259k
    if (leader.ok() && leader.get()->permanent_uuid() == ts_desc->permanent_uuid()) {
6616
36.3k
      RETURN_NOT_OK(HandleTabletSchemaVersionReport(tablet.get(), report.schema_version()));
6617
36.3k
    }
6618
259k
  }
6619
6620
259k
  return Status::OK();
6621
259k
}
6622
6623
Status CatalogManager::ProcessTabletReport(TSDescriptor* ts_desc,
6624
                                           const TabletReportPB& full_report,
6625
                                           TabletReportUpdatesPB* full_report_update,
6626
383k
                                           RpcContext* rpc) {
6627
383k
  int num_tablets = full_report.updated_tablets_size();
6628
383k
  TRACE_EVENT2("master", "ProcessTabletReport",
6629
383k
               "requestor", rpc->requestor_string(),
6630
383k
               "num_tablets", num_tablets);
6631
6632
566
  VLOG_WITH_PREFIX(2) << "Received tablet report from " << RequestorString(rpc) << "("
6633
566
                      << ts_desc->permanent_uuid() << "): " << full_report.DebugString();
6634
6635
383k
  if (!ts_desc->has_tablet_report() && full_report.is_incremental()) {
6636
5.45k
    LOG_WITH_PREFIX(WARNING)
6637
5.45k
        << "Invalid tablet report from " << ts_desc->permanent_uuid()
6638
5.45k
        << ": Received an incremental tablet report when a full one was needed";
6639
    // We should respond with success in order to send reply that we need full report.
6640
5.45k
    return Status::OK();
6641
5.45k
  }
6642
6643
  // TODO: on a full tablet report, we may want to iterate over the tablets we think
6644
  // the server should have, compare vs the ones being reported, and somehow mark
6645
  // any that have been "lost" (eg somehow the tablet metadata got corrupted or something).
6646
6647
377k
  ReportedTablets reported_tablets;
6648
6649
  // Tablet Deletes to process after the catalog lock below.
6650
377k
  set<TabletId> tablets_to_delete;
6651
6652
377k
  {
6653
    // Lock the catalog to iterate over tablet_ids_map_ & table_ids_map_.
6654
377k
    SharedLock lock(mutex_);
6655
6656
    // Fill the above variables before processing
6657
377k
    full_report_update->mutable_tablets()->Reserve(num_tablets);
6658
261k
    for (const ReportedTabletPB& report : full_report.updated_tablets()) {
6659
261k
      const string& tablet_id = report.tablet_id();
6660
6661
      // 1a. Find the tablet, deleting/skipping it if it can't be found.
6662
261k
      scoped_refptr<TabletInfo> tablet = FindPtrOrNull(*tablet_map_, tablet_id);
6663
261k
      if (!tablet) {
6664
        // If a TS reported an unknown tablet, send a delete tablet rpc to the TS.
6665
0
        LOG(INFO) << "Null tablet reported, possibly the TS was not around when the"
6666
0
                      " table was being deleted. Sending Delete tablet RPC to this TS.";
6667
0
        tablets_to_delete.insert(tablet_id);
6668
        // Every tablet in the report that is processed gets a heartbeat response entry.
6669
0
        ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6670
0
        update->set_tablet_id(tablet_id);
6671
0
        continue;
6672
0
      }
6673
261k
      if (!tablet->table() || FindOrNull(*table_ids_map_, tablet->table()->id()) == nullptr) {
6674
0
        auto table_id = tablet->table() == nullptr ? "(null)" : tablet->table()->id();
6675
0
        LOG(INFO) << "Got report from an orphaned tablet " << tablet_id << " on table " << table_id;
6676
0
        tablets_to_delete.insert(tablet_id);
6677
        // Every tablet in the report that is processed gets a heartbeat response entry.
6678
0
        ReportedTabletUpdatesPB* update = full_report_update->add_tablets();
6679
0
        update->set_tablet_id(tablet_id);
6680
0
        continue;
6681
0
      }
6682
6683
      // 1b. Found the tablet, update local state.
6684
261k
      reported_tablets.push_back(ReportedTablet {
6685
261k
        .tablet_id = tablet_id,
6686
261k
        .info = tablet,
6687
261k
        .report = &report,
6688
261k
      });
6689
261k
    }
6690
377k
  }
6691
6692
122k
  std::sort(reported_tablets.begin(), reported_tablets.end(), [](const auto& lhs, const auto& rhs) {
6693
122k
    return lhs.tablet_id < rhs.tablet_id;
6694
122k
  });
6695
6696
  // Process any delete requests from orphaned tablets, identified above.
6697
0
  for (auto tablet_id : tablets_to_delete) {
6698
0
    SendDeleteTabletRequest(tablet_id, TABLET_DATA_DELETED, boost::none, nullptr, ts_desc,
6699
0
        "Report from an orphaned tablet");
6700
0
  }
6701
6702
  // Calculate the deadline for this expensive loop coming up.
6703
377k
  const auto safe_deadline = rpc->GetClientDeadline() -
6704
377k
    (FLAGS_heartbeat_rpc_timeout_ms * 1ms * FLAGS_heartbeat_safe_deadline_ratio);
6705
6706
  // Process tablets by batches.
6707
636k
  for (auto tablet_iter = reported_tablets.begin(); tablet_iter != reported_tablets.end();) {
6708
259k
    auto batch_begin = tablet_iter;
6709
259k
    tablet_iter += std::min<size_t>(
6710
259k
        reported_tablets.end() - tablet_iter, FLAGS_catalog_manager_report_batch_size);
6711
6712
    // Keeps track of all RPCs that should be sent when we're done with a single batch.
6713
259k
    std::vector<RetryingTSRpcTaskPtr> rpcs;
6714
259k
    auto status = ProcessTabletReportBatch(
6715
259k
        ts_desc, full_report.is_incremental(), batch_begin, tablet_iter, full_report_update, &rpcs);
6716
259k
    if (!status.ok()) {
6717
0
      for (auto& rpc : rpcs) {
6718
0
        rpc->AbortAndReturnPrevState(status);
6719
0
      }
6720
0
      return status;
6721
0
    }
6722
6723
    // 13. Send all queued RPCs.
6724
259k
    for (auto& rpc : rpcs) {
6725
1.28k
      DCHECK(rpc->table());
6726
1.28k
      rpc->table()->AddTask(rpc);
6727
1.28k
      WARN_NOT_OK(ScheduleTask(rpc), Substitute("Failed to send $0", rpc->description()));
6728
1.28k
    }
6729
259k
    rpcs.clear();
6730
6731
    // 14. Check deadline. Need to exit before processing all batches if we're close to timing out.
6732
259k
    if (ts_desc->HasCapability(CAPABILITY_TabletReportLimit) &&
6733
259k
        tablet_iter != reported_tablets.end()) {
6734
      // [TESTING] Inject latency before processing a batch to test deadline.
6735
91.1k
      if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_tablet_report_ms > 0)) {
6736
0
        LOG(INFO) << "Sleeping in CatalogManager::ProcessTabletReport for "
6737
0
                  << FLAGS_TEST_inject_latency_during_tablet_report_ms << " ms";
6738
0
        SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_inject_latency_during_tablet_report_ms));
6739
0
      }
6740
6741
      // Return from here at configured safe heartbeat deadline to give the response packet time.
6742
91.1k
      if (safe_deadline < CoarseMonoClock::Now()) {
6743
842
        LOG(INFO) << "Reached Heartbeat deadline. Returning early after processing "
6744
842
                  << full_report_update->tablets_size() << " tablets";
6745
842
        full_report_update->set_processing_truncated(true);
6746
842
        return Status::OK();
6747
842
      }
6748
91.1k
    }
6749
259k
  } // Loop to process the next batch until fully iterated.
6750
6751
377k
  if (!full_report.is_incremental()) {
6752
    // A full report may take multiple heartbeats.
6753
    // The TS communicates how much is left to process for the full report beyond this specific HB.
6754
5.58k
    bool completed_full_report = !full_report.has_remaining_tablet_count()
6755
5.58k
                               || full_report.remaining_tablet_count() == 0;
6756
5.58k
    if (full_report.updated_tablets_size() == 0) {
6757
5.44k
      LOG(INFO) << ts_desc->permanent_uuid() << " sent full tablet report with 0 tablets.";
6758
137
    } else if (!ts_desc->has_tablet_report()) {
6759
137
      LOG(INFO) << ts_desc->permanent_uuid()
6760
137
                << (completed_full_report ? " finished" : " receiving") << " first full report: "
6761
137
                << full_report.updated_tablets_size() << " tablets.";
6762
137
    }
6763
    // We have a tablet report only once we're done processing all the chunks of the initial report.
6764
5.58k
    ts_desc->set_has_tablet_report(completed_full_report);
6765
5.58k
  }
6766
6767
  // 14. Queue background processing if we had updates.
6768
377k
  if (full_report.updated_tablets_size() > 0) {
6769
168k
    background_tasks_->WakeIfHasPendingUpdates();
6770
168k
  }
6771
6772
377k
  return Status::OK();
6773
377k
}
6774
6775
Status CatalogManager::CreateTablegroup(const CreateTablegroupRequestPB* req,
6776
                                        CreateTablegroupResponsePB* resp,
6777
3
                                        rpc::RpcContext* rpc) {
6778
6779
3
  CreateTableRequestPB ctreq;
6780
3
  CreateTableResponsePB ctresp;
6781
6782
  // Sanity check for PB fields.
6783
3
  if (!req->has_id() || !req->has_namespace_id() || !req->has_namespace_name()) {
6784
0
    Status s = STATUS(InvalidArgument, "Improper CREATE TABLEGROUP request (missing fields).");
6785
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
6786
0
  }
6787
6788
  // Use the tablegroup id as the prefix for the parent table id.
6789
3
  const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix;
6790
3
  const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix;
6791
3
  ctreq.set_name(parent_table_name);
6792
3
  ctreq.set_table_id(parent_table_id);
6793
3
  ctreq.mutable_namespace_()->set_name(req->namespace_name());
6794
3
  ctreq.mutable_namespace_()->set_id(req->namespace_id());
6795
3
  ctreq.set_table_type(PGSQL_TABLE_TYPE);
6796
3
  ctreq.set_tablegroup_id(req->id());
6797
3
  ctreq.set_tablespace_id(req->tablespace_id());
6798
6799
3
  YBSchemaBuilder schemaBuilder;
6800
3
  schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull();
6801
3
  YBSchema ybschema;
6802
3
  CHECK_OK(schemaBuilder.Build(&ybschema));
6803
3
  auto schema = yb::client::internal::GetSchema(ybschema);
6804
3
  SchemaToPB(schema, ctreq.mutable_schema());
6805
3
  if (!FLAGS_TEST_tablegroup_master_only) {
6806
2
    ctreq.mutable_schema()->mutable_table_properties()->set_is_transactional(true);
6807
2
  }
6808
6809
  // Create a parent table, which will create the tablet.
6810
3
  Status s = CreateTable(&ctreq, &ctresp, rpc);
6811
3
  resp->set_parent_table_id(ctresp.table_id());
6812
3
  resp->set_parent_table_name(parent_table_name);
6813
6814
  // Carry over error.
6815
3
  if (ctresp.has_error()) {
6816
0
    resp->mutable_error()->Swap(ctresp.mutable_error());
6817
0
  }
6818
6819
  // We do not lock here so it is technically possible that the table was already created.
6820
  // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
6821
3
  if (!s.ok() && !s.IsAlreadyPresent()) {
6822
0
    LOG(WARNING) << "Tablegroup creation failed: " << s.ToString();
6823
0
    return s;
6824
0
  }
6825
6826
  // Update catalog manager maps
6827
3
  SharedLock lock(mutex_);
6828
3
  TRACE("Acquired catalog manager lock");
6829
3
  TablegroupInfo *tg = new TablegroupInfo(req->id(), req->namespace_id());
6830
3
  tablegroup_ids_map_[req->id()] = tg;
6831
6832
3
  return s;
6833
3
}
6834
6835
Status CatalogManager::DeleteTablegroup(const DeleteTablegroupRequestPB* req,
6836
                                        DeleteTablegroupResponsePB* resp,
6837
2
                                        rpc::RpcContext* rpc) {
6838
2
  DeleteTableRequestPB dtreq;
6839
2
  DeleteTableResponsePB dtresp;
6840
6841
  // Sanity check for PB fields
6842
2
  if (!req->has_id() || !req->has_namespace_id()) {
6843
0
    Status s = STATUS(InvalidArgument, "Improper DELETE TABLEGROUP request (missing fields).");
6844
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
6845
0
  }
6846
6847
  // Use the tablegroup id as the prefix for the parent table id.
6848
2
  const auto parent_table_id = req->id() + kTablegroupParentTableIdSuffix;
6849
2
  const auto parent_table_name = req->id() + kTablegroupParentTableNameSuffix;
6850
6851
2
  dtreq.mutable_table()->set_table_name(parent_table_name);
6852
2
  dtreq.mutable_table()->set_table_id(parent_table_id);
6853
2
  dtreq.set_is_index_table(false);
6854
6855
2
  Status s = DeleteTable(&dtreq, &dtresp, rpc);
6856
2
  resp->set_parent_table_id(dtresp.table_id());
6857
6858
  // Carry over error.
6859
2
  if (dtresp.has_error()) {
6860
0
    resp->mutable_error()->Swap(dtresp.mutable_error());
6861
0
    return s;
6862
0
  }
6863
6864
  // Perform map updates.
6865
2
  SharedLock lock(mutex_);
6866
2
  TRACE("Acquired catalog manager lock");
6867
2
  tablegroup_ids_map_.erase(req->id());
6868
2
  tablegroup_tablet_ids_map_[req->namespace_id()].erase(req->id());
6869
6870
2
  LOG(INFO) << "Deleted table " << parent_table_name;
6871
2
  return s;
6872
2
}
6873
6874
Status CatalogManager::ListTablegroups(const ListTablegroupsRequestPB* req,
6875
                                       ListTablegroupsResponsePB* resp,
6876
3
                                       rpc::RpcContext* rpc) {
6877
3
  SharedLock lock(mutex_);
6878
6879
3
  if (!req->has_namespace_id()) {
6880
0
    Status s = STATUS(InvalidArgument, "Improper ListTablegroups request (missing fields).");
6881
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s);
6882
0
  }
6883
6884
3
  if (tablegroup_tablet_ids_map_.find(req->namespace_id()) == tablegroup_tablet_ids_map_.end()) {
6885
0
    return STATUS(NotFound, "Tablegroups not found for namespace id: ", req->namespace_id());
6886
0
  }
6887
6888
3
  for (const auto& entry : tablegroup_tablet_ids_map_[req->namespace_id()]) {
6889
3
    const TablegroupId tgid = entry.first;
6890
3
    if (tablegroup_ids_map_.find(tgid) == tablegroup_ids_map_.end()) {
6891
0
      LOG(WARNING) << "Tablegroup info in " << req->namespace_id()
6892
0
                   << " not found for tablegroup id: " << tgid;
6893
0
      continue;
6894
0
    }
6895
3
    scoped_refptr<TablegroupInfo> tginfo = tablegroup_ids_map_[tgid];
6896
6897
3
    TablegroupIdentifierPB *tg = resp->add_tablegroups();
6898
3
    tg->set_id(tginfo->id());
6899
3
    tg->set_namespace_id(tginfo->namespace_id());
6900
3
  }
6901
3
  return Status::OK();
6902
3
}
6903
6904
1
bool CatalogManager::HasTablegroups() {
6905
1
  SharedLock lock(mutex_);
6906
1
  return !tablegroup_ids_map_.empty();
6907
1
}
6908
6909
Status CatalogManager::CreateNamespace(const CreateNamespaceRequestPB* req,
6910
                                       CreateNamespaceResponsePB* resp,
6911
2.08k
                                       rpc::RpcContext* rpc) {
6912
2.08k
  Status return_status;
6913
6914
  // Copy the request, so we can fill in some defaults.
6915
2.08k
  LOG(INFO) << "CreateNamespace from " << RequestorString(rpc)
6916
2.08k
            << ": " << req->DebugString();
6917
6918
2.08k
  scoped_refptr<NamespaceInfo> ns;
6919
2.08k
  std::vector<scoped_refptr<TableInfo>> pgsql_tables;
6920
2.08k
  TransactionMetadata txn;
6921
2.08k
  const auto db_type = GetDatabaseType(*req);
6922
2.08k
  {
6923
2.08k
    LockGuard lock(mutex_);
6924
2.08k
    TRACE("Acquired catalog manager lock");
6925
6926
    // Validate the user request.
6927
6928
    // Verify that the namespace does not already exist.
6929
2.08k
    ns = FindPtrOrNull(namespace_ids_map_, req->namespace_id()); // Same ID.
6930
2.08k
    if (ns == nullptr && db_type != YQL_DATABASE_PGSQL) {
6931
      // PGSQL databases have name uniqueness handled at a different layer, so ignore overlaps.
6932
2.00k
      ns = FindPtrOrNull(namespace_names_mapper_[db_type], req->name());
6933
2.00k
    }
6934
2.08k
    if (ns != nullptr) {
6935
4
      resp->set_id(ns->id());
6936
4
      return_status = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists",
6937
4
                                        req->name());
6938
4
      LOG(WARNING) << "Found keyspace: " << ns->id() << ". Failed creating keyspace with error: "
6939
4
                   << return_status.ToString() << " Request:\n" << req->DebugString();
6940
4
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_ALREADY_PRESENT,
6941
4
                        return_status);
6942
4
    }
6943
6944
    // Add the new namespace.
6945
6946
    // Create unique id for this new namespace.
6947
2.07k
    NamespaceId new_id = !req->namespace_id().empty()
6948
2.03k
        ? req->namespace_id() : GenerateIdUnlocked(SysRowEntryType::NAMESPACE);
6949
2.07k
    ns = new NamespaceInfo(new_id);
6950
2.07k
    ns->mutable_metadata()->StartMutation();
6951
2.07k
    SysNamespaceEntryPB *metadata = &ns->mutable_metadata()->mutable_dirty()->pb;
6952
2.07k
    metadata->set_name(req->name());
6953
2.07k
    metadata->set_database_type(db_type);
6954
2.07k
    metadata->set_colocated(req->colocated());
6955
2.07k
    metadata->set_state(SysNamespaceEntryPB::PREPARING);
6956
6957
    // For namespace created for a Postgres database, save the list of tables and indexes for
6958
    // for the database that need to be copied.
6959
2.07k
    if (db_type == YQL_DATABASE_PGSQL) {
6960
81
      if (req->source_namespace_id().empty()) {
6961
59
        metadata->set_next_pg_oid(req->next_pg_oid());
6962
22
      } else {
6963
22
        const auto source_oid = GetPgsqlDatabaseOid(req->source_namespace_id());
6964
22
        if (!source_oid.ok()) {
6965
0
          return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
6966
0
                            source_oid.status());
6967
0
        }
6968
13.1k
        for (const auto& iter : *table_ids_map_) {
6969
13.1k
          const auto& table_id = iter.first;
6970
13.1k
          const auto& table = iter.second;
6971
13.1k
          if (IsPgsqlId(table_id) && CHECK_RESULT(GetPgsqlDatabaseOid(table_id)) == *source_oid) {
6972
            // Since indexes have dependencies on the base tables, put the tables in the front.
6973
2.81k
            const bool is_table = table->indexed_table_id().empty();
6974
1.58k
            pgsql_tables.insert(is_table ? pgsql_tables.begin() : pgsql_tables.end(), table);
6975
2.81k
          }
6976
13.1k
        }
6977
6978
22
        scoped_refptr<NamespaceInfo> source_ns = FindPtrOrNull(namespace_ids_map_,
6979
22
                                                               req->source_namespace_id());
6980
22
        if (!source_ns) {
6981
0
          return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND,
6982
0
                            STATUS(NotFound, "Source keyspace not found",
6983
0
                                   req->source_namespace_id()));
6984
0
        }
6985
22
        auto source_ns_lock = source_ns->LockForRead();
6986
22
        metadata->set_next_pg_oid(source_ns_lock->pb.next_pg_oid());
6987
22
      }
6988
81
    }
6989
6990
    // NS with a Transaction should be rolled back if the transaction does not get Committed.
6991
    // Store this on the NS for now and use it later.
6992
2.07k
    if (req->has_transaction() && PREDICT_TRUE(FLAGS_enable_transactional_ddl_gc)) {
6993
22
      metadata->mutable_transaction()->CopyFrom(req->transaction());
6994
22
      txn = VERIFY_RESULT(TransactionMetadata::FromPB(req->transaction()));
6995
22
      RSTATUS_DCHECK(!txn.status_tablet.empty(), Corruption, "Given incomplete Transaction");
6996
22
    }
6997
6998
    // Add the namespace to the in-memory map for the assignment.
6999
2.07k
    namespace_ids_map_[ns->id()] = ns;
7000
2.07k
    namespace_names_mapper_[db_type][req->name()] = ns;
7001
7002
2.07k
    resp->set_id(ns->id());
7003
2.07k
  }
7004
2.07k
  TRACE("Inserted new keyspace info into CatalogManager maps");
7005
7006
  // Update the on-disk system catalog.
7007
2.07k
  return_status = sys_catalog_->Upsert(leader_ready_term(), ns);
7008
2.07k
  if (!return_status.ok()) {
7009
8
    LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString();
7010
8
    {
7011
8
      LockGuard lock(mutex_);
7012
8
      namespace_ids_map_.erase(ns->id());
7013
8
      namespace_names_mapper_[db_type].erase(req->name());
7014
8
    }
7015
8
    ns->mutable_metadata()->AbortMutation();
7016
8
    return CheckIfNoLongerLeaderAndSetupError(return_status, resp);
7017
8
  }
7018
2.07k
  TRACE("Wrote keyspace to sys-catalog");
7019
  // Commit the namespace in-memory state.
7020
2.07k
  ns->mutable_metadata()->CommitMutation();
7021
7022
2.07k
  LOG(INFO) << "Created keyspace " << ns->ToString();
7023
7024
2.07k
  if (req->has_creator_role_name()) {
7025
904
    RETURN_NOT_OK(permissions_manager_->GrantPermissions(
7026
904
        req->creator_role_name(),
7027
904
        get_canonical_keyspace(req->name()),
7028
904
        req->name() /* resource name */,
7029
904
        req->name() /* keyspace name */,
7030
904
        all_permissions_for_resource(ResourceType::KEYSPACE),
7031
904
        ResourceType::KEYSPACE,
7032
904
        resp));
7033
904
  }
7034
7035
  // Colocated databases need to create a parent tablet to serve as the base storage location.
7036
2.07k
  if (req->colocated()) {
7037
6
    CreateTableRequestPB req;
7038
6
    CreateTableResponsePB resp;
7039
6
    const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix;
7040
6
    const auto parent_table_name = ns->id() + kColocatedParentTableNameSuffix;
7041
6
    req.set_name(parent_table_name);
7042
6
    req.set_table_id(parent_table_id);
7043
6
    req.mutable_namespace_()->set_name(ns->name());
7044
6
    req.mutable_namespace_()->set_id(ns->id());
7045
6
    req.set_table_type(GetTableTypeForDatabase(ns->database_type()));
7046
6
    req.set_colocated(true);
7047
7048
6
    YBSchemaBuilder schemaBuilder;
7049
6
    schemaBuilder.AddColumn("parent_column")->Type(BINARY)->PrimaryKey()->NotNull();
7050
6
    YBSchema ybschema;
7051
6
    CHECK_OK(schemaBuilder.Build(&ybschema));
7052
6
    auto schema = yb::client::internal::GetSchema(ybschema);
7053
6
    SchemaToPB(schema, req.mutable_schema());
7054
6
    req.mutable_schema()->mutable_table_properties()->set_is_transactional(true);
7055
7056
    // create a parent table, which will create the tablet.
7057
6
    Status s = CreateTable(&req, &resp, rpc);
7058
    // We do not lock here so it is technically possible that the table was already created.
7059
    // If so, there is nothing to do so we just ignore the "AlreadyPresent" error.
7060
6
    if (!s.ok() && !s.IsAlreadyPresent()) {
7061
0
      LOG(WARNING) << "Keyspace creation failed:" << s.ToString();
7062
      // TODO: We should verify this behavior works end-to-end.
7063
      // Diverging in-memory state from disk so the user can issue a delete if no new leader.
7064
0
      auto l = ns->LockForWrite();
7065
0
      SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7066
0
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7067
0
      l.Commit();
7068
0
      return s;
7069
0
    }
7070
2.07k
  }
7071
7072
2.07k
  if ((db_type == YQL_DATABASE_PGSQL && !pgsql_tables.empty()) ||
7073
2.04k
      PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) {
7074
    // Process the subsequent work in the background thread (normally PGSQL).
7075
24
    LOG(INFO) << "Keyspace create enqueued for later processing: " << ns->ToString();
7076
24
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7077
24
        std::bind(&CatalogManager::ProcessPendingNamespace, this, ns->id(), pgsql_tables, txn)));
7078
24
    return Status::OK();
7079
2.04k
  } else {
7080
    // All work is done, it's now safe to online the namespace (normally YQL).
7081
2.04k
    auto l = ns->LockForWrite();
7082
2.04k
    SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7083
2.04k
    if (metadata.state() == SysNamespaceEntryPB::PREPARING) {
7084
2.04k
      metadata.set_state(SysNamespaceEntryPB::RUNNING);
7085
2.04k
      return_status = sys_catalog_->Upsert(leader_ready_term(), ns);
7086
2.04k
      if (!return_status.ok()) {
7087
        // Diverging in-memory state from disk so the user can issue a delete if no new leader.
7088
2
        LOG(WARNING) << "Keyspace creation failed:" << return_status.ToString();
7089
2
        metadata.set_state(SysNamespaceEntryPB::FAILED);
7090
2
        return_status = CheckIfNoLongerLeaderAndSetupError(return_status, resp);
7091
2.04k
      } else {
7092
2.04k
        TRACE("Activated keyspace in sys-catalog");
7093
2.04k
        LOG(INFO) << "Activated keyspace: " << ns->ToString();
7094
2.04k
      }
7095
      // Commit the namespace in-memory state.
7096
2.04k
      l.Commit();
7097
0
    } else {
7098
0
      LOG(WARNING) << "Keyspace has invalid state (" << metadata.state() << "), aborting create";
7099
0
    }
7100
2.04k
  }
7101
2.04k
  return return_status;
7102
2.07k
}
7103
7104
void CatalogManager::ProcessPendingNamespace(
7105
    NamespaceId id,
7106
    std::vector<scoped_refptr<TableInfo>> template_tables,
7107
25
    TransactionMetadata txn) {
7108
25
  LOG(INFO) << "ProcessPendingNamespace started for " << id;
7109
7110
  // Ensure that we are currently the Leader before handling DDL operations.
7111
25
  {
7112
25
    SCOPED_LEADER_SHARED_LOCK(l, this);
7113
25
    if (!l.catalog_status().ok()) {
7114
0
      LOG(WARNING) << "Catalog status failure: " << l.catalog_status().ToString();
7115
      // Don't try again, we have to reset in-memory state after losing leader election.
7116
0
      return;
7117
0
    }
7118
25
    if (!l.leader_status().ok()) {
7119
0
      LOG(WARNING) << "Leader status failure: " << l.leader_status().ToString();
7120
      // Don't try again, we have to reset in-memory state after losing leader election.
7121
0
      return;
7122
0
    }
7123
25
  }
7124
7125
25
  if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_hang_on_namespace_transition))) {
7126
2
    LOG(INFO) << "Artificially waiting (" << FLAGS_catalog_manager_bg_task_wait_ms
7127
2
              << "ms) on namespace creation for " << id;
7128
2
    SleepFor(MonoDelta::FromMilliseconds(FLAGS_catalog_manager_bg_task_wait_ms));
7129
2
    WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7130
2
        std::bind(&CatalogManager::ProcessPendingNamespace, this, id, template_tables, txn)),
7131
2
        "Could not submit ProcessPendingNamespaces to thread pool");
7132
2
    return;
7133
2
  }
7134
7135
23
  scoped_refptr<NamespaceInfo> ns;
7136
23
  {
7137
23
    LockGuard lock(mutex_);
7138
23
    ns = FindPtrOrNull(namespace_ids_map_, id);;
7139
23
  }
7140
23
  if (ns == nullptr) {
7141
0
    LOG(WARNING) << "Pending Namespace not found to finish creation: " << id;
7142
0
    return;
7143
0
  }
7144
7145
  // Copy the system tables necessary to create this namespace.  This can be time-intensive.
7146
23
  bool success = true;
7147
23
  if (!template_tables.empty()) {
7148
22
    auto s = CopyPgsqlSysTables(ns->id(), template_tables);
7149
22
    WARN_NOT_OK(s, "Error Copying PGSQL System Tables for Pending Namespace");
7150
22
    success = s.ok();
7151
22
  }
7152
7153
  // All work is done, change the namespace state regardless of success or failure.
7154
23
  {
7155
23
    auto l = ns->LockForWrite();
7156
23
    SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7157
23
    if (metadata.state() == SysNamespaceEntryPB::PREPARING) {
7158
22
      metadata.set_state(success ? SysNamespaceEntryPB::RUNNING : SysNamespaceEntryPB::FAILED);
7159
23
      auto s = sys_catalog_->Upsert(leader_ready_term(), ns);
7160
23
      if (s.ok()) {
7161
22
        TRACE("Done processing keyspace");
7162
22
        LOG(INFO) << (success ? "Processed" : "Failed") << " keyspace: " << ns->ToString();
7163
7164
        // Verify Transaction gets committed, which occurs after namespace create finishes.
7165
22
        if (success && metadata.has_transaction()) {
7166
21
          LOG(INFO) << "Enqueuing keyspace for Transaction Verification: " << ns->ToString();
7167
21
          std::function<Status(bool)> when_done =
7168
21
              std::bind(&CatalogManager::VerifyNamespacePgLayer, this, ns, _1);
7169
21
          WARN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7170
21
              std::bind(&YsqlTransactionDdl::VerifyTransaction, ysql_transaction_.get(),
7171
21
                        txn, when_done)),
7172
21
              "Could not submit VerifyTransaction to thread pool");
7173
21
        }
7174
1
      } else {
7175
1
        metadata.set_state(SysNamespaceEntryPB::FAILED);
7176
1
        if (s.IsIllegalState() || s.IsAborted()) {
7177
0
          s = STATUS(ServiceUnavailable,
7178
0
              "operation requested can only be executed on a leader master, but this"
7179
0
              " master is no longer the leader", s.ToString());
7180
1
        } else {
7181
1
          s = s.CloneAndPrepend(Substitute(
7182
1
              "An error occurred while modifying keyspace to $0 in sys-catalog: $1",
7183
1
              metadata.state(), s.ToString()));
7184
1
        }
7185
1
        LOG(WARNING) << s.ToString();
7186
1
      }
7187
      // Commit the namespace in-memory state.
7188
23
      l.Commit();
7189
0
    } else {
7190
0
      LOG(WARNING) << "Bad keyspace state (" << metadata.state()
7191
0
                   << "), abandoning creation work for " << ns->ToString();
7192
0
    }
7193
23
  }
7194
23
}
7195
7196
Status CatalogManager::VerifyNamespacePgLayer(
7197
21
    scoped_refptr<NamespaceInfo> ns, bool rpc_success) {
7198
  // Upon Transaction completion, check pg system table using OID to ensure SUCCESS.
7199
21
  const auto pg_table_id = GetPgsqlTableId(atoi(kSystemNamespaceId), kPgDatabaseTableOid);
7200
21
  auto entry_exists = VERIFY_RESULT(
7201
21
      ysql_transaction_->PgEntryExists(pg_table_id, GetPgsqlDatabaseOid(ns->id())));
7202
21
  auto l = ns->LockForWrite();
7203
21
  SysNamespaceEntryPB& metadata = ns->mutable_metadata()->mutable_dirty()->pb;
7204
7205
  // #5981: Mark un-retryable rpc failures as pass to avoid infinite retry of GC'd txns.
7206
21
  bool txn_check_passed = entry_exists || !rpc_success;
7207
7208
21
  if (txn_check_passed) {
7209
    // Passed checks.  Remove the transaction from the entry since we're done processing it.
7210
21
    SCHECK_EQ(metadata.state(), SysNamespaceEntryPB::RUNNING, Aborted,
7211
21
              Substitute("Invalid Namespace state ($0), abandoning transaction GC work for $1",
7212
21
                 SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString()));
7213
20
    metadata.clear_transaction();
7214
20
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns));
7215
20
    if (entry_exists) {
7216
20
      LOG(INFO) << "Namespace transaction succeeded: " << ns->ToString();
7217
0
    } else {
7218
0
      LOG(WARNING) << "Unknown RPC Failure, removing transaction on namespace: " << ns->ToString();
7219
0
    }
7220
    // Commit the namespace in-memory state.
7221
20
    l.Commit();
7222
0
  } else {
7223
    // Transaction failed.  We need to delete this Database now.
7224
0
    SCHECK(metadata.state() == SysNamespaceEntryPB::RUNNING ||
7225
0
           metadata.state() == SysNamespaceEntryPB::FAILED, Aborted,
7226
0
           Substitute("Invalid Namespace state ($0), aborting delete.",
7227
0
                      SysNamespaceEntryPB_State_Name(metadata.state()), ns->ToString()));
7228
0
    LOG(INFO) << "Namespace transaction failed, deleting: " << ns->ToString();
7229
0
    metadata.set_state(SysNamespaceEntryPB::DELETING);
7230
0
    metadata.clear_transaction();
7231
0
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ns));
7232
    // Commit the namespace in-memory state.
7233
0
    l.Commit();
7234
    // Async enqueue delete.
7235
0
    RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
7236
0
        std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, ns)));
7237
0
  }
7238
20
  return Status::OK();
7239
21
}
7240
7241
// Get the information about an in-progress create operation.
7242
Status CatalogManager::IsCreateNamespaceDone(const IsCreateNamespaceDoneRequestPB* req,
7243
2.13k
                                             IsCreateNamespaceDoneResponsePB* resp) {
7244
2.13k
  auto ns_pb = req->namespace_();
7245
7246
  // 1. Lookup the namespace and verify it exists.
7247
2.13k
  TRACE("Looking up keyspace");
7248
2.13k
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(ns_pb), resp);
7249
7250
2.13k
  TRACE("Locking keyspace");
7251
2.13k
  auto l = ns->LockForRead();
7252
2.13k
  auto metadata = l->pb;
7253
7254
2.13k
  switch (metadata.state()) {
7255
    // Success cases. Done and working.
7256
1.88k
    case SysNamespaceEntryPB::RUNNING:
7257
1.88k
      if (!ns->colocated()) {
7258
1.86k
        resp->set_done(true);
7259
11
      } else {
7260
        // Verify system table created as well, if colocated.
7261
11
        IsCreateTableDoneRequestPB table_req;
7262
11
        IsCreateTableDoneResponsePB table_resp;
7263
11
        const auto parent_table_id = ns->id() + kColocatedParentTableIdSuffix;
7264
11
        table_req.mutable_table()->set_table_id(parent_table_id);
7265
11
        auto s = IsCreateTableDone(&table_req, &table_resp);
7266
11
        resp->set_done(table_resp.done());
7267
11
        if (!s.ok()) {
7268
0
          if (table_resp.has_error()) {
7269
0
            resp->mutable_error()->Swap(table_resp.mutable_error());
7270
0
          }
7271
0
          return s;
7272
0
        }
7273
1.88k
      }
7274
1.88k
      break;
7275
    // These states indicate that a create completed but a subsequent remove was requested.
7276
0
    case SysNamespaceEntryPB::DELETING:
7277
0
    case SysNamespaceEntryPB::DELETED:
7278
0
      resp->set_done(true);
7279
0
      break;
7280
    // Pending cases.  NOT DONE
7281
258
    case SysNamespaceEntryPB::PREPARING:
7282
258
      resp->set_done(false);
7283
258
      break;
7284
    // Failure cases.  Done, but we need to give the user an error message.
7285
1
    case SysNamespaceEntryPB::FAILED:
7286
1
      resp->set_done(true);
7287
1
      return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, STATUS(InternalError,
7288
1
              "Namespace Create Failed: not onlined."));
7289
0
    default:
7290
0
      Status s = STATUS_SUBSTITUTE(IllegalState, "IsCreateNamespaceDone failure: state=$0",
7291
0
                                   SysNamespaceEntryPB_State_Name(metadata.state()));
7292
0
      LOG(WARNING) << s.ToString();
7293
0
      resp->set_done(true);
7294
0
      return SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s);
7295
2.13k
  }
7296
7297
2.13k
  return Status::OK();
7298
2.13k
}
7299
7300
Status CatalogManager::DeleteNamespace(const DeleteNamespaceRequestPB* req,
7301
                                       DeleteNamespaceResponsePB* resp,
7302
1.55k
                                       rpc::RpcContext* rpc) {
7303
1.55k
  auto status = DoDeleteNamespace(req, resp, rpc);
7304
1.55k
  if (!status.ok()) {
7305
10
    return SetupError(resp->mutable_error(), status);
7306
10
  }
7307
1.54k
  return status;
7308
1.54k
}
7309
7310
Status CatalogManager::DoDeleteNamespace(const DeleteNamespaceRequestPB* req,
7311
                                         DeleteNamespaceResponsePB* resp,
7312
1.55k
                                         rpc::RpcContext* rpc) {
7313
1.55k
  LOG(INFO) << "Servicing DeleteNamespace request from " << RequestorString(rpc)
7314
1.55k
            << ": " << req->ShortDebugString();
7315
7316
  // Lookup the namespace and verify if it exists.
7317
1.55k
  TRACE("Looking up keyspace");
7318
1.55k
  auto ns = VERIFY_RESULT(FindNamespace(req->namespace_()));
7319
7320
1.55k
  if (req->has_database_type() && req->database_type() != ns->database_type()) {
7321
    // Could not find the right database to delete.
7322
0
    return STATUS(NotFound, "Keyspace not found", ns->name(),
7323
0
                  MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
7324
0
  }
7325
1.55k
  {
7326
    // Don't allow deletion if the namespace is in a transient state.
7327
1.55k
    auto cur_state = ns->state();
7328
1.55k
    if (cur_state != SysNamespaceEntryPB::RUNNING && cur_state != SysNamespaceEntryPB::FAILED) {
7329
2
      if (cur_state == SysNamespaceEntryPB::DELETED) {
7330
1
        return STATUS(NotFound, "Keyspace already deleted", ns->name(),
7331
1
                      MasterError(MasterErrorPB::NAMESPACE_NOT_FOUND));
7332
1
      } else {
7333
1
        return STATUS_EC_FORMAT(
7334
1
            TryAgain, MasterError(MasterErrorPB::IN_TRANSITION_CAN_RETRY),
7335
1
            "Namespace deletion not allowed when State = $0",
7336
1
            SysNamespaceEntryPB::State_Name(cur_state));
7337
1
      }
7338
1.55k
    }
7339
1.55k
  }
7340
7341
  // PGSQL has a completely forked implementation because it allows non-empty namespaces on delete.
7342
1.55k
  if (ns->database_type() == YQL_DATABASE_PGSQL) {
7343
52
    return DeleteYsqlDatabase(req, resp, rpc);
7344
52
  }
7345
7346
1.50k
  TRACE("Locking keyspace");
7347
1.50k
  auto l = ns->LockForWrite();
7348
7349
  // Only empty namespace can be deleted.
7350
1.50k
  TRACE("Looking for tables in the keyspace");
7351
1.50k
  {
7352
1.50k
    SharedLock lock(mutex_);
7353
0
    VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
7354
7355
29.5k
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7356
29.5k
      auto ltm = entry.second->LockForRead();
7357
7358
29.5k
      if (!ltm->started_deleting() && ltm->namespace_id() == ns->id()) {
7359
2
        return STATUS_EC_FORMAT(
7360
2
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7361
2
            "Cannot delete keyspace which has $0: $1 [id=$2], request: $3",
7362
2
            IsTable(ltm->pb) ? "table" : "index", ltm->name(), entry.second->id(),
7363
2
            req->ShortDebugString());
7364
2
      }
7365
29.5k
    }
7366
7367
    // Only empty namespace can be deleted.
7368
1.50k
    TRACE("Looking for types in the keyspace");
7369
7370
0
    for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) {
7371
0
      auto ltm = entry.second->LockForRead();
7372
7373
0
      if (ltm->namespace_id() == ns->id()) {
7374
0
        return STATUS_EC_FORMAT(
7375
0
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7376
0
            "Cannot delete keyspace which has type: $0 [id=$1], request: $2",
7377
0
            ltm->name(), entry.second->id(), req->ShortDebugString());
7378
0
      }
7379
0
    }
7380
1.50k
  }
7381
7382
  // Disallow deleting namespaces with snapshot schedules.
7383
1.50k
  auto map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(SysRowEntryType::NAMESPACE));
7384
0
  for (const auto& schedule_and_objects : map) {
7385
0
    for (const auto& id : schedule_and_objects.second) {
7386
0
      if (id == ns->id()) {
7387
0
        return STATUS_EC_FORMAT(
7388
0
            InvalidArgument, MasterError(MasterErrorPB::NAMESPACE_IS_NOT_EMPTY),
7389
0
            "Cannot delete keyspace which has schedule: $0, request: $1",
7390
0
            schedule_and_objects.first, req->ShortDebugString());
7391
0
      }
7392
0
    }
7393
0
  }
7394
7395
  // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace.
7396
1.50k
  TRACE("Updating metadata on disk");
7397
  // Update sys-catalog.
7398
1.50k
  Status s = sys_catalog_->Delete(leader_ready_term(), ns);
7399
1.50k
  if (!s.ok()) {
7400
    // The mutation will be aborted when 'l' exits the scope on early return.
7401
0
    s = s.CloneAndPrepend("An error occurred while updating sys-catalog");
7402
0
    LOG(WARNING) << s;
7403
0
    return CheckIfNoLongerLeader(s);
7404
0
  }
7405
7406
  // Update the in-memory state.
7407
1.50k
  TRACE("Committing in-memory state");
7408
1.50k
  l.Commit();
7409
7410
  // Remove the namespace from all CatalogManager mappings.
7411
1.50k
  {
7412
1.50k
    LockGuard lock(mutex_);
7413
1.50k
    if (namespace_names_mapper_[ns->database_type()].erase(ns->name()) < 1) {
7414
0
      LOG(WARNING) << Format("Could not remove namespace from names map, id=$1", ns->id());
7415
0
    }
7416
1.50k
    if (namespace_ids_map_.erase(ns->id()) < 1) {
7417
0
      LOG(WARNING) << Format("Could not remove namespace from ids map, id=$1", ns->id());
7418
0
    }
7419
1.50k
  }
7420
7421
  // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for
7422
  // more details.
7423
1.50k
  string canonical_resource = get_canonical_keyspace(req->namespace_().name());
7424
1.50k
  RETURN_NOT_OK(permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, resp));
7425
7426
1.50k
  LOG(INFO) << "Successfully deleted keyspace " << ns->ToString()
7427
1.50k
            << " per request from " << RequestorString(rpc);
7428
1.50k
  return Status::OK();
7429
1.50k
}
7430
7431
0
void CatalogManager::DeleteYcqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) {
7432
0
  TRACE("Locking keyspace");
7433
0
  auto l = database->LockForWrite();
7434
7435
  // Only empty namespace can be deleted.
7436
0
  TRACE("Looking for tables in the keyspace");
7437
0
  {
7438
0
    SharedLock lock(mutex_);
7439
0
    VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
7440
7441
0
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7442
0
      auto ltm = entry.second->LockForRead();
7443
7444
0
      if (!ltm->started_deleting() && ltm->namespace_id() == database->id()) {
7445
0
        LOG(WARNING) << "Cannot delete keyspace which has " << ltm->name()
7446
0
          << " with id=" << entry.second->id();
7447
0
        return;
7448
0
      }
7449
0
    }
7450
0
  }
7451
7452
  // Only empty namespace can be deleted.
7453
0
  TRACE("Looking for types in the keyspace");
7454
0
  {
7455
0
    SharedLock lock(mutex_);
7456
0
    VLOG_WITH_FUNC(3) << "Acquired the catalog manager lock";
7457
7458
0
    for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) {
7459
0
      auto ltm = entry.second->LockForRead();
7460
7461
0
      if (ltm->namespace_id() == database->id()) {
7462
0
        LOG(WARNING) << "Cannot delete keyspace which has type: " << ltm->name()
7463
0
          << " with id=" << entry.second->id();
7464
0
        return;
7465
0
      }
7466
0
    }
7467
0
  }
7468
7469
  // [Delete]. Skip the DELETING->DELETED state, since no tables are present in this namespace.
7470
0
  TRACE("Updating metadata on disk");
7471
  // Update sys-catalog.
7472
0
  Status s = sys_catalog_->Delete(leader_ready_term(), database);
7473
0
  if (!s.ok()) {
7474
    // The mutation will be aborted when 'l' exits the scope on early return.
7475
0
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0",
7476
0
                                     s.ToString()));
7477
0
    LOG(WARNING) << s.ToString();
7478
0
    return;
7479
0
  }
7480
7481
  // Update the in-memory state.
7482
0
  TRACE("Committing in-memory state");
7483
0
  l.Commit();
7484
7485
  // Remove the namespace from all CatalogManager mappings.
7486
0
  {
7487
0
    LockGuard lock(mutex_);
7488
0
    namespace_names_mapper_[database->database_type()].erase(database->name());
7489
0
    if (namespace_ids_map_.erase(database->id()) < 1) {
7490
0
      LOG(WARNING) << Format("Could not remove namespace from maps, id=$1", database->id());
7491
0
    }
7492
0
  }
7493
7494
  // Delete any permissions granted on this keyspace to any role. See comment in DeleteTable() for
7495
  // more details.
7496
0
  string canonical_resource = get_canonical_keyspace(database->name());
7497
0
  DeleteNamespaceResponsePB resp;
7498
0
  s = permissions_manager_->RemoveAllPermissionsForResource(canonical_resource, &resp);
7499
0
  if (s.ok()) {
7500
0
    LOG(INFO) << "Successfully deleted keyspace " << database->ToString();
7501
0
  } else {
7502
0
    LOG(WARNING) << "Error deleting keyspace " << database->ToString() << ": " << s;
7503
0
  }
7504
0
}
7505
7506
Status CatalogManager::DeleteYsqlDatabase(const DeleteNamespaceRequestPB* req,
7507
                                          DeleteNamespaceResponsePB* resp,
7508
52
                                          rpc::RpcContext* rpc) {
7509
  // Lookup database.
7510
52
  auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7511
7512
  // Make sure this is a YSQL database.
7513
52
  if (database->database_type() != YQL_DATABASE_PGSQL) {
7514
    // A non-YSQL namespace is found, but the rpc requests to drop a YSQL database.
7515
0
    Status s = STATUS(NotFound, "YSQL database not found", database->name());
7516
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7517
0
  }
7518
7519
  // Set the Namespace to DELETING.
7520
52
  TRACE("Locking database");
7521
52
  auto l = database->LockForWrite();
7522
52
  SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb;
7523
52
  if (metadata.state() == SysNamespaceEntryPB::RUNNING ||
7524
52
      metadata.state() == SysNamespaceEntryPB::FAILED) {
7525
52
    metadata.set_state(SysNamespaceEntryPB::DELETING);
7526
52
    RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database));
7527
47
    TRACE("Marked keyspace for deletion in sys-catalog");
7528
    // Commit the namespace in-memory state.
7529
47
    l.Commit();
7530
0
  } else {
7531
0
    Status s = STATUS_SUBSTITUTE(IllegalState,
7532
0
        "Keyspace ($0) has invalid state ($1), aborting delete",
7533
0
        database->name(), metadata.state());
7534
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
7535
0
  }
7536
7537
47
  return background_tasks_thread_pool_->SubmitFunc(
7538
47
    std::bind(&CatalogManager::DeleteYsqlDatabaseAsync, this, database));
7539
47
}
7540
7541
49
void CatalogManager::DeleteYsqlDatabaseAsync(scoped_refptr<NamespaceInfo> database) {
7542
49
  TEST_PAUSE_IF_FLAG(TEST_hang_on_namespace_transition);
7543
7544
  // Lock database before removing content.
7545
49
  TRACE("Locking database");
7546
49
  auto l = database->LockForWrite();
7547
49
  SysNamespaceEntryPB &metadata = database->mutable_metadata()->mutable_dirty()->pb;
7548
7549
  // A DELETED Namespace has finished but was tombstoned to avoid immediately reusing the same ID.
7550
  // We consider a restart enough time, so we just need to remove it from the SysCatalog.
7551
49
  if (metadata.state() == SysNamespaceEntryPB::DELETED) {
7552
0
    Status s = sys_catalog_->Delete(leader_ready_term(), database);
7553
0
    WARN_NOT_OK(s, "SysCatalog DeleteItem for Namespace");
7554
0
    if (!s.ok()) {
7555
0
      return;
7556
0
    }
7557
49
  } else if (metadata.state() == SysNamespaceEntryPB::DELETING) {
7558
    // Delete all tables in the database.
7559
48
    TRACE("Delete all tables in YSQL database");
7560
48
    Status s = DeleteYsqlDBTables(database);
7561
48
    WARN_NOT_OK(s, "DeleteYsqlDBTables failed");
7562
48
    if (!s.ok()) {
7563
      // Move to FAILED so DeleteNamespace can be reissued by the user.
7564
5
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7565
5
      l.Commit();
7566
5
      return;
7567
5
    }
7568
7569
    // Once all user-facing data has been offlined, move the Namespace to DELETED state.
7570
43
    metadata.set_state(SysNamespaceEntryPB::DELETED);
7571
43
    s = sys_catalog_->Upsert(leader_ready_term(), database);
7572
43
    WARN_NOT_OK(s, "SysCatalog Update for Namespace");
7573
43
    if (!s.ok()) {
7574
      // Move to FAILED so DeleteNamespace can be reissued by the user.
7575
0
      metadata.set_state(SysNamespaceEntryPB::FAILED);
7576
0
      l.Commit();
7577
0
      return;
7578
0
    }
7579
43
    TRACE("Marked keyspace as deleted in sys-catalog");
7580
1
  } else {
7581
1
    LOG(WARNING) << "Keyspace (" << database->name() << ") has invalid state ("
7582
1
                 << metadata.state() << "), aborting delete";
7583
1
    return;
7584
1
  }
7585
7586
  // Remove namespace from CatalogManager name mapping.  Will remove ID map after all Tables gone.
7587
43
  {
7588
43
    LockGuard lock(mutex_);
7589
43
    if (namespace_names_mapper_[database->database_type()].erase(database->name()) < 1) {
7590
0
      LOG(WARNING) << Format("Could not remove namespace from maps, name=$0, id=$1",
7591
0
                             database->name(), database->id());
7592
0
    }
7593
43
  }
7594
7595
  // Update the in-memory state.
7596
43
  TRACE("Committing in-memory state");
7597
43
  l.Commit();
7598
7599
  // DROP completed. Return status.
7600
43
  LOG(INFO) << "Successfully deleted YSQL database " << database->ToString();
7601
43
}
7602
7603
// IMPORTANT: If modifying, consider updating DeleteTable(), the singular deletion API.
7604
48
Status CatalogManager::DeleteYsqlDBTables(const scoped_refptr<NamespaceInfo>& database) {
7605
48
  TabletInfoPtr sys_tablet_info;
7606
48
  vector<pair<scoped_refptr<TableInfo>, TableInfo::WriteLock>> tables;
7607
48
  std::unordered_set<TableId> sys_table_ids;
7608
48
  {
7609
    // Lock the catalog to iterate over table_ids_map_.
7610
48
    SharedLock lock(mutex_);
7611
7612
48
    sys_tablet_info = tablet_map_->find(kSysCatalogTabletId)->second;
7613
7614
    // Populate tables and sys_table_ids.
7615
15.4k
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
7616
15.4k
      scoped_refptr<TableInfo> table = entry.second;
7617
15.4k
      if (table->namespace_id() != database->id()) {
7618
13.1k
        continue;
7619
13.1k
      }
7620
2.31k
      auto l = table->LockForWrite();
7621
2.31k
      if (l->started_deleting()) {
7622
11
        continue;
7623
11
      }
7624
2.30k
      RSTATUS_DCHECK(
7625
2.30k
          !l->pb.is_pg_shared_table(), Corruption, "Shared table found in database");
7626
7627
2.30k
      if (IsSystemTable(*table)) {
7628
2.26k
        sys_table_ids.insert(table->id());
7629
2.26k
      }
7630
7631
      // For regular (indexed) table, insert table info and lock in the front of the list. Else for
7632
      // index table, append them to the end. We do so so that we will commit and delete the indexed
7633
      // table first before its indexes.
7634
2.30k
      if (IsTable(l->pb)) {
7635
1.28k
        tables.insert(tables.begin(), {table, std::move(l)});
7636
1.01k
      } else {
7637
1.01k
        tables.push_back({table, std::move(l)});
7638
1.01k
      }
7639
2.30k
    }
7640
48
  }
7641
  // Remove the system tables from RAFT.
7642
48
  TRACE("Sending system table delete RPCs");
7643
2.26k
  for (auto &table_id : sys_table_ids) {
7644
2.26k
    RETURN_NOT_OK(sys_catalog_->DeleteYsqlSystemTable(table_id));
7645
2.26k
  }
7646
  // Remove the system tables from the system catalog TabletInfo.
7647
48
  RETURN_NOT_OK(RemoveTableIdsFromTabletInfo(sys_tablet_info, sys_table_ids));
7648
7649
  // Set all table states to DELETING as one batch RPC call.
7650
44
  TRACE("Sending delete table batch RPC to sys catalog");
7651
44
  vector<TableInfo *> tables_rpc;
7652
44
  tables_rpc.reserve(tables.size());
7653
2.30k
  for (auto &table_and_lock : tables) {
7654
2.30k
    tables_rpc.push_back(table_and_lock.first.get());
7655
2.30k
    auto &l = table_and_lock.second;
7656
    // Mark the table state as DELETING tablets.
7657
2.30k
    l.mutable_data()->set_state(SysTablesEntryPB::DELETING,
7658
2.30k
        Substitute("Started deleting at $0", LocalTimeAsString()));
7659
2.30k
  }
7660
  // Update all the table states in raft in bulk.
7661
44
  Status s = sys_catalog_->Upsert(leader_ready_term(), tables_rpc);
7662
44
  if (!s.ok()) {
7663
    // The mutation will be aborted when 'l' exits the scope on early return.
7664
1
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys tables: $0",
7665
1
                                     s.ToString()));
7666
1
    LOG(WARNING) << s.ToString();
7667
1
    return CheckIfNoLongerLeader(s);
7668
1
  }
7669
2.30k
  for (auto &table_and_lock : tables) {
7670
2.30k
    auto &table = table_and_lock.first;
7671
2.30k
    auto &l = table_and_lock.second;
7672
    // Cancel all table busywork and commit the DELETING change.
7673
2.30k
    l.Commit();
7674
2.30k
    table->AbortTasks();
7675
2.30k
  }
7676
7677
  // Batch remove all relevant CDC streams, handle after releasing Table locks.
7678
43
  TRACE("Deleting CDC streams on table");
7679
43
  vector<TableId> id_list;
7680
43
  id_list.reserve(tables.size());
7681
2.30k
  for (auto &table_and_lock : tables) {
7682
2.30k
    id_list.push_back(table_and_lock.first->id());
7683
2.30k
  }
7684
43
  RETURN_NOT_OK(DeleteCDCStreamsForTables(id_list));
7685
7686
  // Send a DeleteTablet() RPC request to each tablet replica in the table.
7687
2.30k
  for (auto &table_and_lock : tables) {
7688
2.30k
    auto &table = table_and_lock.first;
7689
    // TODO(pitr) undelete for YSQL tables
7690
2.30k
    RETURN_NOT_OK(DeleteTabletsAndSendRequests(table, {}));
7691
2.30k
  }
7692
7693
  // Invoke any background tasks and return (notably, table cleanup).
7694
43
  background_tasks_->Wake();
7695
43
  return Status::OK();
7696
43
}
7697
7698
// Get the information about an in-progress delete operation.
7699
Status CatalogManager::IsDeleteNamespaceDone(const IsDeleteNamespaceDoneRequestPB* req,
7700
1.59k
                                             IsDeleteNamespaceDoneResponsePB* resp) {
7701
1.59k
  auto ns_pb = req->namespace_();
7702
7703
  // Lookup the namespace and verify it exists.
7704
1.59k
  TRACE("Looking up keyspace");
7705
1.59k
  auto ns = FindNamespace(ns_pb);
7706
1.59k
  if (!ns.ok()) {
7707
    // Namespace no longer exists means success.
7708
1.51k
    LOG(INFO) << "Servicing IsDeleteNamespaceDone request for "
7709
1.51k
              << ns_pb.DebugString() << ": deleted (not found)";
7710
1.51k
    resp->set_done(true);
7711
1.51k
    return Status::OK();
7712
1.51k
  }
7713
7714
85
  TRACE("Locking keyspace");
7715
85
  auto l = (**ns).LockForRead();
7716
85
  auto& metadata = l->pb;
7717
7718
85
  if (metadata.state() == SysNamespaceEntryPB::DELETED) {
7719
22
    resp->set_done(true);
7720
63
  } else if (metadata.state() == SysNamespaceEntryPB::DELETING) {
7721
58
    resp->set_done(false);
7722
5
  } else {
7723
5
    Status s = STATUS_SUBSTITUTE(IllegalState,
7724
5
        "Servicing IsDeleteNamespaceDone request for $0: NOT deleted (state=$1)",
7725
5
        ns_pb.DebugString(), metadata.state());
7726
5
    LOG(WARNING) << s.ToString();
7727
    // Done != Successful.  We just want to let the user know the delete has finished processing.
7728
5
    resp->set_done(true);
7729
5
    return SetupError(resp->mutable_error(), MasterErrorPB::INTERNAL_ERROR, s);
7730
5
  }
7731
80
  return Status::OK();
7732
80
}
7733
7734
Status CatalogManager::AlterNamespace(const AlterNamespaceRequestPB* req,
7735
                                      AlterNamespaceResponsePB* resp,
7736
4
                                      rpc::RpcContext* rpc) {
7737
4
  LOG(INFO) << "Servicing AlterNamespace request from " << RequestorString(rpc)
7738
4
            << ": " << req->ShortDebugString();
7739
7740
4
  auto database = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7741
7742
4
  if (req->namespace_().has_database_type() &&
7743
2
      database->database_type() != req->namespace_().database_type()) {
7744
0
    Status s = STATUS(NotFound, "Database not found", database->name());
7745
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7746
0
  }
7747
7748
4
  TRACE("Locking database");
7749
4
  auto l = database->LockForWrite();
7750
7751
  // Don't allow an alter if the namespace isn't running.
7752
4
  if (l->pb.state() != SysNamespaceEntryPB::RUNNING) {
7753
1
    Status s = STATUS_SUBSTITUTE(TryAgain, "Namespace not running.  State = $0",
7754
1
                                 SysNamespaceEntryPB::State_Name(l->pb.state()));
7755
1
    return SetupError(resp->mutable_error(), NamespaceMasterError(l->pb.state()), s);
7756
1
  }
7757
7758
3
  const string old_name = l->pb.name();
7759
7760
3
  if (req->has_new_name() && req->new_name() != old_name) {
7761
3
    const string new_name = req->new_name();
7762
7763
    // Verify that the new name does not exist.
7764
3
    NamespaceIdentifierPB ns_identifier;
7765
3
    ns_identifier.set_name(new_name);
7766
3
    if (req->namespace_().has_database_type()) {
7767
1
      ns_identifier.set_database_type(req->namespace_().database_type());
7768
1
    }
7769
    // TODO: This check will only work for YSQL once we add support for YSQL namespaces in
7770
    // namespace_name_map (#1476).
7771
3
    LockGuard lock(mutex_);
7772
3
    TRACE("Acquired catalog manager lock");
7773
3
    auto ns = FindNamespaceUnlocked(ns_identifier);
7774
3
    if (ns.ok() && req->namespace_().has_database_type() &&
7775
0
        (**ns).database_type() == req->namespace_().database_type()) {
7776
0
      Status s = STATUS_SUBSTITUTE(AlreadyPresent, "Keyspace '$0' already exists", (**ns).name());
7777
0
      LOG(WARNING) << "Found keyspace: " << (**ns).id() << ". Failed altering keyspace with error: "
7778
0
                   << s << " Request:\n" << req->DebugString();
7779
0
      return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_ALREADY_PRESENT, s);
7780
0
    }
7781
7782
3
    namespace_names_mapper_[req->namespace_().database_type()][new_name] = database;
7783
3
    namespace_names_mapper_[req->namespace_().database_type()].erase(old_name);
7784
7785
3
    l.mutable_data()->pb.set_name(new_name);
7786
3
  }
7787
7788
3
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), database));
7789
7790
3
  TRACE("Committing in-memory state");
7791
3
  l.Commit();
7792
7793
3
  LOG(INFO) << "Successfully altered keyspace " << req->namespace_().name()
7794
3
            << " per request from " << RequestorString(rpc);
7795
3
  return Status::OK();
7796
3
}
7797
7798
Status CatalogManager::ListNamespaces(const ListNamespacesRequestPB* req,
7799
4.44k
                                      ListNamespacesResponsePB* resp) {
7800
4.44k
  NamespaceInfoMap namespace_ids_copy;
7801
4.44k
  {
7802
4.44k
    SharedLock lock(mutex_);
7803
4.44k
    namespace_ids_copy = namespace_ids_map_;
7804
4.44k
  }
7805
7806
19.6k
  for (const auto& entry : namespace_ids_copy) {
7807
19.6k
    const auto& namespace_info = *entry.second;
7808
    // If the request asks for namespaces for a specific database type, filter by the type.
7809
19.6k
    if (req->has_database_type() && namespace_info.database_type() != req->database_type()) {
7810
19
      continue;
7811
19
    }
7812
    // Only return RUNNING namespaces.
7813
19.5k
    if (namespace_info.state() != SysNamespaceEntryPB::RUNNING) {
7814
10
      continue;
7815
10
    }
7816
7817
19.5k
    NamespaceIdentifierPB *ns = resp->add_namespaces();
7818
19.5k
    ns->set_id(namespace_info.id());
7819
19.5k
    ns->set_name(namespace_info.name());
7820
19.5k
    ns->set_database_type(namespace_info.database_type());
7821
19.5k
  }
7822
4.44k
  return Status::OK();
7823
4.44k
}
7824
7825
Status CatalogManager::GetNamespaceInfo(const GetNamespaceInfoRequestPB* req,
7826
                                        GetNamespaceInfoResponsePB* resp,
7827
1.77k
                                        rpc::RpcContext* rpc) {
7828
1.77k
  LOG(INFO) << __func__ << " from " << RequestorString(rpc) << ": " << req->ShortDebugString();
7829
7830
  // Look up the namespace and verify if it exists.
7831
1.77k
  TRACE("Looking up namespace");
7832
1.77k
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7833
7834
1.77k
  resp->mutable_namespace_()->set_id(ns->id());
7835
1.77k
  resp->mutable_namespace_()->set_name(ns->name());
7836
1.77k
  resp->mutable_namespace_()->set_database_type(ns->database_type());
7837
1.77k
  resp->set_colocated(ns->colocated());
7838
1.77k
  return Status::OK();
7839
1.77k
}
7840
7841
Status CatalogManager::RedisConfigSet(
7842
0
    const RedisConfigSetRequestPB* req, RedisConfigSetResponsePB* resp, rpc::RpcContext* rpc) {
7843
0
  DCHECK(req->has_keyword());
7844
0
  const auto& key = req->keyword();
7845
0
  SysRedisConfigEntryPB config_entry;
7846
0
  config_entry.set_key(key);
7847
0
  *config_entry.mutable_args() = req->args();
7848
0
  bool created = false;
7849
7850
0
  TRACE("Acquired catalog manager lock");
7851
0
  LockGuard lock(mutex_);
7852
0
  scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword());
7853
0
  if (cfg == nullptr) {
7854
0
    created = true;
7855
0
    cfg = new RedisConfigInfo(key);
7856
0
    redis_config_map_[key] = cfg;
7857
0
  }
7858
7859
0
  auto wl = cfg->LockForWrite();
7860
0
  wl.mutable_data()->pb = std::move(config_entry);
7861
0
  if (created) {
7862
0
    CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg));
7863
0
  } else {
7864
0
    CHECK_OK(sys_catalog_->Upsert(leader_ready_term(), cfg));
7865
0
  }
7866
0
  wl.Commit();
7867
0
  return Status::OK();
7868
0
}
7869
7870
Status CatalogManager::RedisConfigGet(
7871
291
    const RedisConfigGetRequestPB* req, RedisConfigGetResponsePB* resp, rpc::RpcContext* rpc) {
7872
291
  DCHECK(req->has_keyword());
7873
291
  resp->set_keyword(req->keyword());
7874
291
  TRACE("Acquired catalog manager lock");
7875
291
  SharedLock lock(mutex_);
7876
291
  scoped_refptr<RedisConfigInfo> cfg = FindPtrOrNull(redis_config_map_, req->keyword());
7877
291
  if (cfg == nullptr) {
7878
291
    Status s = STATUS_SUBSTITUTE(NotFound, "Redis config for $0 does not exists", req->keyword());
7879
291
    return SetupError(resp->mutable_error(), MasterErrorPB::REDIS_CONFIG_NOT_FOUND, s);
7880
291
  }
7881
0
  auto rci = cfg->LockForRead();
7882
0
  resp->mutable_args()->CopyFrom(rci->pb.args());
7883
0
  return Status::OK();
7884
0
}
7885
7886
Status CatalogManager::CreateUDType(const CreateUDTypeRequestPB* req,
7887
                                    CreateUDTypeResponsePB* resp,
7888
46
                                    rpc::RpcContext* rpc) {
7889
46
  LOG(INFO) << "CreateUDType from " << RequestorString(rpc)
7890
46
            << ": " << req->DebugString();
7891
7892
46
  Status s;
7893
46
  scoped_refptr<UDTypeInfo> tp;
7894
46
  scoped_refptr<NamespaceInfo> ns;
7895
7896
  // Lookup the namespace and verify if it exists.
7897
46
  if (req->has_namespace_()) {
7898
46
    TRACE("Looking up namespace");
7899
46
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->namespace_()), resp);
7900
46
    if (ns->database_type() != YQLDatabase::YQL_DATABASE_CQL) {
7901
0
      Status s = STATUS(NotFound, "Namespace not found");
7902
0
      return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7903
0
    }
7904
46
  }
7905
7906
  // Get all the referenced types (if any).
7907
46
  std::vector<std::string> referenced_udts;
7908
86
  for (const QLTypePB& field_type : req->field_types()) {
7909
86
    QLType::GetUserDefinedTypeIds(field_type, /* transitive = */ true, &referenced_udts);
7910
86
  }
7911
7912
46
  {
7913
46
    TRACE("Acquired catalog manager lock");
7914
46
    LockGuard lock(mutex_);
7915
7916
    // Verify that the type does not exist.
7917
46
    tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->name()));
7918
7919
46
    if (tp != nullptr) {
7920
1
      s = STATUS_SUBSTITUTE(AlreadyPresent,
7921
1
          "Type '$0.$1' already exists", ns->name(), req->name());
7922
1
      LOG(WARNING) << "Found type: " << tp->id() << ". Failed creating type with error: "
7923
1
                   << s.ToString() << " Request:\n" << req->DebugString();
7924
1
      return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_ALREADY_PRESENT, s);
7925
1
    }
7926
7927
    // Verify that all referenced types actually exist.
7928
45
    for (const auto& udt_id : referenced_udts) {
7929
11
      if (FindPtrOrNull(udtype_ids_map_, udt_id) == nullptr) {
7930
          // This may be caused by a stale cache (e.g. referenced type name resolves to an old,
7931
          // deleted type). Return InvalidArgument so query layer will clear cache and retry.
7932
0
          s = STATUS_SUBSTITUTE(InvalidArgument,
7933
0
          "Type id '$0' referenced by type '$1' does not exist", udt_id, req->name());
7934
0
        return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
7935
0
      }
7936
11
    }
7937
7938
    // Construct the new type (generate fresh name and set fields).
7939
45
    UDTypeId new_id = GenerateIdUnlocked(SysRowEntryType::UDTYPE);
7940
45
    tp = new UDTypeInfo(new_id);
7941
45
    tp->mutable_metadata()->StartMutation();
7942
45
    SysUDTypeEntryPB *metadata = &tp->mutable_metadata()->mutable_dirty()->pb;
7943
45
    metadata->set_name(req->name());
7944
45
    metadata->set_namespace_id(ns->id());
7945
85
    for (const string& field_name : req->field_names()) {
7946
85
      metadata->add_field_names(field_name);
7947
85
    }
7948
7949
85
    for (const QLTypePB& field_type : req->field_types()) {
7950
85
      metadata->add_field_types()->CopyFrom(field_type);
7951
85
    }
7952
7953
    // Add the type to the in-memory maps.
7954
45
    udtype_ids_map_[tp->id()] = tp;
7955
45
    udtype_names_map_[std::make_pair(ns->id(), req->name())] = tp;
7956
45
    resp->set_id(tp->id());
7957
45
  }
7958
45
  TRACE("Inserted new user-defined type info into CatalogManager maps");
7959
7960
  // Update the on-disk system catalog.
7961
45
  s = sys_catalog_->Upsert(leader_ready_term(), tp);
7962
45
  if (!s.ok()) {
7963
0
    s = s.CloneAndPrepend(Substitute(
7964
0
        "An error occurred while inserting user-defined type to sys-catalog: $0", s.ToString()));
7965
0
    LOG(WARNING) << s.ToString();
7966
0
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
7967
0
  }
7968
45
  TRACE("Wrote user-defined type to sys-catalog");
7969
7970
  // Commit the in-memory state.
7971
45
  tp->mutable_metadata()->CommitMutation();
7972
45
  LOG(INFO) << "Created user-defined type " << tp->ToString();
7973
45
  return Status::OK();
7974
45
}
7975
7976
Status CatalogManager::DeleteUDType(const DeleteUDTypeRequestPB* req,
7977
                                    DeleteUDTypeResponsePB* resp,
7978
53
                                    rpc::RpcContext* rpc) {
7979
53
  LOG(INFO) << "Servicing DeleteUDType request from " << RequestorString(rpc)
7980
53
            << ": " << req->ShortDebugString();
7981
7982
53
  scoped_refptr<UDTypeInfo> tp;
7983
53
  scoped_refptr<NamespaceInfo> ns;
7984
7985
53
  if (!req->has_type()) {
7986
0
    Status s = STATUS(InvalidArgument, "No type given", req->DebugString());
7987
0
    return SetupError(resp->mutable_error(), MasterErrorPB::NAMESPACE_NOT_FOUND, s);
7988
0
  }
7989
7990
  // Validate namespace.
7991
53
  if (req->type().has_namespace_()) {
7992
    // Lookup the namespace and verify if it exists.
7993
53
    TRACE("Looking up namespace");
7994
53
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp);
7995
53
  }
7996
7997
53
  {
7998
53
    LockGuard lock(mutex_);
7999
53
    TRACE("Acquired catalog manager lock");
8000
8001
53
    if (req->type().has_type_id()) {
8002
0
      tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id());
8003
53
    } else if (req->type().has_type_name()) {
8004
53
      tp = FindPtrOrNull(udtype_names_map_, {ns->id(), req->type().type_name()});
8005
53
    }
8006
8007
53
    if (tp == nullptr) {
8008
2
      Status s = STATUS(NotFound, "The type does not exist", req->DebugString());
8009
2
      return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8010
2
    }
8011
8012
    // Checking if any table uses this type.
8013
    // TODO: this could be more efficient.
8014
959
    for (const TableInfoMap::value_type& entry : *table_ids_map_) {
8015
959
      auto ltm = entry.second->LockForRead();
8016
959
      if (!ltm->started_deleting()) {
8017
6.96k
        for (const auto &col : ltm->schema().columns()) {
8018
6.96k
          if (col.type().main() == DataType::USER_DEFINED_TYPE &&
8019
8
              col.type().udtype_info().id() == tp->id()) {
8020
2
            Status s = STATUS(QLError,
8021
2
                Substitute("Cannot delete type '$0.$1'. It is used in column $2 of table $3",
8022
2
                    ns->name(), tp->name(), col.name(), ltm->name()));
8023
2
            return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
8024
2
          }
8025
6.96k
        }
8026
853
      }
8027
959
    }
8028
8029
    // Checking if any other type uses this type (i.e. in the case of nested types).
8030
    // TODO: this could be more efficient.
8031
74
    for (const UDTypeInfoMap::value_type& entry : udtype_ids_map_) {
8032
74
      auto ltm = entry.second->LockForRead();
8033
8034
203
      for (int i = 0; i < ltm->field_types_size(); i++) {
8035
        // Only need to check direct (non-transitive) type dependencies here.
8036
        // This also means we report more precise errors for in-use types.
8037
133
        if (QLType::DoesUserDefinedTypeIdExist(ltm->field_types(i),
8038
133
                                      false /* transitive */,
8039
4
                                      tp->id())) {
8040
4
          Status s = STATUS(QLError,
8041
4
              Substitute("Cannot delete type '$0.$1'. It is used in field $2 of type '$3'",
8042
4
                  ns->name(), tp->name(), ltm->field_names(i), ltm->name()));
8043
4
          return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_REQUEST, s);
8044
4
        }
8045
133
      }
8046
74
    }
8047
49
  }
8048
8049
45
  auto l = tp->LockForWrite();
8050
8051
45
  Status s = sys_catalog_->Delete(leader_ready_term(), tp);
8052
45
  if (!s.ok()) {
8053
    // The mutation will be aborted when 'l' exits the scope on early return.
8054
0
    s = s.CloneAndPrepend(Substitute("An error occurred while updating sys-catalog: $0",
8055
0
        s.ToString()));
8056
0
    LOG(WARNING) << s.ToString();
8057
0
    return CheckIfNoLongerLeaderAndSetupError(s, resp);
8058
0
  }
8059
8060
  // Remove it from the maps.
8061
45
  {
8062
45
    TRACE("Removing from maps");
8063
45
    LockGuard lock(mutex_);
8064
45
    if (udtype_ids_map_.erase(tp->id()) < 1) {
8065
0
      PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name());
8066
0
    }
8067
45
    if (udtype_names_map_.erase({ns->id(), tp->name()}) < 1) {
8068
0
      PANIC_RPC(rpc, "Could not remove user defined type from map, name=" + l->name());
8069
0
    }
8070
45
  }
8071
8072
  // Update the in-memory state.
8073
45
  TRACE("Committing in-memory state");
8074
45
  l.Commit();
8075
8076
45
  LOG(INFO) << "Successfully deleted user-defined type " << tp->ToString()
8077
45
            << " per request from " << RequestorString(rpc);
8078
8079
45
  return Status::OK();
8080
45
}
8081
8082
Status CatalogManager::GetUDTypeInfo(const GetUDTypeInfoRequestPB* req,
8083
                                     GetUDTypeInfoResponsePB* resp,
8084
55
                                     rpc::RpcContext* rpc) {
8085
55
  LOG(INFO) << "GetUDTypeInfo from " << RequestorString(rpc)
8086
55
            << ": " << req->DebugString();
8087
55
  Status s;
8088
55
  scoped_refptr<UDTypeInfo> tp;
8089
55
  scoped_refptr<NamespaceInfo> ns;
8090
8091
55
  if (!req->has_type()) {
8092
0
    s = STATUS(InvalidArgument, "Cannot get type, no type identifier given", req->DebugString());
8093
0
    return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8094
0
  }
8095
8096
55
  if (req->type().has_type_id()) {
8097
0
    tp = FindPtrOrNull(udtype_ids_map_, req->type().type_id());
8098
55
  } else if (req->type().has_type_name() && req->type().has_namespace_()) {
8099
    // Lookup the type and verify if it exists.
8100
55
    TRACE("Looking up namespace");
8101
55
    ns = VERIFY_NAMESPACE_FOUND(FindNamespace(req->type().namespace_()), resp);
8102
8103
55
    tp = FindPtrOrNull(udtype_names_map_, std::make_pair(ns->id(), req->type().type_name()));
8104
55
  }
8105
8106
55
  if (tp == nullptr) {
8107
7
    s = STATUS(InvalidArgument, "Couldn't find type", req->DebugString());
8108
7
    return SetupError(resp->mutable_error(), MasterErrorPB::TYPE_NOT_FOUND, s);
8109
7
  }
8110
8111
48
  {
8112
48
    auto type_lock = tp->LockForRead();
8113
8114
48
    UDTypeInfoPB* type_info = resp->mutable_udtype();
8115
8116
48
    type_info->set_name(tp->name());
8117
48
    type_info->set_id(tp->id());
8118
48
    type_info->mutable_namespace_()->set_id(type_lock->namespace_id());
8119
8120
140
    for (int i = 0; i < type_lock->field_names_size(); i++) {
8121
92
      type_info->add_field_names(type_lock->field_names(i));
8122
92
    }
8123
140
    for (int i = 0; i < type_lock->field_types_size(); i++) {
8124
92
      type_info->add_field_types()->CopyFrom(type_lock->field_types(i));
8125
92
    }
8126
8127
48
    LOG(INFO) << "Retrieved user-defined type " << tp->ToString();
8128
48
  }
8129
48
  return Status::OK();
8130
48
}
8131
8132
Status CatalogManager::ListUDTypes(const ListUDTypesRequestPB* req,
8133
0
                                   ListUDTypesResponsePB* resp) {
8134
0
  SharedLock lock(mutex_);
8135
8136
  // Lookup the namespace and verify that it exists.
8137
0
  auto ns = VERIFY_NAMESPACE_FOUND(FindNamespaceUnlocked(req->namespace_()), resp);
8138
8139
0
  for (const UDTypeInfoByNameMap::value_type& entry : udtype_names_map_) {
8140
0
    auto ltm = entry.second->LockForRead();
8141
8142
    // key is a pair <namespace_id, type_name>.
8143
0
    if (!ns->id().empty() && ns->id() != entry.first.first) {
8144
0
      continue; // Skip types from other namespaces.
8145
0
    }
8146
8147
0
    UDTypeInfoPB* udtype = resp->add_udtypes();
8148
0
    udtype->set_id(entry.second->id());
8149
0
    udtype->set_name(ltm->name());
8150
0
    for (int i = 0; i <= ltm->field_names_size(); i++) {
8151
0
      udtype->add_field_names(ltm->field_names(i));
8152
0
    }
8153
0
    for (int i = 0; i <= ltm->field_types_size(); i++) {
8154
0
      udtype->add_field_types()->CopyFrom(ltm->field_types(i));
8155
0
    }
8156
8157
0
    if (CHECK_NOTNULL(ns.get())) {
8158
0
      auto l = ns->LockForRead();
8159
0
      udtype->mutable_namespace_()->set_id(ns->id());
8160
0
      udtype->mutable_namespace_()->set_name(ns->name());
8161
0
    }
8162
0
  }
8163
0
  return Status::OK();
8164
0
}
8165
8166
// For non-enterprise builds, this is a no-op.
8167
0
Status CatalogManager::DeleteCDCStreamsForTable(const TableId& table) {
8168
0
  return Status::OK();
8169
0
}
8170
8171
0
Status CatalogManager::DeleteCDCStreamsForTables(const vector<TableId>& table_ids) {
8172
0
  return Status::OK();
8173
0
}
8174
8175
8176
0
bool CatalogManager::CDCStreamExistsUnlocked(const CDCStreamId& stream_id) {
8177
0
  return false;
8178
0
}
8179
8180
0
Result<uint64_t> CatalogManager::IncrementYsqlCatalogVersion() {
8181
8182
0
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite();
8183
0
  uint64_t new_version = l->pb.ysql_catalog_config().version() + 1;
8184
0
  l.mutable_data()->pb.mutable_ysql_catalog_config()->set_version(new_version);
8185
8186
  // Write to sys_catalog and in memory.
8187
0
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), ysql_catalog_config_));
8188
0
  l.Commit();
8189
8190
0
  if (FLAGS_log_ysql_catalog_versions) {
8191
0
    LOG_WITH_FUNC(WARNING) << "set catalog version: " << new_version
8192
0
                           << " (using old protobuf method)";
8193
0
  }
8194
8195
0
  return new_version;
8196
0
}
8197
8198
361
Status CatalogManager::InitDbFinished(Status initdb_status, int64_t term) {
8199
361
  if (initdb_status.ok()) {
8200
361
    LOG(INFO) << "initdb completed successfully";
8201
0
  } else {
8202
0
    LOG(ERROR) << "initdb failed: " << initdb_status;
8203
0
  }
8204
8205
361
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForWrite();
8206
361
  auto* mutable_ysql_catalog_config = l.mutable_data()->pb.mutable_ysql_catalog_config();
8207
361
  mutable_ysql_catalog_config->set_initdb_done(true);
8208
361
  if (!initdb_status.ok()) {
8209
0
    mutable_ysql_catalog_config->set_initdb_error(initdb_status.ToString());
8210
361
  } else {
8211
361
    mutable_ysql_catalog_config->clear_initdb_error();
8212
361
  }
8213
8214
361
  RETURN_NOT_OK(sys_catalog_->Upsert(term, ysql_catalog_config_));
8215
361
  l.Commit();
8216
361
  return Status::OK();
8217
361
}
8218
8219
CHECKED_STATUS CatalogManager::IsInitDbDone(
8220
    const IsInitDbDoneRequestPB* req,
8221
715
    IsInitDbDoneResponsePB* resp) {
8222
715
  auto l = CHECK_NOTNULL(ysql_catalog_config_.get())->LockForRead();
8223
715
  const auto& ysql_catalog_config = l->pb.ysql_catalog_config();
8224
715
  resp->set_pg_proc_exists(pg_proc_exists_.load(std::memory_order_acquire));
8225
715
  resp->set_done(ysql_catalog_config.initdb_done());
8226
715
  if (ysql_catalog_config.has_initdb_error() &&
8227
0
      !ysql_catalog_config.initdb_error().empty()) {
8228
0
    resp->set_initdb_error(ysql_catalog_config.initdb_error());
8229
0
  }
8230
715
  return Status::OK();
8231
715
}
8232
8233
Status CatalogManager::GetYsqlCatalogVersion(uint64_t* catalog_version,
8234
385k
                                             uint64_t* last_breaking_version) {
8235
385k
  auto table_info = GetTableInfo(kPgYbCatalogVersionTableId);
8236
385k
  if (table_info != nullptr) {
8237
96.5k
    RETURN_NOT_OK(sys_catalog_->ReadYsqlCatalogVersion(kPgYbCatalogVersionTableId,
8238
96.5k
                                                       catalog_version,
8239
96.5k
                                                       last_breaking_version));
8240
    // If the version is properly initialized, we're done.
8241
96.5k
    if ((!catalog_version || *catalog_version > 0) &&
8242
95.7k
        (!last_breaking_version || *last_breaking_version > 0)) {
8243
95.1k
      return Status::OK();
8244
95.1k
    }
8245
    // However, it's possible for a table to have no entries mid-migration or if migration fails.
8246
    // In this case we'd like to fall back to the legacy approach.
8247
96.5k
  }
8248
8249
290k
  auto l = ysql_catalog_config_->LockForRead();
8250
  // last_breaking_version is the last version (change) that invalidated ongoing transactions.
8251
  // If using the old (protobuf-based) version method, we do not have any information about
8252
  // breaking changes so assuming every change is a breaking change.
8253
290k
  if (catalog_version) {
8254
288k
    *catalog_version = l->pb.ysql_catalog_config().version();
8255
288k
  }
8256
290k
  if (last_breaking_version) {
8257
287k
    *last_breaking_version = l->pb.ysql_catalog_config().version();
8258
287k
  }
8259
290k
  return Status::OK();
8260
290k
}
8261
8262
1.94k
Status CatalogManager::InitializeTransactionTablesConfig(int64_t term) {
8263
1.94k
  SysTransactionTablesConfigEntryPB transaction_tables_config;
8264
1.94k
  transaction_tables_config.set_version(0);
8265
8266
  // Create in memory objects.
8267
1.94k
  transaction_tables_config_ = new SysConfigInfo(kTransactionTablesConfigType);
8268
8269
  // Prepare write.
8270
1.94k
  auto l = transaction_tables_config_->LockForWrite();
8271
1.94k
  *l.mutable_data()->pb.mutable_transaction_tables_config() = std::move(transaction_tables_config);
8272
8273
  // Write to sys_catalog and in memory.
8274
1.94k
  RETURN_NOT_OK(sys_catalog_->Upsert(term, transaction_tables_config_));
8275
1.94k
  l.Commit();
8276
8277
1.94k
  return Status::OK();
8278
1.94k
}
8279
8280
563
Status CatalogManager::IncrementTransactionTablesVersion() {
8281
563
  auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForWrite();
8282
563
  uint64_t new_version = l->pb.transaction_tables_config().version() + 1;
8283
563
  l.mutable_data()->pb.mutable_transaction_tables_config()->set_version(new_version);
8284
8285
  // Write to sys_catalog and in memory.
8286
563
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), transaction_tables_config_));
8287
563
  l.Commit();
8288
8289
563
  LOG(INFO) << "Set transaction tables version: " << new_version;
8290
8291
563
  return Status::OK();
8292
563
}
8293
8294
383k
uint64_t CatalogManager::GetTransactionTablesVersion() {
8295
383k
  auto l = CHECK_NOTNULL(transaction_tables_config_.get())->LockForRead();
8296
383k
  return l->pb.transaction_tables_config().version();
8297
383k
}
8298
8299
55
Status CatalogManager::RegisterTsFromRaftConfig(const consensus::RaftPeerPB& peer) {
8300
55
  NodeInstancePB instance_pb;
8301
55
  instance_pb.set_permanent_uuid(peer.permanent_uuid());
8302
55
  instance_pb.set_instance_seqno(0);
8303
8304
55
  TSRegistrationPB registration_pb;
8305
55
  auto* common = registration_pb.mutable_common();
8306
55
  *common->mutable_private_rpc_addresses() = peer.last_known_private_addr();
8307
55
  *common->mutable_broadcast_addresses() = peer.last_known_broadcast_addr();
8308
55
  *common->mutable_cloud_info() = peer.cloud_info();
8309
8310
  // Todo(Rahul) : May need to be changed when we implement table level overrides.
8311
55
  {
8312
55
    auto l = cluster_config_->LockForRead();
8313
    // If the config has no replication info, use empty string for the placement uuid, otherwise
8314
    // calculate it from the reported peer.
8315
55
    auto placement_uuid = l->pb.has_replication_info()
8316
55
        ? VERIFY_RESULT(CatalogManagerUtil::GetPlacementUuidFromRaftPeer(
8317
55
                            l->pb.replication_info(), peer))
8318
40
        : "";
8319
55
    common->set_placement_uuid(placement_uuid);
8320
55
  }
8321
55
  return master_->ts_manager()->RegisterTS(instance_pb, registration_pb, master_->MakeCloudInfoPB(),
8322
55
                                           &master_->proxy_cache(),
8323
55
                                           RegisteredThroughHeartbeat::kFalse);
8324
55
}
8325
8326
void CatalogManager::ReconcileTabletReplicasInLocalMemoryWithReport(
8327
    const scoped_refptr<TabletInfo>& tablet,
8328
    const std::string& sender_uuid,
8329
    const ConsensusStatePB& consensus_state,
8330
64.2k
    const ReportedTabletPB& report) {
8331
64.2k
  auto replica_locations = std::make_shared<TabletReplicaMap>();
8332
64.2k
  auto prev_rl = tablet->GetReplicaLocations();
8333
8334
191k
  for (const consensus::RaftPeerPB& peer : consensus_state.config().peers()) {
8335
191k
    shared_ptr<TSDescriptor> ts_desc;
8336
191k
    if (!peer.has_permanent_uuid()) {
8337
0
      LOG_WITH_PREFIX(WARNING) << "Missing UUID for peer" << peer.ShortDebugString();
8338
0
      continue;
8339
0
    }
8340
191k
    if (!master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) {
8341
55
      if (!GetAtomicFlag(&FLAGS_enable_register_ts_from_raft)) {
8342
0
        LOG_WITH_PREFIX(WARNING) << "Tablet server has never reported in. "
8343
0
        << "Not including in replica locations map yet. Peer: " << peer.ShortDebugString()
8344
0
        << "; Tablet: " << tablet->ToString();
8345
0
        continue;
8346
0
      }
8347
8348
55
      LOG_WITH_PREFIX(INFO) << "Tablet server has never reported in. Registering the ts using "
8349
55
                            << "the raft config. Peer: " << peer.ShortDebugString()
8350
55
                            << "; Tablet: " << tablet->ToString();
8351
55
      Status s = RegisterTsFromRaftConfig(peer);
8352
55
      if (!s.ok()) {
8353
9
        LOG_WITH_PREFIX(WARNING) << "Could not register ts from raft config: " << s
8354
9
                                 << " Skip updating the replica map.";
8355
9
        continue;
8356
9
      }
8357
8358
      // Guaranteed to find the ts since we just registered.
8359
46
      master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc);
8360
46
      if (!ts_desc.get()) {
8361
0
        LOG_WITH_PREFIX(WARNING) << "Could not find ts with uuid " << peer.permanent_uuid()
8362
0
                                 << " after registering from raft config. Skip updating the replica"
8363
0
                                 << " map.";
8364
0
        continue;
8365
0
      }
8366
191k
    }
8367
8368
    // Do not update replicas in the NOT_STARTED or BOOTSTRAPPING state (unless they are stale).
8369
191k
    bool use_existing = false;
8370
191k
    const TabletReplica* existing_replica = nullptr;
8371
191k
    auto it = prev_rl->find(ts_desc->permanent_uuid());
8372
191k
    if (it != prev_rl->end()) {
8373
106k
      existing_replica = &it->second;
8374
106k
    }
8375
191k
    if (existing_replica && peer.permanent_uuid() != sender_uuid) {
8376
      // IsStarting returns true if state == NOT_STARTED or state == BOOTSTRAPPING.
8377
71.2k
      use_existing = existing_replica->IsStarting() && !existing_replica->IsStale();
8378
71.2k
    }
8379
191k
    if (use_existing) {
8380
377
      InsertOrDie(replica_locations.get(), existing_replica->ts_desc->permanent_uuid(),
8381
377
          *existing_replica);
8382
190k
    } else {
8383
190k
      TabletReplica replica;
8384
190k
      CreateNewReplicaForLocalMemory(ts_desc.get(), &consensus_state, report, &replica);
8385
190k
      auto result = replica_locations.get()->insert({replica.ts_desc->permanent_uuid(), replica});
8386
7
      LOG_IF(FATAL, !result.second) << "duplicate uuid: " << replica.ts_desc->permanent_uuid();
8387
190k
      if (existing_replica) {
8388
106k
        result.first->second.UpdateDriveInfo(existing_replica->drive_info);
8389
106k
      }
8390
190k
    }
8391
191k
  }
8392
8393
  // Update the local tablet replica set. This deviates from persistent state during bootstrapping.
8394
64.2k
  tablet->SetReplicaLocations(replica_locations);
8395
64.2k
  tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel);
8396
64.2k
}
8397
8398
void CatalogManager::UpdateTabletReplicaInLocalMemory(TSDescriptor* ts_desc,
8399
                                                      const ConsensusStatePB* consensus_state,
8400
                                                      const ReportedTabletPB& report,
8401
188k
                                                      const scoped_refptr<TabletInfo>& tablet) {
8402
188k
  TabletReplica replica;
8403
188k
  CreateNewReplicaForLocalMemory(ts_desc, consensus_state, report, &replica);
8404
188k
  tablet->UpdateReplicaLocations(replica);
8405
188k
  tablet_locations_version_.fetch_add(1, std::memory_order_acq_rel);
8406
188k
}
8407
8408
void CatalogManager::CreateNewReplicaForLocalMemory(TSDescriptor* ts_desc,
8409
                                                    const ConsensusStatePB* consensus_state,
8410
                                                    const ReportedTabletPB& report,
8411
378k
                                                    TabletReplica* new_replica) {
8412
  // Tablets in state NOT_STARTED or BOOTSTRAPPING don't have a consensus.
8413
378k
  if (consensus_state == nullptr) {
8414
2.29k
    new_replica->role = PeerRole::NON_PARTICIPANT;
8415
2.29k
    new_replica->member_type = PeerMemberType::UNKNOWN_MEMBER_TYPE;
8416
376k
  } else {
8417
0
    CHECK(consensus_state != nullptr) << "No cstate: " << ts_desc->permanent_uuid()
8418
0
                                      << " - " << report.state();
8419
376k
    new_replica->role = GetConsensusRole(ts_desc->permanent_uuid(), *consensus_state);
8420
376k
    new_replica->member_type = GetConsensusMemberType(ts_desc->permanent_uuid(), *consensus_state);
8421
376k
  }
8422
378k
  if (report.has_should_disable_lb_move()) {
8423
376k
    new_replica->should_disable_lb_move = report.should_disable_lb_move();
8424
376k
  }
8425
378k
  if (report.has_fs_data_dir()) {
8426
378k
    new_replica->fs_data_dir = report.fs_data_dir();
8427
378k
  }
8428
378k
  new_replica->state = report.state();
8429
378k
  new_replica->ts_desc = ts_desc;
8430
378k
  if (!ts_desc->registered_through_heartbeat()) {
8431
2.46k
    new_replica->time_updated = MonoTime::Now() - ts_desc->TimeSinceHeartbeat();
8432
2.46k
  }
8433
378k
}
8434
8435
Status CatalogManager::GetTabletPeer(const TabletId& tablet_id,
8436
1.15M
                                     std::shared_ptr<TabletPeer>* ret_tablet_peer) const {
8437
  // Note: CatalogManager has only one table, 'sys_catalog', with only
8438
  // one tablet.
8439
8440
1.15M
  if (PREDICT_FALSE(!IsInitialized())) {
8441
    // Master puts up the consensus service first and then initiates catalog manager's creation
8442
    // asynchronously. So this case is possible, but harmless. The RPC will simply be retried.
8443
    // Previously, because we weren't checking for this condition, we would fatal down stream.
8444
100
    const string& reason = "CatalogManager is not yet initialized";
8445
100
    YB_LOG_EVERY_N(WARNING, 1000) << reason;
8446
100
    return STATUS(ServiceUnavailable, reason);
8447
100
  }
8448
8449
0
  CHECK(sys_catalog_) << "sys_catalog_ must be initialized!";
8450
8451
1.15M
  if (master_->opts().IsShellMode()) {
8452
143
    return STATUS_SUBSTITUTE(NotFound,
8453
143
        "In shell mode: no tablet_id $0 exists in CatalogManager.", tablet_id);
8454
143
  }
8455
8456
1.15M
  if (sys_catalog_->tablet_id() == tablet_id && sys_catalog_->tablet_peer().get() != nullptr &&
8457
1.15M
      sys_catalog_->tablet_peer()->CheckRunning().ok()) {
8458
1.15M
    *ret_tablet_peer = tablet_peer();
8459
0
  } else {
8460
0
    return STATUS_SUBSTITUTE(NotFound,
8461
0
        "no SysTable in the RUNNING state exists with tablet_id $0 in CatalogManager", tablet_id);
8462
0
  }
8463
1.15M
  return Status::OK();
8464
1.15M
}
8465
8466
1.18M
const NodeInstancePB& CatalogManager::NodeInstance() const {
8467
1.18M
  return master_->instance_pb();
8468
1.18M
}
8469
8470
19.9k
Status CatalogManager::GetRegistration(ServerRegistrationPB* reg) const {
8471
19.9k
  return master_->GetRegistration(reg, server::RpcOnly::kTrue);
8472
19.9k
}
8473
8474
39
Status CatalogManager::UpdateMastersListInMemoryAndDisk() {
8475
39
  DCHECK(master_->opts().IsShellMode());
8476
8477
39
  if (!master_->opts().IsShellMode()) {
8478
0
    return STATUS(IllegalState, "Cannot update master's info when process is not in shell mode.");
8479
0
  }
8480
8481
39
  consensus::ConsensusStatePB consensus_state;
8482
39
  RETURN_NOT_OK(GetCurrentConfig(&consensus_state));
8483
8484
39
  if (!consensus_state.has_config()) {
8485
0
    return STATUS(NotFound, "No Raft config found.");
8486
0
  }
8487
8488
39
  RETURN_NOT_OK(sys_catalog_->ConvertConfigToMasterAddresses(consensus_state.config()));
8489
39
  RETURN_NOT_OK(sys_catalog_->CreateAndFlushConsensusMeta(master_->fs_manager(),
8490
39
                                                          consensus_state.config(),
8491
39
                                                          consensus_state.current_term()));
8492
8493
39
  return Status::OK();
8494
39
}
8495
8496
5.35k
Status CatalogManager::EnableBgTasks() {
8497
5.35k
  LockGuard lock(mutex_);
8498
  // Initialize refresh_ysql_tablespace_info_task_. This will be used to
8499
  // manage the background task that refreshes tablespace info. This task
8500
  // will be started by the CatalogManagerBgTasks below.
8501
5.35k
  refresh_ysql_tablespace_info_task_.Bind(&master_->messenger()->scheduler());
8502
8503
5.35k
  background_tasks_.reset(new CatalogManagerBgTasks(this));
8504
5.35k
  RETURN_NOT_OK_PREPEND(background_tasks_->Init(),
8505
5.35k
                        "Failed to initialize catalog manager background tasks");
8506
8507
  // Add bg thread to rebuild yql system partitions.
8508
5.35k
  refresh_yql_partitions_task_.Bind(&master_->messenger()->scheduler());
8509
8510
5.35k
  RETURN_NOT_OK(background_tasks_thread_pool_->SubmitFunc(
8511
5.35k
      [this]() { RebuildYQLSystemPartitions(); }));
8512
8513
5.35k
  return Status::OK();
8514
5.35k
}
8515
8516
125
Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) {
8517
125
  const TabletId& tablet_id = req.tablet_id();
8518
125
  std::unique_lock<std::mutex> l(remote_bootstrap_mtx_, std::try_to_lock);
8519
125
  if (!l.owns_lock()) {
8520
84
    return STATUS_SUBSTITUTE(AlreadyPresent,
8521
84
        "Remote bootstrap of tablet $0 already in progress", tablet_id);
8522
84
  }
8523
8524
41
  if (!master_->opts().IsShellMode()) {
8525
0
    return STATUS(IllegalState, "Cannot bootstrap a master which is not in shell mode.");
8526
0
  }
8527
8528
41
  LOG(INFO) << "Starting remote bootstrap: " << req.ShortDebugString();
8529
8530
41
  HostPort bootstrap_peer_addr = HostPortFromPB(DesiredHostPort(
8531
41
      req.source_broadcast_addr(), req.source_private_addr(), req.source_cloud_info(),
8532
41
      master_->MakeCloudInfoPB()));
8533
8534
41
  const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid();
8535
41
  int64_t leader_term = req.caller_term();
8536
8537
41
  std::shared_ptr<TabletPeer> old_tablet_peer;
8538
41
  RaftGroupMetadataPtr meta;
8539
41
  bool replacing_tablet = false;
8540
8541
41
  if (tablet_exists_) {
8542
0
    old_tablet_peer = tablet_peer();
8543
    // Nothing to recover if the remote bootstrap client start failed the last time.
8544
0
    if (old_tablet_peer) {
8545
0
      meta = old_tablet_peer->tablet_metadata();
8546
0
      replacing_tablet = true;
8547
0
    }
8548
0
  }
8549
8550
41
  if (replacing_tablet) {
8551
    // Make sure the existing tablet peer is shut down and tombstoned.
8552
0
    RETURN_NOT_OK(tserver::HandleReplacingStaleTablet(meta,
8553
0
                                                      old_tablet_peer,
8554
0
                                                      tablet_id,
8555
0
                                                      master_->fs_manager()->uuid(),
8556
0
                                                      leader_term));
8557
0
  }
8558
8559
41
  LOG_WITH_PREFIX(INFO) << " Initiating remote bootstrap from peer " << bootstrap_peer_uuid
8560
41
            << " (" << bootstrap_peer_addr.ToString() << ").";
8561
8562
41
  auto rb_client = std::make_unique<tserver::RemoteBootstrapClient>(
8563
41
      tablet_id, master_->fs_manager());
8564
8565
  // Download and persist the remote superblock in TABLET_DATA_COPYING state.
8566
41
  if (replacing_tablet) {
8567
0
    RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term));
8568
0
  }
8569
41
  RETURN_NOT_OK(rb_client->Start(
8570
41
      bootstrap_peer_uuid, &master_->proxy_cache(), bootstrap_peer_addr, &meta));
8571
  // This SetupTabletPeer is needed by rb_client to perform the remote bootstrap/fetch.
8572
  // And the SetupTablet below to perform "local bootstrap" cannot be done until the remote fetch
8573
  // has succeeded. So keeping them seperate for now.
8574
40
  sys_catalog_->SetupTabletPeer(meta);
8575
40
  if (PREDICT_FALSE(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs)) {
8576
1
    LOG(INFO) << "Injecting " << FLAGS_TEST_inject_latency_during_remote_bootstrap_secs
8577
1
              << " seconds of latency for test";
8578
1
    SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_inject_latency_during_remote_bootstrap_secs));
8579
1
  }
8580
8581
  // From this point onward, the superblock is persisted in TABLET_DATA_COPYING
8582
  // state, and we need to tombstone the tablet if additional steps prior to
8583
  // getting to a TABLET_DATA_READY state fail.
8584
40
  tablet_exists_ = true;
8585
8586
  // Download all of the remote files.
8587
40
  TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer()->status_listener()),
8588
40
                   meta,
8589
40
                   master_->fs_manager()->uuid(),
8590
40
                   Substitute("Remote bootstrap: Unable to fetch data from remote peer $0 ($1)",
8591
40
                              bootstrap_peer_uuid, bootstrap_peer_addr.ToString()),
8592
40
                   nullptr);
8593
8594
  // Write out the last files to make the new replica visible and update the
8595
  // TabletDataState in the superblock to TABLET_DATA_READY.
8596
  // Finish() will call EndRemoteSession() and wait for the leader to successfully submit a
8597
  // ChangeConfig request (to change this master's role from PRE_VOTER or PRE_OBSERVER to VOTER or
8598
  // OBSERVER respectively). If the RPC times out, we will ignore the error (since the leader could
8599
  // have successfully submitted the ChangeConfig request and failed to respond before in time)
8600
  // and check the committed config until we find that this master's role has changed, or until we
8601
  // time out which will cause us to tombstone the tablet.
8602
39
  TOMBSTONE_NOT_OK(rb_client->Finish(),
8603
39
                   meta,
8604
39
                   master_->fs_manager()->uuid(),
8605
39
                   "Remote bootstrap: Failed calling Finish()",
8606
39
                   nullptr);
8607
8608
  // Synchronous tablet open for "local bootstrap".
8609
39
  RETURN_NOT_OK(tserver::ShutdownAndTombstoneTabletPeerNotOk(
8610
39
      sys_catalog_->OpenTablet(meta), sys_catalog_->tablet_peer(), meta,
8611
39
      master_->fs_manager()->uuid(), "Remote bootstrap: Failed opening sys catalog"));
8612
8613
  // Set up the in-memory master list and also flush the cmeta.
8614
39
  RETURN_NOT_OK(UpdateMastersListInMemoryAndDisk());
8615
8616
39
  master_->SetShellMode(false);
8617
8618
  // Call VerifyChangeRoleSucceeded only after we have set shell mode to false. Otherwise,
8619
  // CatalogManager::GetTabletPeer will always return an error, and the consensus will never get
8620
  // updated.
8621
39
  auto status = rb_client->VerifyChangeRoleSucceeded(
8622
39
      sys_catalog_->tablet_peer()->shared_consensus());
8623
8624
39
  if (!status.ok()) {
8625
0
    LOG_WITH_PREFIX(WARNING) << "Remote bootstrap finished. "
8626
0
                             << "Failed calling VerifyChangeRoleSucceeded: "
8627
0
                             << status.ToString();
8628
39
  } else {
8629
39
    LOG_WITH_PREFIX(INFO) << "Remote bootstrap finished successfully";
8630
39
  }
8631
8632
39
  LOG(INFO) << "Master completed remote bootstrap and is out of shell mode.";
8633
8634
39
  RETURN_NOT_OK(EnableBgTasks());
8635
8636
39
  return Status::OK();
8637
39
}
8638
8639
CHECKED_STATUS CatalogManager::SendAlterTableRequest(const scoped_refptr<TableInfo>& table,
8640
5.20k
                                                     const AlterTableRequestPB* req) {
8641
5.20k
  auto tablets = table->GetTablets();
8642
8643
5.20k
  bool is_ysql_table_with_transaction_metadata =
8644
5.20k
      table->GetTableType() == TableType::PGSQL_TABLE_TYPE &&
8645
3.07k
      req != nullptr &&
8646
2.69k
      req->has_transaction() &&
8647
154
      req->transaction().has_transaction_id();
8648
8649
5.20k
  bool alter_table_has_add_or_drop_column_step = false;
8650
5.20k
  if (req && (req->alter_schema_steps_size() || req->has_alter_properties())) {
8651
268
    for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) {
8652
268
      if (step.type() == AlterTableRequestPB::ADD_COLUMN ||
8653
253
          step.type() == AlterTableRequestPB::DROP_COLUMN) {
8654
253
        alter_table_has_add_or_drop_column_step = true;
8655
253
        break;
8656
253
      }
8657
268
    }
8658
275
  }
8659
8660
5.20k
  TransactionId txn_id = TransactionId::Nil();
8661
5.20k
  if (is_ysql_table_with_transaction_metadata && alter_table_has_add_or_drop_column_step) {
8662
111
    {
8663
111
      LOG(INFO) << "Persist transaction metadata into SysTableEntryPB for table ID " << table->id();
8664
111
      TRACE("Locking table");
8665
111
      auto l = table->LockForWrite();
8666
111
      auto& tablet_data = *l.mutable_data();
8667
111
      auto& table_pb = tablet_data.pb;
8668
111
      table_pb.mutable_transaction()->CopyFrom(req->transaction());
8669
8670
      // Update sys-catalog with the transaction ID.
8671
111
      TRACE("Updating table metadata on disk");
8672
111
      RETURN_NOT_OK(master_->catalog_manager_impl()->sys_catalog_->Upsert(
8673
111
          master_->catalog_manager()->leader_ready_term(), table.get()));
8674
8675
      // Update the in-memory state.
8676
111
      TRACE("Committing in-memory state");
8677
111
      l.Commit();
8678
111
    }
8679
111
    txn_id = VERIFY_RESULT(FullyDecodeTransactionId(req->transaction().transaction_id()));
8680
111
  }
8681
8682
18.2k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
8683
18.2k
    auto call = std::make_shared<AsyncAlterTable>(master_, AsyncTaskPool(), tablet, table, txn_id);
8684
18.2k
    tablet->table()->AddTask(call);
8685
18.2k
    if (PREDICT_FALSE(FLAGS_TEST_slowdown_alter_table_rpcs_ms > 0)) {
8686
0
      LOG(INFO) << "Sleeping for " << tablet->id() << " "
8687
0
                << FLAGS_TEST_slowdown_alter_table_rpcs_ms
8688
0
                << "ms before sending async alter table request";
8689
0
      SleepFor(MonoDelta::FromMilliseconds(FLAGS_TEST_slowdown_alter_table_rpcs_ms));
8690
0
    }
8691
18.2k
    RETURN_NOT_OK(ScheduleTask(call));
8692
18.2k
  }
8693
5.20k
  return Status::OK();
8694
5.20k
}
8695
8696
void CatalogManager::SendCopartitionTabletRequest(const scoped_refptr<TabletInfo>& tablet,
8697
0
                                                  const scoped_refptr<TableInfo>& table) {
8698
0
  auto call = std::make_shared<AsyncCopartitionTable>(master_, AsyncTaskPool(), tablet, table);
8699
0
  table->AddTask(call);
8700
0
  WARN_NOT_OK(ScheduleTask(call), "Failed to send copartition table request");
8701
0
}
8702
8703
Status CatalogManager::SendSplitTabletRequest(
8704
    const scoped_refptr<TabletInfo>& tablet, std::array<TabletId, kNumSplitParts> new_tablet_ids,
8705
43
    const std::string& split_encoded_key, const std::string& split_partition_key) {
8706
0
  VLOG(2) << "Scheduling SplitTablet request to leader tserver for source tablet ID: "
8707
0
          << tablet->tablet_id() << ", after-split tablet IDs: " << AsString(new_tablet_ids);
8708
43
  auto call = std::make_shared<AsyncSplitTablet>(
8709
43
      master_, AsyncTaskPool(), tablet, new_tablet_ids, split_encoded_key, split_partition_key,
8710
43
      &tablet_split_manager_);
8711
43
  tablet->table()->AddTask(call);
8712
43
  return ScheduleTask(call);
8713
43
}
8714
8715
void CatalogManager::DeleteTabletReplicas(
8716
15.6k
    TabletInfo* tablet, const std::string& msg, HideOnly hide_only) {
8717
15.6k
  auto locations = tablet->GetReplicaLocations();
8718
15.6k
  LOG(INFO) << "Sending DeleteTablet for " << locations->size()
8719
15.6k
            << " replicas of tablet " << tablet->tablet_id();
8720
46.6k
  for (const auto& r : *locations) {
8721
46.6k
    SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, boost::none, tablet->table(),
8722
46.6k
                            r.second.ts_desc, msg, hide_only);
8723
46.6k
  }
8724
15.6k
}
8725
8726
5.05k
Status CatalogManager::CheckIfForbiddenToDeleteTabletOf(const scoped_refptr<TableInfo>& table) {
8727
  // Do not delete the system catalog tablet.
8728
5.05k
  if (IsSystemTable(*table)) {
8729
2.26k
    return STATUS(InvalidArgument, "It is not allowed to delete system tables");
8730
2.26k
  }
8731
  // Do not delete the tablet of a colocated table.
8732
2.78k
  if (table->IsColocatedUserTable()) {
8733
27
    return STATUS(InvalidArgument, "It is not allowed to delete tablets of the colocated tables.");
8734
27
  }
8735
2.76k
  return Status::OK();
8736
2.76k
}
8737
8738
Status CatalogManager::DeleteTabletsAndSendRequests(
8739
5.04k
    const TableInfoPtr& table, const RepeatedBytes& retained_by_snapshot_schedules) {
8740
  // Silently fail if tablet deletion is forbidden so table deletion can continue executing.
8741
5.04k
  if (!CheckIfForbiddenToDeleteTabletOf(table).ok()) {
8742
2.29k
    return Status::OK();
8743
2.29k
  }
8744
8745
2.75k
  auto tablets = table->GetTablets(IncludeInactive::kTrue);
8746
8747
37.1k
  std::sort(tablets.begin(), tablets.end(), [](const auto& lhs, const auto& rhs) {
8748
37.1k
    return lhs->tablet_id() < rhs->tablet_id();
8749
37.1k
  });
8750
8751
2.75k
  string deletion_msg = "Table deleted at " + LocalTimeAsString();
8752
2.75k
  RETURN_NOT_OK(DeleteTabletListAndSendRequests(
8753
2.75k
      tablets, deletion_msg, retained_by_snapshot_schedules));
8754
8755
2.75k
  if (table->IsColocatedParentTable()) {
8756
2
    SharedLock lock(mutex_);
8757
2
    colocated_tablet_ids_map_.erase(table->namespace_id());
8758
2.75k
  } else if (table->IsTablegroupParentTable()) {
8759
    // In the case of dropped tablegroup parent table, need to delete tablegroup info.
8760
2
    SharedLock lock(mutex_);
8761
2
    tablegroup_ids_map_.erase(table->id().substr(0, 32));
8762
2
  }
8763
2.75k
  return Status::OK();
8764
2.75k
}
8765
8766
Status CatalogManager::DeleteTabletListAndSendRequests(
8767
    const std::vector<scoped_refptr<TabletInfo>>& tablets, const std::string& deletion_msg,
8768
2.76k
    const google::protobuf::RepeatedPtrField<std::string>& retained_by_snapshot_schedules) {
8769
2.76k
  struct TabletData {
8770
2.76k
    TabletInfoPtr tablet;
8771
2.76k
    TabletInfo::WriteLock lock;
8772
2.76k
    HideOnly hide_only;
8773
2.76k
  };
8774
2.76k
  std::vector<TabletData> tablets_data;
8775
2.76k
  tablets_data.reserve(tablets.size());
8776
2.76k
  std::vector<TabletInfo*> tablet_infos;
8777
2.76k
  tablet_infos.reserve(tablets_data.size());
8778
2.76k
  std::vector<TabletInfoPtr> marked_as_hidden;
8779
8780
  // Grab tablets and tablet write locks. The list should already be in tablet_id sorted order.
8781
2.76k
  {
8782
2.76k
    SharedLock read_lock(mutex_);
8783
15.6k
    for (const auto& tablet : tablets) {
8784
15.6k
      tablets_data.push_back(TabletData {
8785
15.6k
        .tablet = tablet,
8786
15.6k
        .lock = tablet->LockForWrite(),
8787
        // Hide tablet if it is retained by snapshot schedule, or is part of a cdc stream.
8788
15.6k
        .hide_only = HideOnly(!retained_by_snapshot_schedules.empty()),
8789
15.6k
      });
8790
15.6k
      if (!tablets_data.back().hide_only) {
8791
        // Also check if this tablet is part of a cdc stream and is not already hidden. If this is
8792
        // a cdc stream producer and is already hidden, then we should delete this tablet.
8793
15.6k
        tablets_data.back().hide_only = HideOnly(
8794
15.6k
            IsTableCdcProducer(*tablet->table()) && !tablets_data.back().lock->ListedAsHidden());
8795
15.6k
      }
8796
8797
15.6k
      tablet_infos.emplace_back(tablet.get());
8798
15.6k
    }
8799
2.76k
  }
8800
8801
  // Use the same hybrid time for all hidden tablets.
8802
2.76k
  HybridTime hide_hybrid_time = master_->clock()->Now();
8803
8804
  // Mark the tablets as deleted.
8805
15.6k
  for (auto& tablet_data : tablets_data) {
8806
15.6k
    auto& tablet = tablet_data.tablet;
8807
15.6k
    auto& tablet_lock = tablet_data.lock;
8808
8809
15.6k
    bool was_hidden = tablet_lock->ListedAsHidden();
8810
    // Inactive tablet now, so remove it from partitions_.
8811
    // After all the tablets have been deleted from the tservers, we remove it from tablets_.
8812
15.6k
    tablet->table()->RemoveTablet(tablet->id(), DeactivateOnly::kTrue);
8813
8814
15.6k
    if (tablet_data.hide_only) {
8815
0
      LOG(INFO) << "Hiding tablet " << tablet->tablet_id();
8816
0
      tablet_lock.mutable_data()->pb.set_hide_hybrid_time(hide_hybrid_time.ToUint64());
8817
0
      *tablet_lock.mutable_data()->pb.mutable_retained_by_snapshot_schedules() =
8818
0
          retained_by_snapshot_schedules;
8819
15.6k
    } else {
8820
15.6k
      LOG(INFO) << "Deleting tablet " << tablet->tablet_id();
8821
15.6k
      tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg);
8822
15.6k
    }
8823
15.6k
    if (tablet_lock->ListedAsHidden() && !was_hidden) {
8824
0
      marked_as_hidden.push_back(tablet);
8825
0
    }
8826
15.6k
  }
8827
8828
  // Update all the tablet states in raft in bulk.
8829
2.76k
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), tablet_infos));
8830
8831
  // Commit the change.
8832
15.6k
  for (auto& tablet_data : tablets_data) {
8833
15.6k
    auto& tablet = tablet_data.tablet;
8834
15.6k
    auto& tablet_lock = tablet_data.lock;
8835
8836
15.6k
    tablet_lock.Commit();
8837
15.6k
    LOG(INFO) << (tablet_data.hide_only ? "Hid tablet " : "Deleted tablet ") << tablet->tablet_id();
8838
8839
15.6k
    DeleteTabletReplicas(tablet.get(), deletion_msg, tablet_data.hide_only);
8840
15.6k
  }
8841
8842
2.76k
  if (!marked_as_hidden.empty()) {
8843
0
    LockGuard lock(mutex_);
8844
0
    hidden_tablets_.insert(hidden_tablets_.end(), marked_as_hidden.begin(), marked_as_hidden.end());
8845
0
  }
8846
8847
2.76k
  return Status::OK();
8848
2.76k
}
8849
8850
void CatalogManager::SendDeleteTabletRequest(
8851
    const TabletId& tablet_id,
8852
    TabletDataState delete_type,
8853
    const boost::optional<int64_t>& cas_config_opid_index_less_or_equal,
8854
    const scoped_refptr<TableInfo>& table,
8855
    TSDescriptor* ts_desc,
8856
    const string& reason,
8857
46.6k
    bool hide_only) {
8858
46.6k
  if (PREDICT_FALSE(GetAtomicFlag(&FLAGS_TEST_disable_tablet_deletion))) {
8859
0
    return;
8860
0
  }
8861
46.6k
  LOG_WITH_PREFIX(INFO)
8862
46.6k
      << (hide_only ? "Hiding" : "Deleting") << " tablet " << tablet_id << " on peer "
8863
46.6k
      << ts_desc->permanent_uuid() << " with delete type "
8864
46.6k
      << TabletDataState_Name(delete_type) << " (" << reason << ")";
8865
46.6k
  auto call = std::make_shared<AsyncDeleteReplica>(master_, AsyncTaskPool(),
8866
46.6k
      ts_desc->permanent_uuid(), table, tablet_id, delete_type,
8867
46.6k
      cas_config_opid_index_less_or_equal, reason);
8868
46.6k
  if (hide_only) {
8869
0
    call->set_hide_only(hide_only);
8870
0
  }
8871
46.6k
  if (table != nullptr) {
8872
46.6k
    table->AddTask(call);
8873
46.6k
  }
8874
8875
46.6k
  auto status = ScheduleTask(call);
8876
46.6k
  WARN_NOT_OK(status, Substitute("Failed to send delete request for tablet $0", tablet_id));
8877
  // TODO(bogdan): does the pending delete semantics need to change?
8878
46.6k
  if (status.ok()) {
8879
46.6k
    ts_desc->AddPendingTabletDelete(tablet_id);
8880
46.6k
  }
8881
46.6k
}
8882
8883
void CatalogManager::SendLeaderStepDownRequest(
8884
    const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate,
8885
    const string& change_config_ts_uuid, bool should_remove,
8886
5.81k
    const string& new_leader_uuid) {
8887
5.81k
  auto task = std::make_shared<AsyncTryStepDown>(
8888
5.81k
      master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid, should_remove,
8889
5.81k
      new_leader_uuid);
8890
5.81k
  tablet->table()->AddTask(task);
8891
5.81k
  Status status = ScheduleTask(task);
8892
5.81k
  WARN_NOT_OK(status, Substitute("Failed to send new $0 request", task->type_name()));
8893
5.81k
}
8894
8895
// TODO: refactor this into a joint method with the add one.
8896
void CatalogManager::SendRemoveServerRequest(
8897
    const scoped_refptr<TabletInfo>& tablet, const ConsensusStatePB& cstate,
8898
794
    const string& change_config_ts_uuid) {
8899
  // Check if the user wants the leader to be stepped down.
8900
794
  auto task = std::make_shared<AsyncRemoveServerTask>(
8901
794
      master_, AsyncTaskPool(), tablet, cstate, change_config_ts_uuid);
8902
794
  tablet->table()->AddTask(task);
8903
794
  WARN_NOT_OK(ScheduleTask(task), Substitute("Failed to send new $0 request", task->type_name()));
8904
794
}
8905
8906
void CatalogManager::SendAddServerRequest(
8907
    const scoped_refptr<TabletInfo>& tablet, PeerMemberType member_type,
8908
1.04k
    const ConsensusStatePB& cstate, const string& change_config_ts_uuid) {
8909
1.04k
  auto task = std::make_shared<AsyncAddServerTask>(master_, AsyncTaskPool(), tablet, member_type,
8910
1.04k
      cstate, change_config_ts_uuid);
8911
1.04k
  tablet->table()->AddTask(task);
8912
1.04k
  WARN_NOT_OK(
8913
1.04k
      ScheduleTask(task),
8914
1.04k
      Substitute("Failed to send AddServer of tserver $0 to tablet $1",
8915
1.04k
                 change_config_ts_uuid, tablet.get()->ToString()));
8916
1.04k
}
8917
8918
void CatalogManager::GetPendingServerTasksUnlocked(
8919
    const TableId &table_uuid,
8920
    TabletToTabletServerMap *add_replica_tasks_map,
8921
    TabletToTabletServerMap *remove_replica_tasks_map,
8922
121k
    TabletToTabletServerMap *stepdown_leader_tasks_map) {
8923
8924
121k
  auto table = GetTableInfoUnlocked(table_uuid);
8925
111k
  for (const auto& task : table->GetTasks()) {
8926
111k
    TabletToTabletServerMap* outputMap = nullptr;
8927
111k
    if (task->type() == MonitoredTask::ASYNC_ADD_SERVER) {
8928
173
      outputMap = add_replica_tasks_map;
8929
110k
    } else if (task->type() == MonitoredTask::ASYNC_REMOVE_SERVER) {
8930
343
      outputMap = remove_replica_tasks_map;
8931
110k
    } else if (task->type() == MonitoredTask::ASYNC_TRY_STEP_DOWN) {
8932
      // Store new_leader_uuid instead of change_config_ts_uuid.
8933
96
      auto raft_task = static_cast<AsyncTryStepDown*>(task.get());
8934
96
      (*stepdown_leader_tasks_map)[raft_task->tablet_id()] = raft_task->new_leader_uuid();
8935
96
      continue;
8936
96
    }
8937
110k
    if (outputMap) {
8938
516
      auto raft_task = static_cast<CommonInfoForRaftTask*>(task.get());
8939
516
      (*outputMap)[raft_task->tablet_id()] = raft_task->change_config_ts_uuid();
8940
516
    }
8941
110k
  }
8942
121k
}
8943
8944
void CatalogManager::ExtractTabletsToProcess(
8945
    TabletInfos *tablets_to_delete,
8946
90.0k
    TableToTabletInfos *tablets_to_process) {
8947
90.0k
  SharedLock lock(mutex_);
8948
8949
  // TODO: At the moment we loop through all the tablets
8950
  //       we can keep a set of tablets waiting for "assignment"
8951
  //       or just a counter to avoid to take the lock and loop through the tablets
8952
  //       if everything is "stable".
8953
8954
3.61M
  for (const TabletInfoMap::value_type& entry : *tablet_map_) {
8955
3.61M
    scoped_refptr<TabletInfo> tablet = entry.second;
8956
3.61M
    auto table = tablet->table();
8957
3.61M
    if (!table) {
8958
      // Tablet is orphaned or in preparing state, continue.
8959
0
      continue;
8960
0
    }
8961
8962
    // acquire table lock before tablets.
8963
3.61M
    auto table_lock = table->LockForRead();
8964
3.61M
    auto tablet_lock = tablet->LockForRead();
8965
8966
    // If the table is deleted or the tablet was replaced at table creation time.
8967
3.61M
    if (tablet_lock->is_deleted() || table_lock->started_deleting()) {
8968
      // Process this table deletion only once (tombstones for table may remain longer).
8969
793k
      if (table_ids_map_->find(tablet->table()->id()) != table_ids_map_->end()) {
8970
793k
        tablets_to_delete->push_back(tablet);
8971
793k
      }
8972
      // Don't process deleted tables regardless.
8973
793k
      continue;
8974
793k
    }
8975
8976
    // Running tablets.
8977
2.82M
    if (tablet_lock->is_running()) {
8978
      // TODO: handle last update > not responding timeout?
8979
2.76M
      continue;
8980
2.76M
    }
8981
8982
    // Tablets not yet assigned or with a report just received.
8983
55.6k
    (*tablets_to_process)[tablet->table()->id()].push_back(tablet);
8984
55.6k
  }
8985
90.0k
}
8986
8987
72.7k
bool CatalogManager::AreTablesDeleting() {
8988
72.7k
  SharedLock lock(mutex_);
8989
8990
8.20M
  for (const TableInfoMap::value_type& entry : *table_ids_map_) {
8991
8.20M
    scoped_refptr<TableInfo> table(entry.second);
8992
8.20M
    auto table_lock = table->LockForRead();
8993
    // TODO(jason): possibly change this to started_deleting when we begin removing DELETED tables
8994
    // from table_ids_map_ (see CleanUpDeletedTables).
8995
8.20M
    if (table_lock->is_deleting()) {
8996
17
      return true;
8997
17
    }
8998
8.20M
  }
8999
72.7k
  return false;
9000
72.7k
}
9001
9002
struct DeferredAssignmentActions {
9003
  std::vector<TabletInfo*> modified_tablets;
9004
  std::vector<TabletInfo*> needs_create_rpc;
9005
};
9006
9007
void CatalogManager::HandleAssignPreparingTablet(TabletInfo* tablet,
9008
28.3k
                                                 DeferredAssignmentActions* deferred) {
9009
  // The tablet was just created (probably by a CreateTable RPC).
9010
  // Update the state to "creating" to be ready for the creation request.
9011
28.3k
  tablet->mutable_metadata()->mutable_dirty()->set_state(
9012
28.3k
    SysTabletsEntryPB::CREATING, "Sending initial creation of tablet");
9013
28.3k
  deferred->modified_tablets.push_back(tablet);
9014
28.3k
  deferred->needs_create_rpc.push_back(tablet);
9015
0
  VLOG(1) << "Assign new tablet " << tablet->ToString();
9016
28.3k
}
9017
9018
void CatalogManager::HandleAssignCreatingTablet(TabletInfo* tablet,
9019
                                                DeferredAssignmentActions* deferred,
9020
27.2k
                                                vector<scoped_refptr<TabletInfo>>* new_tablets) {
9021
27.2k
  MonoDelta time_since_updated =
9022
27.2k
      MonoTime::Now().GetDeltaSince(tablet->last_update_time());
9023
27.2k
  int64_t remaining_timeout_ms =
9024
27.2k
      FLAGS_tablet_creation_timeout_ms - time_since_updated.ToMilliseconds();
9025
9026
27.2k
  if (tablet->LockForRead()->pb.has_split_parent_tablet_id()) {
9027
    // No need to recreate post-split tablets, since this is always done on source tablet replicas.
9028
0
    VLOG(2) << "Post-split tablet " << AsString(tablet) << " still being created.";
9029
118
    return;
9030
118
  }
9031
  // Skip the tablet if the assignment timeout is not yet expired.
9032
27.1k
  if (remaining_timeout_ms > 0) {
9033
0
    VLOG(2) << "Tablet " << tablet->ToString() << " still being created. "
9034
0
            << remaining_timeout_ms << "ms remain until timeout.";
9035
27.1k
    return;
9036
27.1k
  }
9037
9038
13
  const PersistentTabletInfo& old_info = tablet->metadata().state();
9039
9040
  // The "tablet creation" was already sent, but we didn't receive an answer
9041
  // within the timeout. So the tablet will be replaced by a new one.
9042
13
  TabletInfoPtr replacement;
9043
13
  {
9044
13
    LockGuard lock(mutex_);
9045
13
    replacement = CreateTabletInfo(tablet->table().get(), old_info.pb.partition());
9046
13
  }
9047
13
  LOG(WARNING) << "Tablet " << tablet->ToString() << " was not created within "
9048
13
               << "the allowed timeout. Replacing with a new tablet "
9049
13
               << replacement->tablet_id();
9050
9051
13
  tablet->table()->ReplaceTablet(tablet, replacement);
9052
13
  {
9053
13
    LockGuard lock(mutex_);
9054
13
    auto tablet_map_checkout = tablet_map_.CheckOut();
9055
13
    (*tablet_map_checkout)[replacement->tablet_id()] = replacement;
9056
13
  }
9057
9058
  // Mark old tablet as replaced.
9059
13
  tablet->mutable_metadata()->mutable_dirty()->set_state(
9060
13
    SysTabletsEntryPB::REPLACED,
9061
13
    Substitute("Replaced by $0 at $1",
9062
13
               replacement->tablet_id(), LocalTimeAsString()));
9063
9064
  // Mark new tablet as being created.
9065
13
  replacement->mutable_metadata()->mutable_dirty()->set_state(
9066
13
    SysTabletsEntryPB::CREATING,
9067
13
    Substitute("Replacement for $0", tablet->tablet_id()));
9068
9069
13
  deferred->modified_tablets.push_back(tablet);
9070
13
  deferred->modified_tablets.push_back(replacement.get());
9071
13
  deferred->needs_create_rpc.push_back(replacement.get());
9072
0
  VLOG(1) << "Replaced tablet " << tablet->tablet_id()
9073
0
          << " with " << replacement->tablet_id()
9074
0
          << " (table " << tablet->table()->ToString() << ")";
9075
9076
13
  new_tablets->push_back(replacement);
9077
13
}
9078
9079
// TODO: we could batch the IO onto a background thread.
9080
Status CatalogManager::HandleTabletSchemaVersionReport(
9081
57.0k
    TabletInfo *tablet, uint32_t version, const scoped_refptr<TableInfo>& table_info) {
9082
57.0k
  scoped_refptr<TableInfo> table;
9083
57.0k
  if (table_info) {
9084
20.5k
    table = table_info;
9085
36.4k
  } else {
9086
36.4k
    table = tablet->table();
9087
36.4k
  }
9088
9089
  // Update the schema version if it's the latest.
9090
57.0k
  tablet->set_reported_schema_version(table->id(), version);
9091
110
  VLOG_WITH_PREFIX_AND_FUNC(1)
9092
110
      << "Tablet " << tablet->tablet_id() << " reported version " << version;
9093
9094
  // Verify if it's the last tablet report, and the alter completed.
9095
57.0k
  {
9096
57.0k
    auto l = table->LockForRead();
9097
57.0k
    if (l->pb.state() != SysTablesEntryPB::ALTERING) {
9098
18.4E
      VLOG_WITH_PREFIX_AND_FUNC(2) << "Table " << table->ToString() << " is not altering";
9099
38.7k
      return Status::OK();
9100
38.7k
    }
9101
9102
18.2k
    uint32_t current_version = l->pb.version();
9103
18.2k
    if (table->IsAlterInProgress(current_version)) {
9104
0
      VLOG_WITH_PREFIX_AND_FUNC(2) << "Table " << table->ToString() << " has IsAlterInProgress ("
9105
0
                                   << current_version << ")";
9106
12.7k
      return Status::OK();
9107
12.7k
    }
9108
5.53k
  }
9109
9110
5.53k
  return MultiStageAlterTable::LaunchNextTableInfoVersionIfNecessary(this, table, version);
9111
5.53k
}
9112
9113
Status CatalogManager::ProcessPendingAssignmentsPerTable(
9114
12.6k
    const TableId& table_id, const TabletInfos& tablets, CMGlobalLoadState* global_load_state) {
9115
0
  VLOG(1) << "Processing pending assignments";
9116
9117
12.6k
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
9118
9119
  // Initialize this table load state.
9120
12.6k
  CMPerTableLoadState table_load_state(global_load_state);
9121
12.6k
  InitializeTableLoadState(table_id, ts_descs, &table_load_state);
9122
12.6k
  table_load_state.SortLoad();
9123
9124
  // Take write locks on all tablets to be processed, and ensure that they are
9125
  // unlocked at the end of this scope.
9126
55.6k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9127
55.6k
    tablet->mutable_metadata()->StartMutation();
9128
55.6k
  }
9129
12.6k
  ScopedInfoCommitter<TabletInfo> unlocker_in(&tablets);
9130
9131
  // Any tablets created by the helper functions will also be created in a
9132
  // locked state, so we must ensure they are unlocked before we return to
9133
  // avoid deadlocks.
9134
12.6k
  TabletInfos new_tablets;
9135
12.6k
  ScopedInfoCommitter<TabletInfo> unlocker_out(&new_tablets);
9136
9137
12.6k
  DeferredAssignmentActions deferred;
9138
9139
  // Iterate over each of the tablets and handle it, whatever state
9140
  // it may be in. The actions required for the tablet are collected
9141
  // into 'deferred'.
9142
55.6k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9143
55.6k
    SysTabletsEntryPB::State t_state = tablet->metadata().state().pb.state();
9144
9145
55.6k
    switch (t_state) {
9146
28.3k
      case SysTabletsEntryPB::PREPARING:
9147
28.3k
        HandleAssignPreparingTablet(tablet.get(), &deferred);
9148
28.3k
        break;
9149
9150
27.2k
      case SysTabletsEntryPB::CREATING:
9151
27.2k
        HandleAssignCreatingTablet(tablet.get(), &deferred, &new_tablets);
9152
27.2k
        break;
9153
9154
50
      default:
9155
0
        VLOG(2) << "Nothing to do for tablet " << tablet->tablet_id() << ": state = "
9156
0
                << SysTabletsEntryPB_State_Name(t_state);
9157
50
        break;
9158
55.6k
    }
9159
55.6k
  }
9160
9161
  // Nothing to do.
9162
12.6k
  if (deferred.modified_tablets.empty() &&
9163
8.68k
      deferred.needs_create_rpc.empty()) {
9164
8.68k
    return Status::OK();
9165
8.68k
  }
9166
9167
  // For those tablets which need to be created in this round, assign replicas.
9168
3.98k
  Status s;
9169
3.98k
  std::unordered_set<TableInfo*> ok_status_tables;
9170
28.3k
  for (TabletInfo *tablet : deferred.needs_create_rpc) {
9171
    // NOTE: if we fail to select replicas on the first pass (due to
9172
    // insufficient Tablet Servers being online), we will still try
9173
    // again unless the tablet/table creation is cancelled.
9174
28.3k
    LOG(INFO) << "Selecting replicas for tablet " << tablet->id();
9175
28.3k
    s = SelectReplicasForTablet(ts_descs, tablet, &table_load_state, global_load_state);
9176
28.3k
    if (!s.ok()) {
9177
0
      s = s.CloneAndPrepend(Substitute(
9178
0
          "An error occurred while selecting replicas for tablet $0: $1",
9179
0
          tablet->tablet_id(), s.ToString()));
9180
0
      tablet->table()->SetCreateTableErrorStatus(s);
9181
0
      break;
9182
28.3k
    } else {
9183
28.3k
      ok_status_tables.emplace(tablet->table().get());
9184
28.3k
    }
9185
28.3k
  }
9186
9187
  // Update the sys catalog with the new set of tablets/metadata.
9188
3.98k
  if (s.ok()) {
9189
    // If any of the ok_status_tables had an error in the previous iterations, we
9190
    // need to clear up the error status to reflect that all the create tablets have now
9191
    // succeded.
9192
3.98k
    for (TableInfo* table : ok_status_tables) {
9193
3.98k
      table->SetCreateTableErrorStatus(Status::OK());
9194
3.98k
    }
9195
9196
3.98k
    s = sys_catalog_->Upsert(leader_ready_term(), deferred.modified_tablets);
9197
3.98k
    if (!s.ok()) {
9198
2
      s = s.CloneAndPrepend("An error occurred while persisting the updated tablet metadata");
9199
2
    }
9200
3.98k
  }
9201
9202
3.98k
  if (!s.ok()) {
9203
2
    LOG(WARNING) << "Aborting the current task due to error: " << s.ToString();
9204
    // If there was an error, abort any mutations started by the current task.
9205
    // NOTE: Lock order should be lock_ -> table -> tablet.
9206
    // We currently have a bunch of tablets locked and need to unlock first to ensure this holds.
9207
9208
0
    std::sort(new_tablets.begin(), new_tablets.end(), [](const auto& lhs, const auto& rhs) {
9209
0
      return lhs->table().get() < rhs->table().get();
9210
0
    });
9211
2
    {
9212
2
      std::string current_table_name;
9213
2
      TableInfoPtr current_table;
9214
0
      for (auto& tablet_to_remove : new_tablets) {
9215
0
        if (tablet_to_remove->table()->RemoveTablet(tablet_to_remove->tablet_id())) {
9216
0
          if (VLOG_IS_ON(1)) {
9217
0
            if (current_table != tablet_to_remove->table()) {
9218
0
              current_table = tablet_to_remove->table();
9219
0
              current_table_name = current_table->name();
9220
0
            }
9221
0
            LOG(INFO) << "Removed tablet " << tablet_to_remove->tablet_id() << " from table "
9222
0
                      << current_table_name;
9223
0
          }
9224
0
        }
9225
0
      }
9226
2
    }
9227
9228
2
    unlocker_out.Abort();  // tablet.unlock
9229
2
    unlocker_in.Abort();
9230
9231
2
    {
9232
2
      LockGuard lock(mutex_); // lock_.lock
9233
2
      auto tablet_map_checkout = tablet_map_.CheckOut();
9234
0
      for (auto& tablet_to_remove : new_tablets) {
9235
        // Potential race condition above, but it's okay if a background thread deleted this.
9236
0
        tablet_map_checkout->erase(tablet_to_remove->tablet_id());
9237
0
      }
9238
2
    }
9239
2
    return s;
9240
2
  }
9241
9242
  // Send DeleteTablet requests to tablet servers serving deleted tablets.
9243
  // This is asynchronous / non-blocking.
9244
28.3k
  for (auto* tablet : deferred.modified_tablets) {
9245
28.3k
    if (tablet->metadata().dirty().is_deleted()) {
9246
      // Actual delete, because we delete tablet replica.
9247
13
      DeleteTabletReplicas(tablet, tablet->metadata().dirty().pb.state_msg(), HideOnly::kFalse);
9248
13
    }
9249
28.3k
  }
9250
  // Send the CreateTablet() requests to the servers. This is asynchronous / non-blocking.
9251
3.98k
  return SendCreateTabletRequests(deferred.needs_create_rpc);
9252
3.98k
}
9253
9254
Status CatalogManager::SelectReplicasForTablet(
9255
    const TSDescriptorVector& ts_descs, TabletInfo* tablet,
9256
28.3k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9257
28.3k
  auto table_guard = tablet->table()->LockForRead();
9258
9259
28.3k
  if (!table_guard->pb.IsInitialized()) {
9260
0
    return STATUS_SUBSTITUTE(InvalidArgument,
9261
0
        "TableInfo for tablet $0 is not initialized (aborted CreateTable attempt?)",
9262
0
        tablet->tablet_id());
9263
0
  }
9264
9265
28.3k
  const auto& replication_info =
9266
28.3k
    VERIFY_RESULT(GetTableReplicationInfo(table_guard->pb.replication_info(),
9267
28.3k
          tablet->table()->TablespaceIdForTableCreation()));
9268
9269
  // Select the set of replicas for the tablet.
9270
28.3k
  ConsensusStatePB* cstate = tablet->mutable_metadata()->mutable_dirty()
9271
28.3k
          ->pb.mutable_committed_consensus_state();
9272
0
  VLOG_WITH_FUNC(3) << "Committed consensus state: " << AsString(cstate);
9273
28.3k
  cstate->set_current_term(kMinimumTerm);
9274
28.3k
  consensus::RaftConfigPB *config = cstate->mutable_config();
9275
28.3k
  config->set_opid_index(consensus::kInvalidOpIdIndex);
9276
9277
28.3k
  Status s = HandlePlacementUsingReplicationInfo(
9278
28.3k
      replication_info, ts_descs, config, per_table_state, global_state);
9279
28.3k
  if (!s.ok()) {
9280
0
    return s;
9281
0
  }
9282
9283
28.3k
  std::ostringstream out;
9284
28.3k
  out << "Initial tserver uuids for tablet " << tablet->tablet_id() << ": ";
9285
82.1k
  for (const RaftPeerPB& peer : config->peers()) {
9286
82.1k
    out << peer.permanent_uuid() << " ";
9287
82.1k
  }
9288
9289
28.3k
  if (VLOG_IS_ON(0)) {
9290
28.3k
    out.str();
9291
28.3k
  }
9292
9293
0
  VLOG_WITH_FUNC(3) << "Committed consensus state has been updated to: " << AsString(cstate);
9294
9295
28.3k
  return Status::OK();
9296
28.3k
}
9297
9298
void CatalogManager::GetTsDescsFromPlacementInfo(const PlacementInfoPB& placement_info,
9299
                                                 const TSDescriptorVector& all_ts_descs,
9300
60.7k
                                                 TSDescriptorVector* ts_descs) {
9301
60.7k
  ts_descs->clear();
9302
177k
  for (const auto& ts_desc : all_ts_descs) {
9303
177k
    if (placement_info.has_placement_uuid()) {
9304
3.01k
      string placement_uuid = placement_info.placement_uuid();
9305
3.01k
      if (ts_desc->placement_uuid() == placement_uuid) {
9306
1.89k
        ts_descs->push_back(ts_desc);
9307
1.89k
      }
9308
174k
    } else if (ts_desc->placement_uuid() == "") {
9309
      // Since the placement info has no placement id, we know it is live, so we add this ts.
9310
174k
      ts_descs->push_back(ts_desc);
9311
174k
    }
9312
177k
  }
9313
60.7k
}
9314
9315
Status CatalogManager::HandlePlacementUsingReplicationInfo(
9316
    const ReplicationInfoPB& replication_info,
9317
    const TSDescriptorVector& all_ts_descs,
9318
    consensus::RaftConfigPB* config,
9319
    CMPerTableLoadState* per_table_state,
9320
28.3k
    CMGlobalLoadState* global_state) {
9321
  // Validate if we have enough tservers to put the replicas.
9322
28.3k
  ValidateReplicationInfoRequestPB req;
9323
28.3k
  req.mutable_replication_info()->CopyFrom(replication_info);
9324
28.3k
  ValidateReplicationInfoResponsePB resp;
9325
28.3k
  RETURN_NOT_OK(ValidateReplicationInfo(&req, &resp));
9326
9327
28.3k
  TSDescriptorVector ts_descs;
9328
28.3k
  GetTsDescsFromPlacementInfo(replication_info.live_replicas(), all_ts_descs, &ts_descs);
9329
28.3k
  RETURN_NOT_OK(HandlePlacementUsingPlacementInfo(
9330
28.3k
      replication_info.live_replicas(), ts_descs, PeerMemberType::VOTER,
9331
28.3k
      config, per_table_state, global_state));
9332
28.4k
  for (int i = 0; i < replication_info.read_replicas_size(); i++) {
9333
92
    GetTsDescsFromPlacementInfo(replication_info.read_replicas(i), all_ts_descs, &ts_descs);
9334
92
    RETURN_NOT_OK(HandlePlacementUsingPlacementInfo(
9335
92
        replication_info.read_replicas(i), ts_descs, PeerMemberType::OBSERVER,
9336
92
        config, per_table_state, global_state));
9337
92
  }
9338
28.3k
  return Status::OK();
9339
28.3k
}
9340
9341
Status CatalogManager::HandlePlacementUsingPlacementInfo(const PlacementInfoPB& placement_info,
9342
                                                         const TSDescriptorVector& ts_descs,
9343
                                                         PeerMemberType member_type,
9344
                                                         consensus::RaftConfigPB* config,
9345
                                                         CMPerTableLoadState* per_table_state,
9346
28.4k
                                                         CMGlobalLoadState* global_state) {
9347
28.4k
  size_t nreplicas = GetNumReplicasFromPlacementInfo(placement_info);
9348
28.4k
  size_t ntservers = ts_descs.size();
9349
  // Keep track of servers we've already selected, so that we don't attempt to
9350
  // put two replicas on the same host.
9351
28.4k
  set<TabletServerId> already_selected_ts;
9352
28.4k
  if (placement_info.placement_blocks().empty()) {
9353
    // If we don't have placement info, just place the replicas as before, distributed across the
9354
    // whole cluster.
9355
    // We cannot put more than ntservers replicas.
9356
28.2k
    nreplicas = min(nreplicas, ntservers);
9357
28.2k
    SelectReplicas(ts_descs, nreplicas, config, &already_selected_ts, member_type,
9358
28.2k
                   per_table_state, global_state);
9359
132
  } else {
9360
    // TODO(bogdan): move to separate function
9361
    //
9362
    // If we do have placement info, we'll try to use the same power of two algorithm, but also
9363
    // match the requested policies. We'll assign the minimum requested replicas in each combination
9364
    // of cloud.region.zone and then if we still have leftover replicas, we'll assign those
9365
    // in any of the allowed areas.
9366
132
    auto all_allowed_ts = VERIFY_RESULT(FindTServersForPlacementInfo(placement_info, ts_descs));
9367
9368
    // Loop through placements and assign to respective available TSs.
9369
132
    size_t min_replica_count_sum = 0;
9370
252
    for (const auto& pb : placement_info.placement_blocks()) {
9371
      // This works because currently we don't allow placement blocks to overlap.
9372
252
      auto available_ts_descs = VERIFY_RESULT(FindTServersForPlacementBlock(pb, ts_descs));
9373
252
      size_t available_ts_descs_size = available_ts_descs.size();
9374
252
      size_t min_num_replicas = pb.min_num_replicas();
9375
      // We cannot put more than the available tablet servers in that placement block.
9376
252
      size_t num_replicas = min(min_num_replicas, available_ts_descs_size);
9377
252
      min_replica_count_sum += min_num_replicas;
9378
252
      SelectReplicas(available_ts_descs, num_replicas, config, &already_selected_ts, member_type,
9379
252
                     per_table_state, global_state);
9380
252
    }
9381
9382
132
    size_t replicas_left = nreplicas - min_replica_count_sum;
9383
132
    size_t max_tservers_left = all_allowed_ts.size() - already_selected_ts.size();
9384
    // Upper bounded by the tservers left.
9385
132
    replicas_left = min(replicas_left, max_tservers_left);
9386
132
    DCHECK_GE(replicas_left, 0);
9387
132
    if (replicas_left > 0) {
9388
      // No need to do an extra check here, as we checked early if we have enough to cover all
9389
      // requested placements and checked individually per placement info, if we could cover the
9390
      // minimums.
9391
12
      SelectReplicas(all_allowed_ts, replicas_left, config, &already_selected_ts, member_type,
9392
12
                     per_table_state, global_state);
9393
12
    }
9394
132
  }
9395
28.4k
  return Status::OK();
9396
28.4k
}
9397
9398
Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementInfo(
9399
    const PlacementInfoPB& placement_info,
9400
237
    const TSDescriptorVector& ts_descs) const {
9401
9402
237
  vector<shared_ptr<TSDescriptor>> all_allowed_ts;
9403
1.00k
  for (const auto& ts : ts_descs) {
9404
1.65k
    for (const auto& pb : placement_info.placement_blocks()) {
9405
1.65k
      if (ts->MatchesCloudInfo(pb.cloud_info())) {
9406
1.00k
        all_allowed_ts.push_back(ts);
9407
1.00k
        break;
9408
1.00k
      }
9409
1.65k
    }
9410
1.00k
  }
9411
9412
237
  return all_allowed_ts;
9413
237
}
9414
9415
Result<vector<shared_ptr<TSDescriptor>>> CatalogManager::FindTServersForPlacementBlock(
9416
    const PlacementBlockPB& placement_block,
9417
491
    const TSDescriptorVector& ts_descs) {
9418
9419
491
  vector<shared_ptr<TSDescriptor>> allowed_ts;
9420
491
  const auto& cloud_info = placement_block.cloud_info();
9421
2.20k
  for (const auto& ts : ts_descs) {
9422
2.20k
    if (ts->MatchesCloudInfo(cloud_info)) {
9423
1.00k
      allowed_ts.push_back(ts);
9424
1.00k
    }
9425
2.20k
  }
9426
9427
491
  return allowed_ts;
9428
491
}
9429
9430
3.97k
Status CatalogManager::SendCreateTabletRequests(const vector<TabletInfo*>& tablets) {
9431
3.97k
  auto schedules_to_tablets_map = VERIFY_RESULT(MakeSnapshotSchedulesToObjectIdsMap(
9432
3.97k
      SysRowEntryType::TABLET));
9433
28.3k
  for (TabletInfo *tablet : tablets) {
9434
28.3k
    const consensus::RaftConfigPB& config =
9435
28.3k
        tablet->metadata().dirty().pb.committed_consensus_state().config();
9436
28.3k
    tablet->set_last_update_time(MonoTime::Now());
9437
28.3k
    std::vector<SnapshotScheduleId> schedules;
9438
0
    for (const auto& pair : schedules_to_tablets_map) {
9439
0
      if (std::binary_search(pair.second.begin(), pair.second.end(), tablet->id())) {
9440
0
        schedules.push_back(pair.first);
9441
0
      }
9442
0
    }
9443
82.1k
    for (const RaftPeerPB& peer : config.peers()) {
9444
82.1k
      auto task = std::make_shared<AsyncCreateReplica>(master_, AsyncTaskPool(),
9445
82.1k
          peer.permanent_uuid(), tablet, schedules);
9446
82.1k
      tablet->table()->AddTask(task);
9447
82.1k
      WARN_NOT_OK(ScheduleTask(task), "Failed to send new tablet request");
9448
82.1k
    }
9449
28.3k
  }
9450
9451
3.97k
  return Status::OK();
9452
3.97k
}
9453
9454
// If responses have been received from sufficient replicas (including hinted leader),
9455
// pick proposed leader and start election.
9456
void CatalogManager::StartElectionIfReady(
9457
91.8k
    const consensus::ConsensusStatePB& cstate, TabletInfo* tablet) {
9458
91.8k
  auto replicas = tablet->GetReplicaLocations();
9459
91.8k
  int num_voters = 0;
9460
274k
  for (const auto& peer : cstate.config().peers()) {
9461
274k
    if (peer.member_type() == PeerMemberType::VOTER) {
9462
273k
      ++num_voters;
9463
273k
    }
9464
274k
  }
9465
91.8k
  int majority_size = num_voters / 2 + 1;
9466
91.8k
  int running_voters = 0;
9467
274k
  for (const auto& replica : *replicas) {
9468
274k
    if (replica.second.member_type == PeerMemberType::VOTER) {
9469
273k
      ++running_voters;
9470
273k
    }
9471
274k
  }
9472
9473
1
  VLOG_WITH_PREFIX(4)
9474
1
      << __func__ << ": T " << tablet->tablet_id() << ": " << AsString(*replicas) << ", voters: "
9475
1
      << running_voters << "/" << majority_size;
9476
9477
91.8k
  if (running_voters < majority_size) {
9478
0
    VLOG_WITH_PREFIX(4) << __func__ << ": Not enough voters";
9479
0
    return;
9480
0
  }
9481
9482
91.8k
  ReplicationInfoPB replication_info;
9483
91.8k
  {
9484
91.8k
    auto l = cluster_config_->LockForRead();
9485
91.8k
    replication_info = l->pb.replication_info();
9486
91.8k
  }
9487
9488
  // Find tservers that can be leaders for a tablet.
9489
91.8k
  TSDescriptorVector ts_descs = GetAllLiveNotBlacklistedTServers();
9490
9491
91.8k
  std::vector<std::string> possible_leaders;
9492
274k
  for (const auto& replica : *replicas) {
9493
555k
    for (const auto& ts_desc : ts_descs) {
9494
555k
      if (ts_desc->permanent_uuid() == replica.first) {
9495
274k
        if (ts_desc->IsAcceptingLeaderLoad(replication_info)) {
9496
272k
          possible_leaders.push_back(replica.first);
9497
272k
        }
9498
274k
        break;
9499
274k
      }
9500
555k
    }
9501
274k
  }
9502
9503
91.8k
  if (FLAGS_TEST_create_table_leader_hint_min_lexicographic) {
9504
6
    std::string min_lexicographic;
9505
18
    for (const auto& peer : cstate.config().peers()) {
9506
18
      if (peer.member_type() == PeerMemberType::VOTER) {
9507
18
        if (min_lexicographic.empty() || peer.permanent_uuid() < min_lexicographic) {
9508
6
          min_lexicographic = peer.permanent_uuid();
9509
6
        }
9510
18
      }
9511
18
    }
9512
6
    if (min_lexicographic.empty() || !replicas->count(min_lexicographic)) {
9513
0
      LOG_WITH_PREFIX(INFO)
9514
0
          << __func__ << ": Min lexicographic is not yet ready: " << min_lexicographic;
9515
0
      return;
9516
0
    }
9517
6
    possible_leaders = { min_lexicographic };
9518
6
  }
9519
9520
91.8k
  if (possible_leaders.empty()) {
9521
0
    VLOG_WITH_PREFIX(4) << __func__ << ": Cannot pick candidate";
9522
64
    return;
9523
64
  }
9524
9525
91.7k
  if (!tablet->InitiateElection()) {
9526
0
    VLOG_WITH_PREFIX(4) << __func__ << ": Already initiated";
9527
63.7k
    return;
9528
63.7k
  }
9529
9530
28.0k
  const auto& protege = RandomElement(possible_leaders);
9531
9532
28.0k
  LOG_WITH_PREFIX(INFO)
9533
28.0k
      << "Starting election at " << tablet->tablet_id() << " in favor of " << protege;
9534
9535
28.0k
  auto task = std::make_shared<AsyncStartElection>(master_, AsyncTaskPool(), protege, tablet);
9536
28.0k
  tablet->table()->AddTask(task);
9537
28.0k
  WARN_NOT_OK(task->Run(), "Failed to send new tablet start election request");
9538
28.0k
}
9539
9540
shared_ptr<TSDescriptor> CatalogManager::SelectReplica(
9541
    const TSDescriptorVector& ts_descs,
9542
    set<TabletServerId>* excluded,
9543
82.1k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9544
82.1k
  shared_ptr<TSDescriptor> found_ts;
9545
83.7k
  for (const auto& sorted_load : per_table_state->sorted_load_) {
9546
    // Don't consider a tserver that has already been considered for this tablet.
9547
83.7k
    if (excluded->count(sorted_load)) {
9548
3
      continue;
9549
3
    }
9550
    // Only choose from the set of allowed tservers for this tablet.
9551
168k
    auto it = std::find_if(ts_descs.begin(), ts_descs.end(), [&sorted_load](const auto& ts) {
9552
168k
      return ts->permanent_uuid() == sorted_load;
9553
168k
    });
9554
9555
83.7k
    if (it != ts_descs.end()) {
9556
82.1k
      found_ts = *it;
9557
82.1k
      break;
9558
82.1k
    }
9559
83.7k
  }
9560
9561
82.1k
  return found_ts;
9562
82.1k
}
9563
9564
void CatalogManager::SelectReplicas(
9565
    const TSDescriptorVector& ts_descs, size_t nreplicas, consensus::RaftConfigPB* config,
9566
    set<TabletServerId>* already_selected_ts, PeerMemberType member_type,
9567
28.5k
    CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state) {
9568
28.5k
  DCHECK_LE(nreplicas, ts_descs.size());
9569
9570
110k
  for (size_t i = 0; i < nreplicas; ++i) {
9571
82.1k
    shared_ptr<TSDescriptor> ts = SelectReplica(
9572
82.1k
        ts_descs, already_selected_ts, per_table_state, global_state);
9573
82.1k
    InsertOrDie(already_selected_ts, ts->permanent_uuid());
9574
    // Update the load state at global and table level.
9575
82.1k
    per_table_state->per_ts_load_[ts->permanent_uuid()]++;
9576
82.1k
    global_state->per_ts_load_[ts->permanent_uuid()]++;
9577
82.1k
    per_table_state->SortLoad();
9578
9579
    // Increment the number of pending replicas so that we take this selection into
9580
    // account when assigning replicas for other tablets of the same table. This
9581
    // value decays back to 0 over time.
9582
82.1k
    ts->IncrementRecentReplicaCreations();
9583
9584
82.1k
    TSRegistrationPB reg = ts->GetRegistration();
9585
9586
82.1k
    RaftPeerPB *peer = config->add_peers();
9587
82.1k
    peer->set_permanent_uuid(ts->permanent_uuid());
9588
9589
    // TODO: This is temporary, we will use only UUIDs.
9590
82.1k
    TakeRegistration(reg.mutable_common(), peer);
9591
82.1k
    peer->set_member_type(member_type);
9592
82.1k
  }
9593
28.5k
}
9594
9595
Status CatalogManager::ConsensusStateToTabletLocations(const consensus::ConsensusStatePB& cstate,
9596
159k
                                                       TabletLocationsPB* locs_pb) {
9597
416k
  for (const consensus::RaftPeerPB& peer : cstate.config().peers()) {
9598
416k
    TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
9599
416k
    if (!peer.has_permanent_uuid()) {
9600
0
      return STATUS_SUBSTITUTE(IllegalState, "Missing UUID $0", peer.ShortDebugString());
9601
0
    }
9602
416k
    replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate));
9603
416k
    if (peer.has_member_type()) {
9604
416k
      replica_pb->set_member_type(peer.member_type());
9605
2
    } else {
9606
2
      replica_pb->set_member_type(PeerMemberType::UNKNOWN_MEMBER_TYPE);
9607
2
    }
9608
416k
    TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info();
9609
416k
    tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
9610
416k
    CopyRegistration(peer, tsinfo_pb);
9611
416k
  }
9612
159k
  return Status::OK();
9613
159k
}
9614
9615
Status CatalogManager::BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet,
9616
                                               TabletLocationsPB* locs_pb,
9617
366k
                                               IncludeInactive include_inactive) {
9618
366k
  {
9619
366k
    auto l_tablet = tablet->LockForRead();
9620
366k
    if (l_tablet->is_hidden() && !include_inactive) {
9621
0
      return STATUS_FORMAT(NotFound, "Tablet hidden", tablet->id());
9622
0
    }
9623
366k
    locs_pb->set_table_id(l_tablet->pb.table_id());
9624
366k
    *locs_pb->mutable_table_ids() = l_tablet->pb.table_ids();
9625
366k
  }
9626
9627
  // For system tables, the set of replicas is always the set of masters.
9628
366k
  if (system_tablets_.find(tablet->id()) != system_tablets_.end()) {
9629
159k
    consensus::ConsensusStatePB master_consensus;
9630
159k
    RETURN_NOT_OK(GetCurrentConfig(&master_consensus));
9631
159k
    locs_pb->set_tablet_id(tablet->tablet_id());
9632
159k
    locs_pb->set_stale(false);
9633
159k
    const auto initial_size = locs_pb->replicas_size();
9634
159k
    RETURN_NOT_OK(ConsensusStateToTabletLocations(master_consensus, locs_pb));
9635
159k
    const auto capabilities = Capabilities();
9636
    // Set capabilities of master node for all newly created system table locations.
9637
159k
    for (auto i = locs_pb->mutable_replicas()->begin() + initial_size,
9638
574k
        end = locs_pb->mutable_replicas()->end(); i != end; ++i) {
9639
415k
      *i->mutable_ts_info()->mutable_capabilities() = google::protobuf::RepeatedField<CapabilityId>(
9640
415k
          capabilities.begin(), capabilities.end());
9641
415k
    }
9642
159k
    return Status::OK();
9643
206k
  }
9644
9645
206k
  TSRegistrationPB reg;
9646
9647
206k
  std::shared_ptr<const TabletReplicaMap> locs;
9648
206k
  consensus::ConsensusStatePB cstate;
9649
206k
  {
9650
206k
    auto l_tablet = tablet->LockForRead();
9651
206k
    if (PREDICT_FALSE(l_tablet->is_deleted())) {
9652
324
      std::vector<TabletId> split_tablet_ids;
9653
4
      for (const auto& split_tablet_id : l_tablet->pb.split_tablet_ids()) {
9654
4
        split_tablet_ids.push_back(split_tablet_id);
9655
4
      }
9656
324
      return STATUS(
9657
324
          NotFound, "Tablet deleted", l_tablet->pb.state_msg(),
9658
324
          SplitChildTabletIdsData(split_tablet_ids));
9659
324
    }
9660
9661
206k
    if (PREDICT_FALSE(!l_tablet->is_running())) {
9662
8.80k
      return STATUS_FORMAT(ServiceUnavailable, "Tablet $0 not running", tablet->id());
9663
8.80k
    }
9664
9665
197k
    locs = tablet->GetReplicaLocations();
9666
197k
    if (locs->empty() && l_tablet->pb.has_committed_consensus_state()) {
9667
220
      cstate = l_tablet->pb.committed_consensus_state();
9668
220
    }
9669
9670
197k
    const auto& metadata = tablet->metadata().state().pb;
9671
197k
    locs_pb->mutable_partition()->CopyFrom(metadata.partition());
9672
197k
    locs_pb->set_split_depth(metadata.split_depth());
9673
197k
    locs_pb->set_split_parent_tablet_id(metadata.split_parent_tablet_id());
9674
150
    for (const auto& split_tablet_id : metadata.split_tablet_ids()) {
9675
150
      *locs_pb->add_split_tablet_ids() = split_tablet_id;
9676
150
    }
9677
197k
  }
9678
9679
197k
  locs_pb->set_tablet_id(tablet->tablet_id());
9680
197k
  locs_pb->set_stale(locs->empty());
9681
9682
  // If the locations are cached.
9683
197k
  if (!locs->empty()) {
9684
197k
    if (cstate.IsInitialized() &&
9685
0
            locs->size() != implicit_cast<size_t>(cstate.config().peers_size())) {
9686
0
      LOG(WARNING) << "Cached tablet replicas " << locs->size() << " does not match consensus "
9687
0
                   << cstate.config().peers_size();
9688
0
    }
9689
9690
584k
    for (const auto& replica : *locs) {
9691
584k
      TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
9692
584k
      replica_pb->set_role(replica.second.role);
9693
584k
      replica_pb->set_member_type(replica.second.member_type);
9694
584k
      auto tsinfo_pb = replica.second.ts_desc->GetTSInformationPB();
9695
9696
584k
      TSInfoPB* out_ts_info = replica_pb->mutable_ts_info();
9697
584k
      out_ts_info->set_permanent_uuid(tsinfo_pb->tserver_instance().permanent_uuid());
9698
584k
      CopyRegistration(tsinfo_pb->registration().common(), out_ts_info);
9699
584k
      out_ts_info->set_placement_uuid(tsinfo_pb->registration().common().placement_uuid());
9700
584k
      *out_ts_info->mutable_capabilities() = tsinfo_pb->registration().capabilities();
9701
584k
    }
9702
197k
    return Status::OK();
9703
197k
  }
9704
9705
  // If the locations were not cached.
9706
  // TODO: Why would this ever happen? See KUDU-759.
9707
291
  if (cstate.IsInitialized()) {
9708
220
    RETURN_NOT_OK(ConsensusStateToTabletLocations(cstate, locs_pb));
9709
220
  }
9710
9711
291
  return Status::OK();
9712
291
}
9713
9714
643k
Result<shared_ptr<tablet::AbstractTablet>> CatalogManager::GetSystemTablet(const TabletId& id) {
9715
643k
  const auto iter = system_tablets_.find(id);
9716
643k
  if (iter == system_tablets_.end()) {
9717
0
    return STATUS_SUBSTITUTE(InvalidArgument, "$0 is not a valid system tablet id", id);
9718
0
  }
9719
643k
  return iter->second;
9720
643k
}
9721
9722
Status CatalogManager::GetTabletLocations(
9723
9.22k
    const TabletId& tablet_id, TabletLocationsPB* locs_pb, IncludeInactive include_inactive) {
9724
9.22k
  scoped_refptr<TabletInfo> tablet_info;
9725
9.22k
  {
9726
9.22k
    SharedLock lock(mutex_);
9727
9.22k
    if (!FindCopy(*tablet_map_, tablet_id, &tablet_info)) {
9728
0
      return STATUS_SUBSTITUTE(NotFound, "Unknown tablet $0", tablet_id);
9729
0
    }
9730
9.22k
  }
9731
9.22k
  Status s = GetTabletLocations(tablet_info, locs_pb, include_inactive);
9732
9733
9.22k
  auto num_replicas = GetReplicationFactorForTablet(tablet_info);
9734
9.22k
  if (num_replicas.ok() && *num_replicas > 0 &&
9735
9.22k
      implicit_cast<size_t>(locs_pb->replicas().size()) != *num_replicas) {
9736
610
    YB_LOG_EVERY_N_SECS(WARNING, 1)
9737
284
        << "Expected replicas " << num_replicas << " but found "
9738
284
        << locs_pb->replicas().size() << " for tablet " << tablet_info->id() << ": "
9739
284
        << locs_pb->ShortDebugString() << THROTTLE_MSG;
9740
610
  }
9741
9.22k
  return s;
9742
9.22k
}
9743
9744
Status CatalogManager::GetTabletLocations(
9745
    scoped_refptr<TabletInfo> tablet_info,
9746
    TabletLocationsPB* locs_pb,
9747
59.3k
    IncludeInactive include_inactive) {
9748
59.3k
  DCHECK_EQ(locs_pb->replicas().size(), 0);
9749
59.3k
  locs_pb->mutable_replicas()->Clear();
9750
59.3k
  return BuildLocationsForTablet(tablet_info, locs_pb, include_inactive);
9751
59.3k
}
9752
9753
Status CatalogManager::GetTableLocations(
9754
    const GetTableLocationsRequestPB* req,
9755
167k
    GetTableLocationsResponsePB* resp) {
9756
18.4E
  VLOG(4) << "GetTableLocations: " << req->ShortDebugString();
9757
9758
  // If start-key is > end-key report an error instead of swap the two
9759
  // since probably there is something wrong app-side.
9760
167k
  if (req->has_partition_key_start() && req->has_partition_key_end()
9761
1
      && req->partition_key_start() > req->partition_key_end()) {
9762
1
    return STATUS(InvalidArgument, "start partition key is greater than the end partition key");
9763
1
  }
9764
9765
167k
  if (req->max_returned_locations() <= 0) {
9766
0
    return STATUS(InvalidArgument, "max_returned_locations must be greater than 0");
9767
0
  }
9768
9769
167k
  scoped_refptr<TableInfo> table = VERIFY_RESULT(FindTable(req->table()));
9770
9771
167k
  if (table->IsCreateInProgress()) {
9772
7.98k
    resp->set_creating(true);
9773
7.98k
  }
9774
9775
167k
  auto l = table->LockForRead();
9776
167k
  RETURN_NOT_OK(CheckIfTableDeletedOrNotVisibleToClient(l, resp));
9777
9778
167k
  vector<scoped_refptr<TabletInfo>> tablets;
9779
167k
  table->GetTabletsInRange(req, &tablets);
9780
9781
167k
  IncludeInactive include_inactive(req->has_include_inactive() && req->include_inactive());
9782
167k
  bool require_tablets_runnings = req->require_tablets_running();
9783
9784
167k
  int expected_live_replicas = 0;
9785
167k
  int expected_read_replicas = 0;
9786
167k
  GetExpectedNumberOfReplicas(&expected_live_replicas, &expected_read_replicas);
9787
267k
  for (const scoped_refptr<TabletInfo>& tablet : tablets) {
9788
267k
    TabletLocationsPB* locs_pb = resp->add_tablet_locations();
9789
267k
    locs_pb->set_expected_live_replicas(expected_live_replicas);
9790
267k
    locs_pb->set_expected_read_replicas(expected_read_replicas);
9791
267k
    auto status = BuildLocationsForTablet(tablet, locs_pb, include_inactive);
9792
267k
    if (!status.ok()) {
9793
      // Not running.
9794
8.78k
      if (require_tablets_runnings) {
9795
8.67k
        resp->mutable_tablet_locations()->Clear();
9796
8.67k
        return SetupError(resp->mutable_error(), MasterErrorPB::OBJECT_NOT_FOUND, status);
9797
8.67k
      }
9798
113
      resp->mutable_tablet_locations()->RemoveLast();
9799
113
    }
9800
267k
  }
9801
9802
158k
  resp->set_table_type(l->pb.table_type());
9803
158k
  resp->set_partition_list_version(l->pb.partition_list_version());
9804
9805
158k
  return Status::OK();
9806
167k
}
9807
9808
568k
Status CatalogManager::GetCurrentConfig(consensus::ConsensusStatePB* cpb) const {
9809
568k
  auto tablet_peer = sys_catalog_->tablet_peer();
9810
556k
  auto consensus = tablet_peer ? tablet_peer->shared_consensus() : nullptr;
9811
568k
  if (!consensus) {
9812
11.6k
    std::string uuid = master_->fs_manager()->uuid();
9813
11.6k
    return STATUS_FORMAT(IllegalState, "Node $0 peer not initialized.", uuid);
9814
11.6k
  }
9815
9816
557k
  *cpb = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED);
9817
9818
557k
  return Status::OK();
9819
557k
}
9820
9821
0
void CatalogManager::DumpState(std::ostream* out, bool on_disk_dump) const {
9822
0
  NamespaceInfoMap namespace_ids_copy;
9823
0
  TableInfoMap ids_copy;
9824
0
  TableInfoByNameMap names_copy;
9825
0
  TabletInfoMap tablets_copy;
9826
9827
  // Copy the internal state so that, if the output stream blocks,
9828
  // we don't end up holding the lock for a long time.
9829
0
  {
9830
0
    SharedLock lock(mutex_);
9831
0
    namespace_ids_copy = namespace_ids_map_;
9832
0
    ids_copy = *table_ids_map_;
9833
0
    names_copy = table_names_map_;
9834
0
    tablets_copy = *tablet_map_;
9835
0
  }
9836
9837
0
  *out << "Dumping current state of master.\nNamespaces:\n";
9838
0
  for (const NamespaceInfoMap::value_type& e : namespace_ids_copy) {
9839
0
    NamespaceInfo* t = e.second.get();
9840
0
    auto l = t->LockForRead();
9841
0
    const NamespaceName& name = l->name();
9842
9843
0
    *out << t->id() << ":\n";
9844
0
    *out << "  name: \"" << strings::CHexEscape(name) << "\"\n";
9845
0
    *out << "  metadata: " << l->pb.ShortDebugString() << "\n";
9846
0
  }
9847
9848
0
  *out << "Tables:\n";
9849
0
  for (const TableInfoMap::value_type& e : ids_copy) {
9850
0
    TableInfo* t = e.second.get();
9851
0
    TabletInfos table_tablets;
9852
0
    {
9853
0
      auto l = t->LockForRead();
9854
0
      const TableName& name = l->name();
9855
0
      const NamespaceId& namespace_id = l->namespace_id();
9856
      // Find namespace by its ID.
9857
0
      scoped_refptr<NamespaceInfo> ns = FindPtrOrNull(namespace_ids_copy, namespace_id);
9858
9859
0
      *out << t->id() << ":\n";
9860
0
      *out << "  namespace id: \"" << strings::CHexEscape(namespace_id) << "\"\n";
9861
9862
0
      if (ns != nullptr) {
9863
0
        *out << "  namespace name: \"" << strings::CHexEscape(ns->name()) << "\"\n";
9864
0
      }
9865
9866
0
      *out << "  name: \"" << strings::CHexEscape(name) << "\"\n";
9867
      // Erase from the map, so later we can check that we don't have
9868
      // any orphaned tables in the by-name map that aren't in the
9869
      // by-id map.
9870
0
      if (names_copy.erase({namespace_id, name}) != 1) {
9871
0
        *out << "  [not present in by-name map]\n";
9872
0
      }
9873
0
      *out << "  metadata: " << l->pb.ShortDebugString() << "\n";
9874
9875
0
      *out << "  tablets:\n";
9876
0
      table_tablets = t->GetTablets();
9877
0
    }
9878
0
    for (const scoped_refptr<TabletInfo>& tablet : table_tablets) {
9879
0
      auto l_tablet = tablet->LockForRead();
9880
0
      *out << "    " << tablet->tablet_id() << ": "
9881
0
           << l_tablet->pb.ShortDebugString() << "\n";
9882
9883
0
      if (tablets_copy.erase(tablet->tablet_id()) != 1) {
9884
0
        *out << "  [ERROR: not present in CM tablet map!]\n";
9885
0
      }
9886
0
    }
9887
0
  }
9888
9889
0
  if (!tablets_copy.empty()) {
9890
0
    *out << "Orphaned tablets (not referenced by any table):\n";
9891
0
    for (const TabletInfoMap::value_type& entry : tablets_copy) {
9892
0
      const scoped_refptr<TabletInfo>& tablet = entry.second;
9893
0
      auto l_tablet = tablet->LockForRead();
9894
0
      *out << "    " << tablet->tablet_id() << ": "
9895
0
           << l_tablet->pb.ShortDebugString() << "\n";
9896
0
    }
9897
0
  }
9898
9899
0
  if (!names_copy.empty()) {
9900
0
    *out << "Orphaned tables (in by-name map, but not id map):\n";
9901
0
    for (const TableInfoByNameMap::value_type& e : names_copy) {
9902
0
      *out << e.second->id() << ":\n";
9903
0
      *out << "  namespace id: \"" << strings::CHexEscape(e.first.first) << "\"\n";
9904
0
      *out << "  name: \"" << CHexEscape(e.first.second) << "\"\n";
9905
0
    }
9906
0
  }
9907
9908
0
  master_->DumpMasterOptionsInfo(out);
9909
9910
0
  if (on_disk_dump) {
9911
0
    consensus::ConsensusStatePB cur_consensus_state;
9912
    // TODO: proper error handling below.
9913
0
    CHECK_OK(GetCurrentConfig(&cur_consensus_state));
9914
0
    *out << "Current raft config: " << cur_consensus_state.ShortDebugString() << "\n";
9915
0
  }
9916
0
}
9917
9918
Status CatalogManager::PeerStateDump(const vector<RaftPeerPB>& peers,
9919
                                     const DumpMasterStateRequestPB* req,
9920
0
                                     DumpMasterStateResponsePB* resp) {
9921
0
  std::unique_ptr<MasterClusterProxy> peer_proxy;
9922
0
  Endpoint sockaddr;
9923
0
  MonoTime timeout = MonoTime::Now();
9924
0
  DumpMasterStateRequestPB peer_req;
9925
0
  rpc::RpcController rpc;
9926
9927
0
  timeout.AddDelta(MonoDelta::FromMilliseconds(FLAGS_master_ts_rpc_timeout_ms));
9928
0
  rpc.set_deadline(timeout);
9929
0
  peer_req.set_on_disk(req->on_disk());
9930
0
  peer_req.set_return_dump_as_string(req->return_dump_as_string());
9931
0
  string dump;
9932
9933
0
  for (const RaftPeerPB& peer : peers) {
9934
0
    HostPort hostport = HostPortFromPB(DesiredHostPort(peer, master_->MakeCloudInfoPB()));
9935
0
    peer_proxy = std::make_unique<MasterClusterProxy>(&master_->proxy_cache(), hostport);
9936
9937
0
    DumpMasterStateResponsePB peer_resp;
9938
0
    rpc.Reset();
9939
9940
0
    RETURN_NOT_OK(peer_proxy->DumpState(peer_req, &peer_resp, &rpc));
9941
9942
0
    if (peer_resp.has_error()) {
9943
0
      LOG(WARNING) << "Hit err " << peer_resp.ShortDebugString() << " during peer "
9944
0
        << peer.ShortDebugString() << " state dump.";
9945
0
      return StatusFromPB(peer_resp.error().status());
9946
0
    } else if (req->return_dump_as_string()) {
9947
0
      dump += peer_resp.dump();
9948
0
    }
9949
0
  }
9950
9951
0
  if (req->return_dump_as_string()) {
9952
0
    resp->set_dump(resp->dump() + dump);
9953
0
  }
9954
0
  return Status::OK();
9955
0
}
9956
9957
90.0k
void CatalogManager::ReportMetrics() {
9958
  // Report metrics on how many tservers are alive.
9959
90.0k
  TSDescriptorVector ts_descs;
9960
90.0k
  master_->ts_manager()->GetAllLiveDescriptors(&ts_descs);
9961
90.0k
  const auto num_live_servers = ts_descs.size();
9962
90.0k
  metric_num_tablet_servers_live_->set_value(narrow_cast<uint32_t>(num_live_servers));
9963
9964
90.0k
  master_->ts_manager()->GetAllDescriptors(&ts_descs);
9965
90.0k
  metric_num_tablet_servers_dead_->set_value(
9966
90.0k
      narrow_cast<uint32_t>(ts_descs.size() - num_live_servers));
9967
90.0k
}
9968
9969
150k
void CatalogManager::ResetMetrics() {
9970
150k
  metric_num_tablet_servers_live_->set_value(0);
9971
150k
  metric_num_tablet_servers_dead_->set_value(0);
9972
150k
}
9973
9974
9975
260k
std::string CatalogManager::LogPrefix() const {
9976
260k
  if (tablet_peer()) {
9977
260k
    return consensus::MakeTabletLogPrefix(
9978
260k
        tablet_peer()->tablet_id(), tablet_peer()->permanent_uuid());
9979
41
  } else {
9980
41
    return consensus::MakeTabletLogPrefix(
9981
41
        kSysCatalogTabletId, master_->fs_manager()->uuid());
9982
41
  }
9983
260k
}
9984
9985
0
void CatalogManager::SetLoadBalancerEnabled(bool is_enabled) {
9986
0
  load_balance_policy_->SetLoadBalancerEnabled(is_enabled);
9987
0
}
9988
9989
1
bool CatalogManager::IsLoadBalancerEnabled() {
9990
1
  return load_balance_policy_->IsLoadBalancerEnabled();
9991
1
}
9992
9993
78.7k
MonoDelta CatalogManager::TimeSinceElectedLeader() {
9994
78.7k
  return MonoTime::Now() - time_elected_leader_;
9995
78.7k
}
9996
9997
26
Status CatalogManager::GoIntoShellMode() {
9998
26
  if (master_->IsShellMode()) {
9999
0
    return STATUS(IllegalState, "Master is already in shell mode.");
10000
0
  }
10001
10002
26
  LOG(INFO) << "Starting going into shell mode.";
10003
26
  master_->SetShellMode(true);
10004
10005
26
  {
10006
26
    LockGuard lock(mutex_);
10007
26
    RETURN_NOT_OK(sys_catalog_->GoIntoShellMode());
10008
26
    background_tasks_->Shutdown();
10009
26
    background_tasks_.reset();
10010
26
  }
10011
26
  {
10012
26
    std::lock_guard<std::mutex> l(remote_bootstrap_mtx_);
10013
26
    tablet_exists_ = false;
10014
26
  }
10015
10016
26
  LOG(INFO) << "Done going into shell mode.";
10017
10018
26
  return Status::OK();
10019
26
}
10020
10021
206
Status CatalogManager::GetClusterConfig(GetMasterClusterConfigResponsePB* resp) {
10022
206
  return GetClusterConfig(resp->mutable_cluster_config());
10023
206
}
10024
10025
566k
Status CatalogManager::GetClusterConfig(SysClusterConfigEntryPB* config) {
10026
56
  DCHECK(cluster_config_) << "Missing cluster config for master!";
10027
566k
  auto l = cluster_config_->LockForRead();
10028
566k
  *config = l->pb;
10029
566k
  return Status::OK();
10030
566k
}
10031
10032
Status CatalogManager::SetClusterConfig(
10033
111
    const ChangeMasterClusterConfigRequestPB* req, ChangeMasterClusterConfigResponsePB* resp) {
10034
111
  SysClusterConfigEntryPB config(req->cluster_config());
10035
10036
111
  if (config.has_server_blacklist()) {
10037
20
    config.mutable_server_blacklist()->set_initial_replica_load(narrow_cast<int32_t>(
10038
20
        GetNumRelevantReplicas(config.server_blacklist(), false /* leaders_only */)));
10039
20
    LOG(INFO) << Format("Set blacklist of total tservers: $0, with initial load: $1",
10040
20
                    config.server_blacklist().hosts().size(),
10041
20
                    config.server_blacklist().initial_replica_load());
10042
20
  }
10043
111
  if (config.has_leader_blacklist()) {
10044
14
    config.mutable_leader_blacklist()->set_initial_leader_load(narrow_cast<int32_t>(
10045
14
        GetNumRelevantReplicas(config.leader_blacklist(), true /* leaders_only */)));
10046
14
    LOG(INFO) << Format("Set leader blacklist of total tservers: $0, with initial load: $1",
10047
14
                    config.leader_blacklist().hosts().size(),
10048
14
                    config.leader_blacklist().initial_leader_load());
10049
14
  }
10050
10051
111
  auto l = cluster_config_->LockForWrite();
10052
  // We should only set the config, if the caller provided us with a valid update to the
10053
  // existing config.
10054
111
  if (l->pb.version() != config.version()) {
10055
0
    Status s = STATUS_SUBSTITUTE(IllegalState,
10056
0
      "Config version does not match, got $0, but most recent one is $1. Should call Get again",
10057
0
      config.version(), l->pb.version());
10058
0
    return SetupError(resp->mutable_error(), MasterErrorPB::CONFIG_VERSION_MISMATCH, s);
10059
0
  }
10060
10061
111
  if (config.cluster_uuid() != l->pb.cluster_uuid()) {
10062
1
    Status s = STATUS(InvalidArgument, "Config cluster UUID cannot be updated");
10063
1
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10064
1
  }
10065
10066
  // TODO(bogdan): should this live here?
10067
110
  const ReplicationInfoPB& replication_info = config.replication_info();
10068
118
  for (int i = 0; i < replication_info.read_replicas_size(); i++) {
10069
8
    if (!replication_info.read_replicas(i).has_placement_uuid()) {
10070
0
      Status s = STATUS(IllegalState,
10071
0
                        "All read-only clusters must have a placement uuid specified");
10072
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10073
0
    }
10074
8
  }
10075
10076
  // Validate placement information according to rules defined.
10077
110
  if (replication_info.has_live_replicas()) {
10078
81
    Status s = CatalogManagerUtil::IsPlacementInfoValid(replication_info.live_replicas());
10079
81
    if (!s.ok()) {
10080
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10081
0
    }
10082
110
  }
10083
10084
110
  l.mutable_data()->pb.CopyFrom(config);
10085
  // Bump the config version, to indicate an update.
10086
110
  l.mutable_data()->pb.set_version(config.version() + 1);
10087
10088
110
  LOG(INFO) << "Updating cluster config to " << config.version() + 1;
10089
10090
110
  RETURN_NOT_OK(sys_catalog_->Upsert(leader_ready_term(), cluster_config_));
10091
10092
110
  l.Commit();
10093
10094
110
  return Status::OK();
10095
110
}
10096
10097
Status CatalogManager::ValidateReplicationInfo(
10098
32.3k
    const ValidateReplicationInfoRequestPB* req, ValidateReplicationInfoResponsePB* resp) {
10099
32.3k
  TSDescriptorVector all_ts_descs;
10100
32.3k
  {
10101
32.3k
    BlacklistSet blacklist = BlacklistSetFromPB();
10102
32.3k
    master_->ts_manager()->GetAllLiveDescriptors(&all_ts_descs, blacklist);
10103
32.3k
  }
10104
  // We don't need any validation checks for read replica placements
10105
  // because they aren't a part of any raft quorum underneath.
10106
  // Technically, it is ok to have even 0 read replica nodes for them upfront.
10107
  // We only need it for the primary cluster replicas.
10108
32.3k
  TSDescriptorVector ts_descs;
10109
32.3k
  GetTsDescsFromPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, &ts_descs);
10110
32.3k
  Status s = CheckValidPlacementInfo(req->replication_info().live_replicas(), all_ts_descs, resp);
10111
32.3k
  if (!s.ok()) {
10112
6
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_TABLE_REPLICATION_INFO, s);
10113
6
  }
10114
32.3k
  return Status::OK();
10115
32.3k
}
10116
10117
Status CatalogManager::SetPreferredZones(
10118
3
    const SetPreferredZonesRequestPB* req, SetPreferredZonesResponsePB* resp) {
10119
3
  auto l = cluster_config_->LockForWrite();
10120
3
  auto replication_info = l.mutable_data()->pb.mutable_replication_info();
10121
3
  replication_info->clear_affinitized_leaders();
10122
10123
5
  for (const auto& cloud_info : req->preferred_zones()) {
10124
5
    const auto& placement_info = replication_info->live_replicas();
10125
5
    if (!CatalogManagerUtil::DoesPlacementInfoContainCloudInfo(placement_info, cloud_info)) {
10126
0
      Status s = STATUS_FORMAT(InvalidArgument, "Placement info $0 does not contain cloud info $1",
10127
0
                               placement_info, TSDescriptor::generate_placement_id(cloud_info));
10128
0
      return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10129
0
    }
10130
5
    *replication_info->add_affinitized_leaders() = cloud_info;
10131
5
  }
10132
10133
3
  l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1);
10134
10135
3
  LOG(INFO) << "Updating cluster config to " << l.mutable_data()->pb.version();
10136
10137
3
  Status s = sys_catalog_->Upsert(leader_ready_term(), cluster_config_);
10138
3
  if (!s.ok()) {
10139
0
    return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
10140
0
  }
10141
10142
3
  l.Commit();
10143
10144
3
  return Status::OK();
10145
3
}
10146
10147
56.0k
Result<size_t> CatalogManager::GetReplicationFactor() {
10148
2
  DCHECK(cluster_config_) << "Missing cluster config for master!";
10149
56.0k
  auto l = cluster_config_->LockForRead();
10150
56.0k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10151
56.0k
  return GetNumReplicasFromPlacementInfo(replication_info.live_replicas());
10152
56.0k
}
10153
10154
Result<size_t> CatalogManager::GetReplicationFactorForTablet(
10155
9.22k
    const scoped_refptr<TabletInfo>& tablet) {
10156
  // For system tables, the set of replicas is always the set of masters.
10157
9.22k
  if (system_tablets_.find(tablet->id()) != system_tablets_.end()) {
10158
90
    consensus::ConsensusStatePB master_consensus;
10159
90
    RETURN_NOT_OK(GetCurrentConfig(&master_consensus));
10160
90
    return master_consensus.config().peers().size();
10161
9.13k
  }
10162
9.13k
  int num_live_replicas = 0, num_read_replicas = 0;
10163
9.13k
  GetExpectedNumberOfReplicas(&num_live_replicas, &num_read_replicas);
10164
9.13k
  return num_live_replicas + num_read_replicas;
10165
9.13k
}
10166
10167
185k
void CatalogManager::GetExpectedNumberOfReplicas(int* num_live_replicas, int* num_read_replicas) {
10168
185k
  auto l = cluster_config_->LockForRead();
10169
185k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10170
185k
  *num_live_replicas = narrow_cast<int>(GetNumReplicasFromPlacementInfo(
10171
185k
      replication_info.live_replicas()));
10172
711
  for (const auto& read_replica_placement_info : replication_info.read_replicas()) {
10173
711
    *num_read_replicas += read_replica_placement_info.num_replicas();
10174
711
  }
10175
185k
}
10176
10177
2.95k
string CatalogManager::placement_uuid() const {
10178
0
  DCHECK(cluster_config_) << "Missing cluster config for master!";
10179
2.95k
  auto l = cluster_config_->LockForRead();
10180
2.95k
  const ReplicationInfoPB& replication_info = l->pb.replication_info();
10181
2.95k
  return replication_info.live_replicas().placement_uuid();
10182
2.95k
}
10183
10184
Status CatalogManager::IsLoadBalanced(const IsLoadBalancedRequestPB* req,
10185
203
                                      IsLoadBalancedResponsePB* resp) {
10186
203
  if (req->has_expected_num_servers()) {
10187
202
    TSDescriptorVector ts_descs;
10188
202
    master_->ts_manager()->GetAllLiveDescriptors(&ts_descs);
10189
10190
202
    if (implicit_cast<size_t>(req->expected_num_servers()) > ts_descs.size()) {
10191
9
      Status s = STATUS_SUBSTITUTE(IllegalState,
10192
9
          "Found $0, which is below the expected number of servers $1.",
10193
9
          ts_descs.size(), req->expected_num_servers());
10194
9
      return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s);
10195
9
    }
10196
194
  }
10197
10198
194
  Status s = load_balance_policy_->IsIdle();
10199
194
  if (!s.ok()) {
10200
174
    return SetupError(resp->mutable_error(), MasterErrorPB::CAN_RETRY_LOAD_BALANCE_CHECK, s);
10201
174
  }
10202
10203
20
  return Status::OK();
10204
20
}
10205
10206
Status CatalogManager::IsLoadBalancerIdle(const IsLoadBalancerIdleRequestPB* req,
10207
2.25k
                                          IsLoadBalancerIdleResponsePB* resp) {
10208
2.25k
  Status s = load_balance_policy_->IsIdle();
10209
2.25k
  if (!s.ok()) {
10210
1.72k
    return SetupError(resp->mutable_error(), MasterErrorPB::LOAD_BALANCER_RECENTLY_ACTIVE, s);
10211
1.72k
  }
10212
10213
531
  return Status::OK();
10214
531
}
10215
10216
Status CatalogManager::AreLeadersOnPreferredOnly(const AreLeadersOnPreferredOnlyRequestPB* req,
10217
153
                                                 AreLeadersOnPreferredOnlyResponsePB* resp) {
10218
  // If we have cluster replication info, then only fetch live tservers (ignore read replicas).
10219
153
  TSDescriptorVector ts_descs;
10220
153
  string live_replicas_placement_uuid = "";
10221
153
  {
10222
153
    auto l = cluster_config_->LockForRead();
10223
153
    const ReplicationInfoPB& cluster_replication_info = l->pb.replication_info();
10224
153
    if (cluster_replication_info.has_live_replicas()) {
10225
116
      live_replicas_placement_uuid = cluster_replication_info.live_replicas().placement_uuid();
10226
116
    }
10227
153
  }
10228
10229
153
  {
10230
153
    BlacklistSet blacklist = BlacklistSetFromPB();
10231
153
    if (live_replicas_placement_uuid.empty()) {
10232
152
      master_->ts_manager()->GetAllLiveDescriptors(&ts_descs, blacklist);
10233
1
    } else {
10234
1
      master_->ts_manager()->GetAllLiveDescriptorsInCluster(
10235
1
          &ts_descs, live_replicas_placement_uuid,
10236
1
          blacklist);
10237
1
    }
10238
153
  }
10239
10240
  // Only need to fetch if txn tables are not using preferred zones.
10241
153
  vector<TableInfoPtr> tables;
10242
153
  if (!FLAGS_transaction_tables_use_preferred_zones) {
10243
153
    tables = master_->catalog_manager()->GetTables(GetTablesMode::kRunning);
10244
153
  }
10245
10246
153
  auto l = cluster_config_->LockForRead();
10247
153
  Status s = CatalogManagerUtil::AreLeadersOnPreferredOnly(
10248
153
      ts_descs, l->pb.replication_info(), tables);
10249
153
  if (!s.ok()) {
10250
138
    return SetupError(
10251
138
        resp->mutable_error(), MasterErrorPB::CAN_RETRY_ARE_LEADERS_ON_PREFERRED_ONLY_CHECK, s);
10252
138
  }
10253
10254
15
  return Status::OK();
10255
15
}
10256
10257
1.23k
int64_t CatalogManager::GetNumRelevantReplicas(const BlacklistPB& blacklist, bool leaders_only) {
10258
1.23k
  int64_t res = 0;
10259
1.23k
  SharedLock lock(mutex_);
10260
43.0k
  for (const TabletInfoMap::value_type& entry : *tablet_map_) {
10261
43.0k
    scoped_refptr<TabletInfo> tablet = entry.second;
10262
43.0k
    auto l = tablet->LockForRead();
10263
    // Not checking being created on purpose as we do not want initial load to be under accounted.
10264
43.0k
    if (!tablet->table() ||
10265
43.0k
        PREDICT_FALSE(l->is_deleted())) {
10266
0
      continue;
10267
0
    }
10268
10269
43.0k
    auto locs = tablet->GetReplicaLocations();
10270
66.2k
    for (const auto& replica : *locs) {
10271
66.2k
      if (leaders_only && replica.second.role != PeerRole::LEADER) {
10272
7.38k
        continue;
10273
7.38k
      }
10274
180k
      for (int i = 0; i < blacklist.hosts_size(); i++) {
10275
144k
        if (replica.second.ts_desc->IsRunningOn(blacklist.hosts(i))) {
10276
23.4k
          ++res;
10277
23.4k
          break;
10278
23.4k
        }
10279
144k
      }
10280
58.9k
    }
10281
43.0k
  }
10282
10283
1.23k
  return res;
10284
1.23k
}
10285
10286
Status CatalogManager::FillHeartbeatResponse(const TSHeartbeatRequestPB* req,
10287
0
                                             TSHeartbeatResponsePB* resp) {
10288
0
  return Status::OK();
10289
0
}
10290
10291
1.00k
Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp) {
10292
1.00k
  return GetLoadMoveCompletionPercent(resp, false);
10293
1.00k
}
10294
10295
194
Status CatalogManager::GetLeaderBlacklistCompletionPercent(GetLoadMovePercentResponsePB* resp) {
10296
194
  return GetLoadMoveCompletionPercent(resp, true);
10297
194
}
10298
10299
Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp,
10300
1.20k
                                                    bool blacklist_leader) {
10301
1.20k
  auto l = cluster_config_->LockForRead();
10302
10303
  // Fine to pass in empty defaults if server_blacklist or leader_blacklist is not filled.
10304
1.00k
  const BlacklistPB& state = blacklist_leader ? l->pb.leader_blacklist() : l->pb.server_blacklist();
10305
1.20k
  int64_t blacklist_replicas = GetNumRelevantReplicas(state, blacklist_leader);
10306
1.20k
  int64_t initial_load = (blacklist_leader) ?
10307
1.00k
                                state.initial_leader_load(): state.initial_replica_load();
10308
  // If we are starting up and don't find any load on the tservers, return progress as 0.
10309
  // We expect that by blacklist_progress_initial_delay_secs time, this should go away and if the
10310
  // load is reported as 0 on the blacklisted tservers after this time then it means that
10311
  // the transfer is successfully complete.
10312
1.20k
  if (blacklist_replicas == 0 &&
10313
580
  TimeSinceElectedLeader() <= MonoDelta::FromSeconds(FLAGS_blacklist_progress_initial_delay_secs)) {
10314
466
      LOG(INFO) << "Master leadership has changed. Reporting progress as 0 until the catalog " <<
10315
466
                   "manager gets the correct estimates of the remaining load on the blacklisted" <<
10316
466
                   "tservers.";
10317
466
      resp->set_percent(0);
10318
466
      resp->set_total(initial_load);
10319
466
      resp->set_remaining(initial_load);
10320
466
      return Status::OK();
10321
466
  }
10322
10323
  // On change of master leader, initial_load_ information may be lost temporarily. Reset to
10324
  // current value to avoid reporting progress percent as 100. Note that doing so will report
10325
  // progress percent as 0 instead.
10326
  // TODO(Sanket): This might be no longer relevant after we persist and load the initial load
10327
  // on failover. Need to investigate.
10328
737
  if (initial_load < blacklist_replicas) {
10329
0
    LOG(INFO) << Format("Initial load: $0, current load: $1."
10330
0
              " Initial load is less than the current load. Probably a master leader change."
10331
0
              " Reporting progress as 0", state.initial_replica_load(),
10332
0
              blacklist_replicas);
10333
0
    initial_load = blacklist_replicas;
10334
0
  }
10335
10336
737
  LOG(INFO) << "Blacklisted count " << blacklist_replicas
10337
737
            << " across " << state.hosts_size()
10338
737
            << " servers, with initial load " << initial_load;
10339
10340
  // Case when a blacklisted servers did not have any starting load.
10341
737
  if (initial_load == 0) {
10342
32
    resp->set_percent(100);
10343
32
    return Status::OK();
10344
32
  }
10345
10346
705
  resp->set_percent(
10347
705
      100 - (static_cast<double>(blacklist_replicas) * 100 / initial_load));
10348
705
  resp->set_remaining(blacklist_replicas);
10349
705
  resp->set_total(initial_load);
10350
10351
705
  return Status::OK();
10352
705
}
10353
10354
2.10k
void CatalogManager::AbortAndWaitForAllTasks(const vector<scoped_refptr<TableInfo>>& tables) {
10355
1.85k
  for (const auto& t : tables) {
10356
0
    VLOG(1) << "Aborting tasks for table " << t->ToString();
10357
1.85k
    t->AbortTasksAndClose();
10358
1.85k
  }
10359
1.85k
  for (const auto& t : tables) {
10360
0
    VLOG(1) << "Waiting on Aborting tasks for table " << t->ToString();
10361
1.85k
    t->WaitTasksCompletion();
10362
1.85k
  }
10363
0
  VLOG(1) << "Waiting on Aborting tasks done";
10364
2.10k
}
10365
10366
242k
void CatalogManager::HandleNewTableId(const TableId& table_id) {
10367
242k
  if (table_id == kPgProcTableId) {
10368
    // Needed to track whether initdb has started running.
10369
363
    pg_proc_exists_.store(true, std::memory_order_release);
10370
363
  }
10371
242k
}
10372
10373
243k
scoped_refptr<TableInfo> CatalogManager::NewTableInfo(TableId id) {
10374
243k
  return make_scoped_refptr<TableInfo>(id, tasks_tracker_);
10375
243k
}
10376
10377
212k
Status CatalogManager::ScheduleTask(std::shared_ptr<RetryingTSRpcTask> task) {
10378
212k
  Status s = async_task_pool_->SubmitFunc([task]() {
10379
212k
      WARN_NOT_OK(task->Run(), "Failed task");
10380
212k
  });
10381
  // If we are not able to enqueue, abort the task.
10382
212k
  if (!s.ok()) {
10383
0
    task->AbortAndReturnPrevState(s);
10384
0
  }
10385
212k
  return s;
10386
212k
}
10387
10388
Status CatalogManager::CollectTable(
10389
    const TableDescription& table_description,
10390
    CollectFlags flags,
10391
    std::vector<TableDescription>* all_tables,
10392
7
    std::unordered_set<NamespaceId>* parent_colocated_table_ids) {
10393
7
  auto lock = table_description.table_info->LockForRead();
10394
7
  if (lock->started_hiding()) {
10395
0
    VLOG_WITH_PREFIX_AND_FUNC(4)
10396
0
        << "Rejected hidden table: " << AsString(table_description.table_info);
10397
0
    return Status::OK();
10398
0
  }
10399
7
  if (lock->started_deleting()) {
10400
0
    VLOG_WITH_PREFIX_AND_FUNC(4)
10401
0
        << "Rejected deleted table: " << AsString(table_description.table_info);
10402
0
    return Status::OK();
10403
0
  }
10404
7
  if (flags.Test(CollectFlag::kIncludeParentColocatedTable) && lock->pb.colocated()) {
10405
    // If a table is colocated, add its parent colocated table as well.
10406
0
    const auto parent_table_id =
10407
0
        table_description.namespace_info->id() + kColocatedParentTableIdSuffix;
10408
0
    auto result = parent_colocated_table_ids->insert(parent_table_id);
10409
0
    if (result.second) {
10410
      // We have not processed this parent table id yet, so do that now.
10411
0
      TableIdentifierPB parent_table_pb;
10412
0
      parent_table_pb.set_table_id(parent_table_id);
10413
0
      parent_table_pb.mutable_namespace_()->set_id(table_description.namespace_info->id());
10414
0
      all_tables->push_back(VERIFY_RESULT(DescribeTable(
10415
0
          parent_table_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress))));
10416
0
    }
10417
0
  }
10418
7
  all_tables->push_back(table_description);
10419
10420
7
  if (flags.Test(CollectFlag::kAddIndexes)) {
10421
0
    TRACE(Substitute("Locking object with id $0", table_description.table_info->id()));
10422
10423
0
    if (lock->is_index()) {
10424
0
      return STATUS(InvalidArgument, "Expected table, but found index",
10425
0
                    table_description.table_info->id(),
10426
0
                    MasterError(MasterErrorPB::INVALID_TABLE_TYPE));
10427
0
    }
10428
10429
0
    if (lock->table_type() == PGSQL_TABLE_TYPE) {
10430
0
      return STATUS(InvalidArgument, "Getting indexes for YSQL table is not supported",
10431
0
                    table_description.table_info->id(),
10432
0
                    MasterError(MasterErrorPB::INVALID_TABLE_TYPE));
10433
0
    }
10434
10435
0
    auto collect_index_flags = flags;
10436
    // Don't need to collect indexes for index.
10437
0
    collect_index_flags.Reset(CollectFlag::kAddIndexes);
10438
0
    for (const auto& index_info : lock->pb.indexes()) {
10439
0
      LOG_IF(DFATAL, table_description.table_info->id() != index_info.indexed_table_id())
10440
0
              << "Wrong indexed table id in index descriptor";
10441
0
      TableIdentifierPB index_id_pb;
10442
0
      index_id_pb.set_table_id(index_info.table_id());
10443
0
      index_id_pb.mutable_namespace_()->set_id(table_description.namespace_info->id());
10444
0
      auto index_description = VERIFY_RESULT(DescribeTable(
10445
0
          index_id_pb, flags.Test(CollectFlag::kSucceedIfCreateInProgress)));
10446
0
      RETURN_NOT_OK(CollectTable(
10447
0
          index_description, collect_index_flags, all_tables, parent_colocated_table_ids));
10448
0
    }
10449
0
  }
10450
10451
7
  return Status::OK();
10452
7
}
10453
10454
Result<vector<TableDescription>> CatalogManager::CollectTables(
10455
    const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
10456
    CollectFlags flags,
10457
7
    std::unordered_set<NamespaceId>* namespaces) {
10458
7
  std::vector<std::pair<TableInfoPtr, CollectFlags>> table_with_flags;
10459
10460
7
  {
10461
7
    SharedLock lock(mutex_);
10462
7
    for (const auto& table_id_pb : table_identifiers) {
10463
7
      if (table_id_pb.table_name().empty() && table_id_pb.table_id().empty() &&
10464
0
          table_id_pb.has_namespace_()) {
10465
0
        auto namespace_info = FindNamespaceUnlocked(table_id_pb.namespace_());
10466
0
        if (!namespace_info.ok()) {
10467
0
          if (namespace_info.status().IsNotFound()) {
10468
0
            continue;
10469
0
          }
10470
0
          return namespace_info.status();
10471
0
        }
10472
0
        if (namespaces) {
10473
0
          namespaces->insert((**namespace_info).id());
10474
0
        }
10475
10476
10477
0
        auto ns_collect_flags = flags;
10478
        // Don't collect indexes, since they should be in the same namespace and will be collected
10479
        // as regular tables.
10480
        // It is necessary because we don't support kAddIndexes for YSQL tables.
10481
0
        ns_collect_flags.Reset(CollectFlag::kAddIndexes);
10482
0
        VLOG_WITH_PREFIX_AND_FUNC(1)
10483
0
            << "Collecting all tables from: " << (**namespace_info).ToString() << ", specified as: "
10484
0
            << table_id_pb.namespace_().ShortDebugString();
10485
0
        for (const auto& id_and_table : *table_ids_map_) {
10486
0
          if (id_and_table.second->is_system()) {
10487
0
            VLOG_WITH_PREFIX_AND_FUNC(4) << "Rejected system table: " << AsString(id_and_table);
10488
0
            continue;
10489
0
          }
10490
0
          auto lock = id_and_table.second->LockForRead();
10491
0
          if (lock->namespace_id() != (**namespace_info).id()) {
10492
0
            VLOG_WITH_PREFIX_AND_FUNC(4)
10493
0
                << "Rejected table from other namespace: " << AsString(id_and_table);
10494
0
            continue;
10495
0
          }
10496
0
          VLOG_WITH_PREFIX_AND_FUNC(4) << "Accepted: " << AsString(id_and_table);
10497
0
          table_with_flags.emplace_back(id_and_table.second, ns_collect_flags);
10498
0
        }
10499
7
      } else {
10500
7
        auto table = VERIFY_RESULT(FindTableUnlocked(table_id_pb));
10501
0
        VLOG_WITH_PREFIX_AND_FUNC(1) << "Collecting table: " << table->ToString();
10502
7
        table_with_flags.emplace_back(table, flags);
10503
7
      }
10504
7
    }
10505
7
  }
10506
10507
7
  std::sort(table_with_flags.begin(), table_with_flags.end(), [](const auto& p1, const auto& p2) {
10508
0
    return p1.first->id() < p2.first->id();
10509
0
  });
10510
7
  std::vector<TableDescription> all_tables;
10511
7
  std::unordered_set<NamespaceId> parent_colocated_table_ids;
10512
7
  const TableId* table_id = nullptr;
10513
7
  for (auto& table_and_flags : table_with_flags) {
10514
7
    if (table_id && *table_id == table_and_flags.first->id()) {
10515
0
      return STATUS_FORMAT(InternalError, "Table collected twice $0", *table_id);
10516
0
    }
10517
7
    auto description = VERIFY_RESULT(DescribeTable(
10518
7
        table_and_flags.first,
10519
7
        table_and_flags.second.Test(CollectFlag::kSucceedIfCreateInProgress)));
10520
7
    RETURN_NOT_OK(CollectTable(
10521
7
        description, table_and_flags.second, &all_tables, &parent_colocated_table_ids));
10522
7
    table_id = &table_and_flags.first->id();
10523
7
  }
10524
10525
7
  return all_tables;
10526
7
}
10527
10528
Result<std::vector<TableDescription>> CatalogManager::CollectTables(
10529
    const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
10530
    bool add_indexes,
10531
7
    bool include_parent_colocated_table) {
10532
7
  CollectFlags flags;
10533
7
  flags.SetIf(CollectFlag::kAddIndexes, add_indexes);
10534
7
  flags.SetIf(CollectFlag::kIncludeParentColocatedTable, include_parent_colocated_table);
10535
7
  return CollectTables(table_identifiers, flags);
10536
7
}
10537
10538
2.00k
Status CatalogManager::GetYQLPartitionsVTable(std::shared_ptr<SystemTablet>* tablet) {
10539
2.00k
  scoped_refptr<TableInfo> table = FindPtrOrNull(table_names_map_,
10540
2.00k
      std::make_pair(kSystemNamespaceId, kSystemPartitionsTableName));
10541
2.00k
  SCHECK(table != nullptr, NotFound, "YQL system.partitions table not found");
10542
10543
2.00k
  auto tablets = table->GetTablets();
10544
2.00k
  SCHECK(tablets.size() == 1, NotFound, "YQL system.partitions tablet not found");
10545
2.00k
  *tablet = std::dynamic_pointer_cast<SystemTablet>(
10546
2.00k
      VERIFY_RESULT(GetSystemTablet(tablets[0]->tablet_id())));
10547
2.00k
  return Status::OK();
10548
2.00k
}
10549
10550
15.9k
void CatalogManager::RebuildYQLSystemPartitions() {
10551
15.9k
  if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask() ||
10552
15.9k
      YQLPartitionsVTable::GeneratePartitionsVTableOnChanges()) {
10553
15.9k
    SCOPED_LEADER_SHARED_LOCK(l, this);
10554
15.9k
    if (l.catalog_status().ok() && l.leader_status().ok()) {
10555
5.03k
      if (system_partitions_tablet_ != nullptr) {
10556
5.03k
        Status s;
10557
5.03k
        if (YQLPartitionsVTable::GeneratePartitionsVTableWithBgTask()) {
10558
          // If we are not generating the vtable on changes, then we need to do a full refresh.
10559
2
          s = ResultToStatus(GetYqlPartitionsVtable().GenerateAndCacheData());
10560
5.02k
        } else {
10561
          // Otherwise, we can simply update the cached vtable with the internal map.
10562
5.02k
          s = GetYqlPartitionsVtable().UpdateCache();
10563
5.02k
        }
10564
5.03k
        if (!s.ok()) {
10565
0
          LOG(ERROR) << "Error rebuilding system.partitions: " << s.ToString();
10566
0
        }
10567
0
      } else {
10568
0
        LOG(ERROR) << "Error finding system.partitions vtable.";
10569
0
      }
10570
5.03k
    }
10571
15.9k
  }
10572
10573
15.9k
  auto wait_time = FLAGS_partitions_vtable_cache_refresh_secs * 1s;
10574
15.9k
  if (wait_time <= 0s) {
10575
15.8k
    wait_time = kDefaultYQLPartitionsRefreshBgTaskSleep;
10576
15.8k
  }
10577
10.6k
  refresh_yql_partitions_task_.Schedule([this](const Status& status) {
10578
10.6k
    WARN_NOT_OK(
10579
10.6k
        background_tasks_thread_pool_->SubmitFunc([this]() { RebuildYQLSystemPartitions(); }),
10580
10.6k
        "Failed to schedule: RebuildYQLSystemPartitions");
10581
10.6k
  }, wait_time);
10582
15.9k
}
10583
10584
90.0k
Status CatalogManager::SysCatalogRespectLeaderAffinity() {
10585
90.0k
  auto l = cluster_config_->LockForRead();
10586
10587
90.0k
  const auto& affinitized_leaders = l->pb.replication_info().affinitized_leaders();
10588
90.0k
  if (affinitized_leaders.empty()) {
10589
89.8k
    return Status::OK();
10590
89.8k
  }
10591
10592
192
  for (const CloudInfoPB& cloud_info : affinitized_leaders) {
10593
    // Do nothing if already in an affinitized zone.
10594
192
    if (CatalogManagerUtil::IsCloudInfoEqual(cloud_info, server_registration_.cloud_info())) {
10595
86
      return Status::OK();
10596
86
    }
10597
192
  }
10598
10599
  // Not in affinitized zone, try finding a master to send a step down request to.
10600
80
  std::vector<ServerEntryPB> masters;
10601
80
  RETURN_NOT_OK(master_->ListMasters(&masters));
10602
10603
142
  for (const ServerEntryPB& master : masters) {
10604
142
    auto master_cloud_info = master.registration().cloud_info();
10605
10606
180
    for (const CloudInfoPB& config_cloud_info : affinitized_leaders) {
10607
180
      if (CatalogManagerUtil::IsCloudInfoEqual(config_cloud_info, master_cloud_info)) {
10608
0
        if (PREDICT_FALSE(
10609
0
            GetAtomicFlag(&FLAGS_TEST_crash_server_on_sys_catalog_leader_affinity_move))) {
10610
0
          LOG_WITH_PREFIX(FATAL) << "For test: Crashing the server instead of performing sys "
10611
0
                                    "catalog leader affinity move.";
10612
0
        }
10613
0
        YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10)
10614
0
            << "Sys catalog tablet is not in an affinitized zone, "
10615
0
            << "sending step down request to master uuid "
10616
0
            << master.instance_id().permanent_uuid()
10617
0
            << " in zone "
10618
0
            << TSDescriptor::generate_placement_id(master_cloud_info);
10619
0
        std::shared_ptr<TabletPeer> tablet_peer;
10620
0
        RETURN_NOT_OK(GetTabletPeer(sys_catalog_->tablet_id(), &tablet_peer));
10621
10622
0
        consensus::LeaderStepDownRequestPB req;
10623
0
        req.set_tablet_id(sys_catalog_->tablet_id());
10624
0
        req.set_dest_uuid(sys_catalog_->tablet_peer()->permanent_uuid());
10625
0
        req.set_new_leader_uuid(master.instance_id().permanent_uuid());
10626
10627
0
        consensus::LeaderStepDownResponsePB resp;
10628
0
        RETURN_NOT_OK(tablet_peer->consensus()->StepDown(&req, &resp));
10629
0
        if (resp.has_error()) {
10630
0
          YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 10) << "Step down failed: "
10631
0
                                                    << resp.error().status().message();
10632
0
          break;
10633
0
        }
10634
0
        LOG_WITH_PREFIX(INFO) << "Successfully stepped down to new master";
10635
0
        return Status::OK();
10636
0
      }
10637
180
    }
10638
142
  }
10639
10640
80
  return STATUS(NotFound, "Couldn't step down to a master in an affinitized zone");
10641
80
}
10642
10643
149k
BlacklistSet CatalogManager::BlacklistSetFromPB() const {
10644
149k
  auto l = cluster_config_->LockForRead();
10645
10646
149k
  const auto& blacklist_pb = l->pb.server_blacklist();
10647
149k
  BlacklistSet blacklist_set;
10648
149k
  for (int i = 0; i < blacklist_pb.hosts_size(); i++) {
10649
104
    blacklist_set.insert(HostPortFromPB(blacklist_pb.hosts(i)));
10650
104
  }
10651
10652
149k
  return blacklist_set;
10653
149k
}
10654
10655
void CatalogManager::ProcessTabletStorageMetadata(
10656
    const std::string& ts_uuid,
10657
285k
    const TabletDriveStorageMetadataPB& storage_metadata) {
10658
285k
  const string& tablet_id = storage_metadata.tablet_id();
10659
285k
  scoped_refptr<TabletInfo> tablet;
10660
285k
  {
10661
285k
    SharedLock lock(mutex_);
10662
285k
    tablet = FindPtrOrNull(*tablet_map_, tablet_id);
10663
285k
  }
10664
285k
  if (!tablet) {
10665
0
    VLOG(1) << Format("Tablet $0 not found on ts $1", tablet_id, ts_uuid);
10666
0
    return;
10667
0
  }
10668
285k
  TabletReplicaDriveInfo drive_info{
10669
285k
        storage_metadata.sst_file_size(),
10670
285k
        storage_metadata.wal_file_size(),
10671
285k
        storage_metadata.uncompressed_sst_file_size(),
10672
285k
        storage_metadata.may_have_orphaned_post_split_data()};
10673
285k
  tablet->UpdateReplicaDriveInfo(ts_uuid, drive_info);
10674
285k
}
10675
10676
76.8k
void CatalogManager::CheckTableDeleted(const TableInfoPtr& table) {
10677
76.8k
  if (!FLAGS_master_drop_table_after_task_response) {
10678
0
    return;
10679
0
  }
10680
  // Since this is called after every successful async DeleteTablet, it's possible if all tasks
10681
  // complete, for us to mark the table as DELETED/HIDDEN asap. This is desirable as clients will
10682
  // wait for this before returning success to the user.
10683
  //
10684
  // However, if tasks fail, timeout, or are aborted, we still have the background thread as a
10685
  // catch all.
10686
76.8k
  auto lock = MaybeTransitionTableToDeleted(table);
10687
76.8k
  if (!lock.locked()) {
10688
74.1k
    return;
10689
74.1k
  }
10690
2.76k
  Status s = sys_catalog_->Upsert(leader_ready_term(), table);
10691
2.76k
  if (!s.ok()) {
10692
0
    LOG_WITH_PREFIX(WARNING)
10693
0
        << "Error marking table as "
10694
0
        << (table->LockForRead()->started_deleting() ? "DELETED" : "HIDDEN") << ": " << s;
10695
0
    return;
10696
0
  }
10697
2.76k
  lock.Commit();
10698
2.76k
}
10699
10700
288k
const YQLPartitionsVTable& CatalogManager::GetYqlPartitionsVtable() const {
10701
288k
  return down_cast<const YQLPartitionsVTable&>(system_partitions_tablet_->QLStorage());
10702
288k
}
10703
10704
void CatalogManager::InitializeTableLoadState(
10705
12.6k
    const TableId& table_id, TSDescriptorVector ts_descs, CMPerTableLoadState* state) {
10706
37.1k
  for (const auto& ts : ts_descs) {
10707
    // Touch every tserver with 0 load.
10708
37.1k
    state->per_ts_load_[ts->permanent_uuid()];
10709
    // Insert into the sorted list.
10710
37.1k
    state->sorted_load_.emplace_back(ts->permanent_uuid());
10711
37.1k
  }
10712
10713
12.6k
  auto table_info = GetTableInfo(table_id);
10714
10715
12.6k
  if (!table_info) {
10716
0
    return;
10717
0
  }
10718
12.6k
  CatalogManagerUtil::FillTableLoadState(table_info, state);
10719
12.6k
}
10720
10721
void CatalogManager::InitializeGlobalLoadState(
10722
11.8k
    TSDescriptorVector ts_descs, CMGlobalLoadState* state) {
10723
34.6k
  for (const auto& ts : ts_descs) {
10724
    // Touch every tserver with 0 load.
10725
34.6k
    state->per_ts_load_[ts->permanent_uuid()];
10726
34.6k
  }
10727
10728
11.8k
  SharedLock l(mutex_);
10729
2.71M
  for (const auto& id_and_info : *table_ids_map_) {
10730
    // Ignore system, colocated and deleting/deleted tables.
10731
2.71M
    {
10732
2.71M
      auto l = id_and_info.second->LockForRead();
10733
2.71M
      if (IsSystemTable(*(id_and_info.second)) ||
10734
90.2k
          id_and_info.second->IsColocatedUserTable() ||
10735
2.67M
          l->started_deleting()) {
10736
2.67M
        continue;
10737
2.67M
      }
10738
42.0k
    }
10739
42.0k
    CatalogManagerUtil::FillTableLoadState(id_and_info.second, state);
10740
42.0k
  }
10741
11.8k
}
10742
10743
}  // namespace master
10744
}  // namespace yb