YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/master/catalog_manager.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#ifndef YB_MASTER_CATALOG_MANAGER_H
34
#define YB_MASTER_CATALOG_MANAGER_H
35
36
#include <list>
37
#include <map>
38
#include <set>
39
#include <string>
40
#include <unordered_map>
41
#include <unordered_set>
42
#include <vector>
43
44
#include <boost/optional/optional_fwd.hpp>
45
#include <boost/functional/hash.hpp>
46
#include <gtest/internal/gtest-internal.h>
47
48
#include "yb/common/constants.h"
49
#include "yb/common/entity_ids.h"
50
#include "yb/common/index.h"
51
#include "yb/common/partition.h"
52
#include "yb/common/transaction.h"
53
#include "yb/client/client_fwd.h"
54
#include "yb/gutil/macros.h"
55
#include "yb/gutil/ref_counted.h"
56
#include "yb/gutil/strings/substitute.h"
57
#include "yb/gutil/thread_annotations.h"
58
59
#include "yb/master/catalog_entity_info.h"
60
#include "yb/master/catalog_manager_if.h"
61
#include "yb/master/catalog_manager_util.h"
62
#include "yb/master/master_dcl.fwd.h"
63
#include "yb/master/master_encryption.fwd.h"
64
#include "yb/master/master_defaults.h"
65
#include "yb/master/sys_catalog_initialization.h"
66
#include "yb/master/scoped_leader_shared_lock.h"
67
#include "yb/master/system_tablet.h"
68
#include "yb/master/tablet_split_candidate_filter.h"
69
#include "yb/master/tablet_split_driver.h"
70
#include "yb/master/tablet_split_manager.h"
71
#include "yb/master/ts_descriptor.h"
72
#include "yb/master/ts_manager.h"
73
#include "yb/master/ysql_tablespace_manager.h"
74
#include "yb/master/xcluster_split_driver.h"
75
76
#include "yb/rpc/rpc.h"
77
#include "yb/rpc/scheduler.h"
78
#include "yb/server/monitored_task.h"
79
#include "yb/tserver/tablet_peer_lookup.h"
80
81
#include "yb/util/debug/lock_debug.h"
82
#include "yb/util/locks.h"
83
#include "yb/util/monotime.h"
84
#include "yb/util/net/net_util.h"
85
#include "yb/util/pb_util.h"
86
#include "yb/util/promise.h"
87
#include "yb/util/random.h"
88
#include "yb/util/rw_mutex.h"
89
#include "yb/util/status_fwd.h"
90
#include "yb/util/test_macros.h"
91
#include "yb/util/version_tracker.h"
92
93
namespace yb {
94
95
class Schema;
96
class ThreadPool;
97
98
template<class T>
99
class AtomicGauge;
100
101
#define CALL_GTEST_TEST_CLASS_NAME_(...) GTEST_TEST_CLASS_NAME_(__VA_ARGS__)
102
namespace pgwrapper {
103
class CALL_GTEST_TEST_CLASS_NAME_(PgMiniTest, YB_DISABLE_TEST_IN_TSAN(DropDBMarkDeleted));
104
class CALL_GTEST_TEST_CLASS_NAME_(PgMiniTest, YB_DISABLE_TEST_IN_TSAN(DropDBUpdateSysTablet));
105
class CALL_GTEST_TEST_CLASS_NAME_(PgMiniTest, YB_DISABLE_TEST_IN_TSAN(DropDBWithTables));
106
}
107
108
class CALL_GTEST_TEST_CLASS_NAME_(MasterPartitionedTest, VerifyOldLeaderStepsDown);
109
#undef CALL_GTEST_TEST_CLASS_NAME_
110
111
namespace tablet {
112
113
struct TableInfo;
114
enum RaftGroupStatePB;
115
116
}
117
118
namespace master {
119
120
struct DeferredAssignmentActions;
121
122
using PlacementId = std::string;
123
124
typedef std::unordered_map<TabletId, TabletServerId> TabletToTabletServerMap;
125
126
typedef std::unordered_set<TableId> TableIdSet;
127
128
typedef std::unordered_map<TablespaceId, boost::optional<ReplicationInfoPB>>
129
  TablespaceIdToReplicationInfoMap;
130
131
typedef std::unordered_map<TableId, boost::optional<TablespaceId>> TableToTablespaceIdMap;
132
133
YB_STRONGLY_TYPED_BOOL(HideOnly);
134
135
typedef std::unordered_map<TableId, vector<scoped_refptr<TabletInfo>>> TableToTabletInfos;
136
137
// The component of the master which tracks the state and location
138
// of tables/tablets in the cluster.
139
//
140
// This is the master-side counterpart of TSTabletManager, which tracks
141
// the state of each tablet on a given tablet-server.
142
//
143
// Thread-safe.
144
class CatalogManager :
145
    public tserver::TabletPeerLookupIf,
146
    public TabletSplitCandidateFilterIf,
147
    public TabletSplitDriverIf,
148
    public CatalogManagerIf,
149
    public XClusterSplitDriverIf {
150
  typedef std::unordered_map<NamespaceName, scoped_refptr<NamespaceInfo> > NamespaceInfoMap;
151
152
  class NamespaceNameMapper {
153
   public:
154
    NamespaceInfoMap& operator[](YQLDatabase db_type);
155
    const NamespaceInfoMap& operator[](YQLDatabase db_type) const;
156
    void clear();
157
158
   private:
159
    std::array<NamespaceInfoMap, 4> typed_maps_;
160
  };
161
162
 public:
163
  explicit CatalogManager(Master *master);
164
  virtual ~CatalogManager();
165
166
  CHECKED_STATUS Init();
167
168
  bool StartShutdown();
169
  void CompleteShutdown();
170
171
  // Create Postgres sys catalog table.
172
  CHECKED_STATUS CreateYsqlSysTable(const CreateTableRequestPB* req, CreateTableResponsePB* resp);
173
174
  CHECKED_STATUS ReplicatePgMetadataChange(const tablet::ChangeMetadataRequestPB* req);
175
176
  // Reserve Postgres oids for a Postgres database.
177
  CHECKED_STATUS ReservePgsqlOids(const ReservePgsqlOidsRequestPB* req,
178
                                  ReservePgsqlOidsResponsePB* resp,
179
                                  rpc::RpcContext* rpc);
180
181
  // Get the info (current only version) for the ysql system catalog.
182
  CHECKED_STATUS GetYsqlCatalogConfig(const GetYsqlCatalogConfigRequestPB* req,
183
                                      GetYsqlCatalogConfigResponsePB* resp,
184
                                      rpc::RpcContext* rpc);
185
186
  // Copy Postgres sys catalog tables into a new namespace.
187
  CHECKED_STATUS CopyPgsqlSysTables(const NamespaceId& namespace_id,
188
                                    const std::vector<scoped_refptr<TableInfo>>& tables);
189
190
  // Create a new Table with the specified attributes.
191
  //
192
  // The RPC context is provided for logging/tracing purposes,
193
  // but this function does not itself respond to the RPC.
194
  CHECKED_STATUS CreateTable(const CreateTableRequestPB* req,
195
                             CreateTableResponsePB* resp,
196
                             rpc::RpcContext* rpc) override;
197
198
  // Create a new transaction status table.
199
  CHECKED_STATUS CreateTransactionStatusTable(const CreateTransactionStatusTableRequestPB* req,
200
                                              CreateTransactionStatusTableResponsePB* resp,
201
                                              rpc::RpcContext *rpc);
202
203
  // Create a transaction status table with the given name.
204
  CHECKED_STATUS CreateTransactionStatusTableInternal(rpc::RpcContext* rpc,
205
                                                      const string& table_name,
206
                                                      const TablespaceId* tablespace_id);
207
208
  // Check if there is a transaction table whose tablespace id matches the given tablespace id.
209
  bool DoesTransactionTableExistForTablespace(
210
      const TablespaceId& tablespace_id) EXCLUDES(mutex_);
211
212
  // Create a local transaction status table for a tablespace if needed
213
  // (i.e. if it does not exist already).
214
  //
215
  // This is called during CreateTable if the table has transactions enabled and is part
216
  // of a tablespace with a placement set.
217
  CHECKED_STATUS CreateLocalTransactionStatusTableIfNeeded(
218
      rpc::RpcContext *rpc, const TablespaceId& tablespace_id) EXCLUDES(mutex_);
219
220
  // Create the global transaction status table if needed (i.e. if it does not exist already).
221
  //
222
  // This is called at the end of CreateTable if the table has transactions enabled.
223
  CHECKED_STATUS CreateGlobalTransactionStatusTableIfNeeded(rpc::RpcContext *rpc);
224
225
  // Get tablet ids of the global transaction status table.
226
  CHECKED_STATUS GetGlobalTransactionStatusTablets(
227
      GetTransactionStatusTabletsResponsePB* resp) EXCLUDES(mutex_);
228
229
  // Get ids of transaction status tables matching a given placement.
230
  Result<std::vector<TableInfoPtr>> GetPlacementLocalTransactionStatusTables(
231
      const CloudInfoPB& placement) EXCLUDES(mutex_);
232
233
  // Get tablet ids of local transaction status tables matching a given placement.
234
  CHECKED_STATUS GetPlacementLocalTransactionStatusTablets(
235
      const std::vector<TableInfoPtr>& placement_local_tables,
236
      GetTransactionStatusTabletsResponsePB* resp) EXCLUDES(mutex_);
237
238
  // Get tablet ids of the global transaction status table and local transaction status tables
239
  // matching a given placement.
240
  CHECKED_STATUS GetTransactionStatusTablets(const GetTransactionStatusTabletsRequestPB* req,
241
                                             GetTransactionStatusTabletsResponsePB* resp,
242
                                             rpc::RpcContext *rpc) EXCLUDES(mutex_);
243
244
  // Create the metrics snapshots table if needed (i.e. if it does not exist already).
245
  //
246
  // This is called at the end of CreateTable.
247
  CHECKED_STATUS CreateMetricsSnapshotsTableIfNeeded(rpc::RpcContext *rpc);
248
249
  // Get the information about an in-progress create operation.
250
  CHECKED_STATUS IsCreateTableDone(const IsCreateTableDoneRequestPB* req,
251
                                   IsCreateTableDoneResponsePB* resp) override;
252
253
  CHECKED_STATUS IsCreateTableInProgress(const TableId& table_id,
254
                                         CoarseTimePoint deadline,
255
                                         bool* create_in_progress);
256
257
  CHECKED_STATUS WaitForCreateTableToFinish(const TableId& table_id, CoarseTimePoint deadline);
258
259
  // Check if the transaction status table creation is done.
260
  //
261
  // This is called at the end of IsCreateTableDone if the table has transactions enabled.
262
  Result<bool> IsTransactionStatusTableCreated();
263
264
  // Check if the metrics snapshots table creation is done.
265
  //
266
  // This is called at the end of IsCreateTableDone.
267
  Result<bool> IsMetricsSnapshotsTableCreated();
268
269
  // Called when transaction associated with table create finishes. Verifies postgres layer present.
270
  CHECKED_STATUS VerifyTablePgLayer(scoped_refptr<TableInfo> table, bool txn_query_succeeded);
271
272
  // Truncate the specified table.
273
  //
274
  // The RPC context is provided for logging/tracing purposes,
275
  // but this function does not itself respond to the RPC.
276
  CHECKED_STATUS TruncateTable(const TruncateTableRequestPB* req,
277
                               TruncateTableResponsePB* resp,
278
                               rpc::RpcContext* rpc);
279
280
  // Get the information about an in-progress truncate operation.
281
  CHECKED_STATUS IsTruncateTableDone(const IsTruncateTableDoneRequestPB* req,
282
                                     IsTruncateTableDoneResponsePB* resp);
283
284
  // Backfill the specified index.  Currently only supported for YSQL.  YCQL does not need this as
285
  // master automatically runs backfill according to the DocDB permissions.
286
  CHECKED_STATUS BackfillIndex(const BackfillIndexRequestPB* req,
287
                               BackfillIndexResponsePB* resp,
288
                               rpc::RpcContext* rpc);
289
290
  // Gets the backfill jobs state associated with the requested table.
291
  CHECKED_STATUS GetBackfillJobs(const GetBackfillJobsRequestPB* req,
292
                                      GetBackfillJobsResponsePB* resp,
293
                                      rpc::RpcContext* rpc);
294
295
  // Backfill the indexes for the specified table.
296
  // Used for backfilling YCQL defered indexes when triggered from yb-admin.
297
  CHECKED_STATUS LaunchBackfillIndexForTable(const LaunchBackfillIndexForTableRequestPB* req,
298
                                             LaunchBackfillIndexForTableResponsePB* resp,
299
                                             rpc::RpcContext* rpc);
300
301
  // Delete the specified table.
302
  //
303
  // The RPC context is provided for logging/tracing purposes,
304
  // but this function does not itself respond to the RPC.
305
  CHECKED_STATUS DeleteTable(const DeleteTableRequestPB* req,
306
                             DeleteTableResponsePB* resp,
307
                             rpc::RpcContext* rpc);
308
  CHECKED_STATUS DeleteTableInternal(
309
      const DeleteTableRequestPB* req, DeleteTableResponsePB* resp, rpc::RpcContext* rpc);
310
311
  // Get the information about an in-progress delete operation.
312
  CHECKED_STATUS IsDeleteTableDone(const IsDeleteTableDoneRequestPB* req,
313
                                   IsDeleteTableDoneResponsePB* resp);
314
315
  // Alter the specified table.
316
  //
317
  // The RPC context is provided for logging/tracing purposes,
318
  // but this function does not itself respond to the RPC.
319
  CHECKED_STATUS AlterTable(const AlterTableRequestPB* req,
320
                            AlterTableResponsePB* resp,
321
                            rpc::RpcContext* rpc);
322
323
  // Get the information about an in-progress alter operation.
324
  CHECKED_STATUS IsAlterTableDone(const IsAlterTableDoneRequestPB* req,
325
                                  IsAlterTableDoneResponsePB* resp);
326
327
  // Get the information about the specified table.
328
  CHECKED_STATUS GetTableSchema(const GetTableSchemaRequestPB* req,
329
                                GetTableSchemaResponsePB* resp) override;
330
  CHECKED_STATUS GetTableSchemaInternal(const GetTableSchemaRequestPB* req,
331
                                        GetTableSchemaResponsePB* resp,
332
                                        bool get_fully_applied_indexes = false);
333
334
  // Get the information about the specified tablegroup.
335
  CHECKED_STATUS GetTablegroupSchema(const GetTablegroupSchemaRequestPB* req,
336
                                     GetTablegroupSchemaResponsePB* resp);
337
338
  // Get the information about the specified colocated databsae.
339
  CHECKED_STATUS GetColocatedTabletSchema(const GetColocatedTabletSchemaRequestPB* req,
340
                                          GetColocatedTabletSchemaResponsePB* resp);
341
342
  // List all the running tables.
343
  CHECKED_STATUS ListTables(const ListTablesRequestPB* req,
344
                            ListTablesResponsePB* resp) override;
345
346
  // Find the tablegroup associated with the given table.
347
  boost::optional<TablegroupId> FindTablegroupByTableId(const TableId& table_id);
348
349
  CHECKED_STATUS GetTableLocations(const GetTableLocationsRequestPB* req,
350
                                   GetTableLocationsResponsePB* resp) override;
351
352
  // Lookup tablet by ID, then call GetTabletLocations below.
353
  CHECKED_STATUS GetTabletLocations(
354
      const TabletId& tablet_id,
355
      TabletLocationsPB* locs_pb,
356
      IncludeInactive include_inactive) override;
357
358
  // Look up the locations of the given tablet. The locations
359
  // vector is overwritten (not appended to).
360
  // If the tablet is not found, returns Status::NotFound.
361
  // If the tablet is not running, returns Status::ServiceUnavailable.
362
  // Otherwise, returns Status::OK and puts the result in 'locs_pb'.
363
  // This only returns tablets which are in RUNNING state.
364
  CHECKED_STATUS GetTabletLocations(
365
      scoped_refptr<TabletInfo> tablet_info,
366
      TabletLocationsPB* locs_pb,
367
      IncludeInactive include_inactive) override;
368
369
  // Returns the system tablet in catalog manager by the id.
370
  Result<std::shared_ptr<tablet::AbstractTablet>> GetSystemTablet(const TabletId& id) override;
371
372
  // Handle a tablet report from the given tablet server.
373
  //
374
  // The RPC context is provided for logging/tracing purposes,
375
  // but this function does not itself respond to the RPC.
376
  CHECKED_STATUS ProcessTabletReport(TSDescriptor* ts_desc,
377
                                     const TabletReportPB& report,
378
                                     TabletReportUpdatesPB *report_update,
379
                                     rpc::RpcContext* rpc);
380
381
  // Create a new Namespace with the specified attributes.
382
  //
383
  // The RPC context is provided for logging/tracing purposes,
384
  // but this function does not itself respond to the RPC.
385
  CHECKED_STATUS CreateNamespace(const CreateNamespaceRequestPB* req,
386
                                 CreateNamespaceResponsePB* resp,
387
                                 rpc::RpcContext* rpc) override;
388
  // Get the information about an in-progress create operation.
389
  CHECKED_STATUS IsCreateNamespaceDone(const IsCreateNamespaceDoneRequestPB* req,
390
                                       IsCreateNamespaceDoneResponsePB* resp);
391
392
  // Delete the specified Namespace.
393
  //
394
  // The RPC context is provided for logging/tracing purposes,
395
  // but this function does not itself respond to the RPC.
396
  CHECKED_STATUS DeleteNamespace(const DeleteNamespaceRequestPB* req,
397
                                 DeleteNamespaceResponsePB* resp,
398
                                 rpc::RpcContext* rpc);
399
  // Get the information about an in-progress delete operation.
400
  CHECKED_STATUS IsDeleteNamespaceDone(const IsDeleteNamespaceDoneRequestPB* req,
401
                                       IsDeleteNamespaceDoneResponsePB* resp);
402
403
  // Alter the specified Namespace.
404
  CHECKED_STATUS AlterNamespace(const AlterNamespaceRequestPB* req,
405
                                AlterNamespaceResponsePB* resp,
406
                                rpc::RpcContext* rpc);
407
408
  // User API to Delete YSQL database tables.
409
  CHECKED_STATUS DeleteYsqlDatabase(const DeleteNamespaceRequestPB* req,
410
                                    DeleteNamespaceResponsePB* resp,
411
                                    rpc::RpcContext* rpc);
412
413
  // Work to delete YSQL database tables, handled asynchronously from the User API call.
414
  void DeleteYsqlDatabaseAsync(scoped_refptr<NamespaceInfo> database);
415
416
  // Work to delete YCQL database, handled asynchronously from the User API call.
417
  void DeleteYcqlDatabaseAsync(scoped_refptr<NamespaceInfo> database);
418
419
  // Delete all tables in YSQL database.
420
  CHECKED_STATUS DeleteYsqlDBTables(const scoped_refptr<NamespaceInfo>& database);
421
422
  // List all the current namespaces.
423
  CHECKED_STATUS ListNamespaces(const ListNamespacesRequestPB* req,
424
                                ListNamespacesResponsePB* resp);
425
426
  // Get information about a namespace.
427
  CHECKED_STATUS GetNamespaceInfo(const GetNamespaceInfoRequestPB* req,
428
                                  GetNamespaceInfoResponsePB* resp,
429
                                  rpc::RpcContext* rpc);
430
431
  // Set Redis Config
432
  CHECKED_STATUS RedisConfigSet(const RedisConfigSetRequestPB* req,
433
                                RedisConfigSetResponsePB* resp,
434
                                rpc::RpcContext* rpc);
435
436
  // Get Redis Config
437
  CHECKED_STATUS RedisConfigGet(const RedisConfigGetRequestPB* req,
438
                                RedisConfigGetResponsePB* resp,
439
                                rpc::RpcContext* rpc);
440
441
  CHECKED_STATUS CreateTablegroup(const CreateTablegroupRequestPB* req,
442
                                  CreateTablegroupResponsePB* resp,
443
                                  rpc::RpcContext* rpc);
444
445
  CHECKED_STATUS DeleteTablegroup(const DeleteTablegroupRequestPB* req,
446
                                  DeleteTablegroupResponsePB* resp,
447
                                  rpc::RpcContext* rpc);
448
449
  // List all the current tablegroups for a namespace.
450
  CHECKED_STATUS ListTablegroups(const ListTablegroupsRequestPB* req,
451
                                 ListTablegroupsResponsePB* resp,
452
                                 rpc::RpcContext* rpc);
453
454
  bool HasTablegroups() override;
455
456
  // Create a new User-Defined Type with the specified attributes.
457
  //
458
  // The RPC context is provided for logging/tracing purposes,
459
  // but this function does not itself respond to the RPC.
460
  CHECKED_STATUS CreateUDType(const CreateUDTypeRequestPB* req,
461
                              CreateUDTypeResponsePB* resp,
462
                              rpc::RpcContext* rpc);
463
464
  // Delete the specified UDType.
465
  //
466
  // The RPC context is provided for logging/tracing purposes,
467
  // but this function does not itself respond to the RPC.
468
  CHECKED_STATUS DeleteUDType(const DeleteUDTypeRequestPB* req,
469
                              DeleteUDTypeResponsePB* resp,
470
                              rpc::RpcContext* rpc);
471
472
  // List all user defined types in given namespaces.
473
  CHECKED_STATUS ListUDTypes(const ListUDTypesRequestPB* req,
474
                             ListUDTypesResponsePB* resp);
475
476
  // Get the info (id, name, namespace, fields names, field types) of a (user-defined) type.
477
  CHECKED_STATUS GetUDTypeInfo(const GetUDTypeInfoRequestPB* req,
478
                               GetUDTypeInfoResponsePB* resp,
479
                               rpc::RpcContext* rpc);
480
481
  // Disables tablet splitting for a specified amount of time.
482
  CHECKED_STATUS DisableTabletSplitting(
483
      const DisableTabletSplittingRequestPB* req, DisableTabletSplittingResponsePB* resp,
484
      rpc::RpcContext* rpc);
485
486
  // Returns true if there are no outstanding tablets and the tablet split manager is not currently
487
  // processing tablet splits.
488
  CHECKED_STATUS IsTabletSplittingComplete(
489
      const IsTabletSplittingCompleteRequestPB* req, IsTabletSplittingCompleteResponsePB* resp,
490
      rpc::RpcContext* rpc);
491
492
  // Delete CDC streams for a table.
493
  virtual CHECKED_STATUS DeleteCDCStreamsForTable(const TableId& table_id) EXCLUDES(mutex_);
494
  virtual CHECKED_STATUS DeleteCDCStreamsForTables(const vector<TableId>& table_ids)
495
      EXCLUDES(mutex_);
496
497
  virtual CHECKED_STATUS ChangeEncryptionInfo(const ChangeEncryptionInfoRequestPB* req,
498
                                              ChangeEncryptionInfoResponsePB* resp);
499
500
  CHECKED_STATUS UpdateXClusterConsumerOnTabletSplit(
501
0
      const TableId& consumer_table_id, const SplitTabletIds& split_tablet_ids) override {
502
    // Default value.
503
0
    return Status::OK();
504
0
  }
505
506
  CHECKED_STATUS UpdateXClusterProducerOnTabletSplit(
507
0
      const TableId& producer_table_id, const SplitTabletIds& split_tablet_ids) override {
508
    // Default value.
509
0
    return Status::OK();
510
0
  }
511
512
  Result<uint64_t> IncrementYsqlCatalogVersion() override;
513
514
  // Records the fact that initdb has succesfully completed.
515
  CHECKED_STATUS InitDbFinished(Status initdb_status, int64_t term);
516
517
  // Check if the initdb operation has been completed. This is intended for use by whoever wants
518
  // to wait for the cluster to be fully initialized, e.g. minicluster, YugaWare, etc.
519
  CHECKED_STATUS IsInitDbDone(
520
      const IsInitDbDoneRequestPB* req, IsInitDbDoneResponsePB* resp) override;
521
522
  CHECKED_STATUS GetYsqlCatalogVersion(
523
      uint64_t* catalog_version, uint64_t* last_breaking_version) override;
524
525
  CHECKED_STATUS InitializeTransactionTablesConfig(int64_t term);
526
527
  CHECKED_STATUS IncrementTransactionTablesVersion();
528
529
  uint64_t GetTransactionTablesVersion() override;
530
531
  virtual CHECKED_STATUS FillHeartbeatResponse(const TSHeartbeatRequestPB* req,
532
                                               TSHeartbeatResponsePB* resp);
533
534
5.54k
  SysCatalogTable* sys_catalog() override { return sys_catalog_.get(); }
535
536
  // Tablet peer for the sys catalog tablet's peer.
537
  std::shared_ptr<tablet::TabletPeer> tablet_peer() const override;
538
539
0
  ClusterLoadBalancer* load_balancer() override { return load_balance_policy_.get(); }
540
541
1.56M
  TabletSplitManager* tablet_split_manager() override { return &tablet_split_manager_; }
542
543
  // Dump all of the current state about tables and tablets to the
544
  // given output stream. This is verbose, meant for debugging.
545
  void DumpState(std::ostream* out, bool on_disk_dump = false) const override;
546
547
  void SetLoadBalancerEnabled(bool is_enabled);
548
549
  bool IsLoadBalancerEnabled() override;
550
551
  // Return the table info for the table with the specified UUID, if it exists.
552
  TableInfoPtr GetTableInfo(const TableId& table_id) override;
553
  TableInfoPtr GetTableInfoUnlocked(const TableId& table_id) REQUIRES_SHARED(mutex_);
554
555
  // Get Table info given namespace id and table name.
556
  // Does not work for YSQL tables because of possible ambiguity.
557
  scoped_refptr<TableInfo> GetTableInfoFromNamespaceNameAndTableName(
558
      YQLDatabase db_type, const NamespaceName& namespace_name,
559
      const TableName& table_name) override;
560
561
  // Return TableInfos according to specified mode.
562
  std::vector<TableInfoPtr> GetTables(GetTablesMode mode) override;
563
564
  // Return all the available NamespaceInfo. The flag 'includeOnlyRunningNamespaces' determines
565
  // whether to retrieve all Namespaces irrespective of their state or just 'RUNNING' namespaces.
566
  // To retrieve all live tables in the system, you should set this flag to true.
567
  void GetAllNamespaces(std::vector<scoped_refptr<NamespaceInfo> >* namespaces,
568
                        bool include_only_running_namespaces = false) override;
569
570
  // Return all the available (user-defined) types.
571
  void GetAllUDTypes(std::vector<scoped_refptr<UDTypeInfo>>* types) override;
572
573
  // Return the recent tasks.
574
  std::vector<std::shared_ptr<server::MonitoredTask>> GetRecentTasks() override;
575
576
  // Return the recent user-initiated jobs.
577
  std::vector<std::shared_ptr<server::MonitoredTask>> GetRecentJobs() override;
578
579
  NamespaceName GetNamespaceNameUnlocked(const NamespaceId& id) const REQUIRES_SHARED(mutex_);
580
  NamespaceName GetNamespaceName(const NamespaceId& id) const override;
581
582
  NamespaceName GetNamespaceNameUnlocked(const scoped_refptr<TableInfo>& table) const
583
      REQUIRES_SHARED(mutex_);
584
  NamespaceName GetNamespaceName(const scoped_refptr<TableInfo>& table) const;
585
586
  // Is the table a system table?
587
  bool IsSystemTable(const TableInfo& table) const override;
588
589
  // Is the table a user created table?
590
  bool IsUserTable(const TableInfo& table) const override;
591
  bool IsUserTableUnlocked(const TableInfo& table) const REQUIRES_SHARED(mutex_);
592
593
  // Is the table a user created index?
594
  bool IsUserIndex(const TableInfo& table) const override;
595
  bool IsUserIndexUnlocked(const TableInfo& table) const REQUIRES_SHARED(mutex_);
596
597
  // Is the table a special sequences system table?
598
  bool IsSequencesSystemTable(const TableInfo& table) const;
599
600
  // Is the table id from a tablegroup?
601
  bool IsTablegroupParentTableId(const TableId& table_id) const;
602
603
  // Is the table id from a table created for colocated database?
604
  bool IsColocatedParentTableId(const TableId& table_id) const;
605
606
  // Is the table created by user?
607
  // Note that table can be regular table or index in this case.
608
  bool IsUserCreatedTable(const TableInfo& table) const override;
609
  bool IsUserCreatedTableUnlocked(const TableInfo& table) const REQUIRES_SHARED(mutex_);
610
611
  // Let the catalog manager know that we have received a response for a delete tablet request,
612
  // and that we either deleted the tablet successfully, or we received a fatal error.
613
  //
614
  // Async tasks should call this when they finish. The last such tablet peer notification will
615
  // trigger trying to transition the table from DELETING to DELETED state.
616
  void NotifyTabletDeleteFinished(
617
      const TabletServerId& tserver_uuid, const TableId& table_id,
618
      const TableInfoPtr& table) override;
619
620
  // For a DeleteTable, we first mark tables as DELETING then move them to DELETED once all
621
  // outstanding tasks are complete and the TS side tablets are deleted.
622
  // For system tables or colocated tables, we just need outstanding tasks to be done.
623
  //
624
  // If all conditions are met, returns a locked write lock on this table.
625
  // Otherwise lock is default constructed, i.e. not locked.
626
  TableInfo::WriteLock MaybeTransitionTableToDeleted(const TableInfoPtr& table);
627
628
  // Used by ConsensusService to retrieve the TabletPeer for a system
629
  // table specified by 'tablet_id'.
630
  //
631
  // See also: TabletPeerLookupIf, ConsensusServiceImpl.
632
  CHECKED_STATUS GetTabletPeer(
633
      const TabletId& tablet_id,
634
      std::shared_ptr<tablet::TabletPeer>* tablet_peer) const override;
635
636
  const NodeInstancePB& NodeInstance() const override;
637
638
  CHECKED_STATUS GetRegistration(ServerRegistrationPB* reg) const override;
639
640
  bool IsInitialized() const;
641
642
  virtual CHECKED_STATUS StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req)
643
      override;
644
645
  // Checks that placement info can be accommodated by available ts_descs.
646
  CHECKED_STATUS CheckValidPlacementInfo(const PlacementInfoPB& placement_info,
647
                                         const TSDescriptorVector& ts_descs,
648
                                         ValidateReplicationInfoResponsePB* resp);
649
650
  // Loops through the table's placement infos and populates the corresponding config from
651
  // each placement.
652
  CHECKED_STATUS HandlePlacementUsingReplicationInfo(
653
      const ReplicationInfoPB& replication_info,
654
      const TSDescriptorVector& all_ts_descs,
655
      consensus::RaftConfigPB* config,
656
      CMPerTableLoadState* per_table_state,
657
      CMGlobalLoadState* global_state);
658
659
  // Handles the config creation for a given placement.
660
  CHECKED_STATUS HandlePlacementUsingPlacementInfo(const PlacementInfoPB& placement_info,
661
                                                   const TSDescriptorVector& ts_descs,
662
                                                   consensus::PeerMemberType member_type,
663
                                                   consensus::RaftConfigPB* config,
664
                                                   CMPerTableLoadState* per_table_state,
665
                                                   CMGlobalLoadState* global_state);
666
667
  // Populates ts_descs with all tservers belonging to a certain placement.
668
  void GetTsDescsFromPlacementInfo(const PlacementInfoPB& placement_info,
669
                                   const TSDescriptorVector& all_ts_descs,
670
                                   TSDescriptorVector* ts_descs);
671
672
    // Set the current committed config.
673
  CHECKED_STATUS GetCurrentConfig(consensus::ConsensusStatePB *cpb) const override;
674
675
  // Return OK if this CatalogManager is a leader in a consensus configuration and if
676
  // the required leader state (metadata for tables and tablets) has
677
  // been successfully loaded into memory. CatalogManager must be
678
  // initialized before calling this method.
679
  CHECKED_STATUS CheckIsLeaderAndReady() const override;
680
681
  // Returns this CatalogManager's role in a consensus configuration. CatalogManager
682
  // must be initialized before calling this method.
683
  PeerRole Role() const;
684
685
  CHECKED_STATUS PeerStateDump(const vector<consensus::RaftPeerPB>& masters_raft,
686
                               const DumpMasterStateRequestPB* req,
687
                               DumpMasterStateResponsePB* resp);
688
689
  // If we get removed from an existing cluster, leader might ask us to detach ourselves from the
690
  // cluster. So we enter a shell mode equivalent state, with no bg tasks and no tablet peer
691
  // nor consensus.
692
  CHECKED_STATUS GoIntoShellMode();
693
694
  // Setters and getters for the cluster config item.
695
  //
696
  // To change the cluster config, a client would need to do a client-side read-modify-write by
697
  // issuing a get for the latest config, obtaining the current valid config (together with its
698
  // respective version number), modify the values it wants of said config and issuing a write
699
  // afterwards, without changing the version number. In case the version number does not match
700
  // on the server, the change will fail and the client will have to retry the get, as someone
701
  // must havGetTableInfoe updated the config in the meantime.
702
  CHECKED_STATUS GetClusterConfig(GetMasterClusterConfigResponsePB* resp) override;
703
  CHECKED_STATUS GetClusterConfig(SysClusterConfigEntryPB* config) override;
704
705
  CHECKED_STATUS SetClusterConfig(
706
      const ChangeMasterClusterConfigRequestPB* req,
707
      ChangeMasterClusterConfigResponsePB* resp) override;
708
709
710
  // Validator for placement information with respect to cluster configuration
711
  CHECKED_STATUS ValidateReplicationInfo(
712
      const ValidateReplicationInfoRequestPB* req, ValidateReplicationInfoResponsePB* resp);
713
714
  CHECKED_STATUS SetPreferredZones(
715
      const SetPreferredZonesRequestPB* req, SetPreferredZonesResponsePB* resp);
716
717
  Result<size_t> GetReplicationFactor() override;
718
  Result<size_t> GetReplicationFactorForTablet(const scoped_refptr<TabletInfo>& tablet);
719
720
  void GetExpectedNumberOfReplicas(int* num_live_replicas, int* num_read_replicas);
721
722
  // Get the percentage of tablets that have been moved off of the black-listed tablet servers.
723
  CHECKED_STATUS GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp);
724
725
  // Get the percentage of leaders that have been moved off of the leader black-listed tablet
726
  // servers.
727
  CHECKED_STATUS GetLeaderBlacklistCompletionPercent(GetLoadMovePercentResponsePB* resp);
728
729
  // Get the percentage of leaders/tablets that have been moved off of the (leader) black-listed
730
  // tablet servers.
731
  CHECKED_STATUS GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp,
732
      bool blacklist_leader);
733
734
  // API to check if all the live tservers have similar tablet workload.
735
  CHECKED_STATUS IsLoadBalanced(const IsLoadBalancedRequestPB* req,
736
                                IsLoadBalancedResponsePB* resp) override;
737
738
  CHECKED_STATUS IsLoadBalancerIdle(const IsLoadBalancerIdleRequestPB* req,
739
                                    IsLoadBalancerIdleResponsePB* resp);
740
741
  // API to check that all tservers that shouldn't have leader load do not.
742
  CHECKED_STATUS AreLeadersOnPreferredOnly(const AreLeadersOnPreferredOnlyRequestPB* req,
743
                                           AreLeadersOnPreferredOnlyResponsePB* resp);
744
745
  // Return the placement uuid of the primary cluster containing this master.
746
  Result<string> placement_uuid() const;
747
748
  // Clears out the existing metadata ('table_names_map_', 'table_ids_map_',
749
  // and 'tablet_map_'), loads tables metadata into memory and if successful
750
  // loads the tablets metadata.
751
  CHECKED_STATUS VisitSysCatalog(int64_t term) override;
752
  virtual CHECKED_STATUS RunLoaders(int64_t term) REQUIRES(mutex_);
753
754
  // Waits for the worker queue to finish processing, returns OK if worker queue is idle before
755
  // the provided timeout, TimedOut Status otherwise.
756
  CHECKED_STATUS WaitForWorkerPoolTests(
757
      const MonoDelta& timeout = MonoDelta::FromSeconds(10)) const override;
758
759
  Result<scoped_refptr<NamespaceInfo>> FindNamespaceUnlocked(
760
      const NamespaceIdentifierPB& ns_identifier) const REQUIRES_SHARED(mutex_);
761
762
  Result<scoped_refptr<NamespaceInfo>> FindNamespace(
763
      const NamespaceIdentifierPB& ns_identifier) const EXCLUDES(mutex_);
764
765
  Result<scoped_refptr<NamespaceInfo>> FindNamespaceById(
766
      const NamespaceId& id) const override EXCLUDES(mutex_);
767
768
  Result<scoped_refptr<NamespaceInfo>> FindNamespaceByIdUnlocked(
769
      const NamespaceId& id) const REQUIRES_SHARED(mutex_);
770
771
  Result<scoped_refptr<TableInfo>> FindTableUnlocked(
772
      const TableIdentifierPB& table_identifier) const REQUIRES_SHARED(mutex_);
773
774
  Result<scoped_refptr<TableInfo>> FindTable(
775
      const TableIdentifierPB& table_identifier) const override EXCLUDES(mutex_);
776
777
  Result<scoped_refptr<TableInfo>> FindTableById(
778
      const TableId& table_id) const override EXCLUDES(mutex_);
779
780
  Result<scoped_refptr<TableInfo>> FindTableByIdUnlocked(
781
      const TableId& table_id) const REQUIRES_SHARED(mutex_);
782
783
  Result<bool> TableExists(
784
      const std::string& namespace_name, const std::string& table_name) const EXCLUDES(mutex_);
785
786
  Result<TableDescription> DescribeTable(
787
      const TableIdentifierPB& table_identifier, bool succeed_if_create_in_progress);
788
789
  Result<TableDescription> DescribeTable(
790
      const TableInfoPtr& table_info, bool succeed_if_create_in_progress);
791
792
  Result<std::string> GetPgSchemaName(const TableInfoPtr& table_info) REQUIRES_SHARED(mutex_);
793
794
8
  void AssertLeaderLockAcquiredForReading() const override {
795
8
    leader_lock_.AssertAcquiredForReading();
796
8
  }
797
798
10
  std::string GenerateId() override {
799
10
    return GenerateId(boost::none);
800
10
  }
801
802
  std::string GenerateId(boost::optional<const SysRowEntryType> entity_type);
803
  std::string GenerateIdUnlocked(boost::optional<const SysRowEntryType> entity_type = boost::none)
804
      REQUIRES_SHARED(mutex_);
805
806
408k
  ThreadPool* AsyncTaskPool() override { return async_task_pool_.get(); }
807
808
142k
  PermissionsManager* permissions_manager() override {
809
142k
    return permissions_manager_.get();
810
142k
  }
811
812
6.01k
  intptr_t tablets_version() const override NO_THREAD_SAFETY_ANALYSIS {
813
    // This method should not hold the lock, because Version method is thread safe.
814
6.01k
    return tablet_map_.Version() + table_ids_map_.Version();
815
6.01k
  }
816
817
3.00k
  intptr_t tablet_locations_version() const override {
818
3.00k
    return tablet_locations_version_.load(std::memory_order_acquire);
819
3.00k
  }
820
821
12.9k
  EncryptionManager& encryption_manager() {
822
12.9k
    return *encryption_manager_;
823
12.9k
  }
824
825
0
  client::UniverseKeyClient& universe_key_client() {
826
0
    return *universe_key_client_;
827
0
  }
828
829
  CHECKED_STATUS FlushSysCatalog(const FlushSysCatalogRequestPB* req,
830
                                 FlushSysCatalogResponsePB* resp,
831
                                 rpc::RpcContext* rpc);
832
833
  CHECKED_STATUS CompactSysCatalog(const CompactSysCatalogRequestPB* req,
834
                                   CompactSysCatalogResponsePB* resp,
835
                                   rpc::RpcContext* rpc);
836
837
  CHECKED_STATUS SplitTablet(const TabletId& tablet_id, bool select_all_tablets_for_split) override;
838
839
  // Splits tablet specified in the request using middle of the partition as a split point.
840
  CHECKED_STATUS SplitTablet(
841
      const SplitTabletRequestPB* req, SplitTabletResponsePB* resp, rpc::RpcContext* rpc);
842
843
  // Deletes a tablet that is no longer serving user requests. This would require that the tablet
844
  // has been split and both of its children are now in RUNNING state and serving user requests
845
  // instead.
846
  CHECKED_STATUS DeleteNotServingTablet(
847
      const DeleteNotServingTabletRequestPB* req, DeleteNotServingTabletResponsePB* resp,
848
      rpc::RpcContext* rpc);
849
850
  CHECKED_STATUS DdlLog(
851
      const DdlLogRequestPB* req, DdlLogResponsePB* resp, rpc::RpcContext* rpc);
852
853
  // Test wrapper around protected DoSplitTablet method.
854
  CHECKED_STATUS TEST_SplitTablet(
855
      const scoped_refptr<TabletInfo>& source_tablet_info,
856
      docdb::DocKeyHash split_hash_code) override;
857
858
  CHECKED_STATUS TEST_SplitTablet(
859
      const TabletId& tablet_id, const std::string& split_encoded_key,
860
      const std::string& split_partition_key) override;
861
862
  CHECKED_STATUS TEST_IncrementTablePartitionListVersion(const TableId& table_id) override;
863
864
  // Schedule a task to run on the async task thread pool.
865
  CHECKED_STATUS ScheduleTask(std::shared_ptr<RetryingTSRpcTask> task) override;
866
867
  // Time since this peer became master leader. Caller should verify that it is leader before.
868
  MonoDelta TimeSinceElectedLeader();
869
870
  Result<std::vector<TableDescription>> CollectTables(
871
      const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
872
      bool add_indexes,
873
      bool include_parent_colocated_table = false) override;
874
875
  Result<std::vector<TableDescription>> CollectTables(
876
      const google::protobuf::RepeatedPtrField<TableIdentifierPB>& table_identifiers,
877
      CollectFlags flags,
878
      std::unordered_set<NamespaceId>* namespaces = nullptr);
879
880
  // Returns 'table_replication_info' itself if set. Else looks up placement info for its
881
  // 'tablespace_id'. If neither is set, returns the cluster level replication info.
882
  Result<ReplicationInfoPB> GetTableReplicationInfo(
883
      const ReplicationInfoPB& table_replication_info,
884
      const TablespaceId& tablespace_id) override;
885
886
  Result<boost::optional<TablespaceId>> GetTablespaceForTable(
887
      const scoped_refptr<TableInfo>& table) override;
888
889
  void ProcessTabletStorageMetadata(
890
      const std::string& ts_uuid,
891
      const TabletDriveStorageMetadataPB& storage_metadata);
892
893
  void CheckTableDeleted(const TableInfoPtr& table) override;
894
895
  bool ShouldSplitValidCandidate(
896
      const TabletInfo& tablet_info, const TabletReplicaDriveInfo& drive_info) const override;
897
898
  Result<BlacklistSet> BlacklistSetFromPB() const override;
899
900
  std::vector<std::string> GetMasterAddresses();
901
902
 protected:
903
  // TODO Get rid of these friend classes and introduce formal interface.
904
  friend class TableLoader;
905
  friend class TabletLoader;
906
  friend class NamespaceLoader;
907
  friend class UDTypeLoader;
908
  friend class ClusterConfigLoader;
909
  friend class RoleLoader;
910
  friend class RedisConfigLoader;
911
  friend class SysConfigLoader;
912
  friend class ::yb::master::ScopedLeaderSharedLock;
913
  friend class PermissionsManager;
914
  friend class MultiStageAlterTable;
915
  friend class BackfillTable;
916
  friend class BackfillTablet;
917
918
  FRIEND_TEST(SysCatalogTest, TestCatalogManagerTasksTracker);
919
  FRIEND_TEST(SysCatalogTest, TestPrepareDefaultClusterConfig);
920
  FRIEND_TEST(SysCatalogTest, TestSysCatalogTablesOperations);
921
  FRIEND_TEST(SysCatalogTest, TestSysCatalogTabletsOperations);
922
  FRIEND_TEST(SysCatalogTest, TestTableInfoCommit);
923
924
  FRIEND_TEST(MasterTest, TestTabletsDeletedWhenTableInDeletingState);
925
  FRIEND_TEST(yb::MasterPartitionedTest, VerifyOldLeaderStepsDown);
926
927
  // Called by SysCatalog::SysCatalogStateChanged when this node
928
  // becomes the leader of a consensus configuration.
929
  //
930
  // Executes LoadSysCatalogDataTask below and marks the current time as time since leader.
931
  CHECKED_STATUS ElectedAsLeaderCb();
932
933
  // Loops and sleeps until one of the following conditions occurs:
934
  // 1. The current node is the leader master in the current term
935
  //    and at least one op from the current term is committed. Returns OK.
936
  // 2. The current node is not the leader master.
937
  //    Returns IllegalState.
938
  // 3. The provided timeout expires. Returns TimedOut.
939
  //
940
  // This method is intended to ensure that all operations replicated by
941
  // previous masters are committed and visible to the local node before
942
  // reading that data, to ensure consistency across failovers.
943
  CHECKED_STATUS WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) override;
944
945
  // This method is submitted to 'leader_initialization_pool_' by
946
  // ElectedAsLeaderCb above. It:
947
  // 1) Acquired 'lock_'
948
  // 2) Runs the various Visitors defined below
949
  // 3) Releases 'lock_' and if successful, updates 'leader_ready_term_'
950
  // to true (under state_lock_).
951
  void LoadSysCatalogDataTask();
952
953
  // This method checks that resource such as keyspace is available for GrantRevokePermission
954
  // request.
955
  // Since this method takes lock on mutex_, it is separated out of permissions manager
956
  // so that the thread safety relationship between the two managers is easy to reason about.
957
  CHECKED_STATUS CheckResource(const GrantRevokePermissionRequestPB* req,
958
                               GrantRevokePermissionResponsePB* resp);
959
960
  // Generated the default entry for the cluster config, that is written into sys_catalog on very
961
  // first leader election of the cluster.
962
  //
963
  // Sets the version field of the SysClusterConfigEntryPB to 0.
964
  CHECKED_STATUS PrepareDefaultClusterConfig(int64_t term) REQUIRES(mutex_);
965
966
  // Sets up various system configs.
967
  CHECKED_STATUS PrepareDefaultSysConfig(int64_t term) REQUIRES(mutex_);
968
969
  // Starts an asynchronous run of initdb. Errors are handled in the callback. Returns true
970
  // if started running initdb, false if decided that it is not needed.
971
  bool StartRunningInitDbIfNeeded(int64_t term) REQUIRES_SHARED(mutex_);
972
973
  CHECKED_STATUS PrepareDefaultNamespaces(int64_t term) REQUIRES(mutex_);
974
975
  CHECKED_STATUS PrepareSystemTables(int64_t term) REQUIRES(mutex_);
976
977
  CHECKED_STATUS PrepareSysCatalogTable(int64_t term) REQUIRES(mutex_);
978
979
  template <class T>
980
  CHECKED_STATUS PrepareSystemTableTemplate(const TableName& table_name,
981
                                            const NamespaceName& namespace_name,
982
                                            const NamespaceId& namespace_id,
983
                                            int64_t term) REQUIRES(mutex_);
984
985
  CHECKED_STATUS PrepareSystemTable(const TableName& table_name,
986
                                    const NamespaceName& namespace_name,
987
                                    const NamespaceId& namespace_id,
988
                                    const Schema& schema,
989
                                    int64_t term,
990
                                    YQLVirtualTable* vtable) REQUIRES(mutex_);
991
992
  CHECKED_STATUS PrepareNamespace(YQLDatabase db_type,
993
                                  const NamespaceName& name,
994
                                  const NamespaceId& id,
995
                                  int64_t term) REQUIRES(mutex_);
996
997
  void ProcessPendingNamespace(NamespaceId id,
998
                               std::vector<scoped_refptr<TableInfo>> template_tables,
999
                               TransactionMetadata txn);
1000
1001
  // Called when transaction associated with NS create finishes. Verifies postgres layer present.
1002
  CHECKED_STATUS VerifyNamespacePgLayer(scoped_refptr<NamespaceInfo> ns, bool txn_query_succeeded);
1003
1004
  CHECKED_STATUS ConsensusStateToTabletLocations(const consensus::ConsensusStatePB& cstate,
1005
                                                 TabletLocationsPB* locs_pb);
1006
1007
  // Creates the table and associated tablet objects in-memory and updates the appropriate
1008
  // catalog manager maps.
1009
  CHECKED_STATUS CreateTableInMemory(const CreateTableRequestPB& req,
1010
                                     const Schema& schema,
1011
                                     const PartitionSchema& partition_schema,
1012
                                     const NamespaceId& namespace_id,
1013
                                     const NamespaceName& namespace_name,
1014
                                     const vector<Partition>& partitions,
1015
                                     IndexInfoPB* index_info,
1016
                                     TabletInfos* tablets,
1017
                                     CreateTableResponsePB* resp,
1018
                                     scoped_refptr<TableInfo>* table) REQUIRES(mutex_);
1019
1020
  Result<TabletInfos> CreateTabletsFromTable(const vector<Partition>& partitions,
1021
                                             const TableInfoPtr& table) REQUIRES(mutex_);
1022
1023
  // Helper for creating copartitioned table.
1024
  CHECKED_STATUS CreateCopartitionedTable(const CreateTableRequestPB& req,
1025
                                          CreateTableResponsePB* resp,
1026
                                          rpc::RpcContext* rpc,
1027
                                          Schema schema,
1028
                                          scoped_refptr<NamespaceInfo> ns);
1029
1030
  // Check that local host is present in master addresses for normal master process start.
1031
  // On error, it could imply that master_addresses is incorrectly set for shell master startup
1032
  // or that this master host info was missed in the master addresses and it should be
1033
  // participating in the very first quorum setup.
1034
  CHECKED_STATUS CheckLocalHostInMasterAddresses();
1035
1036
  // Helper for initializing 'sys_catalog_'. After calling this
1037
  // method, the caller should call WaitUntilRunning() on sys_catalog_
1038
  // WITHOUT holding 'lock_' to wait for consensus to start for
1039
  // sys_catalog_.
1040
  //
1041
  // This method is thread-safe.
1042
  CHECKED_STATUS InitSysCatalogAsync();
1043
1044
  Result<ReplicationInfoPB> GetTableReplicationInfo(const TabletInfo& tablet_info) const;
1045
1046
  // Helper for creating the initial TableInfo state
1047
  // Leaves the table "write locked" with the new info in the
1048
  // "dirty" state field.
1049
  scoped_refptr<TableInfo> CreateTableInfo(const CreateTableRequestPB& req,
1050
                                           const Schema& schema,
1051
                                           const PartitionSchema& partition_schema,
1052
                                           const NamespaceId& namespace_id,
1053
                                           const NamespaceName& namespace_name,
1054
                                           IndexInfoPB* index_info) REQUIRES(mutex_);
1055
1056
  // Helper for creating the initial TabletInfo state.
1057
  // Leaves the tablet "write locked" with the new info in the
1058
  // "dirty" state field.
1059
  TabletInfoPtr CreateTabletInfo(TableInfo* table,
1060
                                 const PartitionPB& partition) REQUIRES(mutex_);
1061
1062
  // Remove the specified entries from the protobuf field table_ids of a TabletInfo.
1063
  Status RemoveTableIdsFromTabletInfo(
1064
      TabletInfoPtr tablet_info, std::unordered_set<TableId> tables_to_remove);
1065
1066
  // Add index info to the indexed table.
1067
  CHECKED_STATUS AddIndexInfoToTable(const scoped_refptr<TableInfo>& indexed_table,
1068
                                     const IndexInfoPB& index_info,
1069
                                     CreateTableResponsePB* resp);
1070
1071
  // Delete index info from the indexed table.
1072
  CHECKED_STATUS MarkIndexInfoFromTableForDeletion(
1073
      const TableId& indexed_table_id, const TableId& index_table_id, bool multi_stage,
1074
      DeleteTableResponsePB* resp);
1075
1076
  // Delete index info from the indexed table.
1077
  CHECKED_STATUS DeleteIndexInfoFromTable(
1078
      const TableId& indexed_table_id, const TableId& index_table_id);
1079
1080
  // Builds the TabletLocationsPB for a tablet based on the provided TabletInfo.
1081
  // Populates locs_pb and returns true on success.
1082
  // Returns Status::ServiceUnavailable if tablet is not running.
1083
  // Set include_inactive to true in order to also get information about hidden tablets.
1084
  CHECKED_STATUS BuildLocationsForTablet(
1085
      const scoped_refptr<TabletInfo>& tablet,
1086
      TabletLocationsPB* locs_pb,
1087
      IncludeInactive include_inactive = IncludeInactive::kFalse);
1088
1089
  // Check whether the tservers in the current replica map differs from those in the cstate when
1090
  // processing a tablet report. Ignore the roles reported by the cstate, just compare the
1091
  // tservers.
1092
  bool ReplicaMapDiffersFromConsensusState(const scoped_refptr<TabletInfo>& tablet,
1093
                                           const consensus::ConsensusStatePB& consensus_state);
1094
1095
  void ReconcileTabletReplicasInLocalMemoryWithReport(
1096
      const scoped_refptr<TabletInfo>& tablet,
1097
      const std::string& sender_uuid,
1098
      const consensus::ConsensusStatePB& consensus_state,
1099
      const ReportedTabletPB& report);
1100
1101
  // Register a tablet server whenever it heartbeats with a consensus configuration. This is
1102
  // needed because we have logic in the Master that states that if a tablet
1103
  // server that is part of a consensus configuration has not heartbeated to the Master yet, we
1104
  // leave it out of the consensus configuration reported to clients.
1105
  // TODO: See if we can remove this logic, as it seems confusing.
1106
  void UpdateTabletReplicaInLocalMemory(TSDescriptor* ts_desc,
1107
                                        const consensus::ConsensusStatePB* consensus_state,
1108
                                        const ReportedTabletPB& report,
1109
                                        const scoped_refptr<TabletInfo>& tablet_to_update);
1110
1111
  static void CreateNewReplicaForLocalMemory(TSDescriptor* ts_desc,
1112
                                             const consensus::ConsensusStatePB* consensus_state,
1113
                                             const ReportedTabletPB& report,
1114
                                             TabletReplica* new_replica);
1115
1116
  // Extract the set of tablets that can be deleted and the set of tablets
1117
  // that must be processed because not running yet.
1118
  // Returns a map of table_id -> {tablet_info1, tablet_info2, etc.}.
1119
  void ExtractTabletsToProcess(TabletInfos *tablets_to_delete,
1120
                               TableToTabletInfos *tablets_to_process);
1121
1122
  // Determine whether any tables are in the DELETING state.
1123
  bool AreTablesDeleting() override;
1124
1125
  // Task that takes care of the tablet assignments/creations.
1126
  // Loops through the "not created" tablets and sends a CreateTablet() request.
1127
  CHECKED_STATUS ProcessPendingAssignmentsPerTable(
1128
      const TableId& table_id, const TabletInfos& tablets, CMGlobalLoadState* global_load_state);
1129
1130
  // Select a tablet server from 'ts_descs' on which to place a new replica.
1131
  // Any tablet servers in 'excluded' are not considered.
1132
  // REQUIRES: 'ts_descs' must include at least one non-excluded server.
1133
  std::shared_ptr<TSDescriptor> SelectReplica(
1134
      const TSDescriptorVector& ts_descs,
1135
      std::set<TabletServerId>* excluded,
1136
      CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state);
1137
1138
  // Select N Replicas from online tablet servers (as specified by
1139
  // 'ts_descs') for the specified tablet and populate the consensus configuration
1140
  // object. If 'ts_descs' does not specify enough online tablet
1141
  // servers to select the N replicas, return Status::InvalidArgument.
1142
  //
1143
  // This method is called by "ProcessPendingAssignmentsPerTable()".
1144
  CHECKED_STATUS SelectReplicasForTablet(
1145
      const TSDescriptorVector& ts_descs, TabletInfo* tablet,
1146
      CMPerTableLoadState* per_table_state, CMGlobalLoadState* global_state);
1147
1148
  // Select N Replicas from the online tablet servers that have been chosen to respect the
1149
  // placement information provided. Populate the consensus configuration object with choices and
1150
  // also update the set of selected tablet servers, to not place several replicas on the same TS.
1151
  // member_type indicated what type of replica to select for.
1152
  //
1153
  // This method is called by "SelectReplicasForTablet".
1154
  void SelectReplicas(
1155
      const TSDescriptorVector& ts_descs,
1156
      size_t nreplicas, consensus::RaftConfigPB* config,
1157
      std::set<TabletServerId>* already_selected_ts,
1158
      consensus::PeerMemberType member_type,
1159
      CMPerTableLoadState* per_table_state,
1160
      CMGlobalLoadState* global_state);
1161
1162
  void HandleAssignPreparingTablet(TabletInfo* tablet,
1163
                                   DeferredAssignmentActions* deferred);
1164
1165
  // Assign tablets and send CreateTablet RPCs to tablet servers.
1166
  // The out param 'new_tablets' should have any newly-created TabletInfo
1167
  // objects appended to it.
1168
  void HandleAssignCreatingTablet(TabletInfo* tablet,
1169
                                  DeferredAssignmentActions* deferred,
1170
                                  TabletInfos* new_tablets);
1171
1172
  CHECKED_STATUS HandleTabletSchemaVersionReport(
1173
      TabletInfo *tablet, uint32_t version,
1174
      const scoped_refptr<TableInfo>& table = nullptr) override;
1175
1176
  // Send the create tablet requests to the selected peers of the consensus configurations.
1177
  // The creation is async, and at the moment there is no error checking on the
1178
  // caller side. We rely on the assignment timeout. If we don't see the tablet
1179
  // after the timeout, we regenerate a new one and proceed with a new
1180
  // assignment/creation.
1181
  //
1182
  // This method is part of the "ProcessPendingAssignmentsPerTable()"
1183
  //
1184
  // This must be called after persisting the tablet state as
1185
  // CREATING to ensure coherent state after Master failover.
1186
  CHECKED_STATUS SendCreateTabletRequests(const std::vector<TabletInfo*>& tablets);
1187
1188
  // Send the "alter table request" to all tablets of the specified table.
1189
  //
1190
  // Also, initiates the required AlterTable requests to backfill the Index.
1191
  // Initially the index is set to be in a INDEX_PERM_DELETE_ONLY state, then
1192
  // updated to INDEX_PERM_WRITE_AND_DELETE state; followed by backfilling. Once
1193
  // all the tablets have completed backfilling, the index will be updated
1194
  // to be in INDEX_PERM_READ_WRITE_AND_DELETE state.
1195
  CHECKED_STATUS SendAlterTableRequest(const scoped_refptr<TableInfo>& table,
1196
                                       const AlterTableRequestPB* req = nullptr);
1197
1198
  // Start the background task to send the CopartitionTable() RPC to the leader for this
1199
  // tablet.
1200
  void SendCopartitionTabletRequest(const scoped_refptr<TabletInfo>& tablet,
1201
                                    const scoped_refptr<TableInfo>& table);
1202
1203
  // Starts the background task to send the SplitTablet RPC to the leader for the specified tablet.
1204
  CHECKED_STATUS SendSplitTabletRequest(
1205
      const scoped_refptr<TabletInfo>& tablet, std::array<TabletId, kNumSplitParts> new_tablet_ids,
1206
      const std::string& split_encoded_key, const std::string& split_partition_key);
1207
1208
  // Send the "truncate table request" to all tablets of the specified table.
1209
  void SendTruncateTableRequest(const scoped_refptr<TableInfo>& table);
1210
1211
  // Start the background task to send the TruncateTable() RPC to the leader for this tablet.
1212
  void SendTruncateTabletRequest(const scoped_refptr<TabletInfo>& tablet);
1213
1214
  // Truncate the specified table/index.
1215
  CHECKED_STATUS TruncateTable(const TableId& table_id,
1216
                               TruncateTableResponsePB* resp,
1217
                               rpc::RpcContext* rpc);
1218
1219
  struct DeletingTableData {
1220
    TableInfoPtr info;
1221
    TableInfo::WriteLock write_lock;
1222
    RepeatedBytes retained_by_snapshot_schedules;
1223
    bool remove_from_name_map;
1224
  };
1225
1226
  // Delete the specified table in memory. The TableInfo, DeletedTableInfo and lock of the deleted
1227
  // table are appended to the lists. The caller will be responsible for committing the change and
1228
  // deleting the actual table and tablets.
1229
  CHECKED_STATUS DeleteTableInMemory(
1230
      const TableIdentifierPB& table_identifier,
1231
      bool is_index_table,
1232
      bool update_indexed_table,
1233
      const SnapshotSchedulesToObjectIdsMap& schedules_to_tables_map,
1234
      std::vector<DeletingTableData>* tables,
1235
      DeleteTableResponsePB* resp,
1236
      rpc::RpcContext* rpc);
1237
1238
  // Request tablet servers to delete all replicas of the tablet.
1239
  void DeleteTabletReplicas(TabletInfo* tablet, const std::string& msg, HideOnly hide_only);
1240
1241
  // Returns error if and only if it is forbidden to both:
1242
  // 1) Delete single tablet from table.
1243
  // 2) Delete the whole table.
1244
  // This is used for pre-checks in both `DeleteTablet` and `DeleteTabletsAndSendRequests`.
1245
  CHECKED_STATUS CheckIfForbiddenToDeleteTabletOf(const scoped_refptr<TableInfo>& table);
1246
1247
  // Marks each of the tablets in the given table as deleted and triggers requests to the tablet
1248
  // servers to delete them. The table parameter is expected to be given "write locked".
1249
  CHECKED_STATUS DeleteTabletsAndSendRequests(
1250
      const TableInfoPtr& table, const RepeatedBytes& retained_by_snapshot_schedules);
1251
1252
  // Marks each tablet as deleted and triggers requests to the tablet servers to delete them.
1253
  CHECKED_STATUS DeleteTabletListAndSendRequests(
1254
      const std::vector<scoped_refptr<TabletInfo>>& tablets, const std::string& deletion_msg,
1255
      const RepeatedBytes& retained_by_snapshot_schedules);
1256
1257
  // Send the "delete tablet request" to the specified TS/tablet.
1258
  // The specified 'reason' will be logged on the TS.
1259
  void SendDeleteTabletRequest(const TabletId& tablet_id,
1260
                               tablet::TabletDataState delete_type,
1261
                               const boost::optional<int64_t>& cas_config_opid_index_less_or_equal,
1262
                               const scoped_refptr<TableInfo>& table,
1263
                               TSDescriptor* ts_desc,
1264
                               const std::string& reason,
1265
                               bool hide_only = false);
1266
1267
  // Start a task to request the specified tablet leader to step down and optionally to remove
1268
  // the server that is over-replicated. A new tablet server can be specified to start an election
1269
  // immediately to become the new leader. If new_leader_ts_uuid is empty, the election will be run
1270
  // following the protocol's default mechanism.
1271
  void SendLeaderStepDownRequest(
1272
      const scoped_refptr<TabletInfo>& tablet, const consensus::ConsensusStatePB& cstate,
1273
      const string& change_config_ts_uuid, bool should_remove,
1274
      const string& new_leader_ts_uuid = "");
1275
1276
  // Start a task to change the config to remove a certain voter because the specified tablet is
1277
  // over-replicated.
1278
  void SendRemoveServerRequest(
1279
      const scoped_refptr<TabletInfo>& tablet, const consensus::ConsensusStatePB& cstate,
1280
      const string& change_config_ts_uuid);
1281
1282
  // Start a task to change the config to add an additional voter because the
1283
  // specified tablet is under-replicated.
1284
  void SendAddServerRequest(
1285
      const scoped_refptr<TabletInfo>& tablet, consensus::PeerMemberType member_type,
1286
      const consensus::ConsensusStatePB& cstate, const string& change_config_ts_uuid);
1287
1288
  void GetPendingServerTasksUnlocked(const TableId &table_uuid,
1289
                                     TabletToTabletServerMap *add_replica_tasks_map,
1290
                                     TabletToTabletServerMap *remove_replica_tasks_map,
1291
                                     TabletToTabletServerMap *stepdown_leader_tasks)
1292
      REQUIRES_SHARED(mutex_);
1293
1294
  // Abort creation of 'table': abort all mutation for TabletInfo and
1295
  // TableInfo objects (releasing all COW locks), abort all pending
1296
  // tasks associated with the table, and erase any state related to
1297
  // the table we failed to create from the in-memory maps
1298
  // ('table_names_map_', 'table_ids_map_', 'tablet_map_' below).
1299
  CHECKED_STATUS AbortTableCreation(TableInfo* table,
1300
                                    const TabletInfos& tablets,
1301
                                    const Status& s,
1302
                                    CreateTableResponsePB* resp);
1303
1304
  CHECKED_STATUS CreateTransactionStatusTablesForTablespaces(
1305
      const TablespaceIdToReplicationInfoMap& tablespace_info,
1306
      const TableToTablespaceIdMap& table_to_tablespace_map);
1307
1308
  void StartTablespaceBgTaskIfStopped();
1309
1310
  std::shared_ptr<YsqlTablespaceManager> GetTablespaceManager() const;
1311
1312
  Result<boost::optional<ReplicationInfoPB>> GetTablespaceReplicationInfoWithRetry(
1313
      const TablespaceId& tablespace_id);
1314
1315
  // Report metrics.
1316
  void ReportMetrics();
1317
1318
  // Reset metrics.
1319
  void ResetMetrics();
1320
1321
  // Conventional "T xxx P yyy: " prefix for logging.
1322
  std::string LogPrefix() const;
1323
1324
  // Aborts all tasks belonging to 'tables' and waits for them to finish.
1325
  void AbortAndWaitForAllTasks(const std::vector<scoped_refptr<TableInfo>>& tables);
1326
1327
  // Can be used to create background_tasks_ field for this master.
1328
  // Used on normal master startup or when master comes out of the shell mode.
1329
  CHECKED_STATUS EnableBgTasks();
1330
1331
  // Helper function for RebuildYQLSystemPartitions to get the system.partitions tablet.
1332
  Status GetYQLPartitionsVTable(std::shared_ptr<SystemTablet>* tablet);
1333
  // Background task for automatically rebuilding system.partitions every
1334
  // partitions_vtable_cache_refresh_secs seconds.
1335
  void RebuildYQLSystemPartitions();
1336
1337
  // Registers new split tablet with `partition` for the same table as `source_tablet_info` tablet.
1338
  // Does not change any other tablets and their partitions.
1339
  // Returns TabletInfo for registered tablet.
1340
  Result<TabletInfoPtr> RegisterNewTabletForSplit(
1341
      TabletInfo* source_tablet_info, const PartitionPB& partition,
1342
      TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock);
1343
1344
  Result<scoped_refptr<TabletInfo>> GetTabletInfo(const TabletId& tablet_id) override;
1345
1346
  CHECKED_STATUS DoSplitTablet(
1347
      const scoped_refptr<TabletInfo>& source_tablet_info, std::string split_encoded_key,
1348
      std::string split_partition_key, bool select_all_tablets_for_split);
1349
1350
  // Splits tablet using specified split_hash_code as a split point.
1351
  CHECKED_STATUS DoSplitTablet(
1352
      const scoped_refptr<TabletInfo>& source_tablet_info, docdb::DocKeyHash split_hash_code,
1353
      bool select_all_tablets_for_split);
1354
1355
  // Calculate the total number of replicas which are being handled by servers in state.
1356
  int64_t GetNumRelevantReplicas(const BlacklistPB& state, bool leaders_only);
1357
1358
351k
  int64_t leader_ready_term() override EXCLUDES(state_lock_) {
1359
351k
    std::lock_guard<simple_spinlock> l(state_lock_);
1360
351k
    return leader_ready_term_;
1361
351k
  }
1362
1363
  // Delete tables from internal map by id, if it has no more active tasks and tablets.
1364
  // This function should only be called from the bg_tasks thread, in a single threaded fashion!
1365
  void CleanUpDeletedTables();
1366
1367
  // Called when a new table id is added to table_ids_map_.
1368
  void HandleNewTableId(const TableId& id);
1369
1370
  // Creates a new TableInfo object.
1371
  scoped_refptr<TableInfo> NewTableInfo(TableId id) override;
1372
1373
  // Register the tablet server with the ts manager using the Raft config. This is called for
1374
  // servers that are part of the Raft config but haven't registered as yet.
1375
  CHECKED_STATUS RegisterTsFromRaftConfig(const consensus::RaftPeerPB& peer);
1376
1377
  template <class Loader>
1378
  CHECKED_STATUS Load(const std::string& title, const int64_t term);
1379
1380
0
  virtual void Started() {}
1381
1382
0
  virtual void SysCatalogLoaded(int64_t term) {}
1383
1384
  // Respect leader affinity with master sys catalog tablet by stepping down if we don't match
1385
  // the cluster config affinity specification.
1386
  CHECKED_STATUS SysCatalogRespectLeaderAffinity();
1387
1388
0
  virtual Result<bool> IsTablePartOfSomeSnapshotSchedule(const TableInfo& table_info) override {
1389
    // Default value.
1390
0
    return false;
1391
0
  }
1392
1393
0
  virtual bool IsCdcEnabled(const TableInfo& table_info) const override {
1394
    // Default value.
1395
0
    return false;
1396
0
  }
1397
1398
0
  virtual bool IsTableCdcProducer(const TableInfo& table_info) const REQUIRES_SHARED(mutex_) {
1399
    // Default value.
1400
0
    return false;
1401
0
  }
1402
1403
  virtual Result<SnapshotSchedulesToObjectIdsMap> MakeSnapshotSchedulesToObjectIdsMap(
1404
0
      SysRowEntryType type) {
1405
0
    return SnapshotSchedulesToObjectIdsMap();
1406
0
  }
1407
1408
  Status DoDeleteNamespace(const DeleteNamespaceRequestPB* req,
1409
                           DeleteNamespaceResponsePB* resp,
1410
                           rpc::RpcContext* rpc);
1411
1412
  std::shared_ptr<ClusterConfigInfo> ClusterConfig() const;
1413
1414
  Result<TableInfoPtr> GetGlobalTransactionStatusTable();
1415
1416
  Result<bool> IsCreateTableDone(const TableInfoPtr& table);
1417
1418
  // TODO: the maps are a little wasteful of RAM, since the TableInfo/TabletInfo
1419
  // objects have a copy of the string key. But STL doesn't make it
1420
  // easy to make a "gettable set".
1421
1422
  // Lock protecting the various in memory storage structures.
1423
  using MutexType = rw_spinlock;
1424
  using SharedLock = NonRecursiveSharedLock<MutexType>;
1425
  using LockGuard = std::lock_guard<MutexType>;
1426
  mutable MutexType mutex_;
1427
1428
  // Note: Namespaces and tables for YSQL databases are identified by their ids only and therefore
1429
  // are not saved in the name maps below.
1430
1431
  // Table map: table-id -> TableInfo
1432
  VersionTracker<TableInfoMap> table_ids_map_ GUARDED_BY(mutex_);
1433
1434
  // Table map: [namespace-id, table-name] -> TableInfo
1435
  // Don't have to use VersionTracker for it, since table_ids_map_ already updated at the same time.
1436
  // Note that this map isn't used for YSQL tables.
1437
  TableInfoByNameMap table_names_map_ GUARDED_BY(mutex_);
1438
1439
  // Set of table ids that are transaction status tables.
1440
  // Don't have to use VersionTracker for it, since table_ids_map_ already updated at the same time.
1441
  TableIdSet transaction_table_ids_set_ GUARDED_BY(mutex_);
1442
1443
  // Don't have to use VersionTracker for it, since table_ids_map_ already updated at the same time.
1444
  // Tablet maps: tablet-id -> TabletInfo
1445
  VersionTracker<TabletInfoMap> tablet_map_ GUARDED_BY(mutex_);
1446
1447
  // Tablets that was hidden instead of deleting, used to cleanup such tablets when time comes.
1448
  std::vector<TabletInfoPtr> hidden_tablets_ GUARDED_BY(mutex_);
1449
1450
  // Namespace maps: namespace-id -> NamespaceInfo and namespace-name -> NamespaceInfo
1451
  NamespaceInfoMap namespace_ids_map_ GUARDED_BY(mutex_);
1452
  NamespaceNameMapper namespace_names_mapper_ GUARDED_BY(mutex_);
1453
1454
  // User-Defined type maps: udtype-id -> UDTypeInfo and udtype-name -> UDTypeInfo
1455
  UDTypeInfoMap udtype_ids_map_ GUARDED_BY(mutex_);
1456
  UDTypeInfoByNameMap udtype_names_map_ GUARDED_BY(mutex_);
1457
1458
  // RedisConfig map: RedisConfigKey -> RedisConfigInfo
1459
  typedef std::unordered_map<RedisConfigKey, scoped_refptr<RedisConfigInfo>> RedisConfigInfoMap;
1460
  RedisConfigInfoMap redis_config_map_ GUARDED_BY(mutex_);
1461
1462
  // Config information.
1463
  mutable rw_spinlock config_mutex_;
1464
  std::shared_ptr<ClusterConfigInfo> cluster_config_ GUARDED_BY(config_mutex_) = nullptr;
1465
1466
  // YSQL Catalog information.
1467
  scoped_refptr<SysConfigInfo> ysql_catalog_config_ = nullptr; // No GUARD, only write on Load.
1468
1469
  // Transaction tables information.
1470
  scoped_refptr<SysConfigInfo> transaction_tables_config_ =
1471
      nullptr; // No GUARD, only write on Load.
1472
1473
  Master *master_;
1474
  Atomic32 closing_;
1475
1476
  std::unique_ptr<SysCatalogTable> sys_catalog_;
1477
1478
  // Mutex to avoid concurrent remote bootstrap sessions.
1479
  std::mutex remote_bootstrap_mtx_;
1480
1481
  // Set to true if this master has received at least the superblock from a remote master.
1482
  bool tablet_exists_;
1483
1484
  // Background thread, used to execute the catalog manager tasks
1485
  // like the assignment and cleaner.
1486
  friend class CatalogManagerBgTasks;
1487
  std::unique_ptr<CatalogManagerBgTasks> background_tasks_;
1488
1489
  // Background threadpool, newer features use this (instead of the Background thread)
1490
  // to execute time-lenient catalog manager tasks.
1491
  std::unique_ptr<yb::ThreadPool> background_tasks_thread_pool_;
1492
1493
  // TODO: convert this to YB_DEFINE_ENUM for automatic pretty-printing.
1494
  enum State {
1495
    kConstructed,
1496
    kStarting,
1497
    kRunning,
1498
    kClosing
1499
  };
1500
1501
  // Lock protecting state_, leader_ready_term_
1502
  mutable simple_spinlock state_lock_;
1503
  State state_ GUARDED_BY(state_lock_);
1504
1505
  // Used to defer Master<->TabletServer work from reactor threads onto a thread where
1506
  // blocking behavior is permissible.
1507
  //
1508
  // NOTE: Presently, this thread pool must contain only a single
1509
  // thread (to correctly serialize invocations of ElectedAsLeaderCb
1510
  // upon closely timed consecutive elections).
1511
  std::unique_ptr<ThreadPool> leader_initialization_pool_;
1512
1513
  // Thread pool to do the async RPC task work.
1514
  std::unique_ptr<ThreadPool> async_task_pool_;
1515
1516
  // This field is updated when a node becomes leader master,
1517
  // waits for all outstanding uncommitted metadata (table and tablet metadata)
1518
  // in the sys catalog to commit, and then reads that metadata into in-memory
1519
  // data structures. This is used to "fence" client and tablet server requests
1520
  // that depend on the in-memory state until this master can respond
1521
  // correctly.
1522
  int64_t leader_ready_term_ GUARDED_BY(state_lock_);
1523
1524
  // Lock used to fence operations and leader elections. All logical operations
1525
  // (i.e. create table, alter table, etc.) should acquire this lock for
1526
  // reading. Following an election where this master is elected leader, it
1527
  // should acquire this lock for writing before reloading the metadata.
1528
  //
1529
  // Readers should not acquire this lock directly; use ScopedLeadershipLock
1530
  // instead.
1531
  //
1532
  // Always acquire this lock before state_lock_.
1533
  RWMutex leader_lock_;
1534
1535
  // Async operations are accessing some private methods
1536
  // (TODO: this stuff should be deferred and done in the background thread)
1537
  friend class AsyncAlterTable;
1538
1539
  // Number of live tservers metric.
1540
  scoped_refptr<AtomicGauge<uint32_t>> metric_num_tablet_servers_live_;
1541
1542
  // Number of dead tservers metric.
1543
  scoped_refptr<AtomicGauge<uint32_t>> metric_num_tablet_servers_dead_;
1544
1545
  friend class ClusterLoadBalancer;
1546
1547
  // Policy for load balancing tablets on tablet servers.
1548
  std::unique_ptr<ClusterLoadBalancer> load_balance_policy_;
1549
1550
  // Use the Raft config that has been bootstrapped to update the in-memory state of master options
1551
  // and also the on-disk state of the consensus meta object.
1552
  CHECKED_STATUS UpdateMastersListInMemoryAndDisk();
1553
1554
  // Tablets of system tables on the master indexed by the tablet id.
1555
  std::unordered_map<std::string, std::shared_ptr<tablet::AbstractTablet>> system_tablets_;
1556
1557
  // Tablet of colocated namespaces indexed by the namespace id.
1558
  std::unordered_map<NamespaceId, scoped_refptr<TabletInfo>> colocated_tablet_ids_map_
1559
      GUARDED_BY(mutex_);
1560
1561
  typedef std::unordered_map<TablegroupId, scoped_refptr<TabletInfo>> TablegroupTabletMap;
1562
1563
  std::unordered_map<NamespaceId, TablegroupTabletMap> tablegroup_tablet_ids_map_
1564
      GUARDED_BY(mutex_);
1565
1566
  std::unordered_map<TablegroupId, scoped_refptr<TablegroupInfo>> tablegroup_ids_map_
1567
      GUARDED_BY(mutex_);
1568
1569
  std::unordered_map<TableId, TableId> matview_pg_table_ids_map_
1570
      GUARDED_BY(mutex_);
1571
1572
  std::unordered_map<TableId, TablegroupId> table_tablegroup_ids_map_
1573
      GUARDED_BY(mutex_);
1574
1575
  boost::optional<std::future<Status>> initdb_future_;
1576
  boost::optional<InitialSysCatalogSnapshotWriter> initial_snapshot_writer_;
1577
1578
  std::unique_ptr<PermissionsManager> permissions_manager_;
1579
1580
  // This is used for tracking that initdb has started running previously.
1581
  std::atomic<bool> pg_proc_exists_{false};
1582
1583
  // Tracks most recent async tasks.
1584
  scoped_refptr<TasksTracker> tasks_tracker_;
1585
1586
  // Tracks most recent user initiated jobs.
1587
  scoped_refptr<TasksTracker> jobs_tracker_;
1588
1589
  std::unique_ptr<EncryptionManager> encryption_manager_;
1590
1591
  std::unique_ptr<client::UniverseKeyClient> universe_key_client_;
1592
1593
  // A pointer to the system.partitions tablet for the RebuildYQLSystemPartitions bg task.
1594
  std::shared_ptr<SystemTablet> system_partitions_tablet_ = nullptr;
1595
1596
  // Handles querying and processing YSQL DDL Transactions as a catalog manager background task.
1597
  std::unique_ptr<YsqlTransactionDdl> ysql_transaction_;
1598
1599
  std::atomic<MonoTime> time_elected_leader_;
1600
1601
  std::unique_ptr<client::YBClient> cdc_state_client_;
1602
1603
  // Mutex to avoid simultaneous creation of transaction tables for a tablespace.
1604
  std::mutex tablespace_transaction_table_creation_mutex_;
1605
1606
  void StartElectionIfReady(
1607
      const consensus::ConsensusStatePB& cstate, TabletInfo* tablet);
1608
1609
 private:
1610
  // Performs the provided action with the sys catalog shared tablet instance, or sets up an error
1611
  // if the tablet is not found.
1612
  template <class Req, class Resp, class F>
1613
  CHECKED_STATUS PerformOnSysCatalogTablet(
1614
      const Req& req, Resp* resp, const F& f);
1615
1616
  virtual bool CDCStreamExistsUnlocked(const CDCStreamId& id) REQUIRES_SHARED(mutex_);
1617
1618
  CHECKED_STATUS CollectTable(
1619
      const TableDescription& table_description,
1620
      CollectFlags flags,
1621
      std::vector<TableDescription>* all_tables,
1622
      std::unordered_set<NamespaceId>* parent_colocated_table_ids);
1623
1624
  void SplitTabletWithKey(
1625
      const scoped_refptr<TabletInfo>& tablet, const std::string& split_encoded_key,
1626
      const std::string& split_partition_key, bool select_all_tablets_for_split);
1627
1628
  // From the list of TServers in 'ts_descs', return the ones that match any placement policy
1629
  // in 'placement_info'. Returns error if there are insufficient TServers to match the
1630
  // required replication factor in placement_info.
1631
  // NOTE: This function will only check whether the total replication factor can be
1632
  // satisfied, and not the individual min_num_replicas in each placement block.
1633
  Result<TSDescriptorVector> FindTServersForPlacementInfo(
1634
      const PlacementInfoPB& placement_info,
1635
      const TSDescriptorVector& ts_descs) const;
1636
1637
  // Using the TServer info in 'ts_descs', return the TServers that match 'pplacement_block'.
1638
  // Returns error if there aren't enough TServers to fulfill the min_num_replicas requirement
1639
  // outlined in 'placement_block'.
1640
  Result<TSDescriptorVector> FindTServersForPlacementBlock(
1641
      const PlacementBlockPB& placement_block,
1642
      const TSDescriptorVector& ts_descs);
1643
1644
  bool IsReplicationInfoSet(const ReplicationInfoPB& replication_info);
1645
1646
  CHECKED_STATUS ValidateTableReplicationInfo(const ReplicationInfoPB& replication_info);
1647
1648
  // Return the id of the tablespace associated with a transaction status table, if any.
1649
  boost::optional<TablespaceId> GetTransactionStatusTableTablespace(
1650
      const scoped_refptr<TableInfo>& table) REQUIRES_SHARED(mutex_);
1651
1652
  // Clears tablespace id for a transaction status table, reverting it back to cluster default
1653
  // if no placement has been set explicitly.
1654
  void ClearTransactionStatusTableTablespace(
1655
      const scoped_refptr<TableInfo>& table) REQUIRES(mutex_);
1656
1657
  // Checks if there are any transaction tables with tablespace id set for a tablespace not in
1658
  // the given tablespace info map.
1659
  bool CheckTransactionStatusTablesWithMissingTablespaces(
1660
      const TablespaceIdToReplicationInfoMap& tablespace_info) EXCLUDES(mutex_);
1661
1662
  // Updates transaction tables' tablespace ids for tablespaces that don't exist.
1663
  CHECKED_STATUS UpdateTransactionStatusTableTablespaces(
1664
      const TablespaceIdToReplicationInfoMap& tablespace_info) EXCLUDES(mutex_);
1665
1666
  // Return the tablespaces in the system and their associated replication info from
1667
  // pg catalog tables.
1668
  Result<std::shared_ptr<TablespaceIdToReplicationInfoMap>> GetYsqlTablespaceInfo();
1669
1670
  // Return the table->tablespace mapping by reading the pg catalog tables.
1671
  Result<std::shared_ptr<TableToTablespaceIdMap>> GetYsqlTableToTablespaceMap(
1672
      const TablespaceIdToReplicationInfoMap& tablespace_info) EXCLUDES(mutex_);
1673
1674
  // Background task that refreshes the in-memory state for YSQL tables with their associated
1675
  // tablespace info.
1676
  // Note: This function should only ever be called by StartTablespaceBgTaskIfStopped().
1677
  void RefreshTablespaceInfoPeriodically();
1678
1679
  // Helper function to schedule the next iteration of the tablespace info task.
1680
  void ScheduleRefreshTablespaceInfoTask(const bool schedule_now = false);
1681
1682
  // Helper function to refresh the tablespace info.
1683
  CHECKED_STATUS DoRefreshTablespaceInfo();
1684
1685
  // Processes committed consensus state for specified tablet from ts_desc.
1686
  // Returns true if tablet was mutated.
1687
  bool ProcessCommittedConsensusState(
1688
      TSDescriptor* ts_desc,
1689
      bool is_incremental,
1690
      const ReportedTabletPB& report,
1691
      const TableInfo::WriteLock& table_lock,
1692
      const TabletInfoPtr& tablet,
1693
      const TabletInfo::WriteLock& tablet_lock,
1694
      std::vector<RetryingTSRpcTaskPtr>* rpcs);
1695
1696
  struct ReportedTablet {
1697
    TabletId tablet_id;
1698
    TabletInfoPtr info;
1699
    const ReportedTabletPB* report;
1700
  };
1701
  using ReportedTablets = std::vector<ReportedTablet>;
1702
1703
  // Process tablets batch while processing tablet report.
1704
  CHECKED_STATUS ProcessTabletReportBatch(
1705
      TSDescriptor* ts_desc,
1706
      bool is_incremental,
1707
      ReportedTablets::const_iterator begin,
1708
      ReportedTablets::const_iterator end,
1709
      TabletReportUpdatesPB* full_report_update,
1710
      std::vector<RetryingTSRpcTaskPtr>* rpcs);
1711
1712
  size_t GetNumLiveTServersForPlacement(const PlacementId& placement_id);
1713
1714
  TSDescriptorVector GetAllLiveNotBlacklistedTServers() const;
1715
1716
  const YQLPartitionsVTable& GetYqlPartitionsVtable() const;
1717
1718
  void InitializeTableLoadState(
1719
      const TableId& table_id, TSDescriptorVector ts_descs, CMPerTableLoadState* state);
1720
1721
  void InitializeGlobalLoadState(
1722
      TSDescriptorVector ts_descs, CMGlobalLoadState* state);
1723
1724
  // Should be bumped up when tablet locations are changed.
1725
  std::atomic<uintptr_t> tablet_locations_version_{0};
1726
1727
  rpc::ScheduledTaskTracker refresh_yql_partitions_task_;
1728
1729
  mutable MutexType tablespace_mutex_;
1730
1731
  // The tablespace_manager_ encapsulates two maps that are periodically updated by a background
1732
  // task that reads tablespace information from the PG catalog tables. The task creates a new
1733
  // manager instance, populates it with the information read from the catalog tables and updates
1734
  // this shared_ptr. The maps themselves are thus never updated (no inserts/deletes/updates)
1735
  // once populated and are garbage collected once all references to them go out of scope.
1736
  // No clients are expected to update the manager, they take a lock merely to copy the
1737
  // shared_ptr and read from it.
1738
  std::shared_ptr<YsqlTablespaceManager> tablespace_manager_ GUARDED_BY(tablespace_mutex_);
1739
1740
  // Whether the periodic job to update tablespace info is running.
1741
  std::atomic<bool> tablespace_bg_task_running_;
1742
1743
  rpc::ScheduledTaskTracker refresh_ysql_tablespace_info_task_;
1744
1745
  ServerRegistrationPB server_registration_;
1746
1747
  TabletSplitManager tablet_split_manager_;
1748
1749
  DISALLOW_COPY_AND_ASSIGN(CatalogManager);
1750
};
1751
1752
}  // namespace master
1753
}  // namespace yb
1754
1755
#endif // YB_MASTER_CATALOG_MANAGER_H