YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/tablet/tablet_metadata.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#include "yb/tablet/tablet_metadata.h"
34
35
#include <algorithm>
36
#include <mutex>
37
#include <string>
38
39
#include <boost/optional.hpp>
40
#include <gflags/gflags.h>
41
42
#include "yb/common/entity_ids.h"
43
#include "yb/common/index.h"
44
#include "yb/common/schema.h"
45
#include "yb/common/transaction.h"
46
#include "yb/common/wire_protocol.h"
47
48
#include "yb/consensus/consensus_util.h"
49
#include "yb/consensus/opid_util.h"
50
51
#include "yb/docdb/docdb_rocksdb_util.h"
52
53
#include "yb/gutil/atomicops.h"
54
#include "yb/gutil/dynamic_annotations.h"
55
#include "yb/gutil/map-util.h"
56
#include "yb/gutil/stl_util.h"
57
#include "yb/gutil/strings/substitute.h"
58
59
#include "yb/rocksdb/db.h"
60
#include "yb/rocksdb/options.h"
61
62
#include "yb/tablet/metadata.pb.h"
63
#include "yb/tablet/tablet_options.h"
64
65
#include "yb/util/debug/trace_event.h"
66
#include "yb/util/flag_tags.h"
67
#include "yb/util/logging.h"
68
#include "yb/util/pb_util.h"
69
#include "yb/util/random.h"
70
#include "yb/util/result.h"
71
#include "yb/util/status.h"
72
#include "yb/util/status_log.h"
73
#include "yb/util/trace.h"
74
75
DEFINE_bool(enable_tablet_orphaned_block_deletion, true,
76
            "Whether to enable deletion of orphaned blocks from disk. "
77
            "Note: This is only exposed for debugging purposes!");
78
TAG_FLAG(enable_tablet_orphaned_block_deletion, advanced);
79
TAG_FLAG(enable_tablet_orphaned_block_deletion, hidden);
80
TAG_FLAG(enable_tablet_orphaned_block_deletion, runtime);
81
82
using std::shared_ptr;
83
84
using base::subtle::Barrier_AtomicIncrement;
85
using strings::Substitute;
86
87
using yb::consensus::MinimumOpId;
88
89
namespace yb {
90
namespace tablet {
91
92
const int64 kNoDurableMemStore = -1;
93
const std::string kIntentsSubdir = "intents";
94
const std::string kIntentsDBSuffix = ".intents";
95
const std::string kSnapshotsDirSuffix = ".snapshots";
96
97
// ============================================================================
98
//  Raft group metadata
99
// ============================================================================
100
101
TableInfo::TableInfo()
102
3.81k
    : schema(std::make_unique<Schema>()), index_map(std::make_unique<IndexMap>()) {
103
3.81k
}
104
105
TableInfo::TableInfo(std::string table_id,
106
                     std::string namespace_name,
107
                     std::string table_name,
108
                     TableType table_type,
109
                     const Schema& schema,
110
                     const IndexMap& index_map,
111
                     const boost::optional<IndexInfo>& index_info,
112
                     const uint32_t schema_version,
113
                     PartitionSchema partition_schema)
114
    : table_id(std::move(table_id)),
115
      namespace_name(std::move(namespace_name)),
116
      table_name(std::move(table_name)),
117
      table_type(table_type),
118
      schema(std::make_unique<Schema>(schema)),
119
      index_map(std::make_unique<IndexMap>(index_map)),
120
      index_info(index_info ? new IndexInfo(*index_info) : nullptr),
121
      schema_version(schema_version),
122
1.17M
      partition_schema(std::move(partition_schema)) {
123
1.17M
}
124
125
TableInfo::TableInfo(const TableInfo& other,
126
                     const Schema& schema,
127
                     const IndexMap& index_map,
128
                     const std::vector<DeletedColumn>& deleted_cols,
129
                     const uint32_t schema_version)
130
    : table_id(other.table_id),
131
      namespace_name(other.namespace_name),
132
      table_name(other.table_name),
133
      table_type(other.table_type),
134
      schema(std::make_unique<Schema>(schema)),
135
      index_map(std::make_unique<IndexMap>(index_map)),
136
      index_info(other.index_info ? new IndexInfo(*other.index_info) : nullptr),
137
      schema_version(schema_version),
138
      partition_schema(other.partition_schema),
139
79.9k
      deleted_cols(other.deleted_cols) {
140
79.9k
  this->deleted_cols.insert(this->deleted_cols.end(), deleted_cols.begin(), deleted_cols.end());
141
79.9k
}
142
143
3.81k
Status TableInfo::LoadFromPB(const TableInfoPB& pb) {
144
3.81k
  table_id = pb.table_id();
145
3.81k
  namespace_name = pb.namespace_name();
146
3.81k
  table_name = pb.table_name();
147
3.81k
  table_type = pb.table_type();
148
149
3.81k
  RETURN_NOT_OK(SchemaFromPB(pb.schema(), schema.get()));
150
3.81k
  if (pb.has_index_info()) {
151
18
    index_info.reset(new IndexInfo(pb.index_info()));
152
18
  }
153
3.81k
  index_map->FromPB(pb.indexes());
154
3.81k
  schema_version = pb.schema_version();
155
156
3.81k
  RETURN_NOT_OK(PartitionSchema::FromPB(pb.partition_schema(), *schema, &partition_schema));
157
158
3.81k
  for (const DeletedColumnPB& deleted_col : pb.deleted_cols()) {
159
636
    DeletedColumn col;
160
636
    RETURN_NOT_OK(DeletedColumn::FromPB(deleted_col, &col));
161
636
    deleted_cols.push_back(col);
162
636
  }
163
164
3.81k
  return Status::OK();
165
3.81k
}
166
167
301M
void TableInfo::ToPB(TableInfoPB* pb) const {
168
301M
  pb->set_table_id(table_id);
169
301M
  pb->set_namespace_name(namespace_name);
170
301M
  pb->set_table_name(table_name);
171
301M
  pb->set_table_type(table_type);
172
173
301M
  DCHECK(schema->has_column_ids());
174
301M
  SchemaToPB(*schema, pb->mutable_schema());
175
301M
  if (index_info) {
176
68.4k
    index_info->ToPB(pb->mutable_index_info());
177
68.4k
  }
178
301M
  index_map->ToPB(pb->mutable_indexes());
179
301M
  pb->set_schema_version(schema_version);
180
181
301M
  partition_schema.ToPB(pb->mutable_partition_schema());
182
183
301M
  for (const DeletedColumn& deleted_col : deleted_cols) {
184
21.4k
    deleted_col.CopyToPB(pb->mutable_deleted_cols()->Add());
185
21.4k
  }
186
301M
}
187
188
Status KvStoreInfo::LoadTablesFromPB(
189
2.57k
    const google::protobuf::RepeatedPtrField<TableInfoPB>& pbs, const TableId& primary_table_id) {
190
2.57k
  tables.clear();
191
3.81k
  for (const auto& table_pb : pbs) {
192
3.81k
    auto table_info = std::make_shared<TableInfo>();
193
3.81k
    RETURN_NOT_OK(table_info->LoadFromPB(table_pb));
194
3.81k
    if (table_info->table_id != primary_table_id) {
195
      // TODO(alex): cotable_id should be loaded from PB schema, do we need this section?
196
1.24k
      if (table_pb.schema().table_properties().is_ysql_catalog_table()) {
197
1.22k
        Uuid cotable_id;
198
1.22k
        CHECK_OK(cotable_id.FromHexString(table_info->table_id));
199
        // TODO(#79): when adding for multiple KV-stores per Raft group support - check if we need
200
        // to set cotable ID.
201
1.22k
        table_info->schema->set_cotable_id(cotable_id);
202
1.22k
      }
203
      // Colocation ID is already set in schema.
204
      // TODO(alex): We don't have this info when master starts up?
205
1.24k
    }
206
3.81k
    tables[table_info->table_id] = std::move(table_info);
207
3.81k
  }
208
2.57k
  return Status::OK();
209
2.57k
}
210
211
Status KvStoreInfo::LoadFromPB(const KvStoreInfoPB& pb,
212
                               const TableId& primary_table_id,
213
2.57k
                               bool local_superblock) {
214
2.57k
  kv_store_id = KvStoreId(pb.kv_store_id());
215
2.57k
  if (local_superblock) {
216
404
    rocksdb_dir = pb.rocksdb_dir();
217
404
  }
218
2.57k
  lower_bound_key = pb.lower_bound_key();
219
2.57k
  upper_bound_key = pb.upper_bound_key();
220
2.57k
  has_been_fully_compacted = pb.has_been_fully_compacted();
221
222
2.57k
  for (const auto& schedule_id : pb.snapshot_schedules()) {
223
0
    snapshot_schedules.insert(VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule_id)));
224
0
  }
225
226
2.57k
  return LoadTablesFromPB(pb.tables(), primary_table_id);
227
2.57k
}
228
229
1.56M
void KvStoreInfo::ToPB(const TableId& primary_table_id, KvStoreInfoPB* pb) const {
230
1.56M
  pb->set_kv_store_id(kv_store_id.ToString());
231
1.56M
  pb->set_rocksdb_dir(rocksdb_dir);
232
1.56M
  if (lower_bound_key.empty()) {
233
1.55M
    pb->clear_lower_bound_key();
234
1.55M
  } else {
235
1.02k
    pb->set_lower_bound_key(lower_bound_key);
236
1.02k
  }
237
1.56M
  if (upper_bound_key.empty()) {
238
1.55M
    pb->clear_upper_bound_key();
239
1.55M
  } else {
240
869
    pb->set_upper_bound_key(upper_bound_key);
241
869
  }
242
1.56M
  pb->set_has_been_fully_compacted(has_been_fully_compacted);
243
244
  // Putting primary table first, then all other tables.
245
1.56M
  const auto& it = tables.find(primary_table_id);
246
1.56M
  if (it != tables.end()) {
247
1.55M
    it->second->ToPB(pb->add_tables());
248
1.55M
  }
249
301M
  for (const auto& it : tables) {
250
301M
    if (it.first != primary_table_id) {
251
299M
      it.second->ToPB(pb->add_tables());
252
299M
    }
253
301M
  }
254
255
1.56M
  for (const auto& schedule_id : snapshot_schedules) {
256
326
    pb->add_snapshot_schedules(schedule_id.data(), schedule_id.size());
257
326
  }
258
1.56M
}
259
260
namespace {
261
262
150k
std::string MakeTabletDirName(const TabletId& tablet_id) {
263
150k
  return Format("tablet-$0", tablet_id);
264
150k
}
265
266
} // namespace
267
268
// ============================================================================
269
270
Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateNew(
271
    const RaftGroupMetadataData& data, const std::string& data_root_dir,
272
150k
    const std::string& wal_root_dir) {
273
150k
  auto* fs_manager = data.fs_manager;
274
  // Verify that no existing Raft group exists with the same ID.
275
150k
  if (fs_manager->env()->FileExists(fs_manager->GetRaftGroupMetadataPath(data.raft_group_id))) {
276
0
    return STATUS(AlreadyPresent, "Raft group already exists", data.raft_group_id);
277
0
  }
278
279
150k
  auto wal_top_dir = wal_root_dir;
280
150k
  auto data_top_dir = data_root_dir;
281
  // Use the original randomized logic if the indices are not explicitly passed in
282
150k
  if (data_root_dir.empty()) {
283
8.48k
    auto data_root_dirs = fs_manager->GetDataRootDirs();
284
8.48k
    CHECK
(!data_root_dirs.empty()) << "No data root directories found"0
;
285
8.48k
    data_top_dir = RandomElement(data_root_dirs);
286
8.48k
  }
287
288
150k
  if (wal_root_dir.empty()) {
289
8.48k
    auto wal_root_dirs = fs_manager->GetWalRootDirs();
290
8.48k
    CHECK
(!wal_root_dirs.empty()) << "No wal root directories found"0
;
291
8.48k
    wal_top_dir = RandomElement(wal_root_dirs);
292
8.48k
  }
293
294
150k
  const string table_dir_name = Substitute("table-$0", data.table_info->table_id);
295
150k
  const string tablet_dir_name = MakeTabletDirName(data.raft_group_id);
296
150k
  const string wal_dir = JoinPathSegments(wal_top_dir, table_dir_name, tablet_dir_name);
297
150k
  const string rocksdb_dir = JoinPathSegments(
298
150k
      data_top_dir, FsManager::kRocksDBDirName, table_dir_name, tablet_dir_name);
299
300
150k
  RaftGroupMetadataPtr ret(new RaftGroupMetadata(data, rocksdb_dir, wal_dir));
301
150k
  RETURN_NOT_OK(ret->Flush());
302
150k
  return ret;
303
150k
}
304
305
Result<RaftGroupMetadataPtr> RaftGroupMetadata::Load(
306
8.87k
    FsManager* fs_manager, const RaftGroupId& raft_group_id) {
307
8.87k
  RaftGroupMetadataPtr ret(new RaftGroupMetadata(fs_manager, raft_group_id));
308
8.87k
  RETURN_NOT_OK(ret->LoadFromDisk());
309
263
  return ret;
310
8.87k
}
311
312
484
Result<RaftGroupMetadataPtr> RaftGroupMetadata::LoadOrCreate(const RaftGroupMetadataData& data) {
313
484
  auto metadata = Load(data.fs_manager, data.raft_group_id);
314
484
  if (metadata.ok()) {
315
7
    if (!(**metadata).schema()->Equals(*data.table_info->schema)) {
316
0
      return STATUS(Corruption, Substitute("Schema on disk ($0) does not "
317
0
        "match expected schema ($1)", (*metadata)->schema()->ToString(),
318
0
        data.table_info->schema->ToString()));
319
0
    }
320
7
    return *metadata;
321
7
  }
322
323
477
  if (metadata.status().IsNotFound()) {
324
477
    return CreateNew(data);
325
477
  }
326
327
0
  return metadata.status();
328
477
}
329
330
template <class TablesMap>
331
CHECKED_STATUS MakeTableNotFound(const TableId& table_id, const RaftGroupId& raft_group_id,
332
16
                                 const TablesMap& tables) {
333
16
  std::string table_name = "<unknown_table_name>";
334
16
  if (!table_id.empty()) {
335
16
    const auto iter = tables.find(table_id);
336
16
    if (iter != tables.end()) {
337
0
      table_name = iter->second->table_name;
338
0
    }
339
16
  }
340
16
  std::ostringstream string_stream;
341
16
  string_stream << "Table " << table_name << " (" << table_id << ") not found in Raft group "
342
16
      << raft_group_id;
343
16
  std::string msg = string_stream.str();
344
16
#ifndef NDEBUG
345
  // This very large message should be logged instead of being appended to STATUS.
346
16
  std::string suffix = Format(". Tables: $0.", tables);
347
16
  VLOG
(1) << msg << suffix0
;
348
16
#endif
349
16
  return STATUS(NotFound, msg);
350
16
}
351
352
24.0M
Result<TableInfoPtr> RaftGroupMetadata::GetTableInfo(const std::string& table_id) const {
353
24.0M
  std::lock_guard<MutexType> lock(data_mutex_);
354
24.0M
  return GetTableInfoUnlocked(table_id);
355
24.0M
}
356
357
24.1M
Result<TableInfoPtr> RaftGroupMetadata::GetTableInfoUnlocked(const std::string& table_id) const {
358
24.1M
  const auto& tables = kv_store_.tables;
359
24.1M
  const auto id = !table_id.empty() ? 
table_id24.0M
:
primary_table_id_53.5k
;
360
24.1M
  const auto iter = tables.find(id);
361
24.1M
  if (iter == tables.end()) {
362
16
    return MakeTableNotFound(table_id, raft_group_id_, tables);
363
16
  }
364
24.1M
  return iter->second;
365
24.1M
}
366
367
Status RaftGroupMetadata::DeleteTabletData(TabletDataState delete_type,
368
75.5k
                                           const OpId& last_logged_opid) {
369
75.5k
  CHECK(delete_type == TABLET_DATA_DELETED ||
370
0
        delete_type == TABLET_DATA_TOMBSTONED)
371
0
      << "DeleteTabletData() called with unsupported delete_type on tablet "
372
0
      << raft_group_id_ << ": " << TabletDataState_Name(delete_type)
373
0
      << " (" << delete_type << ")";
374
375
  // First add all of our blocks to the orphan list
376
  // and clear our rowsets. This serves to erase all the data.
377
  //
378
  // We also set the state in our persisted metadata to indicate that
379
  // we have been deleted.
380
75.5k
  {
381
75.5k
    std::lock_guard<MutexType> lock(data_mutex_);
382
75.5k
    tablet_data_state_ = delete_type;
383
75.5k
    if (!last_logged_opid.empty()) {
384
75.3k
      tombstone_last_logged_opid_ = last_logged_opid;
385
75.3k
    }
386
75.5k
  }
387
388
75.5k
  rocksdb::Options rocksdb_options;
389
75.5k
  TabletOptions tablet_options;
390
75.5k
  std::string log_prefix = consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid());
391
75.5k
  docdb::InitRocksDBOptions(
392
75.5k
      &rocksdb_options, log_prefix, nullptr /* statistics */, tablet_options);
393
394
75.5k
  const auto& rocksdb_dir = this->rocksdb_dir();
395
75.5k
  LOG(INFO) << "Destroying regular db at: " << rocksdb_dir;
396
75.5k
  rocksdb::Status status = rocksdb::DestroyDB(rocksdb_dir, rocksdb_options);
397
398
75.5k
  if (!status.ok()) {
399
473
    LOG(ERROR) << "Failed to destroy regular DB at: " << rocksdb_dir << ": " << status;
400
75.0k
  } else {
401
75.0k
    LOG(INFO) << "Successfully destroyed regular DB at: " << rocksdb_dir;
402
75.0k
  }
403
404
75.5k
  if (fs_manager_->env()->FileExists(rocksdb_dir)) {
405
819
    auto s = fs_manager_->env()->DeleteRecursively(rocksdb_dir);
406
819
    LOG_IF
(WARNING, !s.ok()) << "Unable to delete rocksdb data directory " << rocksdb_dir0
;
407
819
  }
408
409
75.5k
  const auto intents_dir = this->intents_rocksdb_dir();
410
75.5k
  if (fs_manager_->env()->FileExists(intents_dir)) {
411
75.0k
    status = rocksdb::DestroyDB(intents_dir, rocksdb_options);
412
413
75.0k
    if (!status.ok()) {
414
0
      LOG(ERROR) << "Failed to destroy provisional records DB at: " << intents_dir << ": "
415
0
                 << status;
416
75.0k
    } else {
417
75.0k
      LOG(INFO) << "Successfully destroyed provisional records DB at: " << intents_dir;
418
75.0k
    }
419
75.0k
  }
420
421
75.5k
  if (fs_manager_->env()->FileExists(intents_dir)) {
422
0
    auto s = fs_manager_->env()->DeleteRecursively(intents_dir);
423
0
    LOG_IF(WARNING, !s.ok()) << "Unable to delete intents directory " << intents_dir;
424
0
  }
425
426
  // TODO(tsplit): decide what to do with snapshots for split tablets that we delete after split.
427
  // As for now, snapshots will be deleted as well.
428
75.5k
  const auto snapshots_dir = this->snapshots_dir();
429
75.5k
  if (fs_manager_->env()->FileExists(snapshots_dir)) {
430
75.0k
    auto s = fs_manager_->env()->DeleteRecursively(snapshots_dir);
431
18.4E
    LOG_IF(WARNING, !s.ok()) << "Unable to delete snapshots directory " << snapshots_dir;
432
75.0k
  }
433
434
  // Flushing will sync the new tablet_data_state_ to disk and will now also
435
  // delete all the data.
436
75.5k
  RETURN_NOT_OK(Flush());
437
438
  // Re-sync to disk one more time.
439
  // This call will typically re-sync with an empty orphaned blocks list
440
  // (unless deleting any orphans failed during the last Flush()), so that we
441
  // don't try to re-delete the deleted orphaned blocks on every startup.
442
75.5k
  return Flush();
443
75.5k
}
444
445
8
bool RaftGroupMetadata::IsTombstonedWithNoRocksDBData() const {
446
8
  std::lock_guard<MutexType> lock(data_mutex_);
447
8
  const auto& rocksdb_dir = kv_store_.rocksdb_dir;
448
8
  const auto intents_dir = rocksdb_dir + kIntentsDBSuffix;
449
8
  return tablet_data_state_ == TABLET_DATA_TOMBSTONED &&
450
8
      
!fs_manager_->env()->FileExists(rocksdb_dir)2
&&
451
8
      
!fs_manager_->env()->FileExists(intents_dir)2
;
452
8
}
453
454
42
Status RaftGroupMetadata::DeleteSuperBlock() {
455
42
  std::lock_guard<MutexType> lock(data_mutex_);
456
42
  if (tablet_data_state_ != TABLET_DATA_DELETED) {
457
0
    return STATUS(IllegalState,
458
0
        Substitute("Tablet $0 is not in TABLET_DATA_DELETED state. "
459
0
                   "Call DeleteTabletData(TABLET_DATA_DELETED) first. "
460
0
                   "Tablet data state: $1 ($2)",
461
0
                   raft_group_id_,
462
0
                   TabletDataState_Name(tablet_data_state_),
463
0
                   tablet_data_state_));
464
0
  }
465
466
42
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
467
42
  RETURN_NOT_OK_PREPEND(fs_manager_->env()->DeleteFile(path),
468
42
                        "Unable to delete superblock for Raft group " + raft_group_id_);
469
42
  return Status::OK();
470
42
}
471
472
RaftGroupMetadata::RaftGroupMetadata(
473
    const RaftGroupMetadataData& data, const std::string& data_dir, const std::string& wal_dir)
474
    : state_(kNotWrittenYet),
475
      raft_group_id_(data.raft_group_id),
476
      partition_(std::make_shared<Partition>(data.partition)),
477
      primary_table_id_(data.table_info->table_id),
478
      kv_store_(KvStoreId(raft_group_id_), data_dir, data.snapshot_schedules),
479
      fs_manager_(data.fs_manager),
480
      wal_dir_(wal_dir),
481
      tablet_data_state_(data.tablet_data_state),
482
      colocated_(data.colocated),
483
150k
      cdc_min_replicated_index_(std::numeric_limits<int64_t>::max()) {
484
150k
  CHECK(data.table_info->schema->has_column_ids());
485
150k
  CHECK_GT(data.table_info->schema->num_key_columns(), 0);
486
150k
  kv_store_.tables.emplace(primary_table_id_, data.table_info);
487
150k
}
488
489
83.1k
RaftGroupMetadata::~RaftGroupMetadata() {
490
83.1k
}
491
492
RaftGroupMetadata::RaftGroupMetadata(FsManager* fs_manager, RaftGroupId raft_group_id)
493
    : state_(kNotLoadedYet),
494
      raft_group_id_(std::move(raft_group_id)),
495
      kv_store_(KvStoreId(raft_group_id_)),
496
9.01k
      fs_manager_(fs_manager) {
497
9.01k
}
498
499
8.87k
Status RaftGroupMetadata::LoadFromDisk() {
500
8.87k
  TRACE_EVENT1("raft_group", "RaftGroupMetadata::LoadFromDisk",
501
8.87k
               "raft_group_id", raft_group_id_);
502
503
8.87k
  CHECK_EQ(state_, kNotLoadedYet);
504
505
8.87k
  RaftGroupReplicaSuperBlockPB superblock;
506
8.87k
  RETURN_NOT_OK(ReadSuperBlockFromDisk(&superblock));
507
263
  RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(superblock, /* local_superblock = */ true),
508
263
                        "Failed to load data from superblock protobuf");
509
263
  state_ = kInitialized;
510
263
  return Status::OK();
511
263
}
512
513
Status RaftGroupMetadata::LoadFromSuperBlock(const RaftGroupReplicaSuperBlockPB& superblock,
514
2.57k
                                             bool local_superblock) {
515
2.57k
  if (!superblock.has_kv_store()) {
516
    // Backward compatibility for tablet=KV-store=raft-group.
517
0
    RaftGroupReplicaSuperBlockPB superblock_migrated(superblock);
518
0
    RETURN_NOT_OK(MigrateSuperblock(&superblock_migrated));
519
0
    RETURN_NOT_OK(LoadFromSuperBlock(superblock_migrated, local_superblock));
520
0
    return Flush();
521
0
  }
522
523
2.57k
  VLOG(2) << "Loading RaftGroupMetadata from SuperBlockPB:" << std::endl
524
0
          << superblock.DebugString();
525
526
2.57k
  {
527
2.57k
    std::lock_guard<MutexType> lock(data_mutex_);
528
529
    // Verify that the Raft group id matches with the one in the protobuf.
530
2.57k
    if (superblock.raft_group_id() != raft_group_id_) {
531
0
      return STATUS(Corruption, "Expected id=" + raft_group_id_ +
532
0
                                " found " + superblock.raft_group_id(),
533
0
                                superblock.DebugString());
534
0
    }
535
2.57k
    Partition partition;
536
2.57k
    Partition::FromPB(superblock.partition(), &partition);
537
2.57k
    partition_ = std::make_shared<Partition>(partition);
538
2.57k
    primary_table_id_ = superblock.primary_table_id();
539
2.57k
    colocated_ = superblock.colocated();
540
541
2.57k
    RETURN_NOT_OK(kv_store_.LoadFromPB(superblock.kv_store(),
542
2.57k
                                       primary_table_id_,
543
2.57k
                                       local_superblock));
544
545
2.57k
    wal_dir_ = superblock.wal_dir();
546
2.57k
    tablet_data_state_ = superblock.tablet_data_state();
547
548
2.57k
    if (superblock.has_tombstone_last_logged_opid()) {
549
7
      tombstone_last_logged_opid_ = OpId::FromPB(superblock.tombstone_last_logged_opid());
550
2.56k
    } else {
551
2.56k
      tombstone_last_logged_opid_ = OpId();
552
2.56k
    }
553
2.57k
    cdc_min_replicated_index_ = superblock.cdc_min_replicated_index();
554
2.57k
    is_under_twodc_replication_ = superblock.is_under_twodc_replication();
555
2.57k
    hidden_ = superblock.hidden();
556
2.57k
    auto restoration_hybrid_time = HybridTime::FromPB(superblock.restoration_hybrid_time());
557
2.57k
    if (restoration_hybrid_time) {
558
0
      restoration_hybrid_time_ = restoration_hybrid_time;
559
0
    }
560
561
2.57k
    if (superblock.has_split_op_id()) {
562
3
      split_op_id_ = OpId::FromPB(superblock.split_op_id());
563
564
3
      SCHECK_EQ(implicit_cast<size_t>(superblock.split_child_tablet_ids().size()),
565
3
                split_child_tablet_ids_.size(),
566
3
                Corruption, "Expected exact number of child tablet ids");
567
9
      
for (size_t i = 0; 3
i != split_child_tablet_ids_.size();
++i6
) {
568
6
        split_child_tablet_ids_[i] = superblock.split_child_tablet_ids(narrow_cast<int>(i));
569
6
      }
570
3
    }
571
572
2.57k
    if (!superblock.active_restorations().empty()) {
573
0
      active_restorations_.reserve(superblock.active_restorations().size());
574
0
      for (const auto& id : superblock.active_restorations()) {
575
0
        active_restorations_.push_back(VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(id)));
576
0
      }
577
0
    }
578
2.57k
  }
579
580
2.57k
  return Status::OK();
581
2.57k
}
582
583
1.56M
Status RaftGroupMetadata::Flush() {
584
1.56M
  TRACE_EVENT1("raft_group", "RaftGroupMetadata::Flush",
585
1.56M
               "raft_group_id", raft_group_id_);
586
587
1.56M
  MutexLock l_flush(flush_lock_);
588
1.56M
  RaftGroupReplicaSuperBlockPB pb;
589
1.56M
  {
590
1.56M
    std::lock_guard<MutexType> lock(data_mutex_);
591
1.56M
    ToSuperBlockUnlocked(&pb);
592
1.56M
  }
593
1.56M
  RETURN_NOT_OK(SaveToDiskUnlocked(pb));
594
1.56M
  TRACE("Metadata flushed");
595
596
1.56M
  return Status::OK();
597
1.56M
}
598
599
2.17k
Status RaftGroupMetadata::ReplaceSuperBlock(const RaftGroupReplicaSuperBlockPB &pb) {
600
2.17k
  {
601
2.17k
    MutexLock l(flush_lock_);
602
2.17k
    RETURN_NOT_OK_PREPEND(SaveToDiskUnlocked(pb), "Unable to replace superblock");
603
2.17k
  }
604
605
2.17k
  RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(pb, /* local_superblock = */ false),
606
2.17k
                        "Failed to load data from superblock protobuf");
607
608
2.17k
  return Status::OK();
609
2.17k
}
610
611
1.56M
Status RaftGroupMetadata::SaveToDiskUnlocked(const RaftGroupReplicaSuperBlockPB &pb) {
612
1.56M
  flush_lock_.AssertAcquired();
613
614
1.56M
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
615
1.56M
  RETURN_NOT_OK_PREPEND(pb_util::WritePBContainerToPath(
616
1.56M
                            fs_manager_->env(), path, pb,
617
1.56M
                            pb_util::OVERWRITE, pb_util::SYNC),
618
1.56M
                        Substitute("Failed to write Raft group metadata $0", raft_group_id_));
619
620
1.56M
  return Status::OK();
621
1.56M
}
622
623
10.9k
Status RaftGroupMetadata::ReadSuperBlockFromDisk(RaftGroupReplicaSuperBlockPB* superblock) const {
624
10.9k
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
625
10.9k
  RETURN_NOT_OK_PREPEND(
626
10.9k
      pb_util::ReadPBContainerFromPath(fs_manager_->env(), path, superblock),
627
10.9k
      Substitute("Could not load Raft group metadata from $0", path));
628
  // Migration for backward compatibility with versions which don't have separate
629
  // TableType::TRANSACTION_STATUS_TABLE_TYPE.
630
2.30k
  if (superblock->obsolete_table_type() == TableType::REDIS_TABLE_TYPE &&
631
2.30k
      
superblock->obsolete_table_name() == kGlobalTransactionsTableName0
) {
632
0
    superblock->set_obsolete_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE);
633
0
  }
634
2.30k
  return Status::OK();
635
10.9k
}
636
637
149
void RaftGroupMetadata::ToSuperBlock(RaftGroupReplicaSuperBlockPB* superblock) const {
638
  // acquire the lock so that rowsets_ doesn't get changed until we're finished.
639
149
  std::lock_guard<MutexType> lock(data_mutex_);
640
149
  ToSuperBlockUnlocked(superblock);
641
149
}
642
643
1.56M
void RaftGroupMetadata::ToSuperBlockUnlocked(RaftGroupReplicaSuperBlockPB* superblock) const {
644
  // Convert to protobuf.
645
1.56M
  RaftGroupReplicaSuperBlockPB pb;
646
1.56M
  pb.set_raft_group_id(raft_group_id_);
647
1.56M
  partition_->ToPB(pb.mutable_partition());
648
649
1.56M
  kv_store_.ToPB(primary_table_id_, pb.mutable_kv_store());
650
651
1.56M
  pb.set_wal_dir(wal_dir_);
652
1.56M
  pb.set_tablet_data_state(tablet_data_state_);
653
1.56M
  if (!tombstone_last_logged_opid_.empty()) {
654
150k
    tombstone_last_logged_opid_.ToPB(pb.mutable_tombstone_last_logged_opid());
655
150k
  }
656
657
1.56M
  pb.set_primary_table_id(primary_table_id_);
658
1.56M
  pb.set_colocated(colocated_);
659
1.56M
  pb.set_cdc_min_replicated_index(cdc_min_replicated_index_);
660
1.56M
  pb.set_is_under_twodc_replication(is_under_twodc_replication_);
661
1.56M
  pb.set_hidden(hidden_);
662
1.56M
  if (restoration_hybrid_time_) {
663
1.55M
    pb.set_restoration_hybrid_time(restoration_hybrid_time_.ToUint64());
664
1.55M
  }
665
666
1.56M
  if (!split_op_id_.empty()) {
667
160
    split_op_id_.ToPB(pb.mutable_split_op_id());
668
160
    auto& split_child_table_ids = *pb.mutable_split_child_tablet_ids();
669
160
    split_child_table_ids.Reserve(narrow_cast<int>(split_child_tablet_ids_.size()));
670
320
    for (const auto& split_child_tablet_id : split_child_tablet_ids_) {
671
320
      *split_child_table_ids.Add() = split_child_tablet_id;
672
320
    }
673
160
  }
674
675
1.56M
  if (!active_restorations_.empty()) {
676
30
    auto& active_restorations = *pb.mutable_active_restorations();
677
30
    active_restorations.Reserve(narrow_cast<int>(active_restorations_.size()));
678
30
    for (const auto& id : active_restorations_) {
679
30
      active_restorations.Add()->assign(id.AsSlice().cdata(), id.size());
680
30
    }
681
30
  }
682
683
1.56M
  superblock->Swap(&pb);
684
1.56M
}
685
686
void RaftGroupMetadata::SetSchema(const Schema& schema,
687
                                  const IndexMap& index_map,
688
                                  const std::vector<DeletedColumn>& deleted_cols,
689
                                  const uint32_t version,
690
79.9k
                                  const TableId& table_id) {
691
79.9k
  DCHECK(schema.has_column_ids());
692
79.9k
  std::lock_guard<MutexType> lock(data_mutex_);
693
79.9k
  TableId target_table_id = table_id.empty() ? 
primary_table_id_1.90k
:
table_id78.0k
;
694
79.9k
  auto result = GetTableInfoUnlocked(target_table_id);
695
79.9k
  DCHECK(result.ok());
696
79.9k
  TableInfoPtr new_table_info = std::make_shared<TableInfo>(*result.get(),
697
79.9k
                                                            schema,
698
79.9k
                                                            index_map,
699
79.9k
                                                            deleted_cols,
700
79.9k
                                                            version);
701
79.9k
  if (target_table_id != primary_table_id_) {
702
348
    if (schema.table_properties().is_ysql_catalog_table()) {
703
      // TODO(alex): cotable_id should be copied from original schema, do we need this section?
704
      //             Might be related to #5017, #6107
705
0
      Uuid cotable_id;
706
0
      CHECK_OK(cotable_id.FromHexString(target_table_id));
707
0
      new_table_info->schema->set_cotable_id(cotable_id);
708
0
    }
709
    // Ensure colocation ID remains unchanged.
710
348
    const auto& old_schema = (*result)->schema;
711
348
    CHECK(old_schema->has_colocation_id() == schema.has_colocation_id())
712
0
            << "Attempted to change colocation state for table " << table_id
713
0
            << " from " << old_schema->has_colocation_id()
714
0
            << " to " << schema.has_colocation_id();
715
348
    CHECK(!old_schema->has_colocation_id() ||
716
0
          old_schema->colocation_id() == schema.colocation_id())
717
0
            << "Attempted to change colocation ID for table " << table_id
718
0
            << " from " << old_schema->colocation_id()
719
0
            << " to " << schema.colocation_id();
720
348
  }
721
79.9k
  
VLOG_WITH_PREFIX143
(1) << raft_group_id_ << " Updating table " << target_table_id
722
143
                      << " to Schema version " << version
723
143
                      << " from \n" << yb::ToString(kv_store_.tables[target_table_id])
724
143
                      << " to \n" << yb::ToString(new_table_info);
725
79.9k
  kv_store_.tables[target_table_id].swap(new_table_info);
726
79.9k
}
727
728
0
void RaftGroupMetadata::SetPartitionSchema(const PartitionSchema& partition_schema) {
729
0
  std::lock_guard<MutexType> lock(data_mutex_);
730
0
  auto& tables = kv_store_.tables;
731
0
  DCHECK(tables.find(primary_table_id_) != tables.end());
732
0
  tables[primary_table_id_]->partition_schema = partition_schema;
733
0
}
734
735
void RaftGroupMetadata::SetTableName(
736
66.7k
    const string& namespace_name, const string& table_name, const TableId& table_id) {
737
66.7k
  std::lock_guard<MutexType> lock(data_mutex_);
738
66.7k
  auto& tables = kv_store_.tables;
739
18.4E
  auto& id = 
table_id.empty()66.7k
?
primary_table_id_66.8k
: table_id;
740
66.7k
  DCHECK(tables.find(id) != tables.end());
741
66.7k
  tables[id]->namespace_name = namespace_name;
742
66.7k
  tables[id]->table_name = table_name;
743
66.7k
}
744
745
void RaftGroupMetadata::AddTable(const std::string& table_id,
746
                                 const std::string& namespace_name,
747
                                 const std::string& table_name,
748
                                 const TableType table_type,
749
                                 const Schema& schema,
750
                                 const IndexMap& index_map,
751
                                 const PartitionSchema& partition_schema,
752
                                 const boost::optional<IndexInfo>& index_info,
753
1.02M
                                 const uint32_t schema_version) {
754
1.02M
  DCHECK(schema.has_column_ids());
755
1.02M
  TableInfoPtr new_table_info = std::make_shared<TableInfo>(table_id,
756
1.02M
                                                            namespace_name,
757
1.02M
                                                            table_name,
758
1.02M
                                                            table_type,
759
1.02M
                                                            schema,
760
1.02M
                                                            index_map,
761
1.02M
                                                            index_info,
762
1.02M
                                                            schema_version,
763
1.02M
                                                            partition_schema);
764
1.02M
  if (table_id != primary_table_id_) {
765
1.02M
    if (schema.table_properties().is_ysql_catalog_table()) {
766
      // TODO(alex): cotable_id seems to be properly copied from schema, do we need this section?
767
      //             Might be related to #5017, #6107
768
1.02M
      Uuid cotable_id;
769
1.02M
      CHECK_OK(cotable_id.FromHexString(table_id));
770
1.02M
      new_table_info->schema->set_cotable_id(cotable_id);
771
1.02M
    }
772
1.02M
  }
773
1.02M
  std::lock_guard<MutexType> lock(data_mutex_);
774
1.02M
  auto& tables = kv_store_.tables;
775
1.02M
  auto existing_table_iter = tables.find(table_id);
776
1.02M
  if (existing_table_iter != tables.end()) {
777
0
    const auto& existing_table = *existing_table_iter->second.get();
778
0
    if (!existing_table.schema->table_properties().is_ysql_catalog_table() &&
779
0
        schema.table_properties().is_ysql_catalog_table()) {
780
      // This must be the one-time migration with transactional DDL being turned on for the first
781
      // time on this cluster.
782
0
    } else {
783
0
      LOG(DFATAL) << "Table " << table_id << " already exists. New table info: "
784
0
          << new_table_info->ToString() << ", old table info: " << existing_table.ToString();
785
786
      // We never expect colocation IDs to mismatch.
787
0
      const auto& existing_schema = existing_table.schema;
788
0
      CHECK(existing_schema->has_colocation_id() == schema.has_colocation_id())
789
0
              << "Attempted to change colocation state for table " << table_id
790
0
              << " from " << existing_schema->has_colocation_id()
791
0
              << " to " << schema.has_colocation_id();
792
793
0
      CHECK(!existing_schema->has_colocation_id() ||
794
0
            existing_schema->colocation_id() == schema.colocation_id())
795
0
              << "Attempted to change colocation ID for table " << table_id
796
0
              << " from " << existing_schema->colocation_id()
797
0
              << " to " << schema.colocation_id();
798
0
    }
799
0
  }
800
1.02M
  
VLOG_WITH_PREFIX0
(1) << "Updating to Schema version " << schema_version
801
0
                      << " from\n" << yb::ToString(tables[table_id])
802
0
                      << "\nto\n" << yb::ToString(new_table_info);
803
1.02M
  tables[table_id].swap(new_table_info);
804
1.02M
}
805
806
8.23k
void RaftGroupMetadata::RemoveTable(const TableId& table_id) {
807
8.23k
  std::lock_guard<MutexType> lock(data_mutex_);
808
8.23k
  auto& tables = kv_store_.tables;
809
8.23k
  tables.erase(table_id);
810
8.23k
}
811
812
534k
string RaftGroupMetadata::data_root_dir() const {
813
534k
  const auto& rocksdb_dir = kv_store_.rocksdb_dir;
814
534k
  if (rocksdb_dir.empty()) {
815
0
    return "";
816
534k
  } else {
817
534k
    auto data_root_dir = DirName(DirName(rocksdb_dir));
818
534k
    if (
strcmp(BaseName(data_root_dir).c_str(), FsManager::kRocksDBDirName) == 0534k
) {
819
534k
      data_root_dir = DirName(data_root_dir);
820
534k
    }
821
534k
    return data_root_dir;
822
534k
  }
823
534k
}
824
825
75.8k
string RaftGroupMetadata::wal_root_dir() const {
826
75.8k
  std::string wal_dir = this->wal_dir();
827
828
75.8k
  if (wal_dir.empty()) {
829
0
    return "";
830
0
  }
831
832
75.8k
  auto wal_root_dir = DirName(wal_dir);
833
75.8k
  if (
strcmp(BaseName(wal_root_dir).c_str(), FsManager::kWalDirName) != 075.8k
) {
834
75.8k
    wal_root_dir = DirName(wal_root_dir);
835
75.8k
  }
836
75.8k
  return wal_root_dir;
837
75.8k
}
838
839
5.22k
void RaftGroupMetadata::set_wal_retention_secs(uint32 wal_retention_secs) {
840
5.22k
  std::lock_guard<MutexType> lock(data_mutex_);
841
5.22k
  auto it = kv_store_.tables.find(primary_table_id_);
842
5.22k
  if (it == kv_store_.tables.end()) {
843
0
    LOG_WITH_PREFIX(DFATAL) << "Unable to set WAL retention time for primary table "
844
0
                            << primary_table_id_;
845
0
    return;
846
0
  }
847
5.22k
  it->second->wal_retention_secs = wal_retention_secs;
848
5.22k
  LOG_WITH_PREFIX(INFO) << "Set RaftGroupMetadata wal retention time to "
849
5.22k
                        << wal_retention_secs << " seconds";
850
5.22k
}
851
852
306k
uint32_t RaftGroupMetadata::wal_retention_secs() const {
853
306k
  std::lock_guard<MutexType> lock(data_mutex_);
854
306k
  auto it = kv_store_.tables.find(primary_table_id_);
855
306k
  if (it == kv_store_.tables.end()) {
856
0
    return 0;
857
0
  }
858
306k
  return it->second->wal_retention_secs;
859
306k
}
860
861
150k
Status RaftGroupMetadata::set_cdc_min_replicated_index(int64 cdc_min_replicated_index) {
862
150k
  {
863
150k
    std::lock_guard<MutexType> lock(data_mutex_);
864
150k
    cdc_min_replicated_index_ = cdc_min_replicated_index;
865
150k
  }
866
150k
  return Flush();
867
150k
}
868
869
451k
int64_t RaftGroupMetadata::cdc_min_replicated_index() const {
870
451k
  std::lock_guard<MutexType> lock(data_mutex_);
871
451k
  return cdc_min_replicated_index_;
872
451k
}
873
874
0
Status RaftGroupMetadata::SetIsUnderTwodcReplicationAndFlush(bool is_under_twodc_replication) {
875
0
  {
876
0
    std::lock_guard<MutexType> lock(data_mutex_);
877
0
    is_under_twodc_replication_ = is_under_twodc_replication;
878
0
  }
879
0
  return Flush();
880
0
}
881
882
88.7k
bool RaftGroupMetadata::is_under_twodc_replication() const {
883
88.7k
  std::lock_guard<MutexType> lock(data_mutex_);
884
88.7k
  return is_under_twodc_replication_;
885
88.7k
}
886
887
66
void RaftGroupMetadata::SetHidden(bool value) {
888
66
  std::lock_guard<MutexType> lock(data_mutex_);
889
66
  hidden_ = value;
890
66
}
891
892
12.3M
bool RaftGroupMetadata::hidden() const {
893
12.3M
  std::lock_guard<MutexType> lock(data_mutex_);
894
12.3M
  return hidden_;
895
12.3M
}
896
897
29
void RaftGroupMetadata::SetRestorationHybridTime(HybridTime value) {
898
29
  std::lock_guard<MutexType> lock(data_mutex_);
899
29
  restoration_hybrid_time_ = std::max(restoration_hybrid_time_, value);
900
29
}
901
902
150k
HybridTime RaftGroupMetadata::restoration_hybrid_time() const {
903
150k
  std::lock_guard<MutexType> lock(data_mutex_);
904
150k
  return restoration_hybrid_time_;
905
150k
}
906
907
135
void RaftGroupMetadata::set_tablet_data_state(TabletDataState state) {
908
135
  std::lock_guard<MutexType> lock(data_mutex_);
909
135
  tablet_data_state_ = state;
910
135
}
911
912
155k
string RaftGroupMetadata::LogPrefix() const {
913
155k
  return consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid());
914
155k
}
915
916
75.9k
OpId RaftGroupMetadata::tombstone_last_logged_opid() const {
917
75.9k
  std::lock_guard<MutexType> lock(data_mutex_);
918
75.9k
  return tombstone_last_logged_opid_;
919
75.9k
}
920
921
790k
bool RaftGroupMetadata::colocated() const {
922
790k
  std::lock_guard<MutexType> lock(data_mutex_);
923
790k
  return colocated_;
924
790k
}
925
926
48.8M
TabletDataState RaftGroupMetadata::tablet_data_state() const {
927
48.8M
  std::lock_guard<MutexType> lock(data_mutex_);
928
48.8M
  return tablet_data_state_;
929
48.8M
}
930
931
33
std::array<TabletId, kNumSplitParts> RaftGroupMetadata::split_child_tablet_ids() const {
932
33
  std::lock_guard<MutexType> lock(data_mutex_);
933
33
  return split_child_tablet_ids_;
934
33
}
935
936
68
OpId RaftGroupMetadata::split_op_id() const {
937
68
  std::lock_guard<MutexType> lock(data_mutex_);
938
68
  return split_op_id_;
939
68
}
940
941
131k
OpId RaftGroupMetadata::GetOpIdToDeleteAfterAllApplied() const {
942
131k
  std::lock_guard<MutexType> lock(data_mutex_);
943
131k
  if (tablet_data_state_ != TabletDataState::TABLET_DATA_SPLIT_COMPLETED || 
hidden_59
) {
944
131k
    return OpId::Invalid();
945
131k
  }
946
59
  return split_op_id_;
947
131k
}
948
949
void RaftGroupMetadata::SetSplitDone(
950
67
    const OpId& op_id, const TabletId& child1, const TabletId& child2) {
951
67
  std::lock_guard<MutexType> lock(data_mutex_);
952
67
  tablet_data_state_ = TabletDataState::TABLET_DATA_SPLIT_COMPLETED;
953
67
  split_op_id_ = op_id;
954
67
  split_child_tablet_ids_[0] = child1;
955
67
  split_child_tablet_ids_[1] = child2;
956
67
}
957
958
150k
bool RaftGroupMetadata::has_active_restoration() const {
959
150k
  std::lock_guard<MutexType> lock(data_mutex_);
960
150k
  return !active_restorations_.empty();
961
150k
}
962
963
30
void RaftGroupMetadata::RegisterRestoration(const TxnSnapshotRestorationId& restoration_id) {
964
30
  std::lock_guard<MutexType> lock(data_mutex_);
965
30
  if (tablet_data_state_ == TabletDataState::TABLET_DATA_SPLIT_COMPLETED) {
966
0
    tablet_data_state_ = TabletDataState::TABLET_DATA_READY;
967
0
    split_op_id_ = OpId();
968
0
    split_child_tablet_ids_[0] = std::string();
969
0
    split_child_tablet_ids_[1] = std::string();
970
0
  }
971
30
  active_restorations_.push_back(restoration_id);
972
30
}
973
974
29
void RaftGroupMetadata::UnregisterRestoration(const TxnSnapshotRestorationId& restoration_id) {
975
29
  std::lock_guard<MutexType> lock(data_mutex_);
976
29
  Erase(restoration_id, &active_restorations_);
977
29
}
978
979
HybridTime RaftGroupMetadata::CheckCompleteRestorations(
980
2.71k
    const RestorationCompleteTimeMap& restoration_complete_time) {
981
2.71k
  std::lock_guard<MutexType> lock(data_mutex_);
982
2.71k
  auto result = HybridTime::kMin;
983
2.71k
  for (const auto& restoration_id : active_restorations_) {
984
3
    auto it = restoration_complete_time.find(restoration_id);
985
3
    if (it != restoration_complete_time.end() && it->second) {
986
3
      result = std::max(result, it->second);
987
3
    }
988
3
  }
989
2.71k
  return result;
990
2.71k
}
991
992
bool RaftGroupMetadata::CleanupRestorations(
993
2.71k
    const RestorationCompleteTimeMap& restoration_complete_time) {
994
2.71k
  bool result = false;
995
2.71k
  std::lock_guard<MutexType> lock(data_mutex_);
996
2.71k
  for (auto it = active_restorations_.begin(); it != active_restorations_.end();) {
997
3
    auto known_restoration_it = restoration_complete_time.find(*it);
998
3
    if (known_restoration_it == restoration_complete_time.end() || known_restoration_it->second) {
999
3
      it = active_restorations_.erase(it);
1000
3
      result = true;
1001
3
    } else {
1002
0
      ++it;
1003
0
    }
1004
3
  }
1005
2.71k
  return result;
1006
2.71k
}
1007
1008
277
std::string RaftGroupMetadata::GetSubRaftGroupWalDir(const RaftGroupId& raft_group_id) const {
1009
277
  return JoinPathSegments(DirName(wal_dir_), MakeTabletDirName(raft_group_id));
1010
277
}
1011
1012
277
std::string RaftGroupMetadata::GetSubRaftGroupDataDir(const RaftGroupId& raft_group_id) const {
1013
277
  return JoinPathSegments(DirName(kv_store_.rocksdb_dir), MakeTabletDirName(raft_group_id));
1014
277
}
1015
1016
// We directly init fields of a new metadata, so have to use NO_THREAD_SAFETY_ANALYSIS here.
1017
Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateSubtabletMetadata(
1018
    const RaftGroupId& raft_group_id, const Partition& partition,
1019
    const std::string& lower_bound_key, const std::string& upper_bound_key)
1020
141
    const NO_THREAD_SAFETY_ANALYSIS {
1021
141
  RaftGroupReplicaSuperBlockPB superblock;
1022
141
  ToSuperBlock(&superblock);
1023
1024
141
  RaftGroupMetadataPtr metadata(new RaftGroupMetadata(fs_manager_, raft_group_id_));
1025
141
  RETURN_NOT_OK(metadata->LoadFromSuperBlock(superblock, /* local_superblock = */ true));
1026
141
  metadata->raft_group_id_ = raft_group_id;
1027
141
  metadata->wal_dir_ = GetSubRaftGroupWalDir(raft_group_id);
1028
141
  metadata->kv_store_.kv_store_id = KvStoreId(raft_group_id);
1029
141
  metadata->kv_store_.lower_bound_key = lower_bound_key;
1030
141
  metadata->kv_store_.upper_bound_key = upper_bound_key;
1031
141
  metadata->kv_store_.rocksdb_dir = GetSubRaftGroupDataDir(raft_group_id);
1032
141
  metadata->kv_store_.has_been_fully_compacted = false;
1033
141
  *metadata->partition_ = partition;
1034
141
  metadata->state_ = kInitialized;
1035
141
  metadata->tablet_data_state_ = TABLET_DATA_INIT_STARTED;
1036
141
  RETURN_NOT_OK(metadata->Flush());
1037
141
  return metadata;
1038
141
}
1039
1040
177
Result<std::string> RaftGroupMetadata::TopSnapshotsDir() const {
1041
177
  auto result = snapshots_dir();
1042
177
  RETURN_NOT_OK_PREPEND(
1043
177
      fs_manager()->CreateDirIfMissingAndSync(result),
1044
177
      Format("Unable to create snapshots directory $0", result));
1045
173
  return result;
1046
177
}
1047
1048
namespace {
1049
// MigrateSuperblockForDXXXX functions are only needed for backward compatibility with
1050
// YugabyteDB versions which don't have changes from DXXXX revision.
1051
// Each MigrateSuperblockForDXXXX could be removed after all YugabyteDB installations are
1052
// upgraded to have revision DXXXX.
1053
1054
2.02k
CHECKED_STATUS MigrateSuperblockForD5900(RaftGroupReplicaSuperBlockPB* superblock) {
1055
  // In previous version of superblock format we stored primary table metadata in superblock's
1056
  // top-level fields (deprecated table_* and other). TableInfo objects were stored inside
1057
  // RaftGroupReplicaSuperBlockPB.tables.
1058
  //
1059
  // In new format TableInfo objects and some other top-level fields are moved from superblock's
1060
  // top-level fields into RaftGroupReplicaSuperBlockPB.kv_store. Primary table (see
1061
  // RaftGroupMetadata::primary_table_id_ field description) metadata is stored inside one of
1062
  // RaftGroupReplicaSuperBlockPB.kv_store.tables objects and is referenced by
1063
  // RaftGroupReplicaSuperBlockPB.primary_table_id.
1064
2.02k
  if (superblock->has_kv_store()) {
1065
2.02k
    return Status::OK();
1066
2.02k
  }
1067
1068
0
  LOG(INFO) << "Migrating superblock for raft group " << superblock->raft_group_id();
1069
1070
0
  KvStoreInfoPB* kv_store_pb = superblock->mutable_kv_store();
1071
0
  kv_store_pb->set_kv_store_id(superblock->raft_group_id());
1072
0
  kv_store_pb->set_rocksdb_dir(superblock->obsolete_rocksdb_dir());
1073
0
  kv_store_pb->mutable_rocksdb_files()->CopyFrom(superblock->obsolete_rocksdb_files());
1074
0
  kv_store_pb->mutable_snapshot_files()->CopyFrom(superblock->obsolete_snapshot_files());
1075
1076
0
  TableInfoPB* primary_table = kv_store_pb->add_tables();
1077
0
  primary_table->set_table_id(superblock->primary_table_id());
1078
0
  primary_table->set_table_name(superblock->obsolete_table_name());
1079
0
  primary_table->set_table_type(superblock->obsolete_table_type());
1080
0
  primary_table->mutable_schema()->CopyFrom(superblock->obsolete_schema());
1081
0
  primary_table->set_schema_version(superblock->obsolete_schema_version());
1082
0
  primary_table->mutable_partition_schema()->CopyFrom(superblock->obsolete_partition_schema());
1083
0
  primary_table->mutable_indexes()->CopyFrom(superblock->obsolete_indexes());
1084
0
  primary_table->mutable_index_info()->CopyFrom(superblock->obsolete_index_info());
1085
0
  primary_table->mutable_deleted_cols()->CopyFrom(superblock->obsolete_deleted_cols());
1086
1087
0
  kv_store_pb->mutable_tables()->MergeFrom(superblock->obsolete_tables());
1088
1089
0
  return Status::OK();
1090
2.02k
}
1091
1092
} // namespace
1093
1094
2.02k
Status MigrateSuperblock(RaftGroupReplicaSuperBlockPB* superblock) {
1095
2.02k
  return MigrateSuperblockForD5900(superblock);
1096
2.02k
}
1097
1098
std::shared_ptr<std::vector<DeletedColumn>> RaftGroupMetadata::deleted_cols(
1099
494
    const TableId& table_id) const {
1100
494
  DCHECK_NE(state_, kNotLoadedYet);
1101
494
  const TableInfoPtr table_info =
1102
494
      table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id));
1103
494
  return std::shared_ptr<std::vector<DeletedColumn>>(table_info, &table_info->deleted_cols);
1104
494
}
1105
1106
158k
std::string RaftGroupMetadata::namespace_name(const TableId& table_id) const {
1107
158k
  DCHECK_NE(state_, kNotLoadedYet);
1108
158k
  if (table_id.empty()) {
1109
158k
    return primary_table_info()->namespace_name;
1110
158k
  }
1111
23
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1112
23
  return table_info->namespace_name;
1113
158k
}
1114
1115
3.46M
std::string RaftGroupMetadata::table_name(const TableId& table_id) const {
1116
3.46M
  DCHECK_NE(state_, kNotLoadedYet);
1117
3.46M
  if (table_id.empty()) {
1118
3.46M
    return primary_table_info()->table_name;
1119
3.46M
  }
1120
503
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1121
503
  return table_info->table_name;
1122
3.46M
}
1123
1124
6.02M
TableType RaftGroupMetadata::table_type(const TableId& table_id) const {
1125
6.02M
  DCHECK_NE(state_, kNotLoadedYet);
1126
6.02M
  if (table_id.empty()) {
1127
6.02M
    return primary_table_info()->table_type;
1128
6.02M
  }
1129
1.49k
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1130
1.49k
  return table_info->table_type;
1131
6.02M
}
1132
1133
40.3M
yb::SchemaPtr RaftGroupMetadata::schema(const TableId& table_id) const {
1134
40.3M
  DCHECK_NE(state_, kNotLoadedYet);
1135
40.3M
  const TableInfoPtr table_info =
1136
40.3M
      table_id.empty() ? 
primary_table_info()40.2M
: CHECK_RESULT(GetTableInfo(table_id));
1137
40.3M
  return yb::SchemaPtr(table_info, table_info->schema.get());
1138
40.3M
}
1139
1140
4.43k
std::shared_ptr<IndexMap> RaftGroupMetadata::index_map(const TableId& table_id) const {
1141
4.43k
  DCHECK_NE(state_, kNotLoadedYet);
1142
4.43k
  const TableInfoPtr table_info =
1143
4.43k
      table_id.empty() ? 
primary_table_info()30
: CHECK_RESULT(GetTableInfo(table_id));
1144
4.43k
  return std::shared_ptr<IndexMap>(table_info, table_info->index_map.get());
1145
4.43k
}
1146
1147
8.23M
uint32_t RaftGroupMetadata::schema_version(const TableId& table_id) const {
1148
8.23M
  DCHECK_NE(state_, kNotLoadedYet);
1149
8.23M
  const TableInfoPtr table_info =
1150
8.23M
      table_id.empty() ? 
primary_table_info()8.23M
: CHECK_RESULT(GetTableInfo(table_id));
1151
8.23M
  return table_info->schema_version;
1152
8.23M
}
1153
1154
0
const std::string& RaftGroupMetadata::indexed_table_id(const TableId& table_id) const {
1155
0
  DCHECK_NE(state_, kNotLoadedYet);
1156
0
  static const std::string kEmptyString = "";
1157
0
  std::lock_guard<MutexType> lock(data_mutex_);
1158
0
  const TableInfoPtr table_info = table_id.empty() ?
1159
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1160
0
  const auto* index_info = table_info->index_info.get();
1161
0
  return index_info ? index_info->indexed_table_id() : kEmptyString;
1162
0
}
1163
1164
0
bool RaftGroupMetadata::is_local_index(const TableId& table_id) const {
1165
0
  DCHECK_NE(state_, kNotLoadedYet);
1166
0
  std::lock_guard<MutexType> lock(data_mutex_);
1167
0
  const TableInfoPtr table_info = table_id.empty() ?
1168
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1169
0
  const auto* index_info = table_info->index_info.get();
1170
0
  return index_info && index_info->is_local();
1171
0
}
1172
1173
2.08M
bool RaftGroupMetadata::is_unique_index(const TableId& table_id) const {
1174
2.08M
  DCHECK_NE(state_, kNotLoadedYet);
1175
2.08M
  std::lock_guard<MutexType> lock(data_mutex_);
1176
2.08M
  const TableInfoPtr table_info = table_id.empty() ?
1177
2.08M
      primary_table_info_unlocked() : 
CHECK_RESULT2.08M
(GetTableInfoUnlocked(table_id));
1178
2.08M
  const auto* index_info = table_info->index_info.get();
1179
2.08M
  return index_info && 
index_info->is_unique()35.1k
;
1180
2.08M
}
1181
1182
0
std::vector<ColumnId> RaftGroupMetadata::index_key_column_ids(const TableId& table_id) const {
1183
0
  DCHECK_NE(state_, kNotLoadedYet);
1184
0
  std::lock_guard<MutexType> lock(data_mutex_);
1185
0
  const TableInfoPtr table_info = table_id.empty() ?
1186
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1187
0
  const auto* index_info = table_info->index_info.get();
1188
0
  return index_info ? index_info->index_key_column_ids() : std::vector<ColumnId>();
1189
0
}
1190
1191
5.55M
bool RaftGroupMetadata::UsePartialRangeKeyIntents() const {
1192
5.55M
  return table_type() == TableType::PGSQL_TABLE_TYPE;
1193
5.55M
}
1194
1195
} // namespace tablet
1196
} // namespace yb