YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/tablet/tablet_metadata.cc
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
//
18
// The following only applies to changes made to this file as part of YugaByte development.
19
//
20
// Portions Copyright (c) YugaByte, Inc.
21
//
22
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
23
// in compliance with the License.  You may obtain a copy of the License at
24
//
25
// http://www.apache.org/licenses/LICENSE-2.0
26
//
27
// Unless required by applicable law or agreed to in writing, software distributed under the License
28
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
29
// or implied.  See the License for the specific language governing permissions and limitations
30
// under the License.
31
//
32
33
#include "yb/tablet/tablet_metadata.h"
34
35
#include <algorithm>
36
#include <mutex>
37
#include <string>
38
39
#include <boost/optional.hpp>
40
#include <gflags/gflags.h>
41
42
#include "yb/common/entity_ids.h"
43
#include "yb/common/index.h"
44
#include "yb/common/schema.h"
45
#include "yb/common/transaction.h"
46
#include "yb/common/wire_protocol.h"
47
48
#include "yb/consensus/consensus_util.h"
49
#include "yb/consensus/opid_util.h"
50
51
#include "yb/docdb/docdb_rocksdb_util.h"
52
53
#include "yb/gutil/atomicops.h"
54
#include "yb/gutil/dynamic_annotations.h"
55
#include "yb/gutil/map-util.h"
56
#include "yb/gutil/stl_util.h"
57
#include "yb/gutil/strings/substitute.h"
58
59
#include "yb/rocksdb/db.h"
60
#include "yb/rocksdb/options.h"
61
62
#include "yb/tablet/metadata.pb.h"
63
#include "yb/tablet/tablet_options.h"
64
65
#include "yb/util/debug/trace_event.h"
66
#include "yb/util/flag_tags.h"
67
#include "yb/util/logging.h"
68
#include "yb/util/pb_util.h"
69
#include "yb/util/random.h"
70
#include "yb/util/result.h"
71
#include "yb/util/status.h"
72
#include "yb/util/status_log.h"
73
#include "yb/util/trace.h"
74
75
DEFINE_bool(enable_tablet_orphaned_block_deletion, true,
76
            "Whether to enable deletion of orphaned blocks from disk. "
77
            "Note: This is only exposed for debugging purposes!");
78
TAG_FLAG(enable_tablet_orphaned_block_deletion, advanced);
79
TAG_FLAG(enable_tablet_orphaned_block_deletion, hidden);
80
TAG_FLAG(enable_tablet_orphaned_block_deletion, runtime);
81
82
using std::shared_ptr;
83
84
using base::subtle::Barrier_AtomicIncrement;
85
using strings::Substitute;
86
87
using yb::consensus::MinimumOpId;
88
89
namespace yb {
90
namespace tablet {
91
92
const int64 kNoDurableMemStore = -1;
93
const std::string kIntentsSubdir = "intents";
94
const std::string kIntentsDBSuffix = ".intents";
95
const std::string kSnapshotsDirSuffix = ".snapshots";
96
97
// ============================================================================
98
//  Raft group metadata
99
// ============================================================================
100
101
TableInfo::TableInfo()
102
1.46k
    : schema(std::make_unique<Schema>()), index_map(std::make_unique<IndexMap>()) {
103
1.46k
}
104
105
TableInfo::TableInfo(std::string table_id,
106
                     std::string namespace_name,
107
                     std::string table_name,
108
                     TableType table_type,
109
                     const Schema& schema,
110
                     const IndexMap& index_map,
111
                     const boost::optional<IndexInfo>& index_info,
112
                     const uint32_t schema_version,
113
                     PartitionSchema partition_schema)
114
    : table_id(std::move(table_id)),
115
      namespace_name(std::move(namespace_name)),
116
      table_name(std::move(table_name)),
117
      table_type(table_type),
118
      schema(std::make_unique<Schema>(schema)),
119
      index_map(std::make_unique<IndexMap>(index_map)),
120
      index_info(index_info ? new IndexInfo(*index_info) : nullptr),
121
      schema_version(schema_version),
122
578k
      partition_schema(std::move(partition_schema)) {
123
578k
}
124
125
TableInfo::TableInfo(const TableInfo& other,
126
                     const Schema& schema,
127
                     const IndexMap& index_map,
128
                     const std::vector<DeletedColumn>& deleted_cols,
129
                     const uint32_t schema_version)
130
    : table_id(other.table_id),
131
      namespace_name(other.namespace_name),
132
      table_name(other.table_name),
133
      table_type(other.table_type),
134
      schema(std::make_unique<Schema>(schema)),
135
      index_map(std::make_unique<IndexMap>(index_map)),
136
      index_info(other.index_info ? new IndexInfo(*other.index_info) : nullptr),
137
      schema_version(schema_version),
138
      partition_schema(other.partition_schema),
139
54.9k
      deleted_cols(other.deleted_cols) {
140
54.9k
  this->deleted_cols.insert(this->deleted_cols.end(), deleted_cols.begin(), deleted_cols.end());
141
54.9k
}
142
143
1.46k
Status TableInfo::LoadFromPB(const TableInfoPB& pb) {
144
1.46k
  table_id = pb.table_id();
145
1.46k
  namespace_name = pb.namespace_name();
146
1.46k
  table_name = pb.table_name();
147
1.46k
  table_type = pb.table_type();
148
149
1.46k
  RETURN_NOT_OK(SchemaFromPB(pb.schema(), schema.get()));
150
1.46k
  if (pb.has_index_info()) {
151
0
    index_info.reset(new IndexInfo(pb.index_info()));
152
0
  }
153
1.46k
  index_map->FromPB(pb.indexes());
154
1.46k
  schema_version = pb.schema_version();
155
156
1.46k
  RETURN_NOT_OK(PartitionSchema::FromPB(pb.partition_schema(), *schema, &partition_schema));
157
158
1.46k
  for (const DeletedColumnPB& deleted_col : pb.deleted_cols()) {
159
657
    DeletedColumn col;
160
657
    RETURN_NOT_OK(DeletedColumn::FromPB(deleted_col, &col));
161
657
    deleted_cols.push_back(col);
162
657
  }
163
164
1.46k
  return Status::OK();
165
1.46k
}
166
167
140M
void TableInfo::ToPB(TableInfoPB* pb) const {
168
140M
  pb->set_table_id(table_id);
169
140M
  pb->set_namespace_name(namespace_name);
170
140M
  pb->set_table_name(table_name);
171
140M
  pb->set_table_type(table_type);
172
173
140M
  DCHECK(schema->has_column_ids());
174
140M
  SchemaToPB(*schema, pb->mutable_schema());
175
140M
  if (index_info) {
176
44.3k
    index_info->ToPB(pb->mutable_index_info());
177
44.3k
  }
178
140M
  index_map->ToPB(pb->mutable_indexes());
179
140M
  pb->set_schema_version(schema_version);
180
181
140M
  partition_schema.ToPB(pb->mutable_partition_schema());
182
183
12.5k
  for (const DeletedColumn& deleted_col : deleted_cols) {
184
12.5k
    deleted_col.CopyToPB(pb->mutable_deleted_cols()->Add());
185
12.5k
  }
186
140M
}
187
188
Status KvStoreInfo::LoadTablesFromPB(
189
1.45k
    const google::protobuf::RepeatedPtrField<TableInfoPB>& pbs, const TableId& primary_table_id) {
190
1.45k
  tables.clear();
191
1.46k
  for (const auto& table_pb : pbs) {
192
1.46k
    auto table_info = std::make_shared<TableInfo>();
193
1.46k
    RETURN_NOT_OK(table_info->LoadFromPB(table_pb));
194
1.46k
    if (table_info->table_id != primary_table_id) {
195
6
      if (table_pb.schema().table_properties().is_ysql_catalog_table()) {
196
0
        Uuid cotable_id;
197
0
        CHECK_OK(cotable_id.FromHexString(table_info->table_id));
198
        // TODO(#79): when adding for multiple KV-stores per Raft group support - check if we need
199
        // to set cotable ID.
200
0
        table_info->schema->set_cotable_id(cotable_id);
201
6
      } else {
202
6
        auto pgtable_id = VERIFY_RESULT(GetPgsqlTableOid(table_info->table_id));
203
6
        table_info->schema->set_pgtable_id(pgtable_id);
204
6
      }
205
6
    }
206
1.46k
    tables[table_info->table_id] = std::move(table_info);
207
1.46k
  }
208
1.45k
  return Status::OK();
209
1.45k
}
210
211
Status KvStoreInfo::LoadFromPB(const KvStoreInfoPB& pb,
212
                               const TableId& primary_table_id,
213
1.45k
                               bool local_superblock) {
214
1.45k
  kv_store_id = KvStoreId(pb.kv_store_id());
215
1.45k
  if (local_superblock) {
216
359
    rocksdb_dir = pb.rocksdb_dir();
217
359
  }
218
1.45k
  lower_bound_key = pb.lower_bound_key();
219
1.45k
  upper_bound_key = pb.upper_bound_key();
220
1.45k
  has_been_fully_compacted = pb.has_been_fully_compacted();
221
222
0
  for (const auto& schedule_id : pb.snapshot_schedules()) {
223
0
    snapshot_schedules.insert(VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule_id)));
224
0
  }
225
226
1.45k
  return LoadTablesFromPB(pb.tables(), primary_table_id);
227
1.45k
}
228
229
819k
void KvStoreInfo::ToPB(const TableId& primary_table_id, KvStoreInfoPB* pb) const {
230
819k
  pb->set_kv_store_id(kv_store_id.ToString());
231
819k
  pb->set_rocksdb_dir(rocksdb_dir);
232
819k
  if (lower_bound_key.empty()) {
233
818k
    pb->clear_lower_bound_key();
234
575
  } else {
235
575
    pb->set_lower_bound_key(lower_bound_key);
236
575
  }
237
819k
  if (upper_bound_key.empty()) {
238
818k
    pb->clear_upper_bound_key();
239
417
  } else {
240
417
    pb->set_upper_bound_key(upper_bound_key);
241
417
  }
242
819k
  pb->set_has_been_fully_compacted(has_been_fully_compacted);
243
244
  // Putting primary table first, then all other tables.
245
819k
  const auto& it = tables.find(primary_table_id);
246
819k
  if (it != tables.end()) {
247
819k
    it->second->ToPB(pb->add_tables());
248
819k
  }
249
140M
  for (const auto& it : tables) {
250
140M
    if (it.first != primary_table_id) {
251
139M
      it.second->ToPB(pb->add_tables());
252
139M
    }
253
140M
  }
254
255
0
  for (const auto& schedule_id : snapshot_schedules) {
256
0
    pb->add_snapshot_schedules(schedule_id.data(), schedule_id.size());
257
0
  }
258
819k
}
259
260
namespace {
261
262
89.2k
std::string MakeTabletDirName(const TabletId& tablet_id) {
263
89.2k
  return Format("tablet-$0", tablet_id);
264
89.2k
}
265
266
} // namespace
267
268
// ============================================================================
269
270
Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateNew(
271
    const RaftGroupMetadataData& data, const std::string& data_root_dir,
272
89.3k
    const std::string& wal_root_dir) {
273
89.3k
  auto* fs_manager = data.fs_manager;
274
  // Verify that no existing Raft group exists with the same ID.
275
89.3k
  if (fs_manager->env()->FileExists(fs_manager->GetRaftGroupMetadataPath(data.raft_group_id))) {
276
432
    return STATUS(AlreadyPresent, "Raft group already exists", data.raft_group_id);
277
432
  }
278
279
88.9k
  auto wal_top_dir = wal_root_dir;
280
88.9k
  auto data_top_dir = data_root_dir;
281
  // Use the original randomized logic if the indices are not explicitly passed in
282
88.9k
  if (data_root_dir.empty()) {
283
5.88k
    auto data_root_dirs = fs_manager->GetDataRootDirs();
284
0
    CHECK(!data_root_dirs.empty()) << "No data root directories found";
285
5.88k
    data_top_dir = RandomElement(data_root_dirs);
286
5.88k
  }
287
288
88.9k
  if (wal_root_dir.empty()) {
289
5.88k
    auto wal_root_dirs = fs_manager->GetWalRootDirs();
290
0
    CHECK(!wal_root_dirs.empty()) << "No wal root directories found";
291
5.88k
    wal_top_dir = RandomElement(wal_root_dirs);
292
5.88k
  }
293
294
88.9k
  const string table_dir_name = Substitute("table-$0", data.table_info->table_id);
295
88.9k
  const string tablet_dir_name = MakeTabletDirName(data.raft_group_id);
296
88.9k
  const string wal_dir = JoinPathSegments(wal_top_dir, table_dir_name, tablet_dir_name);
297
88.9k
  const string rocksdb_dir = JoinPathSegments(
298
88.9k
      data_top_dir, FsManager::kRocksDBDirName, table_dir_name, tablet_dir_name);
299
300
88.9k
  RaftGroupMetadataPtr ret(new RaftGroupMetadata(data, rocksdb_dir, wal_dir));
301
88.9k
  RETURN_NOT_OK(ret->Flush());
302
88.9k
  return ret;
303
88.9k
}
304
305
Result<RaftGroupMetadataPtr> RaftGroupMetadata::Load(
306
6.22k
    FsManager* fs_manager, const RaftGroupId& raft_group_id) {
307
6.22k
  RaftGroupMetadataPtr ret(new RaftGroupMetadata(fs_manager, raft_group_id));
308
6.22k
  RETURN_NOT_OK(ret->LoadFromDisk());
309
266
  return ret;
310
6.22k
}
311
312
484
Result<RaftGroupMetadataPtr> RaftGroupMetadata::LoadOrCreate(const RaftGroupMetadataData& data) {
313
484
  auto metadata = Load(data.fs_manager, data.raft_group_id);
314
484
  if (metadata.ok()) {
315
7
    if (!(**metadata).schema()->Equals(*data.table_info->schema)) {
316
0
      return STATUS(Corruption, Substitute("Schema on disk ($0) does not "
317
0
        "match expected schema ($1)", (*metadata)->schema()->ToString(),
318
0
        data.table_info->schema->ToString()));
319
0
    }
320
7
    return *metadata;
321
7
  }
322
323
477
  if (metadata.status().IsNotFound()) {
324
477
    return CreateNew(data);
325
477
  }
326
327
0
  return metadata.status();
328
0
}
329
330
template <class TablesMap>
331
CHECKED_STATUS MakeTableNotFound(const TableId& table_id, const RaftGroupId& raft_group_id,
332
0
                                 const TablesMap& tables) {
333
0
  std::string table_name = "<unknown_table_name>";
334
0
  if (!table_id.empty()) {
335
0
    const auto iter = tables.find(table_id);
336
0
    if (iter != tables.end()) {
337
0
      table_name = iter->second->table_name;
338
0
    }
339
0
  }
340
0
  std::ostringstream string_stream;
341
0
  string_stream << "Table " << table_name << " (" << table_id << ") not found in Raft group "
342
0
      << raft_group_id;
343
0
  std::string msg = string_stream.str();
344
0
#ifndef NDEBUG
345
  // This very large message should be logged instead of being appended to STATUS.
346
0
  std::string suffix = Format(". Tables: $0.", tables);
347
0
  VLOG(1) << msg << suffix;
348
0
#endif
349
0
  return STATUS(NotFound, msg);
350
0
}
351
352
7.96M
Result<TableInfoPtr> RaftGroupMetadata::GetTableInfo(const std::string& table_id) const {
353
7.96M
  std::lock_guard<MutexType> lock(data_mutex_);
354
7.96M
  return GetTableInfoUnlocked(table_id);
355
7.96M
}
356
357
8.02M
Result<TableInfoPtr> RaftGroupMetadata::GetTableInfoUnlocked(const std::string& table_id) const {
358
8.02M
  const auto& tables = kv_store_.tables;
359
7.98M
  const auto id = !table_id.empty() ? table_id : primary_table_id_;
360
8.02M
  const auto iter = tables.find(id);
361
8.02M
  if (iter == tables.end()) {
362
0
    return MakeTableNotFound(table_id, raft_group_id_, tables);
363
0
  }
364
8.02M
  return iter->second;
365
8.02M
}
366
367
Status RaftGroupMetadata::DeleteTabletData(TabletDataState delete_type,
368
47.6k
                                           const OpId& last_logged_opid) {
369
0
  CHECK(delete_type == TABLET_DATA_DELETED ||
370
0
        delete_type == TABLET_DATA_TOMBSTONED)
371
0
      << "DeleteTabletData() called with unsupported delete_type on tablet "
372
0
      << raft_group_id_ << ": " << TabletDataState_Name(delete_type)
373
0
      << " (" << delete_type << ")";
374
375
  // First add all of our blocks to the orphan list
376
  // and clear our rowsets. This serves to erase all the data.
377
  //
378
  // We also set the state in our persisted metadata to indicate that
379
  // we have been deleted.
380
47.6k
  {
381
47.6k
    std::lock_guard<MutexType> lock(data_mutex_);
382
47.6k
    tablet_data_state_ = delete_type;
383
47.6k
    if (!last_logged_opid.empty()) {
384
47.4k
      tombstone_last_logged_opid_ = last_logged_opid;
385
47.4k
    }
386
47.6k
  }
387
388
47.6k
  rocksdb::Options rocksdb_options;
389
47.6k
  TabletOptions tablet_options;
390
47.6k
  std::string log_prefix = consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid());
391
47.6k
  docdb::InitRocksDBOptions(
392
47.6k
      &rocksdb_options, log_prefix, nullptr /* statistics */, tablet_options);
393
394
47.6k
  const auto& rocksdb_dir = this->rocksdb_dir();
395
47.6k
  LOG(INFO) << "Destroying regular db at: " << rocksdb_dir;
396
47.6k
  rocksdb::Status status = rocksdb::DestroyDB(rocksdb_dir, rocksdb_options);
397
398
47.6k
  if (!status.ok()) {
399
159
    LOG(ERROR) << "Failed to destroy regular DB at: " << rocksdb_dir << ": " << status;
400
47.4k
  } else {
401
47.4k
    LOG(INFO) << "Successfully destroyed regular DB at: " << rocksdb_dir;
402
47.4k
  }
403
404
47.6k
  if (fs_manager_->env()->FileExists(rocksdb_dir)) {
405
441
    auto s = fs_manager_->env()->DeleteRecursively(rocksdb_dir);
406
0
    LOG_IF(WARNING, !s.ok()) << "Unable to delete rocksdb data directory " << rocksdb_dir;
407
441
  }
408
409
47.6k
  const auto intents_dir = this->intents_rocksdb_dir();
410
47.6k
  if (fs_manager_->env()->FileExists(intents_dir)) {
411
47.4k
    status = rocksdb::DestroyDB(intents_dir, rocksdb_options);
412
413
47.4k
    if (!status.ok()) {
414
0
      LOG(ERROR) << "Failed to destroy provisional records DB at: " << intents_dir << ": "
415
0
                 << status;
416
47.4k
    } else {
417
47.4k
      LOG(INFO) << "Successfully destroyed provisional records DB at: " << intents_dir;
418
47.4k
    }
419
47.4k
  }
420
421
47.6k
  if (fs_manager_->env()->FileExists(intents_dir)) {
422
0
    auto s = fs_manager_->env()->DeleteRecursively(intents_dir);
423
0
    LOG_IF(WARNING, !s.ok()) << "Unable to delete intents directory " << intents_dir;
424
0
  }
425
426
  // TODO(tsplit): decide what to do with snapshots for split tablets that we delete after split.
427
  // As for now, snapshots will be deleted as well.
428
47.6k
  const auto snapshots_dir = this->snapshots_dir();
429
47.6k
  if (fs_manager_->env()->FileExists(snapshots_dir)) {
430
47.4k
    auto s = fs_manager_->env()->DeleteRecursively(snapshots_dir);
431
18.4E
    LOG_IF(WARNING, !s.ok()) << "Unable to delete snapshots directory " << snapshots_dir;
432
47.4k
  }
433
434
  // Flushing will sync the new tablet_data_state_ to disk and will now also
435
  // delete all the data.
436
47.6k
  RETURN_NOT_OK(Flush());
437
438
  // Re-sync to disk one more time.
439
  // This call will typically re-sync with an empty orphaned blocks list
440
  // (unless deleting any orphans failed during the last Flush()), so that we
441
  // don't try to re-delete the deleted orphaned blocks on every startup.
442
47.6k
  return Flush();
443
47.6k
}
444
445
9
bool RaftGroupMetadata::IsTombstonedWithNoRocksDBData() const {
446
9
  std::lock_guard<MutexType> lock(data_mutex_);
447
9
  const auto& rocksdb_dir = kv_store_.rocksdb_dir;
448
9
  const auto intents_dir = rocksdb_dir + kIntentsDBSuffix;
449
9
  return tablet_data_state_ == TABLET_DATA_TOMBSTONED &&
450
2
      !fs_manager_->env()->FileExists(rocksdb_dir) &&
451
2
      !fs_manager_->env()->FileExists(intents_dir);
452
9
}
453
454
33
Status RaftGroupMetadata::DeleteSuperBlock() {
455
33
  std::lock_guard<MutexType> lock(data_mutex_);
456
33
  if (tablet_data_state_ != TABLET_DATA_DELETED) {
457
0
    return STATUS(IllegalState,
458
0
        Substitute("Tablet $0 is not in TABLET_DATA_DELETED state. "
459
0
                   "Call DeleteTabletData(TABLET_DATA_DELETED) first. "
460
0
                   "Tablet data state: $1 ($2)",
461
0
                   raft_group_id_,
462
0
                   TabletDataState_Name(tablet_data_state_),
463
0
                   tablet_data_state_));
464
0
  }
465
466
33
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
467
33
  RETURN_NOT_OK_PREPEND(fs_manager_->env()->DeleteFile(path),
468
33
                        "Unable to delete superblock for Raft group " + raft_group_id_);
469
33
  return Status::OK();
470
33
}
471
472
RaftGroupMetadata::RaftGroupMetadata(
473
    const RaftGroupMetadataData& data, const std::string& data_dir, const std::string& wal_dir)
474
    : state_(kNotWrittenYet),
475
      raft_group_id_(data.raft_group_id),
476
      partition_(std::make_shared<Partition>(data.partition)),
477
      primary_table_id_(data.table_info->table_id),
478
      kv_store_(KvStoreId(raft_group_id_), data_dir, data.snapshot_schedules),
479
      fs_manager_(data.fs_manager),
480
      wal_dir_(wal_dir),
481
      tablet_data_state_(data.tablet_data_state),
482
      colocated_(data.colocated),
483
88.9k
      cdc_min_replicated_index_(std::numeric_limits<int64_t>::max()) {
484
88.9k
  CHECK(data.table_info->schema->has_column_ids());
485
88.9k
  CHECK_GT(data.table_info->schema->num_key_columns(), 0);
486
88.9k
  kv_store_.tables.emplace(primary_table_id_, data.table_info);
487
88.9k
}
488
489
53.4k
RaftGroupMetadata::~RaftGroupMetadata() {
490
53.4k
}
491
492
RaftGroupMetadata::RaftGroupMetadata(FsManager* fs_manager, RaftGroupId raft_group_id)
493
    : state_(kNotLoadedYet),
494
      raft_group_id_(std::move(raft_group_id)),
495
      kv_store_(KvStoreId(raft_group_id_)),
496
6.32k
      fs_manager_(fs_manager) {
497
6.32k
}
498
499
6.22k
Status RaftGroupMetadata::LoadFromDisk() {
500
6.22k
  TRACE_EVENT1("raft_group", "RaftGroupMetadata::LoadFromDisk",
501
6.22k
               "raft_group_id", raft_group_id_);
502
503
6.22k
  CHECK_EQ(state_, kNotLoadedYet);
504
505
6.22k
  RaftGroupReplicaSuperBlockPB superblock;
506
6.22k
  RETURN_NOT_OK(ReadSuperBlockFromDisk(&superblock));
507
266
  RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(superblock, /* local_superblock = */ true),
508
266
                        "Failed to load data from superblock protobuf");
509
266
  state_ = kInitialized;
510
266
  return Status::OK();
511
266
}
512
513
Status RaftGroupMetadata::LoadFromSuperBlock(const RaftGroupReplicaSuperBlockPB& superblock,
514
1.45k
                                             bool local_superblock) {
515
1.45k
  if (!superblock.has_kv_store()) {
516
    // Backward compatibility for tablet=KV-store=raft-group.
517
0
    RaftGroupReplicaSuperBlockPB superblock_migrated(superblock);
518
0
    RETURN_NOT_OK(MigrateSuperblock(&superblock_migrated));
519
0
    RETURN_NOT_OK(LoadFromSuperBlock(superblock_migrated, local_superblock));
520
0
    return Flush();
521
1.45k
  }
522
523
0
  VLOG(2) << "Loading RaftGroupMetadata from SuperBlockPB:" << std::endl
524
0
          << superblock.DebugString();
525
526
1.45k
  {
527
1.45k
    std::lock_guard<MutexType> lock(data_mutex_);
528
529
    // Verify that the Raft group id matches with the one in the protobuf.
530
1.45k
    if (superblock.raft_group_id() != raft_group_id_) {
531
0
      return STATUS(Corruption, "Expected id=" + raft_group_id_ +
532
0
                                " found " + superblock.raft_group_id(),
533
0
                                superblock.DebugString());
534
0
    }
535
1.45k
    Partition partition;
536
1.45k
    Partition::FromPB(superblock.partition(), &partition);
537
1.45k
    partition_ = std::make_shared<Partition>(partition);
538
1.45k
    primary_table_id_ = superblock.primary_table_id();
539
1.45k
    colocated_ = superblock.colocated();
540
541
1.45k
    RETURN_NOT_OK(kv_store_.LoadFromPB(superblock.kv_store(),
542
1.45k
                                       primary_table_id_,
543
1.45k
                                       local_superblock));
544
545
1.45k
    wal_dir_ = superblock.wal_dir();
546
1.45k
    tablet_data_state_ = superblock.tablet_data_state();
547
548
1.45k
    if (superblock.has_tombstone_last_logged_opid()) {
549
8
      tombstone_last_logged_opid_ = OpId::FromPB(superblock.tombstone_last_logged_opid());
550
1.44k
    } else {
551
1.44k
      tombstone_last_logged_opid_ = OpId();
552
1.44k
    }
553
1.45k
    cdc_min_replicated_index_ = superblock.cdc_min_replicated_index();
554
1.45k
    is_under_twodc_replication_ = superblock.is_under_twodc_replication();
555
1.45k
    hidden_ = superblock.hidden();
556
1.45k
    auto restoration_hybrid_time = HybridTime::FromPB(superblock.restoration_hybrid_time());
557
1.45k
    if (restoration_hybrid_time) {
558
0
      restoration_hybrid_time_ = restoration_hybrid_time;
559
0
    }
560
561
1.45k
    if (superblock.has_split_op_id()) {
562
4
      split_op_id_ = OpId::FromPB(superblock.split_op_id());
563
564
4
      SCHECK_EQ(implicit_cast<size_t>(superblock.split_child_tablet_ids().size()),
565
4
                split_child_tablet_ids_.size(),
566
4
                Corruption, "Expected exact number of child tablet ids");
567
12
      for (size_t i = 0; i != split_child_tablet_ids_.size(); ++i) {
568
8
        split_child_tablet_ids_[i] = superblock.split_child_tablet_ids(narrow_cast<int>(i));
569
8
      }
570
4
    }
571
572
1.45k
    if (!superblock.active_restorations().empty()) {
573
0
      active_restorations_.reserve(superblock.active_restorations().size());
574
0
      for (const auto& id : superblock.active_restorations()) {
575
0
        active_restorations_.push_back(VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(id)));
576
0
      }
577
0
    }
578
1.45k
  }
579
580
1.45k
  return Status::OK();
581
1.45k
}
582
583
819k
Status RaftGroupMetadata::Flush() {
584
819k
  TRACE_EVENT1("raft_group", "RaftGroupMetadata::Flush",
585
819k
               "raft_group_id", raft_group_id_);
586
587
819k
  MutexLock l_flush(flush_lock_);
588
819k
  RaftGroupReplicaSuperBlockPB pb;
589
819k
  {
590
819k
    std::lock_guard<MutexType> lock(data_mutex_);
591
819k
    ToSuperBlockUnlocked(&pb);
592
819k
  }
593
819k
  RETURN_NOT_OK(SaveToDiskUnlocked(pb));
594
819k
  TRACE("Metadata flushed");
595
596
819k
  return Status::OK();
597
819k
}
598
599
1.09k
Status RaftGroupMetadata::ReplaceSuperBlock(const RaftGroupReplicaSuperBlockPB &pb) {
600
1.09k
  {
601
1.09k
    MutexLock l(flush_lock_);
602
1.09k
    RETURN_NOT_OK_PREPEND(SaveToDiskUnlocked(pb), "Unable to replace superblock");
603
1.09k
  }
604
605
1.09k
  RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(pb, /* local_superblock = */ false),
606
1.09k
                        "Failed to load data from superblock protobuf");
607
608
1.09k
  return Status::OK();
609
1.09k
}
610
611
820k
Status RaftGroupMetadata::SaveToDiskUnlocked(const RaftGroupReplicaSuperBlockPB &pb) {
612
820k
  flush_lock_.AssertAcquired();
613
614
820k
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
615
820k
  RETURN_NOT_OK_PREPEND(pb_util::WritePBContainerToPath(
616
820k
                            fs_manager_->env(), path, pb,
617
820k
                            pb_util::OVERWRITE, pb_util::SYNC),
618
820k
                        Substitute("Failed to write Raft group metadata $0", raft_group_id_));
619
620
820k
  return Status::OK();
621
820k
}
622
623
7.67k
Status RaftGroupMetadata::ReadSuperBlockFromDisk(RaftGroupReplicaSuperBlockPB* superblock) const {
624
7.67k
  string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_);
625
7.67k
  RETURN_NOT_OK_PREPEND(
626
7.67k
      pb_util::ReadPBContainerFromPath(fs_manager_->env(), path, superblock),
627
1.71k
      Substitute("Could not load Raft group metadata from $0", path));
628
  // Migration for backward compatibility with versions which don't have separate
629
  // TableType::TRANSACTION_STATUS_TABLE_TYPE.
630
1.71k
  if (superblock->obsolete_table_type() == TableType::REDIS_TABLE_TYPE &&
631
0
      superblock->obsolete_table_name() == kGlobalTransactionsTableName) {
632
0
    superblock->set_obsolete_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE);
633
0
  }
634
1.71k
  return Status::OK();
635
7.67k
}
636
637
101
void RaftGroupMetadata::ToSuperBlock(RaftGroupReplicaSuperBlockPB* superblock) const {
638
  // acquire the lock so that rowsets_ doesn't get changed until we're finished.
639
101
  std::lock_guard<MutexType> lock(data_mutex_);
640
101
  ToSuperBlockUnlocked(superblock);
641
101
}
642
643
819k
void RaftGroupMetadata::ToSuperBlockUnlocked(RaftGroupReplicaSuperBlockPB* superblock) const {
644
  // Convert to protobuf.
645
819k
  RaftGroupReplicaSuperBlockPB pb;
646
819k
  pb.set_raft_group_id(raft_group_id_);
647
819k
  partition_->ToPB(pb.mutable_partition());
648
649
819k
  kv_store_.ToPB(primary_table_id_, pb.mutable_kv_store());
650
651
819k
  pb.set_wal_dir(wal_dir_);
652
819k
  pb.set_tablet_data_state(tablet_data_state_);
653
819k
  if (!tombstone_last_logged_opid_.empty()) {
654
94.7k
    tombstone_last_logged_opid_.ToPB(pb.mutable_tombstone_last_logged_opid());
655
94.7k
  }
656
657
819k
  pb.set_primary_table_id(primary_table_id_);
658
819k
  pb.set_colocated(colocated_);
659
819k
  pb.set_cdc_min_replicated_index(cdc_min_replicated_index_);
660
819k
  pb.set_is_under_twodc_replication(is_under_twodc_replication_);
661
819k
  pb.set_hidden(hidden_);
662
819k
  if (restoration_hybrid_time_) {
663
819k
    pb.set_restoration_hybrid_time(restoration_hybrid_time_.ToUint64());
664
819k
  }
665
666
819k
  if (!split_op_id_.empty()) {
667
89
    split_op_id_.ToPB(pb.mutable_split_op_id());
668
89
    auto& split_child_table_ids = *pb.mutable_split_child_tablet_ids();
669
89
    split_child_table_ids.Reserve(narrow_cast<int>(split_child_tablet_ids_.size()));
670
178
    for (const auto& split_child_tablet_id : split_child_tablet_ids_) {
671
178
      *split_child_table_ids.Add() = split_child_tablet_id;
672
178
    }
673
89
  }
674
675
819k
  if (!active_restorations_.empty()) {
676
0
    auto& active_restorations = *pb.mutable_active_restorations();
677
0
    active_restorations.Reserve(narrow_cast<int>(active_restorations_.size()));
678
0
    for (const auto& id : active_restorations_) {
679
0
      active_restorations.Add()->assign(id.AsSlice().cdata(), id.size());
680
0
    }
681
0
  }
682
683
819k
  superblock->Swap(&pb);
684
819k
}
685
686
void RaftGroupMetadata::SetSchema(const Schema& schema,
687
                                  const IndexMap& index_map,
688
                                  const std::vector<DeletedColumn>& deleted_cols,
689
                                  const uint32_t version,
690
54.9k
                                  const TableId& table_id) {
691
54.9k
  DCHECK(schema.has_column_ids());
692
54.9k
  std::lock_guard<MutexType> lock(data_mutex_);
693
54.0k
  TableId target_table_id = table_id.empty() ? primary_table_id_ : table_id;
694
54.9k
  auto result = GetTableInfoUnlocked(target_table_id);
695
54.9k
  DCHECK(result.ok());
696
54.9k
  TableInfoPtr new_table_info = std::make_shared<TableInfo>(*result.get(),
697
54.9k
                                                            schema,
698
54.9k
                                                            index_map,
699
54.9k
                                                            deleted_cols,
700
54.9k
                                                            version);
701
54.9k
  if (target_table_id != primary_table_id_) {
702
69
    if (schema.table_properties().is_ysql_catalog_table()) {
703
0
      Uuid cotable_id;
704
0
      CHECK_OK(cotable_id.FromHexString(target_table_id));
705
0
      new_table_info->schema->set_cotable_id(cotable_id);
706
69
    } else {
707
69
      auto result = CHECK_RESULT(GetPgsqlTableOid(target_table_id));
708
69
      new_table_info->schema->set_pgtable_id(result);
709
69
    }
710
69
  }
711
103
  VLOG_WITH_PREFIX(1) << raft_group_id_ << " Updating table " << target_table_id
712
103
                      << " to Schema version " << version
713
103
                      << " from \n" << yb::ToString(kv_store_.tables[target_table_id])
714
103
                      << " to \n" << yb::ToString(new_table_info);
715
54.9k
  kv_store_.tables[target_table_id].swap(new_table_info);
716
54.9k
}
717
718
0
void RaftGroupMetadata::SetPartitionSchema(const PartitionSchema& partition_schema) {
719
0
  std::lock_guard<MutexType> lock(data_mutex_);
720
0
  auto& tables = kv_store_.tables;
721
0
  DCHECK(tables.find(primary_table_id_) != tables.end());
722
0
  tables[primary_table_id_]->partition_schema = partition_schema;
723
0
}
724
725
void RaftGroupMetadata::SetTableName(
726
46.7k
    const string& namespace_name, const string& table_name, const TableId& table_id) {
727
46.7k
  std::lock_guard<MutexType> lock(data_mutex_);
728
46.7k
  auto& tables = kv_store_.tables;
729
18.4E
  auto& id = table_id.empty() ? primary_table_id_ : table_id;
730
46.7k
  DCHECK(tables.find(id) != tables.end());
731
46.7k
  tables[id]->namespace_name = namespace_name;
732
46.7k
  tables[id]->table_name = table_name;
733
46.7k
}
734
735
void RaftGroupMetadata::AddTable(const std::string& table_id,
736
                              const std::string& namespace_name,
737
                              const std::string& table_name,
738
                              const TableType table_type,
739
                              const Schema& schema,
740
                              const IndexMap& index_map,
741
                              const PartitionSchema& partition_schema,
742
                              const boost::optional<IndexInfo>& index_info,
743
489k
                              const uint32_t schema_version) {
744
489k
  DCHECK(schema.has_column_ids());
745
489k
  TableInfoPtr new_table_info = std::make_shared<TableInfo>(table_id,
746
489k
                                                            namespace_name,
747
489k
                                                            table_name,
748
489k
                                                            table_type,
749
489k
                                                            schema,
750
489k
                                                            index_map,
751
489k
                                                            index_info,
752
489k
                                                            schema_version,
753
489k
                                                            partition_schema);
754
489k
  if (table_id != primary_table_id_) {
755
489k
    if (schema.table_properties().is_ysql_catalog_table()) {
756
488k
      Uuid cotable_id;
757
488k
      CHECK_OK(cotable_id.FromHexString(table_id));
758
488k
      new_table_info->schema->set_cotable_id(cotable_id);
759
81
    } else {
760
81
      auto result = CHECK_RESULT(GetPgsqlTableOid(table_id));
761
81
      new_table_info->schema->set_pgtable_id(result);
762
81
    }
763
489k
  }
764
489k
  std::lock_guard<MutexType> lock(data_mutex_);
765
489k
  auto& tables = kv_store_.tables;
766
489k
  auto existing_table_iter = tables.find(table_id);
767
489k
  if (existing_table_iter != tables.end()) {
768
0
    const auto& existing_table = *existing_table_iter->second.get();
769
0
    if (!existing_table.schema->table_properties().is_ysql_catalog_table() &&
770
0
        schema.table_properties().is_ysql_catalog_table()) {
771
      // This must be the one-time migration with transactional DDL being turned on for the first
772
      // time on this cluster.
773
0
    } else {
774
0
      LOG(DFATAL) << "Table " << table_id << " already exists. New table info: "
775
0
          << new_table_info->ToString() << ", old table info: " << existing_table.ToString();
776
0
    }
777
0
  }
778
0
  VLOG_WITH_PREFIX(1) << "Updating to Schema version " << schema_version
779
0
                      << " from\n" << yb::ToString(tables[table_id])
780
0
                      << "\nto\n" << yb::ToString(new_table_info);
781
489k
  tables[table_id].swap(new_table_info);
782
489k
}
783
784
2.31k
void RaftGroupMetadata::RemoveTable(const TableId& table_id) {
785
2.31k
  std::lock_guard<MutexType> lock(data_mutex_);
786
2.31k
  auto& tables = kv_store_.tables;
787
2.31k
  tables.erase(table_id);
788
2.31k
}
789
790
310k
string RaftGroupMetadata::data_root_dir() const {
791
310k
  const auto& rocksdb_dir = kv_store_.rocksdb_dir;
792
310k
  if (rocksdb_dir.empty()) {
793
0
    return "";
794
310k
  } else {
795
310k
    auto data_root_dir = DirName(DirName(rocksdb_dir));
796
310k
    if (strcmp(BaseName(data_root_dir).c_str(), FsManager::kRocksDBDirName) == 0) {
797
310k
      data_root_dir = DirName(data_root_dir);
798
310k
    }
799
310k
    return data_root_dir;
800
310k
  }
801
310k
}
802
803
47.9k
string RaftGroupMetadata::wal_root_dir() const {
804
47.9k
  std::string wal_dir = this->wal_dir();
805
806
47.9k
  if (wal_dir.empty()) {
807
0
    return "";
808
0
  }
809
810
47.9k
  auto wal_root_dir = DirName(wal_dir);
811
47.9k
  if (strcmp(BaseName(wal_root_dir).c_str(), FsManager::kWalDirName) != 0) {
812
47.9k
    wal_root_dir = DirName(wal_root_dir);
813
47.9k
  }
814
47.9k
  return wal_root_dir;
815
47.9k
}
816
817
2.55k
void RaftGroupMetadata::set_wal_retention_secs(uint32 wal_retention_secs) {
818
2.55k
  std::lock_guard<MutexType> lock(data_mutex_);
819
2.55k
  auto it = kv_store_.tables.find(primary_table_id_);
820
2.55k
  if (it == kv_store_.tables.end()) {
821
0
    LOG_WITH_PREFIX(DFATAL) << "Unable to set WAL retention time for primary table "
822
0
                            << primary_table_id_;
823
0
    return;
824
0
  }
825
2.55k
  it->second->wal_retention_secs = wal_retention_secs;
826
2.55k
  LOG_WITH_PREFIX(INFO) << "Set RaftGroupMetadata wal retention time to "
827
2.55k
                        << wal_retention_secs << " seconds";
828
2.55k
}
829
830
180k
uint32_t RaftGroupMetadata::wal_retention_secs() const {
831
180k
  std::lock_guard<MutexType> lock(data_mutex_);
832
180k
  auto it = kv_store_.tables.find(primary_table_id_);
833
180k
  if (it == kv_store_.tables.end()) {
834
0
    return 0;
835
0
  }
836
180k
  return it->second->wal_retention_secs;
837
180k
}
838
839
89.0k
Status RaftGroupMetadata::set_cdc_min_replicated_index(int64 cdc_min_replicated_index) {
840
89.0k
  {
841
89.0k
    std::lock_guard<MutexType> lock(data_mutex_);
842
89.0k
    cdc_min_replicated_index_ = cdc_min_replicated_index;
843
89.0k
  }
844
89.0k
  return Flush();
845
89.0k
}
846
847
267k
int64_t RaftGroupMetadata::cdc_min_replicated_index() const {
848
267k
  std::lock_guard<MutexType> lock(data_mutex_);
849
267k
  return cdc_min_replicated_index_;
850
267k
}
851
852
0
Status RaftGroupMetadata::SetIsUnderTwodcReplicationAndFlush(bool is_under_twodc_replication) {
853
0
  {
854
0
    std::lock_guard<MutexType> lock(data_mutex_);
855
0
    is_under_twodc_replication_ = is_under_twodc_replication;
856
0
  }
857
0
  return Flush();
858
0
}
859
860
32.8k
bool RaftGroupMetadata::is_under_twodc_replication() const {
861
32.8k
  std::lock_guard<MutexType> lock(data_mutex_);
862
32.8k
  return is_under_twodc_replication_;
863
32.8k
}
864
865
0
void RaftGroupMetadata::SetHidden(bool value) {
866
0
  std::lock_guard<MutexType> lock(data_mutex_);
867
0
  hidden_ = value;
868
0
}
869
870
6.04M
bool RaftGroupMetadata::hidden() const {
871
6.04M
  std::lock_guard<MutexType> lock(data_mutex_);
872
6.04M
  return hidden_;
873
6.04M
}
874
875
0
void RaftGroupMetadata::SetRestorationHybridTime(HybridTime value) {
876
0
  std::lock_guard<MutexType> lock(data_mutex_);
877
0
  restoration_hybrid_time_ = std::max(restoration_hybrid_time_, value);
878
0
}
879
880
89.2k
HybridTime RaftGroupMetadata::restoration_hybrid_time() const {
881
89.2k
  std::lock_guard<MutexType> lock(data_mutex_);
882
89.2k
  return restoration_hybrid_time_;
883
89.2k
}
884
885
87
void RaftGroupMetadata::set_tablet_data_state(TabletDataState state) {
886
87
  std::lock_guard<MutexType> lock(data_mutex_);
887
87
  tablet_data_state_ = state;
888
87
}
889
890
91.8k
string RaftGroupMetadata::LogPrefix() const {
891
91.8k
  return consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid());
892
91.8k
}
893
894
47.9k
OpId RaftGroupMetadata::tombstone_last_logged_opid() const {
895
47.9k
  std::lock_guard<MutexType> lock(data_mutex_);
896
47.9k
  return tombstone_last_logged_opid_;
897
47.9k
}
898
899
350k
bool RaftGroupMetadata::colocated() const {
900
350k
  std::lock_guard<MutexType> lock(data_mutex_);
901
350k
  return colocated_;
902
350k
}
903
904
21.3M
TabletDataState RaftGroupMetadata::tablet_data_state() const {
905
21.3M
  std::lock_guard<MutexType> lock(data_mutex_);
906
21.3M
  return tablet_data_state_;
907
21.3M
}
908
909
17
std::array<TabletId, kNumSplitParts> RaftGroupMetadata::split_child_tablet_ids() const {
910
17
  std::lock_guard<MutexType> lock(data_mutex_);
911
17
  return split_child_tablet_ids_;
912
17
}
913
914
44
OpId RaftGroupMetadata::split_op_id() const {
915
44
  std::lock_guard<MutexType> lock(data_mutex_);
916
44
  return split_op_id_;
917
44
}
918
919
7.51k
OpId RaftGroupMetadata::GetOpIdToDeleteAfterAllApplied() const {
920
7.51k
  std::lock_guard<MutexType> lock(data_mutex_);
921
7.51k
  if (tablet_data_state_ != TabletDataState::TABLET_DATA_SPLIT_COMPLETED || hidden_) {
922
7.45k
    return OpId::Invalid();
923
7.45k
  }
924
62
  return split_op_id_;
925
62
}
926
927
void RaftGroupMetadata::SetSplitDone(
928
43
    const OpId& op_id, const TabletId& child1, const TabletId& child2) {
929
43
  std::lock_guard<MutexType> lock(data_mutex_);
930
43
  tablet_data_state_ = TabletDataState::TABLET_DATA_SPLIT_COMPLETED;
931
43
  split_op_id_ = op_id;
932
43
  split_child_tablet_ids_[0] = child1;
933
43
  split_child_tablet_ids_[1] = child2;
934
43
}
935
936
89.2k
bool RaftGroupMetadata::has_active_restoration() const {
937
89.2k
  std::lock_guard<MutexType> lock(data_mutex_);
938
89.2k
  return !active_restorations_.empty();
939
89.2k
}
940
941
0
void RaftGroupMetadata::RegisterRestoration(const TxnSnapshotRestorationId& restoration_id) {
942
0
  std::lock_guard<MutexType> lock(data_mutex_);
943
0
  if (tablet_data_state_ == TabletDataState::TABLET_DATA_SPLIT_COMPLETED) {
944
0
    tablet_data_state_ = TabletDataState::TABLET_DATA_READY;
945
0
    split_op_id_ = OpId();
946
0
    split_child_tablet_ids_[0] = std::string();
947
0
    split_child_tablet_ids_[1] = std::string();
948
0
  }
949
0
  active_restorations_.push_back(restoration_id);
950
0
}
951
952
0
void RaftGroupMetadata::UnregisterRestoration(const TxnSnapshotRestorationId& restoration_id) {
953
0
  std::lock_guard<MutexType> lock(data_mutex_);
954
0
  Erase(restoration_id, &active_restorations_);
955
0
}
956
957
HybridTime RaftGroupMetadata::CheckCompleteRestorations(
958
1.41k
    const RestorationCompleteTimeMap& restoration_complete_time) {
959
1.41k
  std::lock_guard<MutexType> lock(data_mutex_);
960
1.41k
  auto result = HybridTime::kMin;
961
0
  for (const auto& restoration_id : active_restorations_) {
962
0
    auto it = restoration_complete_time.find(restoration_id);
963
0
    if (it != restoration_complete_time.end() && it->second) {
964
0
      result = std::max(result, it->second);
965
0
    }
966
0
  }
967
1.41k
  return result;
968
1.41k
}
969
970
bool RaftGroupMetadata::CleanupRestorations(
971
1.41k
    const RestorationCompleteTimeMap& restoration_complete_time) {
972
1.41k
  bool result = false;
973
1.41k
  std::lock_guard<MutexType> lock(data_mutex_);
974
1.41k
  for (auto it = active_restorations_.begin(); it != active_restorations_.end();) {
975
0
    auto known_restoration_it = restoration_complete_time.find(*it);
976
0
    if (known_restoration_it == restoration_complete_time.end() || known_restoration_it->second) {
977
0
      it = active_restorations_.erase(it);
978
0
      result = true;
979
0
    } else {
980
0
      ++it;
981
0
    }
982
0
  }
983
1.41k
  return result;
984
1.41k
}
985
986
181
std::string RaftGroupMetadata::GetSubRaftGroupWalDir(const RaftGroupId& raft_group_id) const {
987
181
  return JoinPathSegments(DirName(wal_dir_), MakeTabletDirName(raft_group_id));
988
181
}
989
990
181
std::string RaftGroupMetadata::GetSubRaftGroupDataDir(const RaftGroupId& raft_group_id) const {
991
181
  return JoinPathSegments(DirName(kv_store_.rocksdb_dir), MakeTabletDirName(raft_group_id));
992
181
}
993
994
// We directly init fields of a new metadata, so have to use NO_THREAD_SAFETY_ANALYSIS here.
995
Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateSubtabletMetadata(
996
    const RaftGroupId& raft_group_id, const Partition& partition,
997
    const std::string& lower_bound_key, const std::string& upper_bound_key)
998
93
    const NO_THREAD_SAFETY_ANALYSIS {
999
93
  RaftGroupReplicaSuperBlockPB superblock;
1000
93
  ToSuperBlock(&superblock);
1001
1002
93
  RaftGroupMetadataPtr metadata(new RaftGroupMetadata(fs_manager_, raft_group_id_));
1003
93
  RETURN_NOT_OK(metadata->LoadFromSuperBlock(superblock, /* local_superblock = */ true));
1004
93
  metadata->raft_group_id_ = raft_group_id;
1005
93
  metadata->wal_dir_ = GetSubRaftGroupWalDir(raft_group_id);
1006
93
  metadata->kv_store_.kv_store_id = KvStoreId(raft_group_id);
1007
93
  metadata->kv_store_.lower_bound_key = lower_bound_key;
1008
93
  metadata->kv_store_.upper_bound_key = upper_bound_key;
1009
93
  metadata->kv_store_.rocksdb_dir = GetSubRaftGroupDataDir(raft_group_id);
1010
93
  metadata->kv_store_.has_been_fully_compacted = false;
1011
93
  *metadata->partition_ = partition;
1012
93
  metadata->state_ = kInitialized;
1013
93
  metadata->tablet_data_state_ = TABLET_DATA_INIT_STARTED;
1014
93
  RETURN_NOT_OK(metadata->Flush());
1015
93
  return metadata;
1016
93
}
1017
1018
16
Result<std::string> RaftGroupMetadata::TopSnapshotsDir() const {
1019
16
  auto result = snapshots_dir();
1020
16
  RETURN_NOT_OK_PREPEND(
1021
16
      fs_manager()->CreateDirIfMissingAndSync(result),
1022
16
      Format("Unable to create snapshots directory $0", result));
1023
16
  return result;
1024
16
}
1025
1026
namespace {
1027
// MigrateSuperblockForDXXXX functions are only needed for backward compatibility with
1028
// YugabyteDB versions which don't have changes from DXXXX revision.
1029
// Each MigrateSuperblockForDXXXX could be removed after all YugabyteDB installations are
1030
// upgraded to have revision DXXXX.
1031
1032
1.43k
CHECKED_STATUS MigrateSuperblockForD5900(RaftGroupReplicaSuperBlockPB* superblock) {
1033
  // In previous version of superblock format we stored primary table metadata in superblock's
1034
  // top-level fields (deprecated table_* and other). TableInfo objects were stored inside
1035
  // RaftGroupReplicaSuperBlockPB.tables.
1036
  //
1037
  // In new format TableInfo objects and some other top-level fields are moved from superblock's
1038
  // top-level fields into RaftGroupReplicaSuperBlockPB.kv_store. Primary table (see
1039
  // RaftGroupMetadata::primary_table_id_ field description) metadata is stored inside one of
1040
  // RaftGroupReplicaSuperBlockPB.kv_store.tables objects and is referenced by
1041
  // RaftGroupReplicaSuperBlockPB.primary_table_id.
1042
1.43k
  if (superblock->has_kv_store()) {
1043
1.43k
    return Status::OK();
1044
1.43k
  }
1045
1046
0
  LOG(INFO) << "Migrating superblock for raft group " << superblock->raft_group_id();
1047
1048
0
  KvStoreInfoPB* kv_store_pb = superblock->mutable_kv_store();
1049
0
  kv_store_pb->set_kv_store_id(superblock->raft_group_id());
1050
0
  kv_store_pb->set_rocksdb_dir(superblock->obsolete_rocksdb_dir());
1051
0
  kv_store_pb->mutable_rocksdb_files()->CopyFrom(superblock->obsolete_rocksdb_files());
1052
0
  kv_store_pb->mutable_snapshot_files()->CopyFrom(superblock->obsolete_snapshot_files());
1053
1054
0
  TableInfoPB* primary_table = kv_store_pb->add_tables();
1055
0
  primary_table->set_table_id(superblock->primary_table_id());
1056
0
  primary_table->set_table_name(superblock->obsolete_table_name());
1057
0
  primary_table->set_table_type(superblock->obsolete_table_type());
1058
0
  primary_table->mutable_schema()->CopyFrom(superblock->obsolete_schema());
1059
0
  primary_table->set_schema_version(superblock->obsolete_schema_version());
1060
0
  primary_table->mutable_partition_schema()->CopyFrom(superblock->obsolete_partition_schema());
1061
0
  primary_table->mutable_indexes()->CopyFrom(superblock->obsolete_indexes());
1062
0
  primary_table->mutable_index_info()->CopyFrom(superblock->obsolete_index_info());
1063
0
  primary_table->mutable_deleted_cols()->CopyFrom(superblock->obsolete_deleted_cols());
1064
1065
0
  kv_store_pb->mutable_tables()->MergeFrom(superblock->obsolete_tables());
1066
1067
0
  return Status::OK();
1068
0
}
1069
1070
} // namespace
1071
1072
1.43k
Status MigrateSuperblock(RaftGroupReplicaSuperBlockPB* superblock) {
1073
1.43k
  return MigrateSuperblockForD5900(superblock);
1074
1.43k
}
1075
1076
std::shared_ptr<std::vector<DeletedColumn>> RaftGroupMetadata::deleted_cols(
1077
436
    const TableId& table_id) const {
1078
436
  DCHECK_NE(state_, kNotLoadedYet);
1079
436
  const TableInfoPtr table_info =
1080
436
      table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id));
1081
436
  return std::shared_ptr<std::vector<DeletedColumn>>(table_info, &table_info->deleted_cols);
1082
436
}
1083
1084
96.4k
std::string RaftGroupMetadata::namespace_name(const TableId& table_id) const {
1085
96.4k
  DCHECK_NE(state_, kNotLoadedYet);
1086
96.4k
  if (table_id.empty()) {
1087
96.4k
    return primary_table_info()->namespace_name;
1088
96.4k
  }
1089
13
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1090
13
  return table_info->namespace_name;
1091
13
}
1092
1093
1.84M
std::string RaftGroupMetadata::table_name(const TableId& table_id) const {
1094
1.84M
  DCHECK_NE(state_, kNotLoadedYet);
1095
1.84M
  if (table_id.empty()) {
1096
1.84M
    return primary_table_info()->table_name;
1097
1.84M
  }
1098
646
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1099
646
  return table_info->table_name;
1100
646
}
1101
1102
3.34M
TableType RaftGroupMetadata::table_type(const TableId& table_id) const {
1103
3.34M
  DCHECK_NE(state_, kNotLoadedYet);
1104
3.34M
  if (table_id.empty()) {
1105
3.33M
    return primary_table_info()->table_type;
1106
3.33M
  }
1107
7.06k
  const auto& table_info = CHECK_RESULT(GetTableInfo(table_id));
1108
7.06k
  return table_info->table_type;
1109
7.06k
}
1110
1111
19.7M
yb::SchemaPtr RaftGroupMetadata::schema(const TableId& table_id) const {
1112
19.7M
  DCHECK_NE(state_, kNotLoadedYet);
1113
19.7M
  const TableInfoPtr table_info =
1114
19.7M
      table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id));
1115
19.7M
  return yb::SchemaPtr(table_info, table_info->schema.get());
1116
19.7M
}
1117
1118
2.62k
std::shared_ptr<IndexMap> RaftGroupMetadata::index_map(const TableId& table_id) const {
1119
2.62k
  DCHECK_NE(state_, kNotLoadedYet);
1120
2.62k
  const TableInfoPtr table_info =
1121
2.62k
      table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id));
1122
2.62k
  return std::shared_ptr<IndexMap>(table_info, table_info->index_map.get());
1123
2.62k
}
1124
1125
4.29M
uint32_t RaftGroupMetadata::schema_version(const TableId& table_id) const {
1126
4.29M
  DCHECK_NE(state_, kNotLoadedYet);
1127
4.29M
  const TableInfoPtr table_info =
1128
4.29M
      table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id));
1129
4.29M
  return table_info->schema_version;
1130
4.29M
}
1131
1132
0
const std::string& RaftGroupMetadata::indexed_table_id(const TableId& table_id) const {
1133
0
  DCHECK_NE(state_, kNotLoadedYet);
1134
0
  static const std::string kEmptyString = "";
1135
0
  std::lock_guard<MutexType> lock(data_mutex_);
1136
0
  const TableInfoPtr table_info = table_id.empty() ?
1137
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1138
0
  const auto* index_info = table_info->index_info.get();
1139
0
  return index_info ? index_info->indexed_table_id() : kEmptyString;
1140
0
}
1141
1142
0
bool RaftGroupMetadata::is_local_index(const TableId& table_id) const {
1143
0
  DCHECK_NE(state_, kNotLoadedYet);
1144
0
  std::lock_guard<MutexType> lock(data_mutex_);
1145
0
  const TableInfoPtr table_info = table_id.empty() ?
1146
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1147
0
  const auto* index_info = table_info->index_info.get();
1148
0
  return index_info && index_info->is_local();
1149
0
}
1150
1151
1.22M
bool RaftGroupMetadata::is_unique_index(const TableId& table_id) const {
1152
1.22M
  DCHECK_NE(state_, kNotLoadedYet);
1153
1.22M
  std::lock_guard<MutexType> lock(data_mutex_);
1154
1.22M
  const TableInfoPtr table_info = table_id.empty() ?
1155
1.22M
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1156
1.22M
  const auto* index_info = table_info->index_info.get();
1157
1.22M
  return index_info && index_info->is_unique();
1158
1.22M
}
1159
1160
0
std::vector<ColumnId> RaftGroupMetadata::index_key_column_ids(const TableId& table_id) const {
1161
0
  DCHECK_NE(state_, kNotLoadedYet);
1162
0
  std::lock_guard<MutexType> lock(data_mutex_);
1163
0
  const TableInfoPtr table_info = table_id.empty() ?
1164
0
      primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id));
1165
0
  const auto* index_info = table_info->index_info.get();
1166
0
  return index_info ? index_info->index_key_column_ids() : std::vector<ColumnId>();
1167
0
}
1168
1169
3.00M
bool RaftGroupMetadata::UsePartialRangeKeyIntents() const {
1170
3.00M
  return table_type() == TableType::PGSQL_TABLE_TYPE;
1171
3.00M
}
1172
1173
} // namespace tablet
1174
} // namespace yb