/Users/deen/code/yugabyte-db/src/yb/tablet/tablet_metadata.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include "yb/tablet/tablet_metadata.h" |
34 | | |
35 | | #include <algorithm> |
36 | | #include <mutex> |
37 | | #include <string> |
38 | | |
39 | | #include <boost/optional.hpp> |
40 | | #include <gflags/gflags.h> |
41 | | |
42 | | #include "yb/common/entity_ids.h" |
43 | | #include "yb/common/index.h" |
44 | | #include "yb/common/schema.h" |
45 | | #include "yb/common/transaction.h" |
46 | | #include "yb/common/wire_protocol.h" |
47 | | |
48 | | #include "yb/consensus/consensus_util.h" |
49 | | #include "yb/consensus/opid_util.h" |
50 | | |
51 | | #include "yb/docdb/docdb_rocksdb_util.h" |
52 | | |
53 | | #include "yb/gutil/atomicops.h" |
54 | | #include "yb/gutil/dynamic_annotations.h" |
55 | | #include "yb/gutil/map-util.h" |
56 | | #include "yb/gutil/stl_util.h" |
57 | | #include "yb/gutil/strings/substitute.h" |
58 | | |
59 | | #include "yb/rocksdb/db.h" |
60 | | #include "yb/rocksdb/options.h" |
61 | | |
62 | | #include "yb/tablet/metadata.pb.h" |
63 | | #include "yb/tablet/tablet_options.h" |
64 | | |
65 | | #include "yb/util/debug/trace_event.h" |
66 | | #include "yb/util/flag_tags.h" |
67 | | #include "yb/util/logging.h" |
68 | | #include "yb/util/pb_util.h" |
69 | | #include "yb/util/random.h" |
70 | | #include "yb/util/result.h" |
71 | | #include "yb/util/status.h" |
72 | | #include "yb/util/status_log.h" |
73 | | #include "yb/util/trace.h" |
74 | | |
75 | | DEFINE_bool(enable_tablet_orphaned_block_deletion, true, |
76 | | "Whether to enable deletion of orphaned blocks from disk. " |
77 | | "Note: This is only exposed for debugging purposes!"); |
78 | | TAG_FLAG(enable_tablet_orphaned_block_deletion, advanced); |
79 | | TAG_FLAG(enable_tablet_orphaned_block_deletion, hidden); |
80 | | TAG_FLAG(enable_tablet_orphaned_block_deletion, runtime); |
81 | | |
82 | | using std::shared_ptr; |
83 | | |
84 | | using base::subtle::Barrier_AtomicIncrement; |
85 | | using strings::Substitute; |
86 | | |
87 | | using yb::consensus::MinimumOpId; |
88 | | |
89 | | namespace yb { |
90 | | namespace tablet { |
91 | | |
92 | | const int64 kNoDurableMemStore = -1; |
93 | | const std::string kIntentsSubdir = "intents"; |
94 | | const std::string kIntentsDBSuffix = ".intents"; |
95 | | const std::string kSnapshotsDirSuffix = ".snapshots"; |
96 | | |
97 | | // ============================================================================ |
98 | | // Raft group metadata |
99 | | // ============================================================================ |
100 | | |
101 | | TableInfo::TableInfo() |
102 | 3.81k | : schema(std::make_unique<Schema>()), index_map(std::make_unique<IndexMap>()) { |
103 | 3.81k | } |
104 | | |
105 | | TableInfo::TableInfo(std::string table_id, |
106 | | std::string namespace_name, |
107 | | std::string table_name, |
108 | | TableType table_type, |
109 | | const Schema& schema, |
110 | | const IndexMap& index_map, |
111 | | const boost::optional<IndexInfo>& index_info, |
112 | | const uint32_t schema_version, |
113 | | PartitionSchema partition_schema) |
114 | | : table_id(std::move(table_id)), |
115 | | namespace_name(std::move(namespace_name)), |
116 | | table_name(std::move(table_name)), |
117 | | table_type(table_type), |
118 | | schema(std::make_unique<Schema>(schema)), |
119 | | index_map(std::make_unique<IndexMap>(index_map)), |
120 | | index_info(index_info ? new IndexInfo(*index_info) : nullptr), |
121 | | schema_version(schema_version), |
122 | 1.17M | partition_schema(std::move(partition_schema)) { |
123 | 1.17M | } |
124 | | |
125 | | TableInfo::TableInfo(const TableInfo& other, |
126 | | const Schema& schema, |
127 | | const IndexMap& index_map, |
128 | | const std::vector<DeletedColumn>& deleted_cols, |
129 | | const uint32_t schema_version) |
130 | | : table_id(other.table_id), |
131 | | namespace_name(other.namespace_name), |
132 | | table_name(other.table_name), |
133 | | table_type(other.table_type), |
134 | | schema(std::make_unique<Schema>(schema)), |
135 | | index_map(std::make_unique<IndexMap>(index_map)), |
136 | | index_info(other.index_info ? new IndexInfo(*other.index_info) : nullptr), |
137 | | schema_version(schema_version), |
138 | | partition_schema(other.partition_schema), |
139 | 79.9k | deleted_cols(other.deleted_cols) { |
140 | 79.9k | this->deleted_cols.insert(this->deleted_cols.end(), deleted_cols.begin(), deleted_cols.end()); |
141 | 79.9k | } |
142 | | |
143 | 3.81k | Status TableInfo::LoadFromPB(const TableInfoPB& pb) { |
144 | 3.81k | table_id = pb.table_id(); |
145 | 3.81k | namespace_name = pb.namespace_name(); |
146 | 3.81k | table_name = pb.table_name(); |
147 | 3.81k | table_type = pb.table_type(); |
148 | | |
149 | 3.81k | RETURN_NOT_OK(SchemaFromPB(pb.schema(), schema.get())); |
150 | 3.81k | if (pb.has_index_info()) { |
151 | 18 | index_info.reset(new IndexInfo(pb.index_info())); |
152 | 18 | } |
153 | 3.81k | index_map->FromPB(pb.indexes()); |
154 | 3.81k | schema_version = pb.schema_version(); |
155 | | |
156 | 3.81k | RETURN_NOT_OK(PartitionSchema::FromPB(pb.partition_schema(), *schema, &partition_schema)); |
157 | | |
158 | 3.81k | for (const DeletedColumnPB& deleted_col : pb.deleted_cols()) { |
159 | 636 | DeletedColumn col; |
160 | 636 | RETURN_NOT_OK(DeletedColumn::FromPB(deleted_col, &col)); |
161 | 636 | deleted_cols.push_back(col); |
162 | 636 | } |
163 | | |
164 | 3.81k | return Status::OK(); |
165 | 3.81k | } |
166 | | |
167 | 301M | void TableInfo::ToPB(TableInfoPB* pb) const { |
168 | 301M | pb->set_table_id(table_id); |
169 | 301M | pb->set_namespace_name(namespace_name); |
170 | 301M | pb->set_table_name(table_name); |
171 | 301M | pb->set_table_type(table_type); |
172 | | |
173 | 301M | DCHECK(schema->has_column_ids()); |
174 | 301M | SchemaToPB(*schema, pb->mutable_schema()); |
175 | 301M | if (index_info) { |
176 | 68.4k | index_info->ToPB(pb->mutable_index_info()); |
177 | 68.4k | } |
178 | 301M | index_map->ToPB(pb->mutable_indexes()); |
179 | 301M | pb->set_schema_version(schema_version); |
180 | | |
181 | 301M | partition_schema.ToPB(pb->mutable_partition_schema()); |
182 | | |
183 | 301M | for (const DeletedColumn& deleted_col : deleted_cols) { |
184 | 21.4k | deleted_col.CopyToPB(pb->mutable_deleted_cols()->Add()); |
185 | 21.4k | } |
186 | 301M | } |
187 | | |
188 | | Status KvStoreInfo::LoadTablesFromPB( |
189 | 2.57k | const google::protobuf::RepeatedPtrField<TableInfoPB>& pbs, const TableId& primary_table_id) { |
190 | 2.57k | tables.clear(); |
191 | 3.81k | for (const auto& table_pb : pbs) { |
192 | 3.81k | auto table_info = std::make_shared<TableInfo>(); |
193 | 3.81k | RETURN_NOT_OK(table_info->LoadFromPB(table_pb)); |
194 | 3.81k | if (table_info->table_id != primary_table_id) { |
195 | | // TODO(alex): cotable_id should be loaded from PB schema, do we need this section? |
196 | 1.24k | if (table_pb.schema().table_properties().is_ysql_catalog_table()) { |
197 | 1.22k | Uuid cotable_id; |
198 | 1.22k | CHECK_OK(cotable_id.FromHexString(table_info->table_id)); |
199 | | // TODO(#79): when adding for multiple KV-stores per Raft group support - check if we need |
200 | | // to set cotable ID. |
201 | 1.22k | table_info->schema->set_cotable_id(cotable_id); |
202 | 1.22k | } |
203 | | // Colocation ID is already set in schema. |
204 | | // TODO(alex): We don't have this info when master starts up? |
205 | 1.24k | } |
206 | 3.81k | tables[table_info->table_id] = std::move(table_info); |
207 | 3.81k | } |
208 | 2.57k | return Status::OK(); |
209 | 2.57k | } |
210 | | |
211 | | Status KvStoreInfo::LoadFromPB(const KvStoreInfoPB& pb, |
212 | | const TableId& primary_table_id, |
213 | 2.57k | bool local_superblock) { |
214 | 2.57k | kv_store_id = KvStoreId(pb.kv_store_id()); |
215 | 2.57k | if (local_superblock) { |
216 | 404 | rocksdb_dir = pb.rocksdb_dir(); |
217 | 404 | } |
218 | 2.57k | lower_bound_key = pb.lower_bound_key(); |
219 | 2.57k | upper_bound_key = pb.upper_bound_key(); |
220 | 2.57k | has_been_fully_compacted = pb.has_been_fully_compacted(); |
221 | | |
222 | 2.57k | for (const auto& schedule_id : pb.snapshot_schedules()) { |
223 | 0 | snapshot_schedules.insert(VERIFY_RESULT(FullyDecodeSnapshotScheduleId(schedule_id))); |
224 | 0 | } |
225 | | |
226 | 2.57k | return LoadTablesFromPB(pb.tables(), primary_table_id); |
227 | 2.57k | } |
228 | | |
229 | 1.56M | void KvStoreInfo::ToPB(const TableId& primary_table_id, KvStoreInfoPB* pb) const { |
230 | 1.56M | pb->set_kv_store_id(kv_store_id.ToString()); |
231 | 1.56M | pb->set_rocksdb_dir(rocksdb_dir); |
232 | 1.56M | if (lower_bound_key.empty()) { |
233 | 1.55M | pb->clear_lower_bound_key(); |
234 | 1.55M | } else { |
235 | 1.02k | pb->set_lower_bound_key(lower_bound_key); |
236 | 1.02k | } |
237 | 1.56M | if (upper_bound_key.empty()) { |
238 | 1.55M | pb->clear_upper_bound_key(); |
239 | 1.55M | } else { |
240 | 869 | pb->set_upper_bound_key(upper_bound_key); |
241 | 869 | } |
242 | 1.56M | pb->set_has_been_fully_compacted(has_been_fully_compacted); |
243 | | |
244 | | // Putting primary table first, then all other tables. |
245 | 1.56M | const auto& it = tables.find(primary_table_id); |
246 | 1.56M | if (it != tables.end()) { |
247 | 1.55M | it->second->ToPB(pb->add_tables()); |
248 | 1.55M | } |
249 | 301M | for (const auto& it : tables) { |
250 | 301M | if (it.first != primary_table_id) { |
251 | 299M | it.second->ToPB(pb->add_tables()); |
252 | 299M | } |
253 | 301M | } |
254 | | |
255 | 1.56M | for (const auto& schedule_id : snapshot_schedules) { |
256 | 326 | pb->add_snapshot_schedules(schedule_id.data(), schedule_id.size()); |
257 | 326 | } |
258 | 1.56M | } |
259 | | |
260 | | namespace { |
261 | | |
262 | 150k | std::string MakeTabletDirName(const TabletId& tablet_id) { |
263 | 150k | return Format("tablet-$0", tablet_id); |
264 | 150k | } |
265 | | |
266 | | } // namespace |
267 | | |
268 | | // ============================================================================ |
269 | | |
270 | | Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateNew( |
271 | | const RaftGroupMetadataData& data, const std::string& data_root_dir, |
272 | 150k | const std::string& wal_root_dir) { |
273 | 150k | auto* fs_manager = data.fs_manager; |
274 | | // Verify that no existing Raft group exists with the same ID. |
275 | 150k | if (fs_manager->env()->FileExists(fs_manager->GetRaftGroupMetadataPath(data.raft_group_id))) { |
276 | 0 | return STATUS(AlreadyPresent, "Raft group already exists", data.raft_group_id); |
277 | 0 | } |
278 | | |
279 | 150k | auto wal_top_dir = wal_root_dir; |
280 | 150k | auto data_top_dir = data_root_dir; |
281 | | // Use the original randomized logic if the indices are not explicitly passed in |
282 | 150k | if (data_root_dir.empty()) { |
283 | 8.48k | auto data_root_dirs = fs_manager->GetDataRootDirs(); |
284 | 8.48k | CHECK(!data_root_dirs.empty()) << "No data root directories found"0 ; |
285 | 8.48k | data_top_dir = RandomElement(data_root_dirs); |
286 | 8.48k | } |
287 | | |
288 | 150k | if (wal_root_dir.empty()) { |
289 | 8.48k | auto wal_root_dirs = fs_manager->GetWalRootDirs(); |
290 | 8.48k | CHECK(!wal_root_dirs.empty()) << "No wal root directories found"0 ; |
291 | 8.48k | wal_top_dir = RandomElement(wal_root_dirs); |
292 | 8.48k | } |
293 | | |
294 | 150k | const string table_dir_name = Substitute("table-$0", data.table_info->table_id); |
295 | 150k | const string tablet_dir_name = MakeTabletDirName(data.raft_group_id); |
296 | 150k | const string wal_dir = JoinPathSegments(wal_top_dir, table_dir_name, tablet_dir_name); |
297 | 150k | const string rocksdb_dir = JoinPathSegments( |
298 | 150k | data_top_dir, FsManager::kRocksDBDirName, table_dir_name, tablet_dir_name); |
299 | | |
300 | 150k | RaftGroupMetadataPtr ret(new RaftGroupMetadata(data, rocksdb_dir, wal_dir)); |
301 | 150k | RETURN_NOT_OK(ret->Flush()); |
302 | 150k | return ret; |
303 | 150k | } |
304 | | |
305 | | Result<RaftGroupMetadataPtr> RaftGroupMetadata::Load( |
306 | 8.87k | FsManager* fs_manager, const RaftGroupId& raft_group_id) { |
307 | 8.87k | RaftGroupMetadataPtr ret(new RaftGroupMetadata(fs_manager, raft_group_id)); |
308 | 8.87k | RETURN_NOT_OK(ret->LoadFromDisk()); |
309 | 263 | return ret; |
310 | 8.87k | } |
311 | | |
312 | 484 | Result<RaftGroupMetadataPtr> RaftGroupMetadata::LoadOrCreate(const RaftGroupMetadataData& data) { |
313 | 484 | auto metadata = Load(data.fs_manager, data.raft_group_id); |
314 | 484 | if (metadata.ok()) { |
315 | 7 | if (!(**metadata).schema()->Equals(*data.table_info->schema)) { |
316 | 0 | return STATUS(Corruption, Substitute("Schema on disk ($0) does not " |
317 | 0 | "match expected schema ($1)", (*metadata)->schema()->ToString(), |
318 | 0 | data.table_info->schema->ToString())); |
319 | 0 | } |
320 | 7 | return *metadata; |
321 | 7 | } |
322 | | |
323 | 477 | if (metadata.status().IsNotFound()) { |
324 | 477 | return CreateNew(data); |
325 | 477 | } |
326 | | |
327 | 0 | return metadata.status(); |
328 | 477 | } |
329 | | |
330 | | template <class TablesMap> |
331 | | CHECKED_STATUS MakeTableNotFound(const TableId& table_id, const RaftGroupId& raft_group_id, |
332 | 16 | const TablesMap& tables) { |
333 | 16 | std::string table_name = "<unknown_table_name>"; |
334 | 16 | if (!table_id.empty()) { |
335 | 16 | const auto iter = tables.find(table_id); |
336 | 16 | if (iter != tables.end()) { |
337 | 0 | table_name = iter->second->table_name; |
338 | 0 | } |
339 | 16 | } |
340 | 16 | std::ostringstream string_stream; |
341 | 16 | string_stream << "Table " << table_name << " (" << table_id << ") not found in Raft group " |
342 | 16 | << raft_group_id; |
343 | 16 | std::string msg = string_stream.str(); |
344 | 16 | #ifndef NDEBUG |
345 | | // This very large message should be logged instead of being appended to STATUS. |
346 | 16 | std::string suffix = Format(". Tables: $0.", tables); |
347 | 16 | VLOG(1) << msg << suffix0 ; |
348 | 16 | #endif |
349 | 16 | return STATUS(NotFound, msg); |
350 | 16 | } |
351 | | |
352 | 24.0M | Result<TableInfoPtr> RaftGroupMetadata::GetTableInfo(const std::string& table_id) const { |
353 | 24.0M | std::lock_guard<MutexType> lock(data_mutex_); |
354 | 24.0M | return GetTableInfoUnlocked(table_id); |
355 | 24.0M | } |
356 | | |
357 | 24.1M | Result<TableInfoPtr> RaftGroupMetadata::GetTableInfoUnlocked(const std::string& table_id) const { |
358 | 24.1M | const auto& tables = kv_store_.tables; |
359 | 24.1M | const auto id = !table_id.empty() ? table_id24.0M : primary_table_id_53.5k ; |
360 | 24.1M | const auto iter = tables.find(id); |
361 | 24.1M | if (iter == tables.end()) { |
362 | 16 | return MakeTableNotFound(table_id, raft_group_id_, tables); |
363 | 16 | } |
364 | 24.1M | return iter->second; |
365 | 24.1M | } |
366 | | |
367 | | Status RaftGroupMetadata::DeleteTabletData(TabletDataState delete_type, |
368 | 75.5k | const OpId& last_logged_opid) { |
369 | 75.5k | CHECK(delete_type == TABLET_DATA_DELETED || |
370 | 0 | delete_type == TABLET_DATA_TOMBSTONED) |
371 | 0 | << "DeleteTabletData() called with unsupported delete_type on tablet " |
372 | 0 | << raft_group_id_ << ": " << TabletDataState_Name(delete_type) |
373 | 0 | << " (" << delete_type << ")"; |
374 | | |
375 | | // First add all of our blocks to the orphan list |
376 | | // and clear our rowsets. This serves to erase all the data. |
377 | | // |
378 | | // We also set the state in our persisted metadata to indicate that |
379 | | // we have been deleted. |
380 | 75.5k | { |
381 | 75.5k | std::lock_guard<MutexType> lock(data_mutex_); |
382 | 75.5k | tablet_data_state_ = delete_type; |
383 | 75.5k | if (!last_logged_opid.empty()) { |
384 | 75.3k | tombstone_last_logged_opid_ = last_logged_opid; |
385 | 75.3k | } |
386 | 75.5k | } |
387 | | |
388 | 75.5k | rocksdb::Options rocksdb_options; |
389 | 75.5k | TabletOptions tablet_options; |
390 | 75.5k | std::string log_prefix = consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid()); |
391 | 75.5k | docdb::InitRocksDBOptions( |
392 | 75.5k | &rocksdb_options, log_prefix, nullptr /* statistics */, tablet_options); |
393 | | |
394 | 75.5k | const auto& rocksdb_dir = this->rocksdb_dir(); |
395 | 75.5k | LOG(INFO) << "Destroying regular db at: " << rocksdb_dir; |
396 | 75.5k | rocksdb::Status status = rocksdb::DestroyDB(rocksdb_dir, rocksdb_options); |
397 | | |
398 | 75.5k | if (!status.ok()) { |
399 | 473 | LOG(ERROR) << "Failed to destroy regular DB at: " << rocksdb_dir << ": " << status; |
400 | 75.0k | } else { |
401 | 75.0k | LOG(INFO) << "Successfully destroyed regular DB at: " << rocksdb_dir; |
402 | 75.0k | } |
403 | | |
404 | 75.5k | if (fs_manager_->env()->FileExists(rocksdb_dir)) { |
405 | 819 | auto s = fs_manager_->env()->DeleteRecursively(rocksdb_dir); |
406 | 819 | LOG_IF(WARNING, !s.ok()) << "Unable to delete rocksdb data directory " << rocksdb_dir0 ; |
407 | 819 | } |
408 | | |
409 | 75.5k | const auto intents_dir = this->intents_rocksdb_dir(); |
410 | 75.5k | if (fs_manager_->env()->FileExists(intents_dir)) { |
411 | 75.0k | status = rocksdb::DestroyDB(intents_dir, rocksdb_options); |
412 | | |
413 | 75.0k | if (!status.ok()) { |
414 | 0 | LOG(ERROR) << "Failed to destroy provisional records DB at: " << intents_dir << ": " |
415 | 0 | << status; |
416 | 75.0k | } else { |
417 | 75.0k | LOG(INFO) << "Successfully destroyed provisional records DB at: " << intents_dir; |
418 | 75.0k | } |
419 | 75.0k | } |
420 | | |
421 | 75.5k | if (fs_manager_->env()->FileExists(intents_dir)) { |
422 | 0 | auto s = fs_manager_->env()->DeleteRecursively(intents_dir); |
423 | 0 | LOG_IF(WARNING, !s.ok()) << "Unable to delete intents directory " << intents_dir; |
424 | 0 | } |
425 | | |
426 | | // TODO(tsplit): decide what to do with snapshots for split tablets that we delete after split. |
427 | | // As for now, snapshots will be deleted as well. |
428 | 75.5k | const auto snapshots_dir = this->snapshots_dir(); |
429 | 75.5k | if (fs_manager_->env()->FileExists(snapshots_dir)) { |
430 | 75.0k | auto s = fs_manager_->env()->DeleteRecursively(snapshots_dir); |
431 | 18.4E | LOG_IF(WARNING, !s.ok()) << "Unable to delete snapshots directory " << snapshots_dir; |
432 | 75.0k | } |
433 | | |
434 | | // Flushing will sync the new tablet_data_state_ to disk and will now also |
435 | | // delete all the data. |
436 | 75.5k | RETURN_NOT_OK(Flush()); |
437 | | |
438 | | // Re-sync to disk one more time. |
439 | | // This call will typically re-sync with an empty orphaned blocks list |
440 | | // (unless deleting any orphans failed during the last Flush()), so that we |
441 | | // don't try to re-delete the deleted orphaned blocks on every startup. |
442 | 75.5k | return Flush(); |
443 | 75.5k | } |
444 | | |
445 | 8 | bool RaftGroupMetadata::IsTombstonedWithNoRocksDBData() const { |
446 | 8 | std::lock_guard<MutexType> lock(data_mutex_); |
447 | 8 | const auto& rocksdb_dir = kv_store_.rocksdb_dir; |
448 | 8 | const auto intents_dir = rocksdb_dir + kIntentsDBSuffix; |
449 | 8 | return tablet_data_state_ == TABLET_DATA_TOMBSTONED && |
450 | 8 | !fs_manager_->env()->FileExists(rocksdb_dir)2 && |
451 | 8 | !fs_manager_->env()->FileExists(intents_dir)2 ; |
452 | 8 | } |
453 | | |
454 | 42 | Status RaftGroupMetadata::DeleteSuperBlock() { |
455 | 42 | std::lock_guard<MutexType> lock(data_mutex_); |
456 | 42 | if (tablet_data_state_ != TABLET_DATA_DELETED) { |
457 | 0 | return STATUS(IllegalState, |
458 | 0 | Substitute("Tablet $0 is not in TABLET_DATA_DELETED state. " |
459 | 0 | "Call DeleteTabletData(TABLET_DATA_DELETED) first. " |
460 | 0 | "Tablet data state: $1 ($2)", |
461 | 0 | raft_group_id_, |
462 | 0 | TabletDataState_Name(tablet_data_state_), |
463 | 0 | tablet_data_state_)); |
464 | 0 | } |
465 | | |
466 | 42 | string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_); |
467 | 42 | RETURN_NOT_OK_PREPEND(fs_manager_->env()->DeleteFile(path), |
468 | 42 | "Unable to delete superblock for Raft group " + raft_group_id_); |
469 | 42 | return Status::OK(); |
470 | 42 | } |
471 | | |
472 | | RaftGroupMetadata::RaftGroupMetadata( |
473 | | const RaftGroupMetadataData& data, const std::string& data_dir, const std::string& wal_dir) |
474 | | : state_(kNotWrittenYet), |
475 | | raft_group_id_(data.raft_group_id), |
476 | | partition_(std::make_shared<Partition>(data.partition)), |
477 | | primary_table_id_(data.table_info->table_id), |
478 | | kv_store_(KvStoreId(raft_group_id_), data_dir, data.snapshot_schedules), |
479 | | fs_manager_(data.fs_manager), |
480 | | wal_dir_(wal_dir), |
481 | | tablet_data_state_(data.tablet_data_state), |
482 | | colocated_(data.colocated), |
483 | 150k | cdc_min_replicated_index_(std::numeric_limits<int64_t>::max()) { |
484 | 150k | CHECK(data.table_info->schema->has_column_ids()); |
485 | 150k | CHECK_GT(data.table_info->schema->num_key_columns(), 0); |
486 | 150k | kv_store_.tables.emplace(primary_table_id_, data.table_info); |
487 | 150k | } |
488 | | |
489 | 83.1k | RaftGroupMetadata::~RaftGroupMetadata() { |
490 | 83.1k | } |
491 | | |
492 | | RaftGroupMetadata::RaftGroupMetadata(FsManager* fs_manager, RaftGroupId raft_group_id) |
493 | | : state_(kNotLoadedYet), |
494 | | raft_group_id_(std::move(raft_group_id)), |
495 | | kv_store_(KvStoreId(raft_group_id_)), |
496 | 9.01k | fs_manager_(fs_manager) { |
497 | 9.01k | } |
498 | | |
499 | 8.87k | Status RaftGroupMetadata::LoadFromDisk() { |
500 | 8.87k | TRACE_EVENT1("raft_group", "RaftGroupMetadata::LoadFromDisk", |
501 | 8.87k | "raft_group_id", raft_group_id_); |
502 | | |
503 | 8.87k | CHECK_EQ(state_, kNotLoadedYet); |
504 | | |
505 | 8.87k | RaftGroupReplicaSuperBlockPB superblock; |
506 | 8.87k | RETURN_NOT_OK(ReadSuperBlockFromDisk(&superblock)); |
507 | 263 | RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(superblock, /* local_superblock = */ true), |
508 | 263 | "Failed to load data from superblock protobuf"); |
509 | 263 | state_ = kInitialized; |
510 | 263 | return Status::OK(); |
511 | 263 | } |
512 | | |
513 | | Status RaftGroupMetadata::LoadFromSuperBlock(const RaftGroupReplicaSuperBlockPB& superblock, |
514 | 2.57k | bool local_superblock) { |
515 | 2.57k | if (!superblock.has_kv_store()) { |
516 | | // Backward compatibility for tablet=KV-store=raft-group. |
517 | 0 | RaftGroupReplicaSuperBlockPB superblock_migrated(superblock); |
518 | 0 | RETURN_NOT_OK(MigrateSuperblock(&superblock_migrated)); |
519 | 0 | RETURN_NOT_OK(LoadFromSuperBlock(superblock_migrated, local_superblock)); |
520 | 0 | return Flush(); |
521 | 0 | } |
522 | | |
523 | 2.57k | VLOG(2) << "Loading RaftGroupMetadata from SuperBlockPB:" << std::endl |
524 | 0 | << superblock.DebugString(); |
525 | | |
526 | 2.57k | { |
527 | 2.57k | std::lock_guard<MutexType> lock(data_mutex_); |
528 | | |
529 | | // Verify that the Raft group id matches with the one in the protobuf. |
530 | 2.57k | if (superblock.raft_group_id() != raft_group_id_) { |
531 | 0 | return STATUS(Corruption, "Expected id=" + raft_group_id_ + |
532 | 0 | " found " + superblock.raft_group_id(), |
533 | 0 | superblock.DebugString()); |
534 | 0 | } |
535 | 2.57k | Partition partition; |
536 | 2.57k | Partition::FromPB(superblock.partition(), &partition); |
537 | 2.57k | partition_ = std::make_shared<Partition>(partition); |
538 | 2.57k | primary_table_id_ = superblock.primary_table_id(); |
539 | 2.57k | colocated_ = superblock.colocated(); |
540 | | |
541 | 2.57k | RETURN_NOT_OK(kv_store_.LoadFromPB(superblock.kv_store(), |
542 | 2.57k | primary_table_id_, |
543 | 2.57k | local_superblock)); |
544 | | |
545 | 2.57k | wal_dir_ = superblock.wal_dir(); |
546 | 2.57k | tablet_data_state_ = superblock.tablet_data_state(); |
547 | | |
548 | 2.57k | if (superblock.has_tombstone_last_logged_opid()) { |
549 | 7 | tombstone_last_logged_opid_ = OpId::FromPB(superblock.tombstone_last_logged_opid()); |
550 | 2.56k | } else { |
551 | 2.56k | tombstone_last_logged_opid_ = OpId(); |
552 | 2.56k | } |
553 | 2.57k | cdc_min_replicated_index_ = superblock.cdc_min_replicated_index(); |
554 | 2.57k | is_under_twodc_replication_ = superblock.is_under_twodc_replication(); |
555 | 2.57k | hidden_ = superblock.hidden(); |
556 | 2.57k | auto restoration_hybrid_time = HybridTime::FromPB(superblock.restoration_hybrid_time()); |
557 | 2.57k | if (restoration_hybrid_time) { |
558 | 0 | restoration_hybrid_time_ = restoration_hybrid_time; |
559 | 0 | } |
560 | | |
561 | 2.57k | if (superblock.has_split_op_id()) { |
562 | 3 | split_op_id_ = OpId::FromPB(superblock.split_op_id()); |
563 | | |
564 | 3 | SCHECK_EQ(implicit_cast<size_t>(superblock.split_child_tablet_ids().size()), |
565 | 3 | split_child_tablet_ids_.size(), |
566 | 3 | Corruption, "Expected exact number of child tablet ids"); |
567 | 9 | for (size_t i = 0; 3 i != split_child_tablet_ids_.size(); ++i6 ) { |
568 | 6 | split_child_tablet_ids_[i] = superblock.split_child_tablet_ids(narrow_cast<int>(i)); |
569 | 6 | } |
570 | 3 | } |
571 | | |
572 | 2.57k | if (!superblock.active_restorations().empty()) { |
573 | 0 | active_restorations_.reserve(superblock.active_restorations().size()); |
574 | 0 | for (const auto& id : superblock.active_restorations()) { |
575 | 0 | active_restorations_.push_back(VERIFY_RESULT(FullyDecodeTxnSnapshotRestorationId(id))); |
576 | 0 | } |
577 | 0 | } |
578 | 2.57k | } |
579 | | |
580 | 2.57k | return Status::OK(); |
581 | 2.57k | } |
582 | | |
583 | 1.56M | Status RaftGroupMetadata::Flush() { |
584 | 1.56M | TRACE_EVENT1("raft_group", "RaftGroupMetadata::Flush", |
585 | 1.56M | "raft_group_id", raft_group_id_); |
586 | | |
587 | 1.56M | MutexLock l_flush(flush_lock_); |
588 | 1.56M | RaftGroupReplicaSuperBlockPB pb; |
589 | 1.56M | { |
590 | 1.56M | std::lock_guard<MutexType> lock(data_mutex_); |
591 | 1.56M | ToSuperBlockUnlocked(&pb); |
592 | 1.56M | } |
593 | 1.56M | RETURN_NOT_OK(SaveToDiskUnlocked(pb)); |
594 | 1.56M | TRACE("Metadata flushed"); |
595 | | |
596 | 1.56M | return Status::OK(); |
597 | 1.56M | } |
598 | | |
599 | 2.17k | Status RaftGroupMetadata::ReplaceSuperBlock(const RaftGroupReplicaSuperBlockPB &pb) { |
600 | 2.17k | { |
601 | 2.17k | MutexLock l(flush_lock_); |
602 | 2.17k | RETURN_NOT_OK_PREPEND(SaveToDiskUnlocked(pb), "Unable to replace superblock"); |
603 | 2.17k | } |
604 | | |
605 | 2.17k | RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(pb, /* local_superblock = */ false), |
606 | 2.17k | "Failed to load data from superblock protobuf"); |
607 | | |
608 | 2.17k | return Status::OK(); |
609 | 2.17k | } |
610 | | |
611 | 1.56M | Status RaftGroupMetadata::SaveToDiskUnlocked(const RaftGroupReplicaSuperBlockPB &pb) { |
612 | 1.56M | flush_lock_.AssertAcquired(); |
613 | | |
614 | 1.56M | string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_); |
615 | 1.56M | RETURN_NOT_OK_PREPEND(pb_util::WritePBContainerToPath( |
616 | 1.56M | fs_manager_->env(), path, pb, |
617 | 1.56M | pb_util::OVERWRITE, pb_util::SYNC), |
618 | 1.56M | Substitute("Failed to write Raft group metadata $0", raft_group_id_)); |
619 | | |
620 | 1.56M | return Status::OK(); |
621 | 1.56M | } |
622 | | |
623 | 10.9k | Status RaftGroupMetadata::ReadSuperBlockFromDisk(RaftGroupReplicaSuperBlockPB* superblock) const { |
624 | 10.9k | string path = fs_manager_->GetRaftGroupMetadataPath(raft_group_id_); |
625 | 10.9k | RETURN_NOT_OK_PREPEND( |
626 | 10.9k | pb_util::ReadPBContainerFromPath(fs_manager_->env(), path, superblock), |
627 | 10.9k | Substitute("Could not load Raft group metadata from $0", path)); |
628 | | // Migration for backward compatibility with versions which don't have separate |
629 | | // TableType::TRANSACTION_STATUS_TABLE_TYPE. |
630 | 2.30k | if (superblock->obsolete_table_type() == TableType::REDIS_TABLE_TYPE && |
631 | 2.30k | superblock->obsolete_table_name() == kGlobalTransactionsTableName0 ) { |
632 | 0 | superblock->set_obsolete_table_type(TableType::TRANSACTION_STATUS_TABLE_TYPE); |
633 | 0 | } |
634 | 2.30k | return Status::OK(); |
635 | 10.9k | } |
636 | | |
637 | 149 | void RaftGroupMetadata::ToSuperBlock(RaftGroupReplicaSuperBlockPB* superblock) const { |
638 | | // acquire the lock so that rowsets_ doesn't get changed until we're finished. |
639 | 149 | std::lock_guard<MutexType> lock(data_mutex_); |
640 | 149 | ToSuperBlockUnlocked(superblock); |
641 | 149 | } |
642 | | |
643 | 1.56M | void RaftGroupMetadata::ToSuperBlockUnlocked(RaftGroupReplicaSuperBlockPB* superblock) const { |
644 | | // Convert to protobuf. |
645 | 1.56M | RaftGroupReplicaSuperBlockPB pb; |
646 | 1.56M | pb.set_raft_group_id(raft_group_id_); |
647 | 1.56M | partition_->ToPB(pb.mutable_partition()); |
648 | | |
649 | 1.56M | kv_store_.ToPB(primary_table_id_, pb.mutable_kv_store()); |
650 | | |
651 | 1.56M | pb.set_wal_dir(wal_dir_); |
652 | 1.56M | pb.set_tablet_data_state(tablet_data_state_); |
653 | 1.56M | if (!tombstone_last_logged_opid_.empty()) { |
654 | 150k | tombstone_last_logged_opid_.ToPB(pb.mutable_tombstone_last_logged_opid()); |
655 | 150k | } |
656 | | |
657 | 1.56M | pb.set_primary_table_id(primary_table_id_); |
658 | 1.56M | pb.set_colocated(colocated_); |
659 | 1.56M | pb.set_cdc_min_replicated_index(cdc_min_replicated_index_); |
660 | 1.56M | pb.set_is_under_twodc_replication(is_under_twodc_replication_); |
661 | 1.56M | pb.set_hidden(hidden_); |
662 | 1.56M | if (restoration_hybrid_time_) { |
663 | 1.55M | pb.set_restoration_hybrid_time(restoration_hybrid_time_.ToUint64()); |
664 | 1.55M | } |
665 | | |
666 | 1.56M | if (!split_op_id_.empty()) { |
667 | 160 | split_op_id_.ToPB(pb.mutable_split_op_id()); |
668 | 160 | auto& split_child_table_ids = *pb.mutable_split_child_tablet_ids(); |
669 | 160 | split_child_table_ids.Reserve(narrow_cast<int>(split_child_tablet_ids_.size())); |
670 | 320 | for (const auto& split_child_tablet_id : split_child_tablet_ids_) { |
671 | 320 | *split_child_table_ids.Add() = split_child_tablet_id; |
672 | 320 | } |
673 | 160 | } |
674 | | |
675 | 1.56M | if (!active_restorations_.empty()) { |
676 | 30 | auto& active_restorations = *pb.mutable_active_restorations(); |
677 | 30 | active_restorations.Reserve(narrow_cast<int>(active_restorations_.size())); |
678 | 30 | for (const auto& id : active_restorations_) { |
679 | 30 | active_restorations.Add()->assign(id.AsSlice().cdata(), id.size()); |
680 | 30 | } |
681 | 30 | } |
682 | | |
683 | 1.56M | superblock->Swap(&pb); |
684 | 1.56M | } |
685 | | |
686 | | void RaftGroupMetadata::SetSchema(const Schema& schema, |
687 | | const IndexMap& index_map, |
688 | | const std::vector<DeletedColumn>& deleted_cols, |
689 | | const uint32_t version, |
690 | 79.9k | const TableId& table_id) { |
691 | 79.9k | DCHECK(schema.has_column_ids()); |
692 | 79.9k | std::lock_guard<MutexType> lock(data_mutex_); |
693 | 79.9k | TableId target_table_id = table_id.empty() ? primary_table_id_1.90k : table_id78.0k ; |
694 | 79.9k | auto result = GetTableInfoUnlocked(target_table_id); |
695 | 79.9k | DCHECK(result.ok()); |
696 | 79.9k | TableInfoPtr new_table_info = std::make_shared<TableInfo>(*result.get(), |
697 | 79.9k | schema, |
698 | 79.9k | index_map, |
699 | 79.9k | deleted_cols, |
700 | 79.9k | version); |
701 | 79.9k | if (target_table_id != primary_table_id_) { |
702 | 348 | if (schema.table_properties().is_ysql_catalog_table()) { |
703 | | // TODO(alex): cotable_id should be copied from original schema, do we need this section? |
704 | | // Might be related to #5017, #6107 |
705 | 0 | Uuid cotable_id; |
706 | 0 | CHECK_OK(cotable_id.FromHexString(target_table_id)); |
707 | 0 | new_table_info->schema->set_cotable_id(cotable_id); |
708 | 0 | } |
709 | | // Ensure colocation ID remains unchanged. |
710 | 348 | const auto& old_schema = (*result)->schema; |
711 | 348 | CHECK(old_schema->has_colocation_id() == schema.has_colocation_id()) |
712 | 0 | << "Attempted to change colocation state for table " << table_id |
713 | 0 | << " from " << old_schema->has_colocation_id() |
714 | 0 | << " to " << schema.has_colocation_id(); |
715 | 348 | CHECK(!old_schema->has_colocation_id() || |
716 | 0 | old_schema->colocation_id() == schema.colocation_id()) |
717 | 0 | << "Attempted to change colocation ID for table " << table_id |
718 | 0 | << " from " << old_schema->colocation_id() |
719 | 0 | << " to " << schema.colocation_id(); |
720 | 348 | } |
721 | 79.9k | VLOG_WITH_PREFIX143 (1) << raft_group_id_ << " Updating table " << target_table_id |
722 | 143 | << " to Schema version " << version |
723 | 143 | << " from \n" << yb::ToString(kv_store_.tables[target_table_id]) |
724 | 143 | << " to \n" << yb::ToString(new_table_info); |
725 | 79.9k | kv_store_.tables[target_table_id].swap(new_table_info); |
726 | 79.9k | } |
727 | | |
728 | 0 | void RaftGroupMetadata::SetPartitionSchema(const PartitionSchema& partition_schema) { |
729 | 0 | std::lock_guard<MutexType> lock(data_mutex_); |
730 | 0 | auto& tables = kv_store_.tables; |
731 | 0 | DCHECK(tables.find(primary_table_id_) != tables.end()); |
732 | 0 | tables[primary_table_id_]->partition_schema = partition_schema; |
733 | 0 | } |
734 | | |
735 | | void RaftGroupMetadata::SetTableName( |
736 | 66.7k | const string& namespace_name, const string& table_name, const TableId& table_id) { |
737 | 66.7k | std::lock_guard<MutexType> lock(data_mutex_); |
738 | 66.7k | auto& tables = kv_store_.tables; |
739 | 18.4E | auto& id = table_id.empty()66.7k ? primary_table_id_66.8k : table_id; |
740 | 66.7k | DCHECK(tables.find(id) != tables.end()); |
741 | 66.7k | tables[id]->namespace_name = namespace_name; |
742 | 66.7k | tables[id]->table_name = table_name; |
743 | 66.7k | } |
744 | | |
745 | | void RaftGroupMetadata::AddTable(const std::string& table_id, |
746 | | const std::string& namespace_name, |
747 | | const std::string& table_name, |
748 | | const TableType table_type, |
749 | | const Schema& schema, |
750 | | const IndexMap& index_map, |
751 | | const PartitionSchema& partition_schema, |
752 | | const boost::optional<IndexInfo>& index_info, |
753 | 1.02M | const uint32_t schema_version) { |
754 | 1.02M | DCHECK(schema.has_column_ids()); |
755 | 1.02M | TableInfoPtr new_table_info = std::make_shared<TableInfo>(table_id, |
756 | 1.02M | namespace_name, |
757 | 1.02M | table_name, |
758 | 1.02M | table_type, |
759 | 1.02M | schema, |
760 | 1.02M | index_map, |
761 | 1.02M | index_info, |
762 | 1.02M | schema_version, |
763 | 1.02M | partition_schema); |
764 | 1.02M | if (table_id != primary_table_id_) { |
765 | 1.02M | if (schema.table_properties().is_ysql_catalog_table()) { |
766 | | // TODO(alex): cotable_id seems to be properly copied from schema, do we need this section? |
767 | | // Might be related to #5017, #6107 |
768 | 1.02M | Uuid cotable_id; |
769 | 1.02M | CHECK_OK(cotable_id.FromHexString(table_id)); |
770 | 1.02M | new_table_info->schema->set_cotable_id(cotable_id); |
771 | 1.02M | } |
772 | 1.02M | } |
773 | 1.02M | std::lock_guard<MutexType> lock(data_mutex_); |
774 | 1.02M | auto& tables = kv_store_.tables; |
775 | 1.02M | auto existing_table_iter = tables.find(table_id); |
776 | 1.02M | if (existing_table_iter != tables.end()) { |
777 | 0 | const auto& existing_table = *existing_table_iter->second.get(); |
778 | 0 | if (!existing_table.schema->table_properties().is_ysql_catalog_table() && |
779 | 0 | schema.table_properties().is_ysql_catalog_table()) { |
780 | | // This must be the one-time migration with transactional DDL being turned on for the first |
781 | | // time on this cluster. |
782 | 0 | } else { |
783 | 0 | LOG(DFATAL) << "Table " << table_id << " already exists. New table info: " |
784 | 0 | << new_table_info->ToString() << ", old table info: " << existing_table.ToString(); |
785 | | |
786 | | // We never expect colocation IDs to mismatch. |
787 | 0 | const auto& existing_schema = existing_table.schema; |
788 | 0 | CHECK(existing_schema->has_colocation_id() == schema.has_colocation_id()) |
789 | 0 | << "Attempted to change colocation state for table " << table_id |
790 | 0 | << " from " << existing_schema->has_colocation_id() |
791 | 0 | << " to " << schema.has_colocation_id(); |
792 | |
|
793 | 0 | CHECK(!existing_schema->has_colocation_id() || |
794 | 0 | existing_schema->colocation_id() == schema.colocation_id()) |
795 | 0 | << "Attempted to change colocation ID for table " << table_id |
796 | 0 | << " from " << existing_schema->colocation_id() |
797 | 0 | << " to " << schema.colocation_id(); |
798 | 0 | } |
799 | 0 | } |
800 | 1.02M | VLOG_WITH_PREFIX0 (1) << "Updating to Schema version " << schema_version |
801 | 0 | << " from\n" << yb::ToString(tables[table_id]) |
802 | 0 | << "\nto\n" << yb::ToString(new_table_info); |
803 | 1.02M | tables[table_id].swap(new_table_info); |
804 | 1.02M | } |
805 | | |
806 | 8.23k | void RaftGroupMetadata::RemoveTable(const TableId& table_id) { |
807 | 8.23k | std::lock_guard<MutexType> lock(data_mutex_); |
808 | 8.23k | auto& tables = kv_store_.tables; |
809 | 8.23k | tables.erase(table_id); |
810 | 8.23k | } |
811 | | |
812 | 534k | string RaftGroupMetadata::data_root_dir() const { |
813 | 534k | const auto& rocksdb_dir = kv_store_.rocksdb_dir; |
814 | 534k | if (rocksdb_dir.empty()) { |
815 | 0 | return ""; |
816 | 534k | } else { |
817 | 534k | auto data_root_dir = DirName(DirName(rocksdb_dir)); |
818 | 534k | if (strcmp(BaseName(data_root_dir).c_str(), FsManager::kRocksDBDirName) == 0534k ) { |
819 | 534k | data_root_dir = DirName(data_root_dir); |
820 | 534k | } |
821 | 534k | return data_root_dir; |
822 | 534k | } |
823 | 534k | } |
824 | | |
825 | 75.8k | string RaftGroupMetadata::wal_root_dir() const { |
826 | 75.8k | std::string wal_dir = this->wal_dir(); |
827 | | |
828 | 75.8k | if (wal_dir.empty()) { |
829 | 0 | return ""; |
830 | 0 | } |
831 | | |
832 | 75.8k | auto wal_root_dir = DirName(wal_dir); |
833 | 75.8k | if (strcmp(BaseName(wal_root_dir).c_str(), FsManager::kWalDirName) != 075.8k ) { |
834 | 75.8k | wal_root_dir = DirName(wal_root_dir); |
835 | 75.8k | } |
836 | 75.8k | return wal_root_dir; |
837 | 75.8k | } |
838 | | |
839 | 5.22k | void RaftGroupMetadata::set_wal_retention_secs(uint32 wal_retention_secs) { |
840 | 5.22k | std::lock_guard<MutexType> lock(data_mutex_); |
841 | 5.22k | auto it = kv_store_.tables.find(primary_table_id_); |
842 | 5.22k | if (it == kv_store_.tables.end()) { |
843 | 0 | LOG_WITH_PREFIX(DFATAL) << "Unable to set WAL retention time for primary table " |
844 | 0 | << primary_table_id_; |
845 | 0 | return; |
846 | 0 | } |
847 | 5.22k | it->second->wal_retention_secs = wal_retention_secs; |
848 | 5.22k | LOG_WITH_PREFIX(INFO) << "Set RaftGroupMetadata wal retention time to " |
849 | 5.22k | << wal_retention_secs << " seconds"; |
850 | 5.22k | } |
851 | | |
852 | 306k | uint32_t RaftGroupMetadata::wal_retention_secs() const { |
853 | 306k | std::lock_guard<MutexType> lock(data_mutex_); |
854 | 306k | auto it = kv_store_.tables.find(primary_table_id_); |
855 | 306k | if (it == kv_store_.tables.end()) { |
856 | 0 | return 0; |
857 | 0 | } |
858 | 306k | return it->second->wal_retention_secs; |
859 | 306k | } |
860 | | |
861 | 150k | Status RaftGroupMetadata::set_cdc_min_replicated_index(int64 cdc_min_replicated_index) { |
862 | 150k | { |
863 | 150k | std::lock_guard<MutexType> lock(data_mutex_); |
864 | 150k | cdc_min_replicated_index_ = cdc_min_replicated_index; |
865 | 150k | } |
866 | 150k | return Flush(); |
867 | 150k | } |
868 | | |
869 | 451k | int64_t RaftGroupMetadata::cdc_min_replicated_index() const { |
870 | 451k | std::lock_guard<MutexType> lock(data_mutex_); |
871 | 451k | return cdc_min_replicated_index_; |
872 | 451k | } |
873 | | |
874 | 0 | Status RaftGroupMetadata::SetIsUnderTwodcReplicationAndFlush(bool is_under_twodc_replication) { |
875 | 0 | { |
876 | 0 | std::lock_guard<MutexType> lock(data_mutex_); |
877 | 0 | is_under_twodc_replication_ = is_under_twodc_replication; |
878 | 0 | } |
879 | 0 | return Flush(); |
880 | 0 | } |
881 | | |
882 | 88.7k | bool RaftGroupMetadata::is_under_twodc_replication() const { |
883 | 88.7k | std::lock_guard<MutexType> lock(data_mutex_); |
884 | 88.7k | return is_under_twodc_replication_; |
885 | 88.7k | } |
886 | | |
887 | 66 | void RaftGroupMetadata::SetHidden(bool value) { |
888 | 66 | std::lock_guard<MutexType> lock(data_mutex_); |
889 | 66 | hidden_ = value; |
890 | 66 | } |
891 | | |
892 | 12.3M | bool RaftGroupMetadata::hidden() const { |
893 | 12.3M | std::lock_guard<MutexType> lock(data_mutex_); |
894 | 12.3M | return hidden_; |
895 | 12.3M | } |
896 | | |
897 | 29 | void RaftGroupMetadata::SetRestorationHybridTime(HybridTime value) { |
898 | 29 | std::lock_guard<MutexType> lock(data_mutex_); |
899 | 29 | restoration_hybrid_time_ = std::max(restoration_hybrid_time_, value); |
900 | 29 | } |
901 | | |
902 | 150k | HybridTime RaftGroupMetadata::restoration_hybrid_time() const { |
903 | 150k | std::lock_guard<MutexType> lock(data_mutex_); |
904 | 150k | return restoration_hybrid_time_; |
905 | 150k | } |
906 | | |
907 | 135 | void RaftGroupMetadata::set_tablet_data_state(TabletDataState state) { |
908 | 135 | std::lock_guard<MutexType> lock(data_mutex_); |
909 | 135 | tablet_data_state_ = state; |
910 | 135 | } |
911 | | |
912 | 155k | string RaftGroupMetadata::LogPrefix() const { |
913 | 155k | return consensus::MakeTabletLogPrefix(raft_group_id_, fs_manager_->uuid()); |
914 | 155k | } |
915 | | |
916 | 75.9k | OpId RaftGroupMetadata::tombstone_last_logged_opid() const { |
917 | 75.9k | std::lock_guard<MutexType> lock(data_mutex_); |
918 | 75.9k | return tombstone_last_logged_opid_; |
919 | 75.9k | } |
920 | | |
921 | 790k | bool RaftGroupMetadata::colocated() const { |
922 | 790k | std::lock_guard<MutexType> lock(data_mutex_); |
923 | 790k | return colocated_; |
924 | 790k | } |
925 | | |
926 | 48.8M | TabletDataState RaftGroupMetadata::tablet_data_state() const { |
927 | 48.8M | std::lock_guard<MutexType> lock(data_mutex_); |
928 | 48.8M | return tablet_data_state_; |
929 | 48.8M | } |
930 | | |
931 | 33 | std::array<TabletId, kNumSplitParts> RaftGroupMetadata::split_child_tablet_ids() const { |
932 | 33 | std::lock_guard<MutexType> lock(data_mutex_); |
933 | 33 | return split_child_tablet_ids_; |
934 | 33 | } |
935 | | |
936 | 68 | OpId RaftGroupMetadata::split_op_id() const { |
937 | 68 | std::lock_guard<MutexType> lock(data_mutex_); |
938 | 68 | return split_op_id_; |
939 | 68 | } |
940 | | |
941 | 131k | OpId RaftGroupMetadata::GetOpIdToDeleteAfterAllApplied() const { |
942 | 131k | std::lock_guard<MutexType> lock(data_mutex_); |
943 | 131k | if (tablet_data_state_ != TabletDataState::TABLET_DATA_SPLIT_COMPLETED || hidden_59 ) { |
944 | 131k | return OpId::Invalid(); |
945 | 131k | } |
946 | 59 | return split_op_id_; |
947 | 131k | } |
948 | | |
949 | | void RaftGroupMetadata::SetSplitDone( |
950 | 67 | const OpId& op_id, const TabletId& child1, const TabletId& child2) { |
951 | 67 | std::lock_guard<MutexType> lock(data_mutex_); |
952 | 67 | tablet_data_state_ = TabletDataState::TABLET_DATA_SPLIT_COMPLETED; |
953 | 67 | split_op_id_ = op_id; |
954 | 67 | split_child_tablet_ids_[0] = child1; |
955 | 67 | split_child_tablet_ids_[1] = child2; |
956 | 67 | } |
957 | | |
958 | 150k | bool RaftGroupMetadata::has_active_restoration() const { |
959 | 150k | std::lock_guard<MutexType> lock(data_mutex_); |
960 | 150k | return !active_restorations_.empty(); |
961 | 150k | } |
962 | | |
963 | 30 | void RaftGroupMetadata::RegisterRestoration(const TxnSnapshotRestorationId& restoration_id) { |
964 | 30 | std::lock_guard<MutexType> lock(data_mutex_); |
965 | 30 | if (tablet_data_state_ == TabletDataState::TABLET_DATA_SPLIT_COMPLETED) { |
966 | 0 | tablet_data_state_ = TabletDataState::TABLET_DATA_READY; |
967 | 0 | split_op_id_ = OpId(); |
968 | 0 | split_child_tablet_ids_[0] = std::string(); |
969 | 0 | split_child_tablet_ids_[1] = std::string(); |
970 | 0 | } |
971 | 30 | active_restorations_.push_back(restoration_id); |
972 | 30 | } |
973 | | |
974 | 29 | void RaftGroupMetadata::UnregisterRestoration(const TxnSnapshotRestorationId& restoration_id) { |
975 | 29 | std::lock_guard<MutexType> lock(data_mutex_); |
976 | 29 | Erase(restoration_id, &active_restorations_); |
977 | 29 | } |
978 | | |
979 | | HybridTime RaftGroupMetadata::CheckCompleteRestorations( |
980 | 2.71k | const RestorationCompleteTimeMap& restoration_complete_time) { |
981 | 2.71k | std::lock_guard<MutexType> lock(data_mutex_); |
982 | 2.71k | auto result = HybridTime::kMin; |
983 | 2.71k | for (const auto& restoration_id : active_restorations_) { |
984 | 3 | auto it = restoration_complete_time.find(restoration_id); |
985 | 3 | if (it != restoration_complete_time.end() && it->second) { |
986 | 3 | result = std::max(result, it->second); |
987 | 3 | } |
988 | 3 | } |
989 | 2.71k | return result; |
990 | 2.71k | } |
991 | | |
992 | | bool RaftGroupMetadata::CleanupRestorations( |
993 | 2.71k | const RestorationCompleteTimeMap& restoration_complete_time) { |
994 | 2.71k | bool result = false; |
995 | 2.71k | std::lock_guard<MutexType> lock(data_mutex_); |
996 | 2.71k | for (auto it = active_restorations_.begin(); it != active_restorations_.end();) { |
997 | 3 | auto known_restoration_it = restoration_complete_time.find(*it); |
998 | 3 | if (known_restoration_it == restoration_complete_time.end() || known_restoration_it->second) { |
999 | 3 | it = active_restorations_.erase(it); |
1000 | 3 | result = true; |
1001 | 3 | } else { |
1002 | 0 | ++it; |
1003 | 0 | } |
1004 | 3 | } |
1005 | 2.71k | return result; |
1006 | 2.71k | } |
1007 | | |
1008 | 277 | std::string RaftGroupMetadata::GetSubRaftGroupWalDir(const RaftGroupId& raft_group_id) const { |
1009 | 277 | return JoinPathSegments(DirName(wal_dir_), MakeTabletDirName(raft_group_id)); |
1010 | 277 | } |
1011 | | |
1012 | 277 | std::string RaftGroupMetadata::GetSubRaftGroupDataDir(const RaftGroupId& raft_group_id) const { |
1013 | 277 | return JoinPathSegments(DirName(kv_store_.rocksdb_dir), MakeTabletDirName(raft_group_id)); |
1014 | 277 | } |
1015 | | |
1016 | | // We directly init fields of a new metadata, so have to use NO_THREAD_SAFETY_ANALYSIS here. |
1017 | | Result<RaftGroupMetadataPtr> RaftGroupMetadata::CreateSubtabletMetadata( |
1018 | | const RaftGroupId& raft_group_id, const Partition& partition, |
1019 | | const std::string& lower_bound_key, const std::string& upper_bound_key) |
1020 | 141 | const NO_THREAD_SAFETY_ANALYSIS { |
1021 | 141 | RaftGroupReplicaSuperBlockPB superblock; |
1022 | 141 | ToSuperBlock(&superblock); |
1023 | | |
1024 | 141 | RaftGroupMetadataPtr metadata(new RaftGroupMetadata(fs_manager_, raft_group_id_)); |
1025 | 141 | RETURN_NOT_OK(metadata->LoadFromSuperBlock(superblock, /* local_superblock = */ true)); |
1026 | 141 | metadata->raft_group_id_ = raft_group_id; |
1027 | 141 | metadata->wal_dir_ = GetSubRaftGroupWalDir(raft_group_id); |
1028 | 141 | metadata->kv_store_.kv_store_id = KvStoreId(raft_group_id); |
1029 | 141 | metadata->kv_store_.lower_bound_key = lower_bound_key; |
1030 | 141 | metadata->kv_store_.upper_bound_key = upper_bound_key; |
1031 | 141 | metadata->kv_store_.rocksdb_dir = GetSubRaftGroupDataDir(raft_group_id); |
1032 | 141 | metadata->kv_store_.has_been_fully_compacted = false; |
1033 | 141 | *metadata->partition_ = partition; |
1034 | 141 | metadata->state_ = kInitialized; |
1035 | 141 | metadata->tablet_data_state_ = TABLET_DATA_INIT_STARTED; |
1036 | 141 | RETURN_NOT_OK(metadata->Flush()); |
1037 | 141 | return metadata; |
1038 | 141 | } |
1039 | | |
1040 | 177 | Result<std::string> RaftGroupMetadata::TopSnapshotsDir() const { |
1041 | 177 | auto result = snapshots_dir(); |
1042 | 177 | RETURN_NOT_OK_PREPEND( |
1043 | 177 | fs_manager()->CreateDirIfMissingAndSync(result), |
1044 | 177 | Format("Unable to create snapshots directory $0", result)); |
1045 | 173 | return result; |
1046 | 177 | } |
1047 | | |
1048 | | namespace { |
1049 | | // MigrateSuperblockForDXXXX functions are only needed for backward compatibility with |
1050 | | // YugabyteDB versions which don't have changes from DXXXX revision. |
1051 | | // Each MigrateSuperblockForDXXXX could be removed after all YugabyteDB installations are |
1052 | | // upgraded to have revision DXXXX. |
1053 | | |
1054 | 2.02k | CHECKED_STATUS MigrateSuperblockForD5900(RaftGroupReplicaSuperBlockPB* superblock) { |
1055 | | // In previous version of superblock format we stored primary table metadata in superblock's |
1056 | | // top-level fields (deprecated table_* and other). TableInfo objects were stored inside |
1057 | | // RaftGroupReplicaSuperBlockPB.tables. |
1058 | | // |
1059 | | // In new format TableInfo objects and some other top-level fields are moved from superblock's |
1060 | | // top-level fields into RaftGroupReplicaSuperBlockPB.kv_store. Primary table (see |
1061 | | // RaftGroupMetadata::primary_table_id_ field description) metadata is stored inside one of |
1062 | | // RaftGroupReplicaSuperBlockPB.kv_store.tables objects and is referenced by |
1063 | | // RaftGroupReplicaSuperBlockPB.primary_table_id. |
1064 | 2.02k | if (superblock->has_kv_store()) { |
1065 | 2.02k | return Status::OK(); |
1066 | 2.02k | } |
1067 | | |
1068 | 0 | LOG(INFO) << "Migrating superblock for raft group " << superblock->raft_group_id(); |
1069 | |
|
1070 | 0 | KvStoreInfoPB* kv_store_pb = superblock->mutable_kv_store(); |
1071 | 0 | kv_store_pb->set_kv_store_id(superblock->raft_group_id()); |
1072 | 0 | kv_store_pb->set_rocksdb_dir(superblock->obsolete_rocksdb_dir()); |
1073 | 0 | kv_store_pb->mutable_rocksdb_files()->CopyFrom(superblock->obsolete_rocksdb_files()); |
1074 | 0 | kv_store_pb->mutable_snapshot_files()->CopyFrom(superblock->obsolete_snapshot_files()); |
1075 | |
|
1076 | 0 | TableInfoPB* primary_table = kv_store_pb->add_tables(); |
1077 | 0 | primary_table->set_table_id(superblock->primary_table_id()); |
1078 | 0 | primary_table->set_table_name(superblock->obsolete_table_name()); |
1079 | 0 | primary_table->set_table_type(superblock->obsolete_table_type()); |
1080 | 0 | primary_table->mutable_schema()->CopyFrom(superblock->obsolete_schema()); |
1081 | 0 | primary_table->set_schema_version(superblock->obsolete_schema_version()); |
1082 | 0 | primary_table->mutable_partition_schema()->CopyFrom(superblock->obsolete_partition_schema()); |
1083 | 0 | primary_table->mutable_indexes()->CopyFrom(superblock->obsolete_indexes()); |
1084 | 0 | primary_table->mutable_index_info()->CopyFrom(superblock->obsolete_index_info()); |
1085 | 0 | primary_table->mutable_deleted_cols()->CopyFrom(superblock->obsolete_deleted_cols()); |
1086 | |
|
1087 | 0 | kv_store_pb->mutable_tables()->MergeFrom(superblock->obsolete_tables()); |
1088 | |
|
1089 | 0 | return Status::OK(); |
1090 | 2.02k | } |
1091 | | |
1092 | | } // namespace |
1093 | | |
1094 | 2.02k | Status MigrateSuperblock(RaftGroupReplicaSuperBlockPB* superblock) { |
1095 | 2.02k | return MigrateSuperblockForD5900(superblock); |
1096 | 2.02k | } |
1097 | | |
1098 | | std::shared_ptr<std::vector<DeletedColumn>> RaftGroupMetadata::deleted_cols( |
1099 | 494 | const TableId& table_id) const { |
1100 | 494 | DCHECK_NE(state_, kNotLoadedYet); |
1101 | 494 | const TableInfoPtr table_info = |
1102 | 494 | table_id.empty() ? primary_table_info() : CHECK_RESULT(GetTableInfo(table_id)); |
1103 | 494 | return std::shared_ptr<std::vector<DeletedColumn>>(table_info, &table_info->deleted_cols); |
1104 | 494 | } |
1105 | | |
1106 | 158k | std::string RaftGroupMetadata::namespace_name(const TableId& table_id) const { |
1107 | 158k | DCHECK_NE(state_, kNotLoadedYet); |
1108 | 158k | if (table_id.empty()) { |
1109 | 158k | return primary_table_info()->namespace_name; |
1110 | 158k | } |
1111 | 23 | const auto& table_info = CHECK_RESULT(GetTableInfo(table_id)); |
1112 | 23 | return table_info->namespace_name; |
1113 | 158k | } |
1114 | | |
1115 | 3.46M | std::string RaftGroupMetadata::table_name(const TableId& table_id) const { |
1116 | 3.46M | DCHECK_NE(state_, kNotLoadedYet); |
1117 | 3.46M | if (table_id.empty()) { |
1118 | 3.46M | return primary_table_info()->table_name; |
1119 | 3.46M | } |
1120 | 503 | const auto& table_info = CHECK_RESULT(GetTableInfo(table_id)); |
1121 | 503 | return table_info->table_name; |
1122 | 3.46M | } |
1123 | | |
1124 | 6.02M | TableType RaftGroupMetadata::table_type(const TableId& table_id) const { |
1125 | 6.02M | DCHECK_NE(state_, kNotLoadedYet); |
1126 | 6.02M | if (table_id.empty()) { |
1127 | 6.02M | return primary_table_info()->table_type; |
1128 | 6.02M | } |
1129 | 1.49k | const auto& table_info = CHECK_RESULT(GetTableInfo(table_id)); |
1130 | 1.49k | return table_info->table_type; |
1131 | 6.02M | } |
1132 | | |
1133 | 40.3M | yb::SchemaPtr RaftGroupMetadata::schema(const TableId& table_id) const { |
1134 | 40.3M | DCHECK_NE(state_, kNotLoadedYet); |
1135 | 40.3M | const TableInfoPtr table_info = |
1136 | 40.3M | table_id.empty() ? primary_table_info()40.2M : CHECK_RESULT(GetTableInfo(table_id)); |
1137 | 40.3M | return yb::SchemaPtr(table_info, table_info->schema.get()); |
1138 | 40.3M | } |
1139 | | |
1140 | 4.43k | std::shared_ptr<IndexMap> RaftGroupMetadata::index_map(const TableId& table_id) const { |
1141 | 4.43k | DCHECK_NE(state_, kNotLoadedYet); |
1142 | 4.43k | const TableInfoPtr table_info = |
1143 | 4.43k | table_id.empty() ? primary_table_info()30 : CHECK_RESULT(GetTableInfo(table_id)); |
1144 | 4.43k | return std::shared_ptr<IndexMap>(table_info, table_info->index_map.get()); |
1145 | 4.43k | } |
1146 | | |
1147 | 8.23M | uint32_t RaftGroupMetadata::schema_version(const TableId& table_id) const { |
1148 | 8.23M | DCHECK_NE(state_, kNotLoadedYet); |
1149 | 8.23M | const TableInfoPtr table_info = |
1150 | 8.23M | table_id.empty() ? primary_table_info()8.23M : CHECK_RESULT(GetTableInfo(table_id)); |
1151 | 8.23M | return table_info->schema_version; |
1152 | 8.23M | } |
1153 | | |
1154 | 0 | const std::string& RaftGroupMetadata::indexed_table_id(const TableId& table_id) const { |
1155 | 0 | DCHECK_NE(state_, kNotLoadedYet); |
1156 | 0 | static const std::string kEmptyString = ""; |
1157 | 0 | std::lock_guard<MutexType> lock(data_mutex_); |
1158 | 0 | const TableInfoPtr table_info = table_id.empty() ? |
1159 | 0 | primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id)); |
1160 | 0 | const auto* index_info = table_info->index_info.get(); |
1161 | 0 | return index_info ? index_info->indexed_table_id() : kEmptyString; |
1162 | 0 | } |
1163 | | |
1164 | 0 | bool RaftGroupMetadata::is_local_index(const TableId& table_id) const { |
1165 | 0 | DCHECK_NE(state_, kNotLoadedYet); |
1166 | 0 | std::lock_guard<MutexType> lock(data_mutex_); |
1167 | 0 | const TableInfoPtr table_info = table_id.empty() ? |
1168 | 0 | primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id)); |
1169 | 0 | const auto* index_info = table_info->index_info.get(); |
1170 | 0 | return index_info && index_info->is_local(); |
1171 | 0 | } |
1172 | | |
1173 | 2.08M | bool RaftGroupMetadata::is_unique_index(const TableId& table_id) const { |
1174 | 2.08M | DCHECK_NE(state_, kNotLoadedYet); |
1175 | 2.08M | std::lock_guard<MutexType> lock(data_mutex_); |
1176 | 2.08M | const TableInfoPtr table_info = table_id.empty() ? |
1177 | 2.08M | primary_table_info_unlocked() : CHECK_RESULT2.08M (GetTableInfoUnlocked(table_id)); |
1178 | 2.08M | const auto* index_info = table_info->index_info.get(); |
1179 | 2.08M | return index_info && index_info->is_unique()35.1k ; |
1180 | 2.08M | } |
1181 | | |
1182 | 0 | std::vector<ColumnId> RaftGroupMetadata::index_key_column_ids(const TableId& table_id) const { |
1183 | 0 | DCHECK_NE(state_, kNotLoadedYet); |
1184 | 0 | std::lock_guard<MutexType> lock(data_mutex_); |
1185 | 0 | const TableInfoPtr table_info = table_id.empty() ? |
1186 | 0 | primary_table_info_unlocked() : CHECK_RESULT(GetTableInfoUnlocked(table_id)); |
1187 | 0 | const auto* index_info = table_info->index_info.get(); |
1188 | 0 | return index_info ? index_info->index_key_column_ids() : std::vector<ColumnId>(); |
1189 | 0 | } |
1190 | | |
1191 | 5.55M | bool RaftGroupMetadata::UsePartialRangeKeyIntents() const { |
1192 | 5.55M | return table_type() == TableType::PGSQL_TABLE_TYPE; |
1193 | 5.55M | } |
1194 | | |
1195 | | } // namespace tablet |
1196 | | } // namespace yb |