/Users/deen/code/yugabyte-db/src/yb/tserver/remote_bootstrap_client.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // |
18 | | // The following only applies to changes made to this file as part of YugaByte development. |
19 | | // |
20 | | // Portions Copyright (c) YugaByte, Inc. |
21 | | // |
22 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
23 | | // in compliance with the License. You may obtain a copy of the License at |
24 | | // |
25 | | // http://www.apache.org/licenses/LICENSE-2.0 |
26 | | // |
27 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
28 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
29 | | // or implied. See the License for the specific language governing permissions and limitations |
30 | | // under the License. |
31 | | // |
32 | | |
33 | | #include "yb/tserver/remote_bootstrap_client.h" |
34 | | |
35 | | #include <glog/logging.h> |
36 | | |
37 | | #include "yb/common/index.h" |
38 | | #include "yb/common/schema.h" |
39 | | #include "yb/common/wire_protocol.h" |
40 | | |
41 | | #include "yb/consensus/consensus.h" |
42 | | #include "yb/consensus/consensus_meta.h" |
43 | | #include "yb/consensus/metadata.pb.h" |
44 | | |
45 | | #include "yb/fs/fs_manager.h" |
46 | | |
47 | | #include "yb/gutil/strings/substitute.h" |
48 | | #include "yb/gutil/walltime.h" |
49 | | |
50 | | #include "yb/rpc/rpc_controller.h" |
51 | | |
52 | | #include "yb/tablet/tablet.pb.h" |
53 | | #include "yb/tablet/tablet_bootstrap_if.h" |
54 | | #include "yb/tablet/tablet_metadata.h" |
55 | | |
56 | | #include "yb/tserver/remote_bootstrap.pb.h" |
57 | | #include "yb/tserver/remote_bootstrap.proxy.h" |
58 | | #include "yb/tserver/remote_bootstrap_snapshots.h" |
59 | | #include "yb/tserver/ts_tablet_manager.h" |
60 | | |
61 | | #include "yb/util/env.h" |
62 | | #include "yb/util/env_util.h" |
63 | | #include "yb/util/fault_injection.h" |
64 | | #include "yb/util/flag_tags.h" |
65 | | #include "yb/util/logging.h" |
66 | | #include "yb/util/net/net_util.h" |
67 | | #include "yb/util/result.h" |
68 | | #include "yb/util/scope_exit.h" |
69 | | #include "yb/util/size_literals.h" |
70 | | #include "yb/util/status_log.h" |
71 | | |
72 | | using namespace yb::size_literals; |
73 | | |
74 | | DEFINE_int32(remote_bootstrap_begin_session_timeout_ms, 5000, |
75 | | "Tablet server RPC client timeout for BeginRemoteBootstrapSession calls."); |
76 | | TAG_FLAG(remote_bootstrap_begin_session_timeout_ms, hidden); |
77 | | |
78 | | DEFINE_int32(remote_bootstrap_end_session_timeout_sec, 15, |
79 | | "Tablet server RPC client timeout for EndRemoteBootstrapSession calls. " |
80 | | "The timeout is usually a large value because we have to wait for the remote server " |
81 | | "to get a CHANGE_ROLE config change accepted."); |
82 | | TAG_FLAG(remote_bootstrap_end_session_timeout_sec, hidden); |
83 | | |
84 | | DEFINE_bool(remote_bootstrap_save_downloaded_metadata, false, |
85 | | "Save copies of the downloaded remote bootstrap files for debugging purposes. " |
86 | | "Note: This is only intended for debugging and should not be normally used!"); |
87 | | TAG_FLAG(remote_bootstrap_save_downloaded_metadata, advanced); |
88 | | TAG_FLAG(remote_bootstrap_save_downloaded_metadata, hidden); |
89 | | TAG_FLAG(remote_bootstrap_save_downloaded_metadata, runtime); |
90 | | |
91 | | DEFINE_int32(committed_config_change_role_timeout_sec, 30, |
92 | | "Number of seconds to wait for the CHANGE_ROLE to be in the committed config before " |
93 | | "timing out. "); |
94 | | TAG_FLAG(committed_config_change_role_timeout_sec, hidden); |
95 | | |
96 | | DEFINE_test_flag(double, fault_crash_bootstrap_client_before_changing_role, 0.0, |
97 | | "The remote bootstrap client will crash before closing the session with the " |
98 | | "leader. Because the session won't be closed successfully, the leader won't issue " |
99 | | "a ChangeConfig request to change this tserver role *(from PRE_VOTER or " |
100 | | "PRE_OBSERVER to VOTER or OBSERVER respectively)."); |
101 | | |
102 | | DEFINE_test_flag(int32, simulate_long_remote_bootstrap_sec, 0, |
103 | | "The remote bootstrap client will take at least this number of seconds to finish. " |
104 | | "We use this for testing a scenario where a remote bootstrap takes longer than " |
105 | | "follower_unavailable_considered_failed_sec seconds."); |
106 | | |
107 | | DEFINE_test_flag(bool, download_partial_wal_segments, false, ""); |
108 | | |
109 | | DECLARE_int32(bytes_remote_bootstrap_durable_write_mb); |
110 | | |
111 | | namespace yb { |
112 | | namespace tserver { |
113 | | |
114 | | using consensus::ConsensusMetadata; |
115 | | using consensus::ConsensusStatePB; |
116 | | using consensus::PeerMemberType; |
117 | | using consensus::RaftConfigPB; |
118 | | using consensus::RaftPeerPB; |
119 | | using env_util::CopyFile; |
120 | | using rpc::Messenger; |
121 | | using std::shared_ptr; |
122 | | using std::string; |
123 | | using std::vector; |
124 | | using strings::Substitute; |
125 | | using tablet::TabletDataState; |
126 | | using tablet::TabletDataState_Name; |
127 | | using tablet::RaftGroupMetadata; |
128 | | using tablet::RaftGroupMetadataPtr; |
129 | | using tablet::TabletStatusListener; |
130 | | using tablet::RaftGroupReplicaSuperBlockPB; |
131 | | |
132 | | std::atomic<int32_t> remote_bootstrap_clients_started_{0}; |
133 | | |
134 | | RemoteBootstrapClient::RemoteBootstrapClient(std::string tablet_id, FsManager* fs_manager) |
135 | | : tablet_id_(std::move(tablet_id)), |
136 | | log_prefix_(Format("T $0 P $1: Remote bootstrap client: ", tablet_id_, fs_manager->uuid())), |
137 | 1.44k | downloader_(&log_prefix_, fs_manager) { |
138 | 1.44k | AddComponent<RemoteBootstrapSnapshotsComponent>(); |
139 | 1.44k | } |
140 | | |
141 | 1.43k | RemoteBootstrapClient::~RemoteBootstrapClient() { |
142 | | // Note: Ending the remote bootstrap session releases anchors on the remote. |
143 | | // This assumes that succeeded_ only gets set to true in Finish() just before calling |
144 | | // EndRemoteSession. If this didn't happen, then close the session here. |
145 | 1.43k | if (!succeeded_) { |
146 | 452 | LOG_WITH_PREFIX(INFO) << "Closing remote bootstrap session " << session_id() |
147 | 452 | << " in RemoteBootstrapClient destructor."; |
148 | 452 | WARN_NOT_OK(EndRemoteSession(), |
149 | 452 | LogPrefix() + "Unable to close remote bootstrap session " + session_id()); |
150 | 452 | } |
151 | 1.43k | if (started_) { |
152 | 985 | auto old_count = remote_bootstrap_clients_started_.fetch_sub(1, std::memory_order_acq_rel); |
153 | 985 | if (old_count < 1) { |
154 | 0 | LOG_WITH_PREFIX(DFATAL) << "Invalid number of remote bootstrap sessions: " << old_count; |
155 | 0 | } |
156 | 985 | } |
157 | 1.43k | } |
158 | | |
159 | | Status RemoteBootstrapClient::SetTabletToReplace(const RaftGroupMetadataPtr& meta, |
160 | 102 | int64_t caller_term) { |
161 | 102 | CHECK_EQ(tablet_id_, meta->raft_group_id()); |
162 | 102 | TabletDataState data_state = meta->tablet_data_state(); |
163 | 102 | if (data_state != tablet::TABLET_DATA_TOMBSTONED) { |
164 | 0 | return STATUS(IllegalState, Substitute("Tablet $0 not in tombstoned state: $1 ($2)", |
165 | 0 | tablet_id_, |
166 | 0 | TabletDataState_Name(data_state), |
167 | 0 | data_state)); |
168 | 0 | } |
169 | | |
170 | 102 | replace_tombstoned_tablet_ = true; |
171 | 102 | meta_ = meta; |
172 | | |
173 | 102 | int64_t last_logged_term = meta->tombstone_last_logged_opid().term; |
174 | 102 | if (last_logged_term > caller_term) { |
175 | 0 | return STATUS(InvalidArgument, |
176 | 0 | Substitute("Leader has term $0 but the last log entry written by the tombstoned replica " |
177 | 0 | "for tablet $1 has higher term $2. Refusing remote bootstrap from leader", |
178 | 0 | caller_term, tablet_id_, last_logged_term)); |
179 | 0 | } |
180 | | |
181 | | // Load the old consensus metadata, if it exists. |
182 | 102 | std::unique_ptr<ConsensusMetadata> cmeta; |
183 | 102 | Status s = ConsensusMetadata::Load( |
184 | 102 | &fs_manager(), tablet_id_, permanent_uuid(), &cmeta); |
185 | 102 | if (s.IsNotFound()) { |
186 | | // The consensus metadata was not written to disk, possibly due to a failed |
187 | | // remote bootstrap. |
188 | 0 | return Status::OK(); |
189 | 0 | } |
190 | 102 | RETURN_NOT_OK(s); |
191 | 102 | cmeta_ = std::move(cmeta); |
192 | 102 | return Status::OK(); |
193 | 102 | } |
194 | | |
195 | | Status RemoteBootstrapClient::Start(const string& bootstrap_peer_uuid, |
196 | | rpc::ProxyCache* proxy_cache, |
197 | | const HostPort& bootstrap_peer_addr, |
198 | | RaftGroupMetadataPtr* meta, |
199 | 1.44k | TSTabletManager* ts_manager) { |
200 | 1.44k | CHECK(!started_); |
201 | 1.44k | start_time_micros_ = GetCurrentTimeMicros(); |
202 | | |
203 | 1.44k | LOG_WITH_PREFIX(INFO) << "Beginning remote bootstrap session" |
204 | 1.44k | << " from remote peer at address " << bootstrap_peer_addr.ToString(); |
205 | | |
206 | | // Set up an RPC proxy for the RemoteBootstrapService. |
207 | 1.44k | proxy_.reset(new RemoteBootstrapServiceProxy(proxy_cache, bootstrap_peer_addr)); |
208 | | |
209 | 1.44k | BeginRemoteBootstrapSessionRequestPB req; |
210 | 1.44k | req.set_requestor_uuid(permanent_uuid()); |
211 | 1.44k | req.set_tablet_id(tablet_id_); |
212 | | |
213 | 1.44k | rpc::RpcController controller; |
214 | 1.44k | controller.set_timeout(MonoDelta::FromMilliseconds( |
215 | 1.44k | FLAGS_remote_bootstrap_begin_session_timeout_ms)); |
216 | | |
217 | | // Begin the remote bootstrap session with the remote peer. |
218 | 1.44k | BeginRemoteBootstrapSessionResponsePB resp; |
219 | 1.44k | auto status = |
220 | 1.44k | UnwindRemoteError(proxy_->BeginRemoteBootstrapSession(req, &resp, &controller), controller); |
221 | | |
222 | 1.44k | if (!status.ok()) { |
223 | 14 | status = status.CloneAndPrepend("Unable to begin remote bootstrap session"); |
224 | 14 | LOG_WITH_PREFIX(WARNING) << status; |
225 | 14 | return status; |
226 | 14 | } |
227 | | |
228 | 1.43k | remote_tablet_data_state_ = resp.superblock().tablet_data_state(); |
229 | 1.43k | if (!CanServeTabletData(remote_tablet_data_state_)) { |
230 | 0 | Status s = STATUS(IllegalState, "Remote peer (" + bootstrap_peer_uuid + ")" + |
231 | 0 | " is currently remotely bootstrapping itself!", |
232 | 0 | resp.superblock().ShortDebugString()); |
233 | 0 | LOG_WITH_PREFIX(WARNING) << s.ToString(); |
234 | 0 | return s; |
235 | 0 | } |
236 | | |
237 | 1.43k | LOG_WITH_PREFIX(INFO) << "Received superblock: " << resp.superblock().ShortDebugString(); |
238 | 1.43k | RETURN_NOT_OK(MigrateSuperblock(resp.mutable_superblock())); |
239 | | |
240 | 1.43k | auto* kv_store = resp.mutable_superblock()->mutable_kv_store(); |
241 | 1.43k | LOG_WITH_PREFIX(INFO) << "RocksDB files: " << yb::ToString(kv_store->rocksdb_files()); |
242 | 1.43k | LOG_WITH_PREFIX(INFO) << "Snapshot files: " << yb::ToString(kv_store->snapshot_files()); |
243 | 1.43k | if (first_wal_seqno_) { |
244 | 0 | LOG_WITH_PREFIX(INFO) << "First WAL segment: " << first_wal_seqno_; |
245 | 1.43k | } else { |
246 | 1.43k | LOG_WITH_PREFIX(INFO) << "Log files: " << yb::ToString(resp.deprecated_wal_segment_seqnos()); |
247 | 1.43k | } |
248 | | |
249 | 1.43k | const TableId table_id = resp.superblock().primary_table_id(); |
250 | 1.43k | const bool colocated = resp.superblock().colocated(); |
251 | 1.43k | const tablet::TableInfoPB* table_ptr = nullptr; |
252 | 1.43k | for (auto& table_pb : kv_store->tables()) { |
253 | 1.43k | if (table_pb.table_id() == table_id) { |
254 | 1.43k | table_ptr = &table_pb; |
255 | 1.43k | break; |
256 | 1.43k | } |
257 | 1.43k | } |
258 | 1.43k | if (!table_ptr) { |
259 | 0 | return STATUS(InvalidArgument, Format( |
260 | 0 | "Tablet $0: Superblock's KV-store doesn't contain primary table $1", tablet_id_, |
261 | 0 | table_id)); |
262 | 0 | } |
263 | 1.43k | const auto& table = *table_ptr; |
264 | | |
265 | 1.43k | downloader_.Start( |
266 | 1.43k | proxy_, resp.session_id(), MonoDelta::FromMilliseconds(resp.session_idle_timeout_millis())); |
267 | 1.43k | LOG_WITH_PREFIX(INFO) << "Began remote bootstrap session " << session_id(); |
268 | | |
269 | 1.43k | superblock_.reset(resp.release_superblock()); |
270 | | |
271 | | // Clear fields rocksdb_dir and wal_dir so we get an error if we try to use them without setting |
272 | | // them to the right path. |
273 | 1.43k | kv_store->clear_rocksdb_dir(); |
274 | 1.43k | superblock_->clear_wal_dir(); |
275 | | |
276 | 1.43k | superblock_->set_tablet_data_state(tablet::TABLET_DATA_COPYING); |
277 | 1.43k | wal_seqnos_.assign(resp.deprecated_wal_segment_seqnos().begin(), |
278 | 1.43k | resp.deprecated_wal_segment_seqnos().end()); |
279 | 1.43k | if (resp.has_first_wal_segment_seqno()) { |
280 | 1.43k | first_wal_seqno_ = resp.first_wal_segment_seqno(); |
281 | 2 | } else { |
282 | 2 | first_wal_seqno_ = 0; |
283 | 2 | } |
284 | 1.43k | remote_committed_cstate_.reset(resp.release_initial_committed_cstate()); |
285 | | |
286 | 1.43k | Schema schema; |
287 | 1.43k | RETURN_NOT_OK_PREPEND(SchemaFromPB( |
288 | 1.43k | table.schema(), &schema), "Cannot deserialize schema from remote superblock"); |
289 | 1.43k | string data_root_dir; |
290 | 1.43k | string wal_root_dir; |
291 | 1.43k | if (replace_tombstoned_tablet_) { |
292 | | // Also validate the term of the bootstrap source peer, in case they are |
293 | | // different. This is a sanity check that protects us in case a bug or |
294 | | // misconfiguration causes us to attempt to bootstrap from an out-of-date |
295 | | // source peer, even after passing the term check from the caller in |
296 | | // SetTabletToReplace(). |
297 | 100 | int64_t last_logged_term = meta_->tombstone_last_logged_opid().term; |
298 | 100 | if (last_logged_term > remote_committed_cstate_->current_term()) { |
299 | 0 | return STATUS(InvalidArgument, |
300 | 0 | Substitute("Tablet $0: Bootstrap source has term $1 but " |
301 | 0 | "tombstoned replica has last-logged opid with higher term $2. " |
302 | 0 | "Refusing remote bootstrap from source peer $3", |
303 | 0 | tablet_id_, |
304 | 0 | remote_committed_cstate_->current_term(), |
305 | 0 | last_logged_term, |
306 | 0 | bootstrap_peer_uuid)); |
307 | 0 | } |
308 | | // Replace rocksdb_dir in the received superblock with our rocksdb_dir. |
309 | 100 | kv_store->set_rocksdb_dir(meta_->rocksdb_dir()); |
310 | | |
311 | | // Replace wal_dir in the received superblock with our assigned wal_dir. |
312 | 100 | superblock_->set_wal_dir(meta_->wal_dir()); |
313 | | |
314 | | // This will flush to disk, but we set the data state to COPYING above. |
315 | 100 | RETURN_NOT_OK_PREPEND(meta_->ReplaceSuperBlock(*superblock_), |
316 | 100 | "Remote bootstrap unable to replace superblock on tablet " + |
317 | 100 | tablet_id_); |
318 | | // Update the directory assignment mapping. |
319 | 100 | data_root_dir = meta_->data_root_dir(); |
320 | 100 | wal_root_dir = meta_->wal_root_dir(); |
321 | 100 | if (ts_manager != nullptr) { |
322 | 100 | ts_manager->RegisterDataAndWalDir(&fs_manager(), |
323 | 100 | table_id, |
324 | 100 | meta_->raft_group_id(), |
325 | 100 | data_root_dir, |
326 | 100 | wal_root_dir); |
327 | 100 | } |
328 | 1.33k | } else { |
329 | 1.33k | Partition partition; |
330 | 1.33k | Partition::FromPB(superblock_->partition(), &partition); |
331 | 1.33k | PartitionSchema partition_schema; |
332 | 1.33k | RETURN_NOT_OK(PartitionSchema::FromPB(table.partition_schema(), schema, &partition_schema)); |
333 | | // Create the superblock on disk. |
334 | 1.33k | if (ts_manager != nullptr) { |
335 | 1.28k | ts_manager->GetAndRegisterDataAndWalDir(&fs_manager(), |
336 | 1.28k | table_id, |
337 | 1.28k | tablet_id_, |
338 | 1.28k | &data_root_dir, |
339 | 1.28k | &wal_root_dir); |
340 | 1.28k | } |
341 | 1.33k | auto table_info = std::make_shared<tablet::TableInfo>( |
342 | 1.33k | table_id, table.namespace_name(), table.table_name(), table.table_type(), schema, |
343 | 1.33k | IndexMap(table.indexes()), |
344 | 1.12k | table.has_index_info() ? boost::optional<IndexInfo>(table.index_info()) : boost::none, |
345 | 1.33k | table.schema_version(), partition_schema); |
346 | 1.33k | auto create_result = RaftGroupMetadata::CreateNew(tablet::RaftGroupMetadataData { |
347 | 1.33k | .fs_manager = &fs_manager(), |
348 | 1.33k | .table_info = table_info, |
349 | 1.33k | .raft_group_id = tablet_id_, |
350 | 1.33k | .partition = partition, |
351 | 1.33k | .tablet_data_state = tablet::TABLET_DATA_COPYING, |
352 | 1.33k | .colocated = colocated }, data_root_dir, wal_root_dir); |
353 | 1.33k | if (ts_manager != nullptr && !create_result.ok()) { |
354 | 429 | ts_manager->UnregisterDataWalDir(table_id, tablet_id_, data_root_dir, wal_root_dir); |
355 | 429 | } |
356 | 1.33k | RETURN_NOT_OK(create_result); |
357 | 904 | meta_ = std::move(*create_result); |
358 | | |
359 | 904 | vector<DeletedColumn> deleted_cols; |
360 | 0 | for (const DeletedColumnPB& col_pb : table.deleted_cols()) { |
361 | 0 | DeletedColumn col; |
362 | 0 | RETURN_NOT_OK(DeletedColumn::FromPB(col_pb, &col)); |
363 | 0 | deleted_cols.push_back(col); |
364 | 0 | } |
365 | 904 | meta_->SetSchema(schema, |
366 | 904 | IndexMap(table.indexes()), |
367 | 904 | deleted_cols, |
368 | 904 | table.schema_version()); |
369 | | |
370 | | // Replace rocksdb_dir in the received superblock with our rocksdb_dir. |
371 | 904 | kv_store->set_rocksdb_dir(meta_->rocksdb_dir()); |
372 | | |
373 | | // Replace wal_dir in the received superblock with our assigned wal_dir. |
374 | 904 | superblock_->set_wal_dir(meta_->wal_dir()); |
375 | 904 | } |
376 | | |
377 | 1.00k | started_ = true; |
378 | 1.00k | auto old_count = remote_bootstrap_clients_started_.fetch_add(1, std::memory_order_acq_rel); |
379 | 1.00k | if (old_count < 0) { |
380 | 0 | LOG_WITH_PREFIX(DFATAL) << "Invalid number of remote bootstrap sessions: " << old_count; |
381 | 0 | remote_bootstrap_clients_started_.store(0, std::memory_order_release); |
382 | 0 | } |
383 | | |
384 | 1.00k | if (meta) { |
385 | 1.00k | *meta = meta_; |
386 | 1.00k | } |
387 | 1.00k | return Status::OK(); |
388 | 1.43k | } |
389 | | |
390 | 1.00k | Status RemoteBootstrapClient::FetchAll(TabletStatusListener* status_listener) { |
391 | 1.00k | CHECK(started_); |
392 | 1.00k | status_listener_ = CHECK_NOTNULL(status_listener); |
393 | | |
394 | 0 | VLOG_WITH_PREFIX(2) << "Fetching table_type: " << TableType_Name(meta_->table_type()); |
395 | | |
396 | 1.00k | new_superblock_ = *superblock_; |
397 | | // Replace rocksdb_dir with our rocksdb_dir |
398 | 1.00k | new_superblock_.mutable_kv_store()->set_rocksdb_dir(meta_->rocksdb_dir()); |
399 | | |
400 | 1.00k | RETURN_NOT_OK(DownloadRocksDBFiles()); |
401 | 999 | RETURN_NOT_OK(DownloadWALs()); |
402 | 999 | for (const auto& component : components_) { |
403 | 999 | RETURN_NOT_OK(component->Download()); |
404 | 999 | } |
405 | | |
406 | | // We sleep here to simulate the transfer of very large files. |
407 | 999 | if (PREDICT_FALSE(FLAGS_TEST_simulate_long_remote_bootstrap_sec > 0)) { |
408 | 38 | LOG_WITH_PREFIX(INFO) << "Sleeping " << FLAGS_TEST_simulate_long_remote_bootstrap_sec |
409 | 38 | << " seconds to simulate the transfer of very large files"; |
410 | 38 | SleepFor(MonoDelta::FromSeconds(FLAGS_TEST_simulate_long_remote_bootstrap_sec)); |
411 | 38 | } |
412 | 999 | return Status::OK(); |
413 | 999 | } |
414 | | |
415 | 995 | Status RemoteBootstrapClient::Finish() { |
416 | 995 | CHECK(meta_); |
417 | 995 | CHECK(started_); |
418 | | |
419 | 995 | CHECK(downloaded_wal_); |
420 | 0 | CHECK(downloaded_rocksdb_files_) << "files not downloaded"; |
421 | | |
422 | 995 | RETURN_NOT_OK(WriteConsensusMetadata()); |
423 | | |
424 | | // Replace tablet metadata superblock. This will set the tablet metadata state |
425 | | // to remote_tablet_data_state_. |
426 | 995 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap complete. Replacing tablet superblock."; |
427 | 995 | UpdateStatusMessage("Replacing tablet superblock"); |
428 | 995 | new_superblock_.set_tablet_data_state(remote_tablet_data_state_); |
429 | 995 | RETURN_NOT_OK(meta_->ReplaceSuperBlock(new_superblock_)); |
430 | | |
431 | 995 | if (FLAGS_remote_bootstrap_save_downloaded_metadata) { |
432 | 0 | string meta_path = fs_manager().GetRaftGroupMetadataPath(tablet_id_); |
433 | 0 | string meta_copy_path = Substitute("$0.copy.$1.tmp", meta_path, start_time_micros_); |
434 | 0 | RETURN_NOT_OK_PREPEND(CopyFile(Env::Default(), meta_path, meta_copy_path, |
435 | 0 | WritableFileOptions()), |
436 | 0 | "Unable to make copy of tablet metadata"); |
437 | 0 | } |
438 | | |
439 | 995 | succeeded_ = true; |
440 | | |
441 | 995 | MAYBE_FAULT(FLAGS_TEST_fault_crash_bootstrap_client_before_changing_role); |
442 | | |
443 | 995 | RETURN_NOT_OK_PREPEND( |
444 | 995 | EndRemoteSession(), "Error closing remote bootstrap session " + session_id()); |
445 | | |
446 | 995 | return Status::OK(); |
447 | 995 | } |
448 | | |
449 | | Status RemoteBootstrapClient::VerifyChangeRoleSucceeded( |
450 | 994 | const shared_ptr<consensus::Consensus>& shared_consensus) { |
451 | | |
452 | 994 | if (!shared_consensus) { |
453 | 0 | return STATUS(InvalidArgument, "Invalid consensus object"); |
454 | 0 | } |
455 | | |
456 | 994 | auto start = MonoTime::Now(); |
457 | 994 | auto timeout = MonoDelta::FromSeconds(FLAGS_committed_config_change_role_timeout_sec); |
458 | 994 | int backoff_ms = 1; |
459 | 994 | const int kMaxBackoffMs = 256; |
460 | 994 | RaftConfigPB committed_config; |
461 | | |
462 | 50.5M | do { |
463 | 50.5M | committed_config = shared_consensus->CommittedConfig(); |
464 | 139M | for (const auto &peer : committed_config.peers()) { |
465 | 139M | if (peer.permanent_uuid() != permanent_uuid()) { |
466 | 139M | continue; |
467 | 139M | } |
468 | | |
469 | 10.2k | if (peer.member_type() == PeerMemberType::VOTER || |
470 | 9.30k | peer.member_type() == PeerMemberType::OBSERVER) { |
471 | 973 | return Status::OK(); |
472 | 9.25k | } else { |
473 | 9.25k | SleepFor(MonoDelta::FromMilliseconds(backoff_ms)); |
474 | 9.25k | backoff_ms = min(backoff_ms << 1, kMaxBackoffMs); |
475 | 9.25k | break; |
476 | 9.25k | } |
477 | 10.2k | } |
478 | 50.5M | } while (MonoTime::Now().GetDeltaSince(start).LessThan(timeout)); |
479 | | |
480 | 21 | return STATUS(TimedOut, |
481 | 994 | Substitute("Timed out waiting member type of peer $0 to change in the committed " |
482 | 994 | "config $1", permanent_uuid(), |
483 | 994 | committed_config.ShortDebugString())); |
484 | 994 | } |
485 | | |
486 | 3.23k | void RemoteBootstrapClient::UpdateStatusMessage(const string& message) { |
487 | 3.23k | if (status_listener_ != nullptr) { |
488 | 3.23k | status_listener_->StatusMessage("RemoteBootstrap: " + message); |
489 | 3.23k | } |
490 | 3.23k | } |
491 | | |
492 | 1.44k | Status RemoteBootstrapClient::EndRemoteSession() { |
493 | 1.44k | if (!started_) { |
494 | 445 | return Status::OK(); |
495 | 445 | } |
496 | | |
497 | 1.00k | rpc::RpcController controller; |
498 | 1.00k | controller.set_timeout(MonoDelta::FromSeconds(FLAGS_remote_bootstrap_end_session_timeout_sec)); |
499 | | |
500 | 1.00k | EndRemoteBootstrapSessionRequestPB req; |
501 | 1.00k | req.set_session_id(session_id()); |
502 | 1.00k | req.set_is_success(succeeded_); |
503 | 1.00k | req.set_keep_session(succeeded_); |
504 | 1.00k | EndRemoteBootstrapSessionResponsePB resp; |
505 | | |
506 | 1.00k | LOG_WITH_PREFIX(INFO) << "Ending remote bootstrap session " << session_id(); |
507 | 1.00k | auto status = proxy_->EndRemoteBootstrapSession(req, &resp, &controller); |
508 | 1.00k | if (status.ok()) { |
509 | 998 | remove_required_ = resp.session_kept(); |
510 | 998 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap session " << session_id() |
511 | 998 | << " ended successfully"; |
512 | 998 | return Status::OK(); |
513 | 998 | } |
514 | | |
515 | 4 | if (status.IsTimedOut()) { |
516 | | // Ignore timeout errors since the server could have sent the ChangeConfig request and died |
517 | | // before replying. We need to check the config to verify that this server's role changed as |
518 | | // expected, in which case, the remote bootstrap was completed successfully. |
519 | 0 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap session " << session_id() << " timed out"; |
520 | 0 | return Status::OK(); |
521 | 0 | } |
522 | | |
523 | 4 | status = UnwindRemoteError(status, controller); |
524 | 4 | return status.CloneAndPrepend( |
525 | 4 | Format("Failed to end remote bootstrap session $0", session_id())); |
526 | 4 | } |
527 | | |
528 | 941 | Status RemoteBootstrapClient::Remove() { |
529 | 941 | if (!remove_required_) { |
530 | 0 | return Status::OK(); |
531 | 0 | } |
532 | | |
533 | 941 | rpc::RpcController controller; |
534 | 941 | controller.set_timeout(MonoDelta::FromSeconds(FLAGS_remote_bootstrap_end_session_timeout_sec)); |
535 | | |
536 | 941 | RemoveSessionRequestPB req; |
537 | 941 | req.set_session_id(session_id()); |
538 | 941 | RemoveSessionResponsePB resp; |
539 | | |
540 | 941 | LOG_WITH_PREFIX(INFO) << "Removing remote bootstrap session " << session_id(); |
541 | 941 | const auto status = proxy_->RemoveSession(req, &resp, &controller); |
542 | 941 | if (status.ok()) { |
543 | 941 | LOG_WITH_PREFIX(INFO) << "Remote bootstrap session " << session_id() << " removed successfully"; |
544 | 941 | return Status::OK(); |
545 | 941 | } |
546 | | |
547 | 0 | return UnwindRemoteError(status, controller).CloneAndPrepend( |
548 | 0 | Format("Failure removing remote bootstrap session $0", session_id())); |
549 | 0 | } |
550 | | |
551 | 999 | Status RemoteBootstrapClient::DownloadWALs() { |
552 | 999 | CHECK(started_); |
553 | | |
554 | | // Delete and recreate WAL dir if it already exists, to ensure stray files are |
555 | | // not kept from previous bootstraps and runs. |
556 | 999 | const string& wal_dir = meta_->wal_dir(); |
557 | 999 | if (env().FileExists(wal_dir)) { |
558 | 0 | RETURN_NOT_OK(env().DeleteRecursively(wal_dir)); |
559 | 0 | } |
560 | 999 | auto wal_table_top_dir = DirName(wal_dir); |
561 | 999 | RETURN_NOT_OK_PREPEND(fs_manager().CreateDirIfMissing(wal_table_top_dir), |
562 | 999 | Substitute("Failed to create WAL table directory $0", wal_table_top_dir)); |
563 | | |
564 | | // fsync() parent dir. |
565 | 999 | RETURN_NOT_OK_PREPEND(env().SyncDir(DirName(wal_table_top_dir)), |
566 | 999 | Substitute("Failed to sync WAL root directory $0", |
567 | 999 | DirName(wal_table_top_dir))); |
568 | | |
569 | 999 | RETURN_NOT_OK_PREPEND(env().CreateDir(wal_dir), |
570 | 999 | Substitute("Failed to create WAL tablet directory $0", wal_dir)); |
571 | | |
572 | | // fsync() parent dir. |
573 | 999 | RETURN_NOT_OK_PREPEND(env().SyncDir(wal_table_top_dir), |
574 | 999 | Substitute("Failed to sync WAL table directory $0", wal_table_top_dir)); |
575 | | |
576 | | // Download the WAL segments. |
577 | 999 | uint64_t counter = 0; |
578 | 999 | if (first_wal_seqno_) { |
579 | 999 | LOG_WITH_PREFIX(INFO) << "Starting download of WAL segments starting from sequence number " |
580 | 999 | << first_wal_seqno_; |
581 | 2.24k | for (;;) { |
582 | 2.24k | uint64_t segment_seqno = first_wal_seqno_ + counter; |
583 | 2.24k | UpdateStatusMessage( |
584 | 2.24k | Format("Downloading WAL segment with seq. number $0 (#$1 in this session)", |
585 | 2.24k | segment_seqno, counter + 1)); |
586 | 2.24k | auto download_status = DownloadWAL(segment_seqno); |
587 | 2.24k | if (!download_status.ok()) { |
588 | 999 | std::string message_suffix; |
589 | 999 | if (counter > 0) { |
590 | 999 | message_suffix = Format(", downloaded segments in range: $0..$1", |
591 | 999 | first_wal_seqno_, segment_seqno - 1); |
592 | 0 | } else { |
593 | 0 | message_suffix = ", no segments were downloaded"; |
594 | 0 | } |
595 | 999 | if (download_status.IsNotFound()) { |
596 | 999 | LOG_WITH_PREFIX(INFO) << "Stopped downloading WAL segments" << message_suffix; |
597 | 999 | break; |
598 | 999 | } |
599 | 0 | LOG_WITH_PREFIX(WARNING) << "Downloading WAL segments failed: " |
600 | 0 | << download_status << message_suffix; |
601 | 0 | return download_status; |
602 | 0 | } |
603 | 1.24k | ++counter; |
604 | 1.24k | if (PREDICT_FALSE(FLAGS_TEST_download_partial_wal_segments) && counter > 0) { |
605 | 0 | LOG(INFO) << "Flag TEST_download_partial_wal_segments set to true. " |
606 | 0 | << "Stopping WAL files download after one file has been downloaded."; |
607 | 0 | break; |
608 | 0 | } |
609 | 1.24k | } |
610 | 0 | } else { |
611 | 0 | auto num_segments = wal_seqnos_.size(); |
612 | 0 | LOG_WITH_PREFIX(INFO) << "Starting download of " << num_segments << " WAL segments..."; |
613 | 0 | for (uint64_t seg_seqno : wal_seqnos_) { |
614 | 0 | UpdateStatusMessage(Substitute("Downloading WAL segment with seq. number $0 ($1/$2)", |
615 | 0 | seg_seqno, counter + 1, num_segments)); |
616 | 0 | RETURN_NOT_OK(DownloadWAL(seg_seqno)); |
617 | 0 | ++counter; |
618 | 0 | } |
619 | 0 | } |
620 | | |
621 | 999 | if (FLAGS_bytes_remote_bootstrap_durable_write_mb != 0) { |
622 | | // Persist directory so that recently downloaded files are accessible. |
623 | 999 | RETURN_NOT_OK_PREPEND(env().SyncDir(wal_table_top_dir), |
624 | 999 | Substitute("Failed to sync WAL table directory $0", wal_table_top_dir)); |
625 | 999 | } |
626 | | |
627 | 999 | downloaded_wal_ = true; |
628 | 999 | return Status::OK(); |
629 | 999 | } |
630 | | |
631 | 1.00k | Status RemoteBootstrapClient::CreateTabletDirectories(const string& db_dir, FsManager* fs) { |
632 | | // Create the directory table-uuid first. |
633 | 1.00k | RETURN_NOT_OK_PREPEND(fs->CreateDirIfMissing(DirName(db_dir)), |
634 | 1.00k | Substitute("Failed to create RocksDB table directory $0", |
635 | 1.00k | DirName(db_dir))); |
636 | | |
637 | 1.00k | RETURN_NOT_OK_PREPEND(fs->CreateDirIfMissing(db_dir), |
638 | 1.00k | Substitute("Failed to create RocksDB tablet directory $0", |
639 | 1.00k | db_dir)); |
640 | | |
641 | 1.00k | for (const auto& component : components_) { |
642 | 1.00k | RETURN_NOT_OK(component->CreateDirectories(db_dir, fs)); |
643 | 1.00k | } |
644 | | |
645 | 1.00k | return Status::OK(); |
646 | 1.00k | } |
647 | | |
648 | 1.00k | Status RemoteBootstrapClient::DownloadRocksDBFiles() { |
649 | 1.00k | const auto& rocksdb_dir = meta_->rocksdb_dir(); |
650 | | |
651 | 1.00k | RETURN_NOT_OK(CreateTabletDirectories(rocksdb_dir, meta_->fs_manager())); |
652 | | |
653 | 1.00k | DataIdPB data_id; |
654 | 1.00k | data_id.set_type(DataIdPB::ROCKSDB_FILE); |
655 | 3.51k | for (auto const& file_pb : new_superblock_.kv_store().rocksdb_files()) { |
656 | 3.51k | auto start = MonoTime::Now(); |
657 | 3.51k | RETURN_NOT_OK(downloader_.DownloadFile(file_pb, rocksdb_dir, &data_id)); |
658 | 3.50k | auto elapsed = MonoTime::Now().GetDeltaSince(start); |
659 | 3.50k | LOG_WITH_PREFIX(INFO) |
660 | 3.50k | << "Downloaded file " << file_pb.name() << " of size " << file_pb.size_bytes() |
661 | 3.50k | << " in " << elapsed.ToSeconds() << " seconds"; |
662 | 3.50k | } |
663 | | |
664 | | // To avoid adding new file type to remote bootstrap we move intents as subdir of regular DB. |
665 | 999 | auto intents_tmp_dir = JoinPathSegments(rocksdb_dir, tablet::kIntentsSubdir); |
666 | 999 | if (env().FileExists(intents_tmp_dir)) { |
667 | 52 | auto intents_dir = rocksdb_dir + tablet::kIntentsDBSuffix; |
668 | 52 | LOG_WITH_PREFIX(INFO) << "Moving intents DB: " << intents_tmp_dir << " => " << intents_dir; |
669 | 52 | RETURN_NOT_OK(env().RenameFile(intents_tmp_dir, intents_dir)); |
670 | 52 | } |
671 | 999 | if (FLAGS_bytes_remote_bootstrap_durable_write_mb != 0) { |
672 | | // Persist directory so that recently downloaded files are accessible. |
673 | 999 | RETURN_NOT_OK(env().SyncDir(rocksdb_dir)); |
674 | 999 | } |
675 | 999 | downloaded_rocksdb_files_ = true; |
676 | 999 | return Status::OK(); |
677 | 999 | } |
678 | | |
679 | 2.24k | Status RemoteBootstrapClient::DownloadWAL(uint64_t wal_segment_seqno) { |
680 | 0 | VLOG_WITH_PREFIX(1) << "Downloading WAL segment with seqno " << wal_segment_seqno; |
681 | 2.24k | DataIdPB data_id; |
682 | 2.24k | data_id.set_type(DataIdPB::LOG_SEGMENT); |
683 | 2.24k | data_id.set_wal_segment_seqno(wal_segment_seqno); |
684 | 2.24k | const string dest_path = fs_manager().GetWalSegmentFilePath(meta_->wal_dir(), wal_segment_seqno); |
685 | 2.24k | const auto temp_dest_path = dest_path + ".tmp"; |
686 | 2.24k | bool ok = false; |
687 | 2.24k | auto se = ScopeExit([this, &temp_dest_path, &ok] { |
688 | 2.24k | if (!ok) { |
689 | 999 | WARN_NOT_OK(env().DeleteFile(temp_dest_path), |
690 | 999 | "Failed to delete temporary WAL segment"); |
691 | 999 | } |
692 | 2.24k | }); |
693 | | |
694 | 2.24k | WritableFileOptions opts; |
695 | 2.24k | opts.sync_on_close = true; |
696 | 2.24k | std::unique_ptr<WritableFile> writer; |
697 | 2.24k | RETURN_NOT_OK_PREPEND(env().NewWritableFile(opts, temp_dest_path, &writer), |
698 | 2.24k | "Unable to open file for writing"); |
699 | | |
700 | 2.24k | auto start = MonoTime::Now(); |
701 | 2.24k | RETURN_NOT_OK_PREPEND(downloader_.DownloadFile(data_id, writer.get()), |
702 | 2.24k | Substitute("Unable to download WAL segment with seq. number $0", |
703 | 1.24k | wal_segment_seqno)); |
704 | 1.24k | RETURN_NOT_OK(env().RenameFile(temp_dest_path, dest_path)); |
705 | 1.24k | auto elapsed = MonoTime::Now().GetDeltaSince(start); |
706 | 1.24k | LOG_WITH_PREFIX(INFO) << "Downloaded WAL segment with seq. number " << wal_segment_seqno |
707 | 1.24k | << " of size " << writer->Size() << " in " << elapsed.ToSeconds() |
708 | 1.24k | << " seconds"; |
709 | 1.24k | ok = true; |
710 | | |
711 | 1.24k | return Status::OK(); |
712 | 1.24k | } |
713 | | |
714 | 995 | Status RemoteBootstrapClient::WriteConsensusMetadata() { |
715 | | // If we didn't find a previous consensus meta file, create one. |
716 | 995 | if (!cmeta_) { |
717 | 898 | std::unique_ptr<ConsensusMetadata> cmeta; |
718 | 898 | return ConsensusMetadata::Create(&fs_manager(), tablet_id_, fs_manager().uuid(), |
719 | 898 | remote_committed_cstate_->config(), |
720 | 898 | remote_committed_cstate_->current_term(), |
721 | 898 | &cmeta); |
722 | 898 | } |
723 | | |
724 | | // Otherwise, update the consensus metadata to reflect the config and term |
725 | | // sent by the remote bootstrap source. |
726 | 97 | cmeta_->MergeCommittedConsensusStatePB(*remote_committed_cstate_); |
727 | 97 | RETURN_NOT_OK(cmeta_->Flush()); |
728 | | |
729 | 97 | if (FLAGS_remote_bootstrap_save_downloaded_metadata) { |
730 | 0 | string cmeta_path = fs_manager().GetConsensusMetadataPath(tablet_id_); |
731 | 0 | string cmeta_copy_path = Substitute("$0.copy.$1.tmp", cmeta_path, start_time_micros_); |
732 | 0 | RETURN_NOT_OK_PREPEND(CopyFile(Env::Default(), cmeta_path, cmeta_copy_path, |
733 | 0 | WritableFileOptions()), |
734 | 0 | "Unable to make copy of consensus metadata"); |
735 | 0 | } |
736 | | |
737 | 97 | return Status::OK(); |
738 | 97 | } |
739 | | |
740 | 11.5k | Env& RemoteBootstrapClient::env() const { |
741 | 11.5k | return *fs_manager().env(); |
742 | 11.5k | } |
743 | | |
744 | 139M | const std::string& RemoteBootstrapClient::permanent_uuid() const { |
745 | 139M | return fs_manager().uuid(); |
746 | 139M | } |
747 | | |
748 | | } // namespace tserver |
749 | | } // namespace yb |