/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/db_impl.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | |
24 | | #include "yb/rocksdb/db/db_impl.h" |
25 | | |
26 | | #ifndef __STDC_FORMAT_MACROS |
27 | | #define __STDC_FORMAT_MACROS |
28 | | #endif |
29 | | |
30 | | #include <inttypes.h> |
31 | | #include <stdint.h> |
32 | | #ifdef OS_SOLARIS |
33 | | #include <alloca.h> |
34 | | #endif |
35 | | |
36 | | #include <algorithm> |
37 | | #include <climits> |
38 | | #include <map> |
39 | | #include <set> |
40 | | #include <stdexcept> |
41 | | #include <string> |
42 | | #include <unordered_map> |
43 | | #include <unordered_set> |
44 | | #include <utility> |
45 | | #include <vector> |
46 | | |
47 | | #include <boost/container/small_vector.hpp> |
48 | | |
49 | | #include "yb/gutil/stringprintf.h" |
50 | | #include "yb/util/string_util.h" |
51 | | #include "yb/util/scope_exit.h" |
52 | | #include "yb/util/logging.h" |
53 | | #include "yb/util/debug-util.h" |
54 | | #include "yb/util/fault_injection.h" |
55 | | #include "yb/util/flag_tags.h" |
56 | | #include "yb/util/priority_thread_pool.h" |
57 | | #include "yb/util/atomic.h" |
58 | | |
59 | | #include "yb/rocksdb/db/auto_roll_logger.h" |
60 | | #include "yb/rocksdb/db/builder.h" |
61 | | #include "yb/rocksdb/db/compaction_job.h" |
62 | | #include "yb/rocksdb/db/compaction_picker.h" |
63 | | #include "yb/rocksdb/db/db_info_dumper.h" |
64 | | #include "yb/rocksdb/db/db_iter.h" |
65 | | #include "yb/rocksdb/db/dbformat.h" |
66 | | #include "yb/rocksdb/db/event_helpers.h" |
67 | | #include "yb/rocksdb/db/filename.h" |
68 | | #include "yb/rocksdb/db/file_numbers.h" |
69 | | #include "yb/rocksdb/db/flush_job.h" |
70 | | #include "yb/rocksdb/db/forward_iterator.h" |
71 | | #include "yb/rocksdb/db/job_context.h" |
72 | | #include "yb/rocksdb/db/log_reader.h" |
73 | | #include "yb/rocksdb/db/log_writer.h" |
74 | | #include "yb/rocksdb/db/managed_iterator.h" |
75 | | #include "yb/rocksdb/db/memtable.h" |
76 | | #include "yb/rocksdb/db/memtable_list.h" |
77 | | #include "yb/rocksdb/db/merge_context.h" |
78 | | #include "yb/rocksdb/db/merge_helper.h" |
79 | | #include "yb/rocksdb/db/table_cache.h" |
80 | | #include "yb/rocksdb/db/table_properties_collector.h" |
81 | | #include "yb/rocksdb/db/version_set.h" |
82 | | #include "yb/rocksdb/db/write_batch_internal.h" |
83 | | #include "yb/rocksdb/db/write_callback.h" |
84 | | #include "yb/rocksdb/db/writebuffer.h" |
85 | | #include "yb/rocksdb/port/likely.h" |
86 | | #include "yb/rocksdb/port/port.h" |
87 | | #include "yb/rocksdb/cache.h" |
88 | | #include "yb/rocksdb/compaction_filter.h" |
89 | | #include "yb/rocksdb/db.h" |
90 | | #include "yb/rocksdb/env.h" |
91 | | #include "yb/rocksdb/sst_file_writer.h" |
92 | | #include "yb/rocksdb/statistics.h" |
93 | | #include "yb/rocksdb/status.h" |
94 | | #include "yb/rocksdb/table.h" |
95 | | #include "yb/rocksdb/wal_filter.h" |
96 | | #include "yb/rocksdb/table/block_based_table_factory.h" |
97 | | #include "yb/rocksdb/table/merger.h" |
98 | | #include "yb/rocksdb/table/scoped_arena_iterator.h" |
99 | | #include "yb/rocksdb/table/table_builder.h" |
100 | | #include "yb/rocksdb/util/autovector.h" |
101 | | #include "yb/rocksdb/util/coding.h" |
102 | | #include "yb/rocksdb/util/compression.h" |
103 | | #include "yb/rocksdb/util/crc32c.h" |
104 | | #include "yb/rocksdb/util/file_reader_writer.h" |
105 | | #include "yb/rocksdb/util/file_util.h" |
106 | | #include "yb/rocksdb/util/log_buffer.h" |
107 | | #include "yb/rocksdb/util/logging.h" |
108 | | #include "yb/rocksdb/util/mutexlock.h" |
109 | | #include "yb/rocksdb/util/sst_file_manager_impl.h" |
110 | | #include "yb/rocksdb/util/options_helper.h" |
111 | | #include "yb/rocksdb/util/options_parser.h" |
112 | | #include "yb/rocksdb/util/perf_context_imp.h" |
113 | | #include "yb/rocksdb/util/stop_watch.h" |
114 | | #include "yb/rocksdb/util/sync_point.h" |
115 | | #include "yb/rocksdb/util/xfunc.h" |
116 | | #include "yb/rocksdb/db/db_iterator_wrapper.h" |
117 | | |
118 | | #include "yb/util/status_log.h" |
119 | | #include "yb/util/stats/iostats_context_imp.h" |
120 | | |
121 | | using namespace std::literals; |
122 | | |
123 | | DEFINE_bool(dump_dbimpl_info, false, "Dump RocksDB info during constructor."); |
124 | | DEFINE_bool(flush_rocksdb_on_shutdown, true, |
125 | | "Safely flush RocksDB when instance is destroyed, disabled for crash tests."); |
126 | | DEFINE_double(fault_crash_after_rocksdb_flush, 0.0, |
127 | | "Fraction of time to crash right after a successful RocksDB flush in tests."); |
128 | | |
129 | | DEFINE_bool(use_priority_thread_pool_for_flushes, false, |
130 | | "When true priority thread pool will be used for flushes, otherwise " |
131 | | "Env thread pool with Priority::HIGH will be used."); |
132 | | TAG_FLAG(use_priority_thread_pool_for_flushes, runtime); |
133 | | |
134 | | DEFINE_bool(use_priority_thread_pool_for_compactions, true, |
135 | | "When true priority thread pool will be used for compactions, otherwise " |
136 | | "Env thread pool with Priority::LOW will be used."); |
137 | | TAG_FLAG(use_priority_thread_pool_for_compactions, runtime); |
138 | | |
139 | | DEFINE_int32(compaction_priority_start_bound, 10, |
140 | | "Compaction task of DB that has number of SST files less than specified will have " |
141 | | "priority 0."); |
142 | | |
143 | | DEFINE_int32(compaction_priority_step_size, 5, |
144 | | "Compaction task of DB that has number of SST files greater that " |
145 | | "compaction_priority_start_bound will get 1 extra priority per every " |
146 | | "compaction_priority_step_size files."); |
147 | | |
148 | | DEFINE_int32(small_compaction_extra_priority, 1, |
149 | | "Small compaction will get small_compaction_extra_priority extra priority."); |
150 | | |
151 | | DEFINE_bool(rocksdb_use_logging_iterator, false, |
152 | | "Wrap newly created RocksDB iterators in a logging wrapper"); |
153 | | |
154 | | DEFINE_test_flag(int32, max_write_waiters, std::numeric_limits<int32_t>::max(), |
155 | | "Max allowed number of write waiters per RocksDB instance in tests."); |
156 | | |
157 | | namespace rocksdb { |
158 | | |
159 | | namespace { |
160 | | |
161 | | std::unique_ptr<Compaction> PopFirstFromCompactionQueue( |
162 | 17.7k | std::deque<std::unique_ptr<Compaction>>* queue) { |
163 | 17.7k | DCHECK(!queue->empty()); |
164 | 17.7k | auto c = std::move(queue->front()); |
165 | 17.7k | ColumnFamilyData* cfd = c->column_family_data(); |
166 | 17.7k | queue->pop_front(); |
167 | 17.7k | DCHECK(cfd->pending_compaction()); |
168 | 17.7k | cfd->set_pending_compaction(false); |
169 | 17.7k | return c; |
170 | 17.7k | } |
171 | | |
172 | 792k | void ClearCompactionQueue(std::deque<std::unique_ptr<Compaction>>* queue) { |
173 | 792k | while (!queue->empty()) { |
174 | 12 | auto c = PopFirstFromCompactionQueue(queue); |
175 | 12 | c->ReleaseCompactionFiles(STATUS(Incomplete, "DBImpl destroyed before compaction scheduled")); |
176 | 12 | auto cfd = c->column_family_data(); |
177 | 12 | c.reset(); |
178 | 12 | if (cfd->Unref()) { |
179 | 0 | delete cfd; |
180 | 0 | } |
181 | 12 | } |
182 | 792k | } |
183 | | |
184 | | } // namespace |
185 | | |
186 | | const char kDefaultColumnFamilyName[] = "default"; |
187 | | |
188 | | struct DBImpl::WriteContext { |
189 | | boost::container::small_vector<std::unique_ptr<SuperVersion>, 8> superversions_to_free_; |
190 | | autovector<MemTable*> memtables_to_free_; |
191 | | |
192 | 29.0M | ~WriteContext() { |
193 | 29.0M | for (auto& m : memtables_to_free_) { |
194 | 0 | delete m; |
195 | 0 | } |
196 | 29.0M | } |
197 | | }; |
198 | | |
199 | | YB_DEFINE_ENUM(BgTaskType, (kFlush)(kCompaction)); |
200 | | |
201 | | class DBImpl::ThreadPoolTask : public yb::PriorityThreadPoolTask { |
202 | | public: |
203 | 1.17k | explicit ThreadPoolTask(DBImpl* db_impl) : db_impl_(db_impl) {} |
204 | | |
205 | 1.17k | void Run(const Status& status, yb::PriorityThreadPoolSuspender* suspender) override { |
206 | 1.17k | if (!status.ok()) { |
207 | 1 | LOG_WITH_PREFIX(INFO) << "Task cancelled " << ToString() << ": " << status; |
208 | 1 | InstrumentedMutexLock lock(&db_impl_->mutex_); |
209 | 1 | AbortedUnlocked(status); |
210 | 1 | return; // Failed to schedule, could just drop compaction. |
211 | 1 | } |
212 | 1.16k | DoRun(suspender); |
213 | 1.16k | } |
214 | | |
215 | | virtual BgTaskType Type() const = 0; |
216 | | |
217 | | virtual int Priority() const = 0; |
218 | | |
219 | | virtual void AbortedUnlocked(const Status& status) = 0; |
220 | | |
221 | | virtual void DoRun(yb::PriorityThreadPoolSuspender* suspender) = 0; |
222 | | |
223 | | // Tries to recalculate and update task priority, returns true if priority was updated. |
224 | | virtual bool UpdatePriority() = 0; |
225 | | |
226 | 1 | const std::string& LogPrefix() const { |
227 | 1 | return db_impl_->LogPrefix(); |
228 | 1 | } |
229 | | |
230 | | protected: |
231 | | DBImpl* const db_impl_; |
232 | | }; |
233 | | |
234 | | constexpr int kShuttingDownPriority = 200; |
235 | | constexpr int kFlushPriority = 100; |
236 | | constexpr int kNoJobId = -1; |
237 | | |
238 | | class DBImpl::CompactionTask : public ThreadPoolTask { |
239 | | public: |
240 | | CompactionTask(DBImpl* db_impl, DBImpl::ManualCompaction* manual_compaction) |
241 | | : ThreadPoolTask(db_impl), manual_compaction_(manual_compaction), |
242 | 441 | compaction_(manual_compaction->compaction.get()), priority_(CalcPriority()) { |
243 | 441 | db_impl->mutex_.AssertHeld(); |
244 | 441 | SetFileAndByteCount(); |
245 | 441 | } |
246 | | |
247 | | CompactionTask(DBImpl* db_impl, std::unique_ptr<Compaction> compaction) |
248 | | : ThreadPoolTask(db_impl), manual_compaction_(nullptr), |
249 | | compaction_holder_(std::move(compaction)), compaction_(compaction_holder_.get()), |
250 | 699 | priority_(CalcPriority()) { |
251 | 699 | db_impl->mutex_.AssertHeld(); |
252 | 699 | SetFileAndByteCount(); |
253 | 699 | } |
254 | | |
255 | 95 | bool ShouldRemoveWithKey(void* key) override { |
256 | 95 | return key == db_impl_; |
257 | 95 | } |
258 | | |
259 | 1.13k | void DoRun(yb::PriorityThreadPoolSuspender* suspender) override { |
260 | 1.13k | compaction_->SetSuspender(suspender); |
261 | 1.13k | db_impl_->BackgroundCallCompaction(manual_compaction_, std::move(compaction_holder_), this); |
262 | 1.13k | } |
263 | | |
264 | 1 | void AbortedUnlocked(const Status& status) override { |
265 | 1 | db_impl_->mutex_.AssertHeld(); |
266 | 1 | if (!manual_compaction_) { |
267 | | // This corresponds to cfd->Ref() inside DBImpl::AddToCompactionQueue that is |
268 | | // unreferenced by DBImpl::BackgroundCompaction in normal workflow, but in case of cancelling |
269 | | // compaction task we don't get there. |
270 | | // Since DBImpl::AddToCompactionQueue calls Ref only for non-manual compactions, we should |
271 | | // do the same here too. |
272 | | // TODO: https://github.com/yugabyte/yugabyte-db/issues/8578 |
273 | 0 | auto cfd = compaction_->column_family_data(); |
274 | 0 | if (cfd->Unref()) { |
275 | 0 | delete cfd; |
276 | 0 | } |
277 | 1 | } else { |
278 | 1 | if (!manual_compaction_->done) { |
279 | 1 | manual_compaction_->in_progress = false; |
280 | 1 | manual_compaction_->done = true; |
281 | 1 | manual_compaction_->status = status; |
282 | 1 | } |
283 | 1 | } |
284 | 1 | compaction_->ReleaseCompactionFiles(status); |
285 | 1 | LOG_IF_WITH_PREFIX0 (DFATAL, db_impl_->compaction_tasks_.erase(this) != 1) |
286 | 0 | << "Aborted unknown compaction task: " << SerialNo(); |
287 | 1 | if (db_impl_->compaction_tasks_.empty()) { |
288 | 1 | db_impl_->bg_cv_.SignalAll(); |
289 | 1 | } |
290 | 1 | } |
291 | | |
292 | 1.14k | BgTaskType Type() const override { |
293 | 1.14k | return BgTaskType::kCompaction; |
294 | 1.14k | } |
295 | | |
296 | 371 | std::string ToString() const override { |
297 | 371 | int job_id_value = job_id_.Load(); |
298 | 371 | return yb::Format( |
299 | 371 | "{ compact db: $0 is_manual: $1 serial_no: $2 job_id: $3}", db_impl_->GetName(), |
300 | 371 | manual_compaction_ != nullptr, SerialNo(), |
301 | 371 | ((job_id_value == kNoJobId) ? "None"96 : std::to_string(job_id_value)275 )); |
302 | 371 | } |
303 | | |
304 | 1.68k | yb::CompactionInfo GetFileAndByteInfoIfCompaction() const override { |
305 | 1.68k | return yb::CompactionInfo{file_count_, byte_count_}; |
306 | 1.68k | } |
307 | | |
308 | 1.13k | void SetJobID(JobContext* job_context) { |
309 | 1.13k | job_id_.Store(job_context->job_id); |
310 | 1.13k | } |
311 | | |
312 | 3.81k | bool UpdatePriority() override { |
313 | 3.81k | db_impl_->mutex_.AssertHeld(); |
314 | | |
315 | | // Task already complete. |
316 | 3.81k | if (compaction_ == nullptr) { |
317 | 1.14k | return false; |
318 | 1.14k | } |
319 | | |
320 | 2.67k | auto new_priority = CalcPriority(); |
321 | 2.67k | if (new_priority != priority_) { |
322 | 480 | priority_ = new_priority; |
323 | 480 | return true; |
324 | 480 | } |
325 | 2.19k | return false; |
326 | 2.67k | } |
327 | | |
328 | 1.12k | void Complete() { |
329 | 1.12k | db_impl_->mutex_.AssertHeld(); |
330 | 1.12k | compaction_ = nullptr; |
331 | 1.12k | } |
332 | | |
333 | 1.62k | int Priority() const override { |
334 | 1.62k | return priority_; |
335 | 1.62k | } |
336 | | |
337 | | private: |
338 | 3.81k | int CalcPriority() const { |
339 | 3.81k | db_impl_->mutex_.AssertHeld(); |
340 | | |
341 | 3.81k | if (db_impl_->IsShuttingDown()) { |
342 | 15 | return kShuttingDownPriority; |
343 | 15 | } |
344 | | |
345 | 3.79k | auto* current_version = compaction_->column_family_data()->GetSuperVersion()->current; |
346 | 3.79k | auto num_files = current_version->storage_info()->l0_delay_trigger_count(); |
347 | | |
348 | 3.79k | int result = 0; |
349 | 3.79k | if (num_files >= FLAGS_compaction_priority_start_bound) { |
350 | 1.69k | result = |
351 | 1.69k | 1 + |
352 | 1.69k | (num_files - FLAGS_compaction_priority_start_bound) / FLAGS_compaction_priority_step_size; |
353 | 1.69k | } |
354 | | |
355 | 3.79k | if (!db_impl_->IsLargeCompaction(*compaction_)) { |
356 | 3.79k | result += FLAGS_small_compaction_extra_priority; |
357 | 3.79k | } |
358 | | |
359 | 3.79k | return result; |
360 | 3.81k | } |
361 | | |
362 | 1.14k | void SetFileAndByteCount() { |
363 | 1.14k | size_t levels = compaction_->num_input_levels(); |
364 | 1.14k | uint64_t file_count = 0; |
365 | 2.28k | for (size_t i = 0; i < levels; i++1.14k ) { |
366 | 1.14k | file_count += compaction_->num_input_files(i); |
367 | 1.14k | } |
368 | 1.14k | file_count_ = file_count; |
369 | 1.14k | byte_count_ = compaction_->CalculateTotalInputSize(); |
370 | 1.14k | } |
371 | | |
372 | | DBImpl::ManualCompaction* const manual_compaction_; |
373 | | std::unique_ptr<Compaction> compaction_holder_; |
374 | | Compaction* compaction_; |
375 | | int priority_; |
376 | | yb::AtomicInt<int> job_id_{kNoJobId}; |
377 | | uint64_t file_count_; |
378 | | uint64_t byte_count_; |
379 | | }; |
380 | | |
381 | | class DBImpl::FlushTask : public ThreadPoolTask { |
382 | | public: |
383 | | FlushTask(DBImpl* db_impl, ColumnFamilyData* cfd) |
384 | 30 | : ThreadPoolTask(db_impl), cfd_(cfd) {} |
385 | | |
386 | 1 | bool ShouldRemoveWithKey(void* key) override { |
387 | 1 | return key == db_impl_ && db_impl_->disable_flush_on_shutdown_; |
388 | 1 | } |
389 | | |
390 | 30 | void DoRun(yb::PriorityThreadPoolSuspender* suspender) override { |
391 | | // Since flush tasks has highest priority we could don't use suspender for them. |
392 | 30 | db_impl_->BackgroundCallFlush(cfd_); |
393 | 30 | } |
394 | | |
395 | 30 | int Priority() const override { |
396 | 30 | return kFlushPriority; |
397 | 30 | } |
398 | | |
399 | 0 | void AbortedUnlocked(const Status& status) override { |
400 | 0 | db_impl_->mutex_.AssertHeld(); |
401 | 0 | cfd_->set_pending_flush(false); |
402 | 0 | if (cfd_->Unref()) { |
403 | 0 | delete cfd_; |
404 | 0 | } |
405 | 0 | if (--db_impl_->bg_flush_scheduled_ == 0) { |
406 | 0 | db_impl_->bg_cv_.SignalAll(); |
407 | 0 | } |
408 | 0 | } |
409 | | |
410 | 30 | BgTaskType Type() const override { |
411 | 30 | return BgTaskType::kFlush; |
412 | 30 | } |
413 | | |
414 | 0 | bool UpdatePriority() override { |
415 | 0 | return false; |
416 | 0 | } |
417 | | |
418 | 1 | std::string ToString() const override { |
419 | 1 | return yb::Format("{ flush db: $0 serial_no: $1 }", db_impl_->GetName(), SerialNo()); |
420 | 1 | } |
421 | | |
422 | | private: |
423 | | ColumnFamilyData* cfd_; |
424 | | }; |
425 | | |
426 | | // Utility class to update task priority. |
427 | | // We use two phase update to avoid calling thread pool while holding the mutex. |
428 | | class DBImpl::TaskPriorityUpdater { |
429 | | public: |
430 | | explicit TaskPriorityUpdater(DBImpl* db) |
431 | | : db_(db), |
432 | | priority_thread_pool_for_compactions_and_flushes_( |
433 | 958k | db_->db_options_.priority_thread_pool_for_compactions_and_flushes) {} |
434 | | |
435 | 958k | void Prepare() { |
436 | 958k | db_->mutex_.AssertHeld(); |
437 | 958k | for (auto* task : db_->compaction_tasks_) { |
438 | 3.81k | if (task->UpdatePriority()) { |
439 | 480 | update_priorities_request_.push_back({task->SerialNo(), task->Priority()}); |
440 | 480 | } |
441 | 3.81k | } |
442 | 958k | db_ = nullptr; |
443 | 958k | } |
444 | | |
445 | 0 | bool Empty() const { |
446 | 0 | return update_priorities_request_.empty(); |
447 | 0 | } |
448 | | |
449 | 958k | void Apply() { |
450 | 958k | for (const auto& entry : update_priorities_request_) { |
451 | 480 | priority_thread_pool_for_compactions_and_flushes_->ChangeTaskPriority( |
452 | 480 | entry.task_serial_no, entry.new_priority); |
453 | 480 | } |
454 | 958k | } |
455 | | |
456 | | private: |
457 | | DBImpl* db_; |
458 | | yb::PriorityThreadPool* priority_thread_pool_for_compactions_and_flushes_; |
459 | | boost::container::small_vector<TaskPriorityChange, 8> update_priorities_request_; |
460 | | }; |
461 | | |
462 | | Options SanitizeOptions(const std::string& dbname, |
463 | | const InternalKeyComparator* icmp, |
464 | 422k | const Options& src) { |
465 | 422k | auto db_options = SanitizeOptions(dbname, DBOptions(src)); |
466 | 422k | auto cf_options = SanitizeOptions(db_options, icmp, ColumnFamilyOptions(src)); |
467 | 422k | return Options(db_options, cf_options); |
468 | 422k | } |
469 | | |
470 | 857k | DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { |
471 | 857k | DBOptions result = src; |
472 | | |
473 | | // result.max_open_files means an "infinite" open files. |
474 | 857k | if (result.max_open_files != -1) { |
475 | 857k | int max_max_open_files = port::GetMaxOpenFiles(); |
476 | 857k | if (max_max_open_files == -1) { |
477 | 0 | max_max_open_files = 1000000; |
478 | 0 | } |
479 | 857k | ClipToRange(&result.max_open_files, 20, max_max_open_files); |
480 | 857k | } |
481 | | |
482 | 857k | if (result.info_log == nullptr) { |
483 | 21.0k | Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); |
484 | 21.0k | if (!s.ok()) { |
485 | | // No place suitable for logging |
486 | 2 | result.info_log = nullptr; |
487 | 2 | } |
488 | 21.0k | } |
489 | 857k | if (result.base_background_compactions == -1) { |
490 | 21.0k | result.base_background_compactions = result.max_background_compactions; |
491 | 21.0k | } |
492 | 857k | if (result.base_background_compactions > result.max_background_compactions) { |
493 | 1 | result.base_background_compactions = result.max_background_compactions; |
494 | 1 | } |
495 | 857k | if (result.base_background_compactions == 1) { |
496 | 20.9k | result.num_reserved_small_compaction_threads = 0; |
497 | 20.9k | } |
498 | 857k | if (result.num_reserved_small_compaction_threads == -1 || |
499 | 857k | result.num_reserved_small_compaction_threads >= result.base_background_compactions20.9k ) { |
500 | 836k | result.num_reserved_small_compaction_threads = result.base_background_compactions - 1; |
501 | 836k | } |
502 | 857k | result.env->IncBackgroundThreadsIfNeeded( |
503 | 857k | src.max_background_compactions, Env::Priority::LOW); |
504 | 857k | result.env->IncBackgroundThreadsIfNeeded( |
505 | 857k | src.max_background_flushes, Env::Priority::HIGH); |
506 | | |
507 | 857k | if (result.rate_limiter.get() != nullptr) { |
508 | 837k | if (result.bytes_per_sync == 0) { |
509 | 837k | result.bytes_per_sync = 1024 * 1024; |
510 | 837k | } |
511 | 837k | } |
512 | | |
513 | 858k | if (result.WAL_ttl_seconds > 0857k || result.WAL_size_limit_MB > 0) { |
514 | 147 | result.recycle_log_file_num = false; |
515 | 147 | } |
516 | | |
517 | 857k | if (result.wal_dir.empty()) { |
518 | | // Use dbname as default |
519 | 857k | result.wal_dir = dbname; |
520 | 857k | } |
521 | 857k | if (result.wal_dir.back() == '/') { |
522 | 5 | result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); |
523 | 5 | } |
524 | | |
525 | 857k | if (result.db_paths.size() == 0) { |
526 | 857k | result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max()); |
527 | 857k | } |
528 | | |
529 | 857k | if (result.compaction_readahead_size > 0) { |
530 | 272 | result.new_table_reader_for_compaction_inputs = true; |
531 | 272 | } |
532 | | |
533 | 857k | return result; |
534 | 857k | } |
535 | | |
536 | | namespace { |
537 | | |
538 | | Status SanitizeOptionsByTable( |
539 | | const DBOptions& db_opts, |
540 | 435k | const std::vector<ColumnFamilyDescriptor>& column_families) { |
541 | 435k | Status s; |
542 | 438k | for (auto cf : column_families) { |
543 | 438k | s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); |
544 | 438k | if (!s.ok()) { |
545 | 1 | return s; |
546 | 1 | } |
547 | 438k | } |
548 | 435k | return Status::OK(); |
549 | 435k | } |
550 | | |
551 | 63.3k | CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { |
552 | | // Compressing memtable flushes might not help unless the sequential load |
553 | | // optimization is used for leveled compaction. Otherwise the CPU and |
554 | | // latency overhead is not offset by saving much space. |
555 | | |
556 | 63.3k | bool can_compress; |
557 | | |
558 | 63.3k | if (ioptions.compaction_style == kCompactionStyleUniversal) { |
559 | 47.2k | can_compress = |
560 | 47.2k | (ioptions.compaction_options_universal.compression_size_percent < 0); |
561 | 47.2k | } else { |
562 | | // For leveled compress when min_level_to_compress == 0. |
563 | 16.1k | can_compress = ioptions.compression_per_level.empty() || |
564 | 16.1k | ioptions.compression_per_level[0] != kNoCompression990 ; |
565 | 16.1k | } |
566 | | |
567 | 63.3k | if (can_compress) { |
568 | 62.1k | return ioptions.compression; |
569 | 62.1k | } else { |
570 | 1.17k | return kNoCompression; |
571 | 1.17k | } |
572 | 63.3k | } |
573 | | |
574 | 0 | void DumpSupportInfo(Logger* logger) { |
575 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "Compression algorithms supported:"); |
576 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tSnappy supported: %d", |
577 | 0 | Snappy_Supported()); |
578 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tZlib supported: %d", |
579 | 0 | Zlib_Supported()); |
580 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tBzip supported: %d", |
581 | 0 | BZip2_Supported()); |
582 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tLZ4 supported: %d", LZ4_Supported()); |
583 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "Fast CRC32 supported: %d", |
584 | 0 | crc32c::IsFastCrc32Supported()); |
585 | 0 | } |
586 | | |
587 | | } // namespace |
588 | | |
589 | | DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) |
590 | | : env_(options.env), |
591 | | checkpoint_env_(options.get_checkpoint_env()), |
592 | | dbname_(dbname), |
593 | | db_options_(SanitizeOptions(dbname, options)), |
594 | | stats_(db_options_.statistics.get()), |
595 | | db_lock_(nullptr), |
596 | | mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, options.use_adaptive_mutex), |
597 | | shutting_down_(false), |
598 | | bg_cv_(&mutex_), |
599 | | logfile_number_(0), |
600 | | log_dir_synced_(false), |
601 | | log_empty_(true), |
602 | | default_cf_handle_(nullptr), |
603 | | log_sync_cv_(&mutex_), |
604 | | total_log_size_(0), |
605 | | max_total_in_memory_state_(0), |
606 | | is_snapshot_supported_(true), |
607 | | write_buffer_(options.db_write_buffer_size, options.memory_monitor), |
608 | | write_thread_(options.enable_write_thread_adaptive_yield |
609 | | ? options.write_thread_max_yield_usec |
610 | | : 0, |
611 | | options.write_thread_slow_yield_usec), |
612 | | write_controller_(options.delayed_write_rate), |
613 | | last_batch_group_size_(0), |
614 | | unscheduled_flushes_(0), |
615 | | unscheduled_compactions_(0), |
616 | | bg_compaction_scheduled_(0), |
617 | | num_total_running_compactions_(0), |
618 | | num_running_large_compactions_(0), |
619 | | bg_flush_scheduled_(0), |
620 | | num_running_flushes_(0), |
621 | | disable_delete_obsolete_files_(0), |
622 | | delete_obsolete_files_next_run_( |
623 | | options.env->NowMicros() + |
624 | | db_options_.delete_obsolete_files_period_micros), |
625 | | last_stats_dump_time_microsec_(0), |
626 | | next_job_id_(1), |
627 | | has_unpersisted_data_(false), |
628 | | env_options_(db_options_), |
629 | | #ifndef ROCKSDB_LITE |
630 | | wal_manager_(db_options_, env_options_), |
631 | | #endif // ROCKSDB_LITE |
632 | | event_logger_(db_options_.info_log.get()), |
633 | | bg_work_paused_(0), |
634 | | bg_compaction_paused_(0), |
635 | | refitting_level_(false), |
636 | 435k | opened_successfully_(false) { |
637 | 435k | CHECK_OK(env_->GetAbsolutePath(dbname, &db_absolute_path_)); |
638 | | |
639 | | // Reserve ten files or so for other uses and give the rest to TableCache. |
640 | | // Give a large number for setting of "infinite" open files. |
641 | 435k | const int table_cache_size = (db_options_.max_open_files == -1) ? |
642 | 435k | 4194304238 : db_options_.max_open_files - 10; |
643 | 435k | table_cache_ = |
644 | 435k | NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits); |
645 | | |
646 | 435k | versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, |
647 | 435k | table_cache_.get(), &write_buffer_, |
648 | 435k | &write_controller_)); |
649 | 435k | pending_outputs_ = std::make_unique<FileNumbersProvider>(versions_.get()); |
650 | 435k | column_family_memtables_.reset( |
651 | 435k | new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); |
652 | | |
653 | 435k | if (FLAGS_dump_dbimpl_info) { |
654 | 0 | DumpDBFileSummary(db_options_, dbname_); |
655 | 0 | db_options_.Dump(db_options_.info_log.get()); |
656 | 0 | DumpSupportInfo(db_options_.info_log.get()); |
657 | 0 | } |
658 | 435k | } |
659 | | |
660 | | // Will lock the mutex_, will wait for completion if wait is true |
661 | 40 | void DBImpl::CancelAllBackgroundWork(bool wait) { |
662 | 40 | InstrumentedMutexLock l(&mutex_); |
663 | 40 | shutting_down_.store(true, std::memory_order_release); |
664 | 40 | bg_cv_.SignalAll(); |
665 | 40 | if (!wait) { |
666 | 3 | return; |
667 | 3 | } |
668 | | // Wait for background work to finish |
669 | 37 | while (CheckBackgroundWorkAndLog("Cancel")) { |
670 | 0 | bg_cv_.Wait(); |
671 | 0 | } |
672 | 37 | } |
673 | | |
674 | 396k | bool DBImpl::CheckBackgroundWorkAndLog(const char* prefix) const { |
675 | 396k | if (bg_compaction_scheduled_ || bg_flush_scheduled_396k || !compaction_tasks_.empty()396k ) { |
676 | 251 | LOG_WITH_PREFIX(INFO) |
677 | 251 | << prefix << " waiting for " << bg_compaction_scheduled_ << " scheduled compactions, " |
678 | 251 | << compaction_tasks_.size() << " compaction tasks and " |
679 | 251 | << bg_flush_scheduled_ << " flushes"; |
680 | 251 | return true; |
681 | 251 | } |
682 | 396k | return false; |
683 | 396k | } |
684 | | |
685 | 777k | void DBImpl::StartShutdown() { |
686 | 777k | bool expected = false; |
687 | 777k | if (!shutting_down_.compare_exchange_strong(expected, true)) { |
688 | 382k | return; |
689 | 382k | } |
690 | | |
691 | 394k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Shutting down RocksDB at: %s\n", |
692 | 394k | dbname_.c_str()); |
693 | | |
694 | 394k | bg_cv_.SignalAll(); |
695 | | |
696 | 394k | TaskPriorityUpdater task_priority_updater(this); |
697 | 394k | { |
698 | 394k | InstrumentedMutexLock lock(&mutex_); |
699 | 394k | task_priority_updater.Prepare(); |
700 | 394k | } |
701 | 394k | task_priority_updater.Apply(); |
702 | 394k | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
703 | 383k | db_options_.priority_thread_pool_for_compactions_and_flushes->Remove(this); |
704 | 383k | } |
705 | 394k | } |
706 | | |
707 | 395k | DBImpl::~DBImpl() { |
708 | 395k | StartShutdown(); |
709 | | |
710 | 395k | TaskPriorityUpdater task_priority_updater(this); |
711 | 395k | { |
712 | 395k | InstrumentedMutexLock lock(&mutex_); |
713 | | |
714 | 395k | if (has_unpersisted_data_) { |
715 | 66.6k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
716 | 66.6k | if (!cfd->IsDropped()66.6k && !cfd->mem()->IsEmpty()) { |
717 | 65.7k | cfd->Ref(); |
718 | 65.7k | mutex_.Unlock(); |
719 | 65.7k | if (disable_flush_on_shutdown_) { |
720 | 64.9k | LOG_WITH_PREFIX(INFO) << "Skipping mem table flush - disable_flush_on_shutdown_ is set"; |
721 | 64.9k | } else if (787 FLAGS_flush_rocksdb_on_shutdown787 ) { |
722 | 705 | LOG_WITH_PREFIX(INFO) << "Flushing mem table on shutdown"; |
723 | 705 | CHECK_OK(FlushMemTable(cfd, FlushOptions())); |
724 | 705 | } else { |
725 | 82 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
726 | 82 | "Skipping mem table flush - flush_rocksdb_on_shutdown is unset"); |
727 | 82 | } |
728 | 65.7k | mutex_.Lock(); |
729 | 65.7k | cfd->Unref(); |
730 | 65.7k | } |
731 | 66.6k | } |
732 | 66.5k | versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); |
733 | 66.5k | } |
734 | 395k | task_priority_updater.Prepare(); |
735 | 395k | } |
736 | | |
737 | 395k | task_priority_updater.Apply(); |
738 | | |
739 | 395k | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
740 | 382k | db_options_.priority_thread_pool_for_compactions_and_flushes->Remove(this); |
741 | 382k | } |
742 | | |
743 | 395k | int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); |
744 | 395k | int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); |
745 | | |
746 | 395k | mutex_.Lock(); |
747 | 395k | bg_compaction_scheduled_ -= compactions_unscheduled; |
748 | 395k | bg_flush_scheduled_ -= flushes_unscheduled; |
749 | | |
750 | | // Wait for background work to finish |
751 | 396k | while (CheckBackgroundWorkAndLog("Shutdown")) { |
752 | | // Use timed wait for periodic status logging. |
753 | 251 | bg_cv_.TimedWait(env_->NowMicros() + yb::ToMicroseconds(5s)); |
754 | 251 | } |
755 | 395k | flush_scheduler_.Clear(); |
756 | | |
757 | 395k | while (!flush_queue_.empty()) { |
758 | 11 | auto cfd = PopFirstFromFlushQueue(); |
759 | 11 | if (cfd->Unref()) { |
760 | 0 | delete cfd; |
761 | 0 | } |
762 | 11 | } |
763 | | |
764 | 395k | ClearCompactionQueue(&small_compaction_queue_); |
765 | 395k | ClearCompactionQueue(&large_compaction_queue_); |
766 | | |
767 | 396k | if (default_cf_handle_ != nullptr395k ) { |
768 | | // we need to delete handle outside of lock because it does its own locking |
769 | 396k | mutex_.Unlock(); |
770 | 396k | delete default_cf_handle_; |
771 | 396k | mutex_.Lock(); |
772 | 396k | } |
773 | | |
774 | | // Clean up obsolete files due to SuperVersion release. |
775 | | // (1) Need to delete to obsolete files before closing because RepairDB() |
776 | | // scans all existing files in the file system and builds manifest file. |
777 | | // Keeping obsolete files confuses the repair process. |
778 | | // (2) Need to check if we Open()/Recover() the DB successfully before |
779 | | // deleting because if VersionSet recover fails (may be due to corrupted |
780 | | // manifest file), it is not able to identify live files correctly. As a |
781 | | // result, all "live" files can get deleted by accident. However, corrupted |
782 | | // manifest is recoverable by RepairDB(). |
783 | 395k | if (opened_successfully_) { |
784 | 395k | JobContext job_context(next_job_id_.fetch_add(1)); |
785 | 395k | FindObsoleteFiles(&job_context, true); |
786 | | |
787 | 395k | mutex_.Unlock(); |
788 | | // manifest number starting from 2 |
789 | 395k | job_context.manifest_file_number = 1; |
790 | 395k | if (job_context.HaveSomethingToDelete()) { |
791 | 395k | PurgeObsoleteFiles(job_context); |
792 | 395k | } |
793 | 395k | job_context.Clean(); |
794 | 395k | mutex_.Lock(); |
795 | 395k | } |
796 | | |
797 | 395k | for (auto l : logs_to_free_) { |
798 | 0 | delete l; |
799 | 0 | } |
800 | 395k | for (auto& log : logs_) { |
801 | 395k | log.ClearWriter(); |
802 | 395k | } |
803 | 395k | logs_.clear(); |
804 | | |
805 | | // versions need to be destroyed before table_cache since it can hold |
806 | | // references to table_cache. |
807 | 395k | versions_.reset(); |
808 | 395k | mutex_.Unlock(); |
809 | 395k | if (db_lock_ != nullptr) { |
810 | 395k | CHECK_OK(env_->UnlockFile(db_lock_)); |
811 | 395k | } |
812 | | |
813 | 395k | LogFlush(db_options_.info_log); |
814 | | |
815 | 395k | LOG_WITH_PREFIX(INFO) << "Shutdown done"; |
816 | 395k | } |
817 | | |
818 | 424k | Status DBImpl::NewDB() { |
819 | 424k | VersionEdit new_db; |
820 | 424k | new_db.InitNewDB(); |
821 | 424k | new_db.SetLastSequence(db_options_.initial_seqno); |
822 | | |
823 | 424k | Status s; |
824 | | |
825 | 424k | RLOG(InfoLogLevel::INFO_LEVEL, |
826 | 424k | db_options_.info_log, "Creating manifest 1 \n"); |
827 | 424k | const std::string manifest = DescriptorFileName(dbname_, 1); |
828 | 424k | { |
829 | 424k | unique_ptr<WritableFile> file; |
830 | 424k | EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); |
831 | 424k | s = NewWritableFile(env_, manifest, &file, env_options); |
832 | 424k | if (!s.ok()) { |
833 | 2 | return s; |
834 | 2 | } |
835 | 424k | file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); |
836 | 424k | unique_ptr<WritableFileWriter> file_writer( |
837 | 424k | new WritableFileWriter(std::move(file), env_options)); |
838 | 424k | log::Writer log(std::move(file_writer), 0, false); |
839 | 424k | std::string record; |
840 | 424k | new_db.AppendEncodedTo(&record); |
841 | 424k | s = log.AddRecord(record); |
842 | 424k | if (s.ok()) { |
843 | 424k | s = SyncManifest(env_, &db_options_, log.file()); |
844 | 424k | } |
845 | 424k | } |
846 | 424k | if (s.ok()) { |
847 | | // Make "CURRENT" file that points to the new manifest file. |
848 | 424k | s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir(), db_options_.disableDataSync); |
849 | 424k | } else { |
850 | 291 | env_->CleanupFile(manifest); |
851 | 291 | } |
852 | 424k | return s; |
853 | 424k | } |
854 | | |
855 | 2.95M | void DBImpl::MaybeIgnoreError(Status* s) const { |
856 | 2.95M | if (s->ok() || db_options_.paranoid_checks1 ) { |
857 | | // No change needed |
858 | 2.95M | } else { |
859 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, |
860 | 0 | db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); |
861 | 0 | *s = Status::OK(); |
862 | 0 | } |
863 | 2.95M | } |
864 | | |
865 | 435k | const Status DBImpl::CreateArchivalDirectory() { |
866 | 435k | if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0435k ) { |
867 | 98 | std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); |
868 | 98 | return env_->CreateDirIfMissing(archivalPath); |
869 | 98 | } |
870 | 435k | return Status::OK(); |
871 | 435k | } |
872 | | |
873 | | // * Returns the list of live files in 'sst_live' |
874 | | // If it's doing full scan: |
875 | | // * Returns the list of all files in the filesystem in |
876 | | // 'full_scan_candidate_files'. |
877 | | // Otherwise, gets obsolete files from VersionSet. |
878 | | // no_full_scan = true -- never do the full scan using GetChildren() |
879 | | // force = false -- don't force the full scan, except every |
880 | | // db_options_.delete_obsolete_files_period_micros |
881 | | // force = true -- force the full scan |
882 | | void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, |
883 | 1.85M | bool no_full_scan) { |
884 | 1.85M | mutex_.AssertHeld(); |
885 | | |
886 | | // if deletion is disabled, do nothing |
887 | 1.85M | if (disable_delete_obsolete_files_ > 0) { |
888 | 1.24k | return; |
889 | 1.24k | } |
890 | | |
891 | 1.85M | bool doing_the_full_scan = false; |
892 | | |
893 | | // logic for figurint out if we're doing the full scan |
894 | 1.85M | if (no_full_scan) { |
895 | 839k | doing_the_full_scan = false; |
896 | 1.01M | } else if (force || db_options_.delete_obsolete_files_period_micros == 0165k ) { |
897 | 852k | doing_the_full_scan = true; |
898 | 852k | } else { |
899 | 165k | const uint64_t now_micros = env_->NowMicros(); |
900 | 165k | if (delete_obsolete_files_next_run_ < now_micros) { |
901 | 0 | doing_the_full_scan = true; |
902 | 0 | delete_obsolete_files_next_run_ = |
903 | 0 | now_micros + db_options_.delete_obsolete_files_period_micros; |
904 | 0 | } |
905 | 165k | } |
906 | | |
907 | | // Get obsolete files. This function will also update the list of |
908 | | // pending files in VersionSet(). |
909 | 1.85M | versions_->GetObsoleteFiles(*pending_outputs_, |
910 | 1.85M | &job_context->sst_delete_files, |
911 | 1.85M | &job_context->manifest_delete_files); |
912 | | |
913 | | // store the current filenum, lognum, etc |
914 | 1.85M | job_context->manifest_file_number = versions_->manifest_file_number(); |
915 | 1.85M | job_context->pending_manifest_file_number = |
916 | 1.85M | versions_->pending_manifest_file_number(); |
917 | 1.85M | job_context->log_number = versions_->MinLogNumber(); |
918 | 1.85M | job_context->prev_log_number = versions_->prev_log_number(); |
919 | | |
920 | 1.85M | versions_->AddLiveFiles(&job_context->sst_live); |
921 | 1.85M | if (doing_the_full_scan) { |
922 | 852k | InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); |
923 | 1.70M | for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++851k ) { |
924 | | // set of all files in the directory. We'll exclude files that are still |
925 | | // alive in the subsequent processings. |
926 | 851k | std::vector<std::string> files; |
927 | 851k | env_->GetChildrenWarnNotOk(db_options_.db_paths[path_id].path, &files); |
928 | 7.42M | for (std::string file : files) { |
929 | 7.42M | uint64_t number; |
930 | 7.42M | FileType type; |
931 | 7.42M | if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || |
932 | 7.42M | pending_outputs_->HasFileNumber(number)5.72M ) { |
933 | 1.73M | continue; |
934 | 1.73M | } |
935 | | // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes |
936 | 5.69M | job_context->full_scan_candidate_files.emplace_back( |
937 | 5.69M | "/" + file, static_cast<uint32_t>(path_id)); |
938 | 5.69M | } |
939 | 851k | } |
940 | | |
941 | | // Add log files in wal_dir |
942 | 852k | if (db_options_.wal_dir != dbname_) { |
943 | 818 | std::vector<std::string> log_files; |
944 | 818 | env_->GetChildrenWarnNotOk(db_options_.wal_dir, &log_files); |
945 | 2.98k | for (std::string log_file : log_files) { |
946 | 2.98k | job_context->full_scan_candidate_files.emplace_back(log_file, 0); |
947 | 2.98k | } |
948 | 818 | } |
949 | | // Add info log files in db_log_dir |
950 | 852k | if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_441 ) { |
951 | 441 | std::vector<std::string> info_log_files; |
952 | | // Ignore errors |
953 | 441 | env_->GetChildrenWarnNotOk(db_options_.db_log_dir, &info_log_files); |
954 | 12.1k | for (std::string log_file : info_log_files) { |
955 | 12.1k | job_context->full_scan_candidate_files.emplace_back(log_file, 0); |
956 | 12.1k | } |
957 | 441 | } |
958 | 852k | } |
959 | | |
960 | 1.85M | if (!alive_log_files_.empty()) { |
961 | 1.85M | uint64_t min_log_number = versions_->MinLogNumber(); |
962 | | // find newly obsoleted log files |
963 | 1.87M | while (alive_log_files_.begin()->number < min_log_number) { |
964 | 20.1k | auto& earliest = *alive_log_files_.begin(); |
965 | 20.1k | if (db_options_.recycle_log_file_num > log_recycle_files.size()) { |
966 | 83 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
967 | 83 | "adding log %" PRIu64 " to recycle list\n", earliest.number); |
968 | 83 | log_recycle_files.push_back(earliest.number); |
969 | 20.1k | } else { |
970 | 20.1k | job_context->log_delete_files.push_back(earliest.number); |
971 | 20.1k | } |
972 | 20.1k | total_log_size_.fetch_sub(static_cast<int64_t>(earliest.size)); |
973 | 20.1k | alive_log_files_.pop_front(); |
974 | | // Current log should always stay alive since it can't have |
975 | | // number < MinLogNumber(). |
976 | 20.1k | DCHECK(alive_log_files_.size()); |
977 | 20.1k | } |
978 | 1.87M | while (!logs_.empty() && logs_.front().number < min_log_number1.87M ) { |
979 | 20.1k | auto& log = logs_.front(); |
980 | 20.1k | if (log.getting_synced) { |
981 | 0 | log_sync_cv_.Wait(); |
982 | | // logs_ could have changed while we were waiting. |
983 | 0 | continue; |
984 | 0 | } |
985 | 20.1k | logs_to_free_.push_back(log.ReleaseWriter()); |
986 | 20.1k | logs_.pop_front(); |
987 | 20.1k | } |
988 | | // Current log cannot be obsolete. |
989 | 1.85M | DCHECK(!logs_.empty()); |
990 | 1.85M | } |
991 | | |
992 | | // We're just cleaning up for DB::Write(). |
993 | 1.85M | DCHECK(job_context->logs_to_free.empty()); |
994 | 1.85M | job_context->logs_to_free = logs_to_free_; |
995 | 1.85M | logs_to_free_.clear(); |
996 | 1.85M | } |
997 | | |
998 | | namespace { |
999 | | bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, |
1000 | 15.5M | const JobContext::CandidateFileInfo& second) { |
1001 | 15.5M | if (first.file_name > second.file_name) { |
1002 | 9.00M | return true; |
1003 | 9.00M | } else if (6.53M first.file_name < second.file_name6.53M ) { |
1004 | 5.78M | return false; |
1005 | 5.78M | } else { |
1006 | 743k | return (first.path_id > second.path_id); |
1007 | 743k | } |
1008 | 15.5M | } |
1009 | | }; // namespace |
1010 | | |
1011 | | // Diffs the files listed in filenames and those that do not |
1012 | | // belong to live files are posibly removed. Also, removes all the |
1013 | | // files in sst_delete_files and log_delete_files. |
1014 | | // It is not necessary to hold the mutex when invoking this method. |
1015 | 1.27M | void DBImpl::PurgeObsoleteFiles(const JobContext& state) { |
1016 | | // we'd better have sth to delete |
1017 | 1.27M | assert(state.HaveSomethingToDelete()); |
1018 | | |
1019 | | // this checks if FindObsoleteFiles() was run before. If not, don't do |
1020 | | // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also |
1021 | | // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true |
1022 | 1.27M | if (state.manifest_file_number == 0) { |
1023 | 1.20k | return; |
1024 | 1.20k | } |
1025 | | |
1026 | | // Now, convert live list to an unordered map, WITHOUT mutex held; |
1027 | | // set is slow. |
1028 | 1.27M | std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map; |
1029 | 2.44M | for (const FileDescriptor& fd : state.sst_live) { |
1030 | 2.44M | sst_live_map[fd.GetNumber()] = &fd; |
1031 | 2.44M | } |
1032 | | |
1033 | 1.27M | auto candidate_files = state.full_scan_candidate_files; |
1034 | 1.27M | candidate_files.reserve( |
1035 | 1.27M | candidate_files.size() + state.sst_delete_files.size() + |
1036 | 1.27M | state.log_delete_files.size() + state.manifest_delete_files.size()); |
1037 | | // We may ignore the dbname when generating the file names. |
1038 | 1.27M | const char* kDumbDbName = ""; |
1039 | 1.27M | for (auto file : state.sst_delete_files) { |
1040 | | // We only put base SST file in candidate_files |
1041 | 61.7k | candidate_files.emplace_back( |
1042 | 61.7k | MakeTableFileName(kDumbDbName, file->fd.GetNumber()), |
1043 | 61.7k | file->fd.GetPathId()); |
1044 | 61.7k | delete file; |
1045 | 61.7k | } |
1046 | | |
1047 | 1.27M | for (auto file_num : state.log_delete_files) { |
1048 | 20.1k | if (file_num > 0) { |
1049 | 20.1k | candidate_files.emplace_back(LogFileName(kDumbDbName, file_num).substr(1), |
1050 | 20.1k | 0); |
1051 | 20.1k | } |
1052 | 20.1k | } |
1053 | 1.27M | for (const auto& filename : state.manifest_delete_files) { |
1054 | 266k | candidate_files.emplace_back(filename, 0); |
1055 | 266k | } |
1056 | | |
1057 | | // dedup state.candidate_files so we don't try to delete the same |
1058 | | // file twice |
1059 | 1.27M | sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile); |
1060 | 1.27M | candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), |
1061 | 1.27M | candidate_files.end()); |
1062 | | |
1063 | 1.27M | std::vector<std::string> old_info_log_files; |
1064 | 1.27M | InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); |
1065 | 6.05M | for (const auto& candidate_file : candidate_files) { |
1066 | 6.05M | std::string to_delete = candidate_file.file_name; |
1067 | 6.05M | uint32_t path_id = candidate_file.path_id; |
1068 | 6.05M | uint64_t number; |
1069 | 6.05M | FileType type; |
1070 | | // Ignore file if we cannot recognize it. |
1071 | 6.05M | if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { |
1072 | 2.57k | continue; |
1073 | 2.57k | } |
1074 | | |
1075 | 6.04M | bool keep = true; |
1076 | 6.04M | switch (type) { |
1077 | 893k | case kLogFile: |
1078 | 893k | keep = ((number >= state.log_number) || |
1079 | 893k | (number == state.prev_log_number)30.0k ); |
1080 | 893k | break; |
1081 | 1.12M | case kDescriptorFile: |
1082 | | // Keep my manifest file, and any newer incarnations' |
1083 | | // (can happen during manifest roll) |
1084 | 1.12M | keep = (number >= state.manifest_file_number); |
1085 | 1.12M | break; |
1086 | 164k | case kTableFile: |
1087 | | // If the second condition is not there, this makes |
1088 | | // DontDeletePendingOutputs fail |
1089 | 164k | keep = (sst_live_map.find(number) != sst_live_map.end()) || |
1090 | 164k | pending_outputs_->HasFileNumber(number)49.5k ; |
1091 | 164k | break; |
1092 | 101k | case kTableSBlockFile: |
1093 | | // Just skip, since we will process SST data file during processing of corresponding |
1094 | | // SST base file. |
1095 | 101k | keep = true; |
1096 | 101k | break; |
1097 | 62 | case kTempFile: |
1098 | | // Any temp files that are currently being written to must |
1099 | | // be recorded in pending_outputs_, which is inserted into "live". |
1100 | | // Also, SetCurrentFile creates a temp file when writing out new |
1101 | | // manifest, which is equal to state.pending_manifest_file_number. We |
1102 | | // should not delete that file |
1103 | | // |
1104 | | // TODO(yhchiang): carefully modify the third condition to safely |
1105 | | // remove the temp options files. |
1106 | 62 | keep = (sst_live_map.find(number) != sst_live_map.end()) || |
1107 | 62 | (number == state.pending_manifest_file_number) || |
1108 | 62 | (to_delete.find(kOptionsFileNamePrefix) != std::string::npos)56 ; |
1109 | 62 | break; |
1110 | 393k | case kInfoLogFile: |
1111 | 393k | keep = true; |
1112 | 393k | if (number != 0) { |
1113 | 348k | old_info_log_files.push_back(to_delete); |
1114 | 348k | } |
1115 | 393k | break; |
1116 | 851k | case kCurrentFile: |
1117 | 1.70M | case kDBLockFile: |
1118 | 2.55M | case kIdentityFile: |
1119 | 2.55M | case kMetaDatabase: |
1120 | 3.37M | case kOptionsFile: |
1121 | 3.37M | keep = true; |
1122 | 3.37M | break; |
1123 | 6.04M | } |
1124 | | |
1125 | 6.04M | if (keep) { |
1126 | 5.70M | continue; |
1127 | 5.70M | } |
1128 | | |
1129 | 348k | std::string fname; |
1130 | 348k | if (type == kTableFile) { |
1131 | | // evict from cache |
1132 | 49.5k | TableCache::Evict(table_cache_.get(), number); |
1133 | 49.5k | fname = TableFileName(db_options_.db_paths, number, path_id); |
1134 | 298k | } else { |
1135 | 298k | fname = ((type == kLogFile) ? |
1136 | 268k | db_options_.wal_dir30.0k : dbname_) + "/" + to_delete; |
1137 | 298k | } |
1138 | | |
1139 | 348k | #ifndef ROCKSDB_LITE |
1140 | 348k | if (type == kLogFile && (30.0k db_options_.WAL_ttl_seconds > 030.0k || |
1141 | 30.0k | db_options_.WAL_size_limit_MB > 029.9k )) { |
1142 | 108 | wal_manager_.ArchiveWALFile(fname, number); |
1143 | 108 | continue; |
1144 | 108 | } |
1145 | 348k | #endif // !ROCKSDB_LITE |
1146 | 348k | Status file_deletion_status; |
1147 | 348k | if (type == kTableFile) { |
1148 | 49.5k | file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id); |
1149 | 49.5k | const std::string data_fname = TableBaseToDataFileName(fname); |
1150 | 49.5k | if (file_deletion_status.ok()) { |
1151 | | // Delete corresponding data file if exists. |
1152 | 48.3k | Status s = db_options_.env->FileExists(data_fname); |
1153 | 48.3k | if (s.ok()) { |
1154 | 47.4k | file_deletion_status = DeleteSSTFile(&db_options_, data_fname, path_id); |
1155 | 47.4k | } else if (854 !s.IsNotFound()854 ) { |
1156 | 0 | file_deletion_status = s; |
1157 | 0 | } |
1158 | 48.3k | } |
1159 | 298k | } else { |
1160 | 298k | file_deletion_status = env_->DeleteFile(fname); |
1161 | 298k | } |
1162 | 348k | if (file_deletion_status.ok()) { |
1163 | 346k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1164 | 346k | "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id, |
1165 | 346k | fname.c_str(), type, number, |
1166 | 346k | file_deletion_status.ToString().c_str()); |
1167 | 346k | } else if (1.91k env_->FileExists(fname).IsNotFound()1.91k ) { |
1168 | 2.10k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1169 | 2.10k | "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 |
1170 | 2.10k | " -- %s\n", |
1171 | 2.10k | state.job_id, fname.c_str(), type, number, |
1172 | 2.10k | file_deletion_status.ToString().c_str()); |
1173 | 18.4E | } else { |
1174 | 18.4E | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
1175 | 18.4E | "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", |
1176 | 18.4E | state.job_id, fname.c_str(), type, number, |
1177 | 18.4E | file_deletion_status.ToString().c_str()); |
1178 | 18.4E | } |
1179 | 348k | if (type == kTableFile) { |
1180 | 49.5k | EventHelpers::LogAndNotifyTableFileDeletion( |
1181 | 49.5k | &event_logger_, state.job_id, number, fname, |
1182 | 49.5k | file_deletion_status, GetName(), |
1183 | 49.5k | db_options_.listeners); |
1184 | 49.5k | } |
1185 | 348k | } |
1186 | | |
1187 | | // Delete old info log files. |
1188 | 1.27M | size_t old_info_log_file_count = old_info_log_files.size(); |
1189 | 1.27M | if (old_info_log_file_count != 0 && |
1190 | 1.27M | old_info_log_file_count >= db_options_.keep_log_file_num12.8k ) { |
1191 | 0 | std::sort(old_info_log_files.begin(), old_info_log_files.end()); |
1192 | 0 | size_t end = old_info_log_file_count - db_options_.keep_log_file_num; |
1193 | 0 | for (unsigned int i = 0; i <= end; i++) { |
1194 | 0 | std::string& to_delete = old_info_log_files.at(i); |
1195 | 0 | std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? |
1196 | 0 | dbname_ : db_options_.db_log_dir) + "/" + to_delete; |
1197 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1198 | 0 | "[JOB %d] Delete info log file %s\n", state.job_id, |
1199 | 0 | full_path_to_delete.c_str()); |
1200 | 0 | Status s = env_->DeleteFile(full_path_to_delete); |
1201 | 0 | if (!s.ok()) { |
1202 | 0 | if (env_->FileExists(full_path_to_delete).IsNotFound()) { |
1203 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1204 | 0 | "[JOB %d] Tried to delete non-existing info log file %s FAILED " |
1205 | 0 | "-- %s\n", |
1206 | 0 | state.job_id, to_delete.c_str(), s.ToString().c_str()); |
1207 | 0 | } else { |
1208 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
1209 | 0 | "[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id, |
1210 | 0 | to_delete.c_str(), s.ToString().c_str()); |
1211 | 0 | } |
1212 | 0 | } |
1213 | 0 | } |
1214 | 0 | } |
1215 | 1.27M | #ifndef ROCKSDB_LITE |
1216 | 1.27M | wal_manager_.PurgeObsoleteWALFiles(); |
1217 | 1.27M | #endif // ROCKSDB_LITE |
1218 | 1.27M | LogFlush(db_options_.info_log); |
1219 | 1.27M | } |
1220 | | |
1221 | 435k | void DBImpl::DeleteObsoleteFiles() { |
1222 | 435k | mutex_.AssertHeld(); |
1223 | 435k | JobContext job_context(next_job_id_.fetch_add(1)); |
1224 | 435k | FindObsoleteFiles(&job_context, true); |
1225 | | |
1226 | 435k | mutex_.Unlock(); |
1227 | 435k | if (job_context.HaveSomethingToDelete()) { |
1228 | 434k | PurgeObsoleteFiles(job_context); |
1229 | 434k | } |
1230 | 435k | job_context.Clean(); |
1231 | 435k | mutex_.Lock(); |
1232 | 435k | } |
1233 | | |
1234 | | Status DBImpl::Directories::CreateAndNewDirectory( |
1235 | | Env* env, const std::string& dirname, |
1236 | 435k | std::unique_ptr<Directory>* directory) const { |
1237 | | // We call CreateDirIfMissing() as the directory may already exist (if we |
1238 | | // are reopening a DB), when this happens we don't want creating the |
1239 | | // directory to cause an error. However, we need to check if creating the |
1240 | | // directory fails or else we may get an obscure message about the lock |
1241 | | // file not existing. One real-world example of this occurring is if |
1242 | | // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. |
1243 | | // when dbname_ is "dir/db" but when "dir" doesn't exist. |
1244 | 435k | Status s = env->CreateDirIfMissing(dirname); |
1245 | 435k | if (!s.ok()) { |
1246 | 0 | return s; |
1247 | 0 | } |
1248 | 435k | return env->NewDirectory(dirname, directory); |
1249 | 435k | } |
1250 | | |
1251 | | Status DBImpl::Directories::SetDirectories( |
1252 | | Env* env, const std::string& dbname, const std::string& wal_dir, |
1253 | 435k | const std::vector<DbPath>& data_paths) { |
1254 | 435k | Status s = CreateAndNewDirectory(env, dbname, &db_dir_); |
1255 | 435k | if (!s.ok()) { |
1256 | 0 | return s; |
1257 | 0 | } |
1258 | 435k | if (!wal_dir.empty() && dbname != wal_dir435k ) { |
1259 | 413 | s = CreateAndNewDirectory(env, wal_dir, &wal_dir_); |
1260 | 413 | if (!s.ok()) { |
1261 | 0 | return s; |
1262 | 0 | } |
1263 | 413 | } |
1264 | | |
1265 | 435k | data_dirs_.clear(); |
1266 | 435k | for (auto& p : data_paths) { |
1267 | 435k | const std::string db_path = p.path; |
1268 | 435k | if (db_path == dbname435k ) { |
1269 | 435k | data_dirs_.emplace_back(nullptr); |
1270 | 18.4E | } else { |
1271 | 18.4E | std::unique_ptr<Directory> path_directory; |
1272 | 18.4E | s = CreateAndNewDirectory(env, db_path, &path_directory); |
1273 | 18.4E | if (!s.ok()) { |
1274 | 0 | return s; |
1275 | 0 | } |
1276 | 18.4E | data_dirs_.emplace_back(path_directory.release()); |
1277 | 18.4E | } |
1278 | 435k | } |
1279 | 435k | assert(data_dirs_.size() == data_paths.size()); |
1280 | 0 | return Status::OK(); |
1281 | 435k | } |
1282 | | |
1283 | 71.0k | Directory* DBImpl::Directories::GetDataDir(size_t path_id) { |
1284 | 71.0k | assert(path_id < data_dirs_.size()); |
1285 | 0 | Directory* ret_dir = data_dirs_[path_id].get(); |
1286 | 71.0k | if (ret_dir == nullptr) { |
1287 | | // Should use db_dir_ |
1288 | 70.8k | return db_dir_.get(); |
1289 | 70.8k | } |
1290 | 259 | return ret_dir; |
1291 | 71.0k | } |
1292 | | |
1293 | | Status DBImpl::Recover( |
1294 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
1295 | 435k | bool error_if_log_file_exist) { |
1296 | 435k | mutex_.AssertHeld(); |
1297 | | |
1298 | 435k | bool is_new_db = false; |
1299 | 435k | assert(db_lock_ == nullptr); |
1300 | 435k | if (!read_only435k ) { |
1301 | 435k | Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir, |
1302 | 435k | db_options_.db_paths); |
1303 | 435k | if (!s.ok()) { |
1304 | 0 | return s; |
1305 | 0 | } |
1306 | | |
1307 | 435k | s = env_->LockFile(LockFileName(dbname_), &db_lock_); |
1308 | 435k | if (!s.ok()) { |
1309 | 5 | return s; |
1310 | 5 | } |
1311 | | |
1312 | 435k | s = env_->FileExists(CurrentFileName(dbname_)); |
1313 | 435k | if (s.IsNotFound()) { |
1314 | 424k | if (db_options_.create_if_missing) { |
1315 | 424k | s = NewDB(); |
1316 | 424k | is_new_db = true; |
1317 | 424k | if (!s.ok()) { |
1318 | 2 | return s; |
1319 | 2 | } |
1320 | 424k | } else { |
1321 | 6 | return STATUS(InvalidArgument, |
1322 | 6 | dbname_, "does not exist (create_if_missing is false)"); |
1323 | 6 | } |
1324 | 424k | } else if (10.5k s.ok()10.5k ) { |
1325 | 10.6k | if (db_options_.error_if_exists) { |
1326 | 1 | return STATUS(InvalidArgument, |
1327 | 1 | dbname_, "exists (error_if_exists is true)"); |
1328 | 1 | } |
1329 | 18.4E | } else { |
1330 | | // Unexpected error reading file |
1331 | 18.4E | assert(s.IsIOError()); |
1332 | 0 | return s; |
1333 | 18.4E | } |
1334 | | // Check for the IDENTITY file and create it if not there |
1335 | 435k | s = env_->FileExists(IdentityFileName(dbname_)); |
1336 | 435k | if (s.IsNotFound()) { |
1337 | 428k | s = SetIdentityFile(env_, dbname_); |
1338 | 428k | if (!s.ok()) { |
1339 | 1 | return s; |
1340 | 1 | } |
1341 | 428k | } else if (7.03k !s.ok()7.03k ) { |
1342 | 0 | assert(s.IsIOError()); |
1343 | 0 | return s; |
1344 | 0 | } |
1345 | 435k | } |
1346 | | |
1347 | 435k | Status s = versions_->Recover(column_families, read_only); |
1348 | 435k | if (db_options_.paranoid_checks435k && s.ok()) { |
1349 | 435k | s = CheckConsistency(); |
1350 | 435k | } |
1351 | 435k | if (s.ok()) { |
1352 | 435k | SequenceNumber max_sequence(kMaxSequenceNumber); |
1353 | 435k | default_cf_handle_ = new ColumnFamilyHandleImpl( |
1354 | 435k | versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); |
1355 | 435k | default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); |
1356 | 435k | single_column_family_mode_ = |
1357 | 435k | versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; |
1358 | | |
1359 | | // Recover from all newer log files than the ones named in the |
1360 | | // descriptor (new log files may have been added by the previous |
1361 | | // incarnation without registering them in the descriptor). |
1362 | | // |
1363 | | // Note that prev_log_number() is no longer used, but we pay |
1364 | | // attention to it in case we are recovering a database |
1365 | | // produced by an older version of rocksdb. |
1366 | 435k | const uint64_t min_log = versions_->MinLogNumber(); |
1367 | 435k | const uint64_t prev_log = versions_->prev_log_number(); |
1368 | 435k | std::vector<std::string> filenames; |
1369 | 435k | s = env_->GetChildren(db_options_.wal_dir, &filenames); |
1370 | 435k | if (!s.ok()) { |
1371 | 0 | return s; |
1372 | 0 | } |
1373 | | |
1374 | 435k | std::vector<uint64_t> logs; |
1375 | 3.28M | for (size_t i = 0; i < filenames.size(); i++2.85M ) { |
1376 | 2.85M | uint64_t number; |
1377 | 2.85M | FileType type; |
1378 | 2.85M | if (ParseFileName(filenames[i], &number, &type) && type == kLogFile1.80M ) { |
1379 | 10.2k | if (is_new_db) { |
1380 | 30 | return STATUS(Corruption, |
1381 | 30 | "While creating a new Db, wal_dir contains " |
1382 | 30 | "existing log file: ", |
1383 | 30 | filenames[i]); |
1384 | 10.2k | } else if ((number >= min_log) || (number == prev_log)31 ) { |
1385 | 10.2k | logs.push_back(number); |
1386 | 10.2k | } |
1387 | 10.2k | } |
1388 | 2.85M | } |
1389 | | |
1390 | 435k | if (logs.size() > 0 && error_if_log_file_exist7.07k ) { |
1391 | 0 | return STATUS(Corruption, "" |
1392 | 0 | "The db was opened in readonly mode with error_if_log_file_exist" |
1393 | 0 | "flag but a log file already exists"); |
1394 | 0 | } |
1395 | | |
1396 | 435k | if (!logs.empty()) { |
1397 | | // Recover in the order in which the logs were generated |
1398 | 7.07k | std::sort(logs.begin(), logs.end()); |
1399 | 7.07k | s = RecoverLogFiles(logs, &max_sequence, read_only); |
1400 | 7.07k | if (!s.ok()) { |
1401 | | // Clear memtables if recovery failed |
1402 | 116 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1403 | 116 | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1404 | 116 | kMaxSequenceNumber); |
1405 | 116 | } |
1406 | 114 | } |
1407 | 7.07k | } |
1408 | | |
1409 | 435k | SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); |
1410 | 435k | } |
1411 | | |
1412 | | // Initial value |
1413 | 435k | max_total_in_memory_state_ = 0; |
1414 | 438k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1415 | 438k | auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
1416 | 438k | max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * |
1417 | 438k | mutable_cf_options->max_write_buffer_number; |
1418 | 438k | } |
1419 | | |
1420 | 435k | return s; |
1421 | 435k | } |
1422 | | |
1423 | | // REQUIRES: log_numbers are sorted in ascending order |
1424 | | Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers, |
1425 | 7.05k | SequenceNumber* max_sequence, bool read_only) { |
1426 | 7.05k | struct LogReporter : public log::Reader::Reporter { |
1427 | 7.05k | Env* env; |
1428 | 7.05k | Logger* info_log; |
1429 | 7.05k | const char* fname; |
1430 | 7.05k | Status* status; // nullptr if db_options_.paranoid_checks==false |
1431 | 7.05k | void Corruption(size_t bytes, const Status& s) override { |
1432 | 319 | RLOG(InfoLogLevel::WARN_LEVEL, |
1433 | 319 | info_log, "%s%s: dropping %d bytes; %s", |
1434 | 319 | (this->status == nullptr ? "(ignoring error) " : ""), |
1435 | 319 | fname, static_cast<int>(bytes), s.ToString().c_str()); |
1436 | 319 | if (this->status != nullptr && this->status->ok()245 ) { |
1437 | 151 | *this->status = s; |
1438 | 151 | } |
1439 | 319 | } |
1440 | 7.05k | }; |
1441 | | |
1442 | 7.05k | mutex_.AssertHeld(); |
1443 | 7.05k | Status status; |
1444 | 7.05k | std::unordered_map<int, VersionEdit> version_edits; |
1445 | | // no need to refcount because iteration is under mutex |
1446 | 9.97k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1447 | 9.97k | VersionEdit edit; |
1448 | 9.97k | edit.SetColumnFamily(cfd->GetID()); |
1449 | 9.97k | auto frontier = versions_->FlushedFrontier(); |
1450 | 9.97k | if (frontier) { |
1451 | 280 | edit.UpdateFlushedFrontier(frontier->Clone()); |
1452 | 280 | } |
1453 | 9.97k | version_edits.insert({cfd->GetID(), edit}); |
1454 | 9.97k | } |
1455 | 7.05k | int job_id = next_job_id_.fetch_add(1); |
1456 | 7.05k | { |
1457 | 7.05k | auto stream = event_logger_.Log(); |
1458 | 7.05k | stream << "job" << job_id << "event" |
1459 | 7.05k | << "recovery_started"; |
1460 | 7.05k | stream << "log_files"; |
1461 | 7.05k | stream.StartArray(); |
1462 | 10.2k | for (auto log_number : log_numbers) { |
1463 | 10.2k | stream << log_number; |
1464 | 10.2k | } |
1465 | 7.05k | stream.EndArray(); |
1466 | 7.05k | } |
1467 | | |
1468 | 7.05k | bool continue_replay_log = true; |
1469 | 9.72k | for (auto log_number : log_numbers) { |
1470 | | // The previous incarnation may not have written any MANIFEST |
1471 | | // records after allocating this log number. So we manually |
1472 | | // update the file number allocation counter in VersionSet. |
1473 | 9.72k | versions_->MarkFileNumberUsedDuringRecovery(log_number); |
1474 | | // Open the log file |
1475 | 9.72k | std::string fname = LogFileName(db_options_.wal_dir, log_number); |
1476 | 9.72k | unique_ptr<SequentialFileReader> file_reader; |
1477 | 9.72k | { |
1478 | 9.72k | unique_ptr<SequentialFile> file; |
1479 | 9.72k | status = env_->NewSequentialFile(fname, &file, env_options_); |
1480 | 9.72k | if (!status.ok()) { |
1481 | 0 | MaybeIgnoreError(&status); |
1482 | 0 | if (!status.ok()) { |
1483 | 0 | return status; |
1484 | 0 | } else { |
1485 | | // Fail with one log file, but that's ok. |
1486 | | // Try next one. |
1487 | 0 | continue; |
1488 | 0 | } |
1489 | 0 | } |
1490 | 9.72k | file_reader.reset(new SequentialFileReader(std::move(file))); |
1491 | 9.72k | } |
1492 | | |
1493 | | // Create the log reader. |
1494 | 0 | LogReporter reporter; |
1495 | 9.72k | reporter.env = env_; |
1496 | 9.72k | reporter.info_log = db_options_.info_log.get(); |
1497 | 9.72k | reporter.fname = fname.c_str(); |
1498 | 9.72k | if (!db_options_.paranoid_checks || |
1499 | 9.72k | db_options_.wal_recovery_mode == |
1500 | 9.70k | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1501 | 892 | reporter.status = nullptr; |
1502 | 8.83k | } else { |
1503 | 8.83k | reporter.status = &status; |
1504 | 8.83k | } |
1505 | | // We intentially make log::Reader do checksumming even if |
1506 | | // paranoid_checks==false so that corruptions cause entire commits |
1507 | | // to be skipped instead of propagating bad information (like overly |
1508 | | // large sequence numbers). |
1509 | 9.72k | log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, |
1510 | 9.72k | true /*checksum*/, 0 /*initial_offset*/, log_number); |
1511 | 9.72k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1512 | 9.72k | "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, |
1513 | 9.72k | db_options_.wal_recovery_mode, !continue_replay_log); |
1514 | | |
1515 | | // Determine if we should tolerate incomplete records at the tail end of the |
1516 | | // Read all the records and add to a memtable |
1517 | 9.72k | std::string scratch; |
1518 | 9.72k | Slice record; |
1519 | 9.72k | WriteBatch batch; |
1520 | | |
1521 | 9.72k | if (!continue_replay_log) { |
1522 | 180 | uint64_t bytes; |
1523 | 180 | if (env_->GetFileSize(fname, &bytes).ok()) { |
1524 | 180 | auto info_log = db_options_.info_log.get(); |
1525 | 180 | RLOG(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes", |
1526 | 180 | fname.c_str(), static_cast<int>(bytes)); |
1527 | 180 | } |
1528 | 180 | } |
1529 | | |
1530 | 9.72k | while ( |
1531 | 2.96M | continue_replay_log && |
1532 | 2.96M | reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode)2.96M && |
1533 | 2.96M | status.ok()2.95M ) { |
1534 | 2.95M | if (record.size() < 12) { |
1535 | 0 | reporter.Corruption(record.size(), |
1536 | 0 | STATUS(Corruption, "log record too small")); |
1537 | 0 | continue; |
1538 | 0 | } |
1539 | 2.95M | WriteBatchInternal::SetContents(&batch, record); |
1540 | | |
1541 | 2.95M | #ifndef ROCKSDB_LITE |
1542 | 2.95M | if (db_options_.wal_filter != nullptr) { |
1543 | 17 | WriteBatch new_batch; |
1544 | 17 | bool batch_changed = false; |
1545 | | |
1546 | 17 | WalFilter::WalProcessingOption wal_processing_option = |
1547 | 17 | db_options_.wal_filter->LogRecord(batch, &new_batch, |
1548 | 17 | &batch_changed); |
1549 | | |
1550 | 17 | switch (wal_processing_option) { |
1551 | 14 | case WalFilter::WalProcessingOption::kContinueProcessing: |
1552 | | // do nothing, proceeed normally |
1553 | 14 | break; |
1554 | 1 | case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: |
1555 | | // skip current record |
1556 | 1 | continue; |
1557 | 1 | case WalFilter::WalProcessingOption::kStopReplay: |
1558 | | // skip current record and stop replay |
1559 | 1 | continue_replay_log = false; |
1560 | 1 | continue; |
1561 | 1 | case WalFilter::WalProcessingOption::kCorruptedRecord: { |
1562 | 1 | status = STATUS(Corruption, "Corruption reported by Wal Filter ", |
1563 | 1 | db_options_.wal_filter->Name()); |
1564 | 1 | MaybeIgnoreError(&status); |
1565 | 1 | if (!status.ok()) { |
1566 | 1 | reporter.Corruption(record.size(), status); |
1567 | 1 | continue; |
1568 | 1 | } |
1569 | 0 | break; |
1570 | 1 | } |
1571 | 0 | default: { |
1572 | 0 | assert(false); // unhandled case |
1573 | 0 | status = STATUS(NotSupported, |
1574 | 0 | "Unknown WalProcessingOption returned" |
1575 | 0 | " by Wal Filter ", |
1576 | 0 | db_options_.wal_filter->Name()); |
1577 | 0 | MaybeIgnoreError(&status); |
1578 | 0 | if (!status.ok()) { |
1579 | 0 | return status; |
1580 | 0 | } else { |
1581 | | // Ignore the error with current record processing. |
1582 | 0 | continue; |
1583 | 0 | } |
1584 | 0 | } |
1585 | 17 | } |
1586 | | |
1587 | 14 | if (batch_changed) { |
1588 | | // Make sure that the count in the new batch is |
1589 | | // within the orignal count. |
1590 | 3 | int new_count = WriteBatchInternal::Count(&new_batch); |
1591 | 3 | int original_count = WriteBatchInternal::Count(&batch); |
1592 | 3 | if (new_count > original_count) { |
1593 | 1 | RLOG(InfoLogLevel::FATAL_LEVEL, db_options_.info_log, |
1594 | 1 | "Recovering log #%" PRIu64 |
1595 | 1 | " mode %d log filter %s returned " |
1596 | 1 | "more records (%d) than original (%d) which is not allowed. " |
1597 | 1 | "Aborting recovery.", |
1598 | 1 | log_number, db_options_.wal_recovery_mode, |
1599 | 1 | db_options_.wal_filter->Name(), new_count, original_count); |
1600 | 1 | status = STATUS(NotSupported, |
1601 | 1 | "More than original # of records " |
1602 | 1 | "returned by Wal Filter ", |
1603 | 1 | db_options_.wal_filter->Name()); |
1604 | 1 | return status; |
1605 | 1 | } |
1606 | | // Set the same sequence number in the new_batch |
1607 | | // as the original batch. |
1608 | 2 | WriteBatchInternal::SetSequence(&new_batch, |
1609 | 2 | WriteBatchInternal::Sequence(&batch)); |
1610 | 2 | batch = new_batch; |
1611 | 2 | } |
1612 | 14 | } |
1613 | 2.95M | #endif // ROCKSDB_LITE |
1614 | | |
1615 | | // If column family was not found, it might mean that the WAL write |
1616 | | // batch references to the column family that was dropped after the |
1617 | | // insert. We don't want to fail the whole write batch in that case -- |
1618 | | // we just ignore the update. |
1619 | | // That's why we set ignore missing column families to true |
1620 | 2.95M | status = |
1621 | 2.95M | WriteBatchInternal::InsertInto(&batch, column_family_memtables_.get(), |
1622 | 2.95M | &flush_scheduler_, true, log_number); |
1623 | | |
1624 | 2.95M | MaybeIgnoreError(&status); |
1625 | 2.95M | if (!status.ok()) { |
1626 | | // We are treating this as a failure while reading since we read valid |
1627 | | // blocks that do not form coherent data |
1628 | 0 | reporter.Corruption(record.size(), status); |
1629 | 0 | continue; |
1630 | 0 | } |
1631 | | |
1632 | 2.95M | const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + |
1633 | 2.95M | WriteBatchInternal::Count(&batch) - 1; |
1634 | 2.95M | if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)2.95M ) { |
1635 | 2.95M | *max_sequence = last_seq; |
1636 | 2.95M | } |
1637 | | |
1638 | 2.95M | if (!read_only) { |
1639 | | // we can do this because this is called before client has access to the |
1640 | | // DB and there is only a single thread operating on DB |
1641 | 2.95M | ColumnFamilyData* cfd; |
1642 | | |
1643 | 2.95M | while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { |
1644 | 15 | cfd->Unref(); |
1645 | | // If this asserts, it means that InsertInto failed in |
1646 | | // filtering updates to already-flushed column families |
1647 | 15 | assert(cfd->GetLogNumber() <= log_number); |
1648 | 0 | auto iter = version_edits.find(cfd->GetID()); |
1649 | 15 | assert(iter != version_edits.end()); |
1650 | 0 | VersionEdit* edit = &iter->second; |
1651 | 15 | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1652 | 15 | if (!status.ok()) { |
1653 | | // Reflect errors immediately so that conditions like full |
1654 | | // file-systems cause the DB::Open() to fail. |
1655 | 0 | return status; |
1656 | 0 | } |
1657 | | |
1658 | 15 | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1659 | 15 | *max_sequence); |
1660 | 15 | } |
1661 | 2.95M | } |
1662 | 2.95M | } |
1663 | | |
1664 | 9.72k | if (!status.ok()) { |
1665 | 152 | if (db_options_.wal_recovery_mode == |
1666 | 152 | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1667 | | // We should ignore all errors unconditionally |
1668 | 0 | status = Status::OK(); |
1669 | 152 | } else if (db_options_.wal_recovery_mode == |
1670 | 152 | WALRecoveryMode::kPointInTimeRecovery) { |
1671 | | // We should ignore the error but not continue replaying |
1672 | 40 | status = Status::OK(); |
1673 | 40 | continue_replay_log = false; |
1674 | | |
1675 | 40 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1676 | 40 | "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, |
1677 | 40 | log_number, *max_sequence); |
1678 | 112 | } else { |
1679 | 112 | assert(db_options_.wal_recovery_mode == |
1680 | 112 | WALRecoveryMode::kTolerateCorruptedTailRecords |
1681 | 112 | || db_options_.wal_recovery_mode == |
1682 | 112 | WALRecoveryMode::kAbsoluteConsistency); |
1683 | 0 | return status; |
1684 | 112 | } |
1685 | 152 | } |
1686 | | |
1687 | 9.61k | flush_scheduler_.Clear(); |
1688 | 9.61k | if ((*max_sequence != kMaxSequenceNumber) && (versions_->LastSequence() < *max_sequence)5.97k ) { |
1689 | 5.62k | versions_->SetLastSequence(*max_sequence); |
1690 | 5.62k | } |
1691 | 9.61k | } |
1692 | | |
1693 | 6.94k | if (!read_only) { |
1694 | | // no need to refcount since client still doesn't have access |
1695 | | // to the DB and can not drop column families while we iterate |
1696 | 6.93k | auto max_log_number = log_numbers.back(); |
1697 | 9.84k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1698 | 9.84k | auto iter = version_edits.find(cfd->GetID()); |
1699 | 9.84k | assert(iter != version_edits.end()); |
1700 | 0 | VersionEdit* edit = &iter->second; |
1701 | | |
1702 | 9.84k | if (cfd->GetLogNumber() > max_log_number) { |
1703 | | // Column family cfd has already flushed the data |
1704 | | // from all logs. Memtable has to be empty because |
1705 | | // we filter the updates based on log_number |
1706 | | // (in WriteBatch::InsertInto) |
1707 | 5 | assert(cfd->mem()->GetFirstSequenceNumber() == 0); |
1708 | 0 | assert(edit->NumEntries() == 0); |
1709 | 0 | continue; |
1710 | 5 | } |
1711 | | |
1712 | | // flush the final memtable (if non-empty) |
1713 | 9.84k | if (cfd->mem()->GetFirstSequenceNumber() != 0) { |
1714 | 3.65k | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1715 | 3.65k | if (!status.ok()) { |
1716 | | // Recovery failed |
1717 | 0 | break; |
1718 | 0 | } |
1719 | | |
1720 | 3.65k | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1721 | 3.65k | *max_sequence); |
1722 | 3.65k | } |
1723 | | |
1724 | | // write MANIFEST with update |
1725 | | // writing log_number in the manifest means that any log file |
1726 | | // with number strongly less than (log_number + 1) is already |
1727 | | // recovered and should be ignored on next reincarnation. |
1728 | | // Since we already recovered max_log_number, we want all logs |
1729 | | // with numbers `<= max_log_number` (includes this one) to be ignored |
1730 | 9.84k | edit->SetLogNumber(max_log_number + 1); |
1731 | | // we must mark the next log number as used, even though it's |
1732 | | // not actually used. that is because VersionSet assumes |
1733 | | // VersionSet::next_file_number_ always to be strictly greater than any |
1734 | | // log number |
1735 | 9.84k | versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); |
1736 | 9.84k | status = versions_->LogAndApply( |
1737 | 9.84k | cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); |
1738 | 9.84k | if (!status.ok()) { |
1739 | | // Recovery failed |
1740 | 1 | break; |
1741 | 1 | } |
1742 | 9.84k | } |
1743 | 6.93k | } |
1744 | | |
1745 | 6.94k | event_logger_.Log() << "job" << job_id << "event" |
1746 | 6.94k | << "recovery_finished"; |
1747 | | |
1748 | 6.94k | return status; |
1749 | 7.05k | } |
1750 | | |
1751 | | Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, |
1752 | 3.67k | MemTable* mem, VersionEdit* edit) { |
1753 | 3.67k | mutex_.AssertHeld(); |
1754 | 3.67k | const uint64_t start_micros = env_->NowMicros(); |
1755 | 3.67k | FileMetaData meta; |
1756 | 3.67k | Status s; |
1757 | 3.67k | { |
1758 | 3.67k | auto file_number_holder = pending_outputs_->NewFileNumber(); |
1759 | 3.67k | meta.fd = FileDescriptor(file_number_holder.Last(), 0, 0, 0); |
1760 | 3.67k | const auto* frontier = mem->Frontiers(); |
1761 | 3.67k | if (frontier) { |
1762 | 0 | meta.smallest.user_frontier = frontier->Smallest().Clone(); |
1763 | 0 | meta.largest.user_frontier = frontier->Largest().Clone(); |
1764 | 0 | } |
1765 | 3.67k | ReadOptions ro; |
1766 | 3.67k | ro.total_order_seek = true; |
1767 | 3.67k | Arena arena; |
1768 | 3.67k | TableProperties table_properties; |
1769 | 3.67k | { |
1770 | 3.67k | ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); |
1771 | 3.67k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1772 | 3.67k | "[%s] [WriteLevel0TableForRecovery]" |
1773 | 3.67k | " Level-0 table #%" PRIu64 ": started", |
1774 | 3.67k | cfd->GetName().c_str(), meta.fd.GetNumber()); |
1775 | | |
1776 | 3.67k | bool paranoid_file_checks = |
1777 | 3.67k | cfd->GetLatestMutableCFOptions()->paranoid_file_checks; |
1778 | 3.67k | { |
1779 | 3.67k | mutex_.Unlock(); |
1780 | 3.67k | TableFileCreationInfo info; |
1781 | | |
1782 | 3.67k | SequenceNumber earliest_write_conflict_snapshot; |
1783 | 3.67k | std::vector<SequenceNumber> snapshot_seqs = |
1784 | 3.67k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
1785 | | |
1786 | 3.67k | s = BuildTable(dbname_, |
1787 | 3.67k | env_, |
1788 | 3.67k | *cfd->ioptions(), |
1789 | 3.67k | env_options_, |
1790 | 3.67k | cfd->table_cache(), |
1791 | 3.67k | iter.get(), |
1792 | 3.67k | &meta, |
1793 | 3.67k | cfd->internal_comparator(), |
1794 | 3.67k | cfd->int_tbl_prop_collector_factories(), |
1795 | 3.67k | cfd->GetID(), |
1796 | 3.67k | snapshot_seqs, |
1797 | 3.67k | earliest_write_conflict_snapshot, |
1798 | 3.67k | GetCompressionFlush(*cfd->ioptions()), |
1799 | 3.67k | cfd->ioptions()->compression_opts, |
1800 | 3.67k | paranoid_file_checks, |
1801 | 3.67k | cfd->internal_stats(), |
1802 | 3.67k | db_options_.boundary_extractor.get(), |
1803 | 3.67k | Env::IO_HIGH, |
1804 | 3.67k | &info.table_properties); |
1805 | 3.67k | LogFlush(db_options_.info_log); |
1806 | 3.67k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1807 | 3.67k | "[%s] [WriteLevel0TableForRecovery]" |
1808 | 3.67k | " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", |
1809 | 3.67k | cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetTotalFileSize(), |
1810 | 3.67k | s.ToString().c_str()); |
1811 | | |
1812 | | // output to event logger |
1813 | 3.67k | if (s.ok()) { |
1814 | 3.67k | info.db_name = dbname_; |
1815 | 3.67k | info.cf_name = cfd->GetName(); |
1816 | 3.67k | info.file_path = TableFileName(db_options_.db_paths, |
1817 | 3.67k | meta.fd.GetNumber(), |
1818 | 3.67k | meta.fd.GetPathId()); |
1819 | 3.67k | info.file_size = meta.fd.GetTotalFileSize(); |
1820 | 3.67k | info.job_id = job_id; |
1821 | 3.67k | EventHelpers::LogAndNotifyTableFileCreation( |
1822 | 3.67k | &event_logger_, db_options_.listeners, meta.fd, info); |
1823 | 3.67k | } |
1824 | 3.67k | mutex_.Lock(); |
1825 | 3.67k | } |
1826 | 3.67k | } |
1827 | 3.67k | } |
1828 | | |
1829 | | // Note that if file_size is zero, the file has been deleted and |
1830 | | // should not be added to the manifest. |
1831 | 3.67k | int level = 0; |
1832 | 3.67k | if (s.ok() && meta.fd.GetTotalFileSize() > 0) { |
1833 | 3.67k | edit->AddCleanedFile(level, meta); |
1834 | 3.67k | } |
1835 | | |
1836 | 3.67k | InternalStats::CompactionStats stats(1); |
1837 | 3.67k | stats.micros = env_->NowMicros() - start_micros; |
1838 | 3.67k | stats.bytes_written = meta.fd.GetTotalFileSize(); |
1839 | 3.67k | stats.num_output_files = 1; |
1840 | 3.67k | cfd->internal_stats()->AddCompactionStats(level, stats); |
1841 | 3.67k | cfd->internal_stats()->AddCFStats( |
1842 | 3.67k | InternalStats::BYTES_FLUSHED, meta.fd.GetTotalFileSize()); |
1843 | 3.67k | RecordTick(stats_, FLUSH_WRITE_BYTES, meta.fd.GetTotalFileSize()); |
1844 | 3.67k | return s; |
1845 | 3.67k | } |
1846 | | |
1847 | | Result<FileNumbersHolder> DBImpl::FlushMemTableToOutputFile( |
1848 | | ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, |
1849 | 59.6k | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { |
1850 | 59.6k | mutex_.AssertHeld(); |
1851 | 59.6k | DCHECK_NE(cfd->imm()->NumNotFlushed(), 0); |
1852 | 59.6k | DCHECK(cfd->imm()->IsFlushPending()); |
1853 | | |
1854 | 59.6k | SequenceNumber earliest_write_conflict_snapshot; |
1855 | 59.6k | std::vector<SequenceNumber> snapshot_seqs = |
1856 | 59.6k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
1857 | | |
1858 | 59.6k | MemTableFilter mem_table_flush_filter; |
1859 | 59.6k | if (db_options_.mem_table_flush_filter_factory) { |
1860 | 35.4k | mem_table_flush_filter = (*db_options_.mem_table_flush_filter_factory)(); |
1861 | 35.4k | } |
1862 | | |
1863 | 59.6k | FlushJob flush_job( |
1864 | 59.6k | dbname_, cfd, db_options_, mutable_cf_options, env_options_, |
1865 | 59.6k | versions_.get(), &mutex_, &shutting_down_, &disable_flush_on_shutdown_, snapshot_seqs, |
1866 | 59.6k | earliest_write_conflict_snapshot, mem_table_flush_filter, pending_outputs_.get(), |
1867 | 59.6k | job_context, log_buffer, directories_.GetDbDir(), directories_.GetDataDir(0U), |
1868 | 59.6k | GetCompressionFlush(*cfd->ioptions()), stats_, &event_logger_); |
1869 | | |
1870 | 59.6k | FileMetaData file_meta; |
1871 | | |
1872 | | // Within flush_job.Run, rocksdb may call event listener to notify |
1873 | | // file creation and deletion. |
1874 | | // |
1875 | | // Note that flush_job.Run will unlock and lock the db_mutex, |
1876 | | // and EventListener callback will be called when the db_mutex |
1877 | | // is unlocked by the current thread. |
1878 | 59.6k | auto file_number_holder = flush_job.Run(&file_meta); |
1879 | | |
1880 | 59.6k | if (file_number_holder.ok()) { |
1881 | 59.6k | InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context, |
1882 | 59.6k | mutable_cf_options); |
1883 | 59.6k | if (made_progress) { |
1884 | 59.6k | *made_progress = 1; |
1885 | 59.6k | } |
1886 | 59.6k | VersionStorageInfo::LevelSummaryStorage tmp; |
1887 | 59.6k | YB_LOG_EVERY_N_SECS(INFO, 1) |
1888 | 3.23k | << "[" << cfd->GetName() << "] Level summary: " |
1889 | 3.23k | << cfd->current()->storage_info()->LevelSummary(&tmp); |
1890 | 59.6k | } |
1891 | | |
1892 | 59.6k | if (!file_number_holder.ok() && !file_number_holder.status().IsShutdownInProgress()22 |
1893 | 59.6k | && db_options_.paranoid_checks18 && bg_error_.ok()18 ) { |
1894 | | // if a bad error happened (not ShutdownInProgress) and paranoid_checks is |
1895 | | // true, mark DB read-only |
1896 | 17 | bg_error_ = file_number_holder.status(); |
1897 | 17 | } |
1898 | 59.6k | RETURN_NOT_OK(file_number_holder); |
1899 | 59.6k | MAYBE_FAULT(FLAGS_fault_crash_after_rocksdb_flush); |
1900 | 59.6k | #ifndef ROCKSDB_LITE |
1901 | | // may temporarily unlock and lock the mutex. |
1902 | 59.6k | NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, |
1903 | 59.6k | job_context->job_id, flush_job.GetTableProperties()); |
1904 | 59.6k | #endif // ROCKSDB_LITE |
1905 | 59.6k | auto sfm = |
1906 | 59.6k | static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); |
1907 | 59.6k | if (sfm) { |
1908 | | // Notify sst_file_manager that a new file was added |
1909 | 81 | std::string file_path = MakeTableFileName(db_options_.db_paths[0].path, |
1910 | 81 | file_meta.fd.GetNumber()); |
1911 | 81 | RETURN_NOT_OK(sfm->OnAddFile(file_path)); |
1912 | 81 | if (cfd->ioptions()->table_factory->IsSplitSstForWriteSupported()) { |
1913 | 81 | RETURN_NOT_OK(sfm->OnAddFile(TableBaseToDataFileName(file_path))); |
1914 | 81 | } |
1915 | 81 | if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()3 ) { |
1916 | 2 | bg_error_ = STATUS(IOError, "Max allowed space was reached"); |
1917 | 2 | TEST_SYNC_POINT( |
1918 | 2 | "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached"); |
1919 | 2 | } |
1920 | 81 | } |
1921 | 59.6k | return file_number_holder; |
1922 | 59.6k | } |
1923 | | |
1924 | 470k | uint64_t DBImpl::GetCurrentVersionSstFilesSize() { |
1925 | 470k | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1926 | 470k | GetLiveFilesMetaData(&file_metadata); |
1927 | 470k | uint64_t total_sst_file_size = 0; |
1928 | 470k | for (const auto& meta : file_metadata) { |
1929 | 64.3k | total_sst_file_size += meta.total_size; |
1930 | 64.3k | } |
1931 | 470k | return total_sst_file_size; |
1932 | 470k | } |
1933 | | |
1934 | 470k | uint64_t DBImpl::GetCurrentVersionSstFilesUncompressedSize() { |
1935 | 470k | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1936 | 470k | GetLiveFilesMetaData(&file_metadata); |
1937 | 470k | uint64_t total_uncompressed_file_size = 0; |
1938 | 470k | for (const auto &meta : file_metadata) { |
1939 | 64.2k | total_uncompressed_file_size += meta.uncompressed_size; |
1940 | 64.2k | } |
1941 | 470k | return total_uncompressed_file_size; |
1942 | 470k | } |
1943 | | |
1944 | 1.64M | std::pair<uint64_t, uint64_t> DBImpl::GetCurrentVersionSstFilesAllSizes() { |
1945 | 1.64M | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1946 | 1.64M | GetLiveFilesMetaData(&file_metadata); |
1947 | 1.64M | uint64_t total_sst_file_size = 0; |
1948 | 1.64M | uint64_t total_uncompressed_file_size = 0; |
1949 | 1.64M | for (const auto& meta : file_metadata) { |
1950 | 223k | total_sst_file_size += meta.total_size; |
1951 | 223k | total_uncompressed_file_size += meta.uncompressed_size; |
1952 | 223k | } |
1953 | 1.64M | return std::pair<uint64_t, uint64_t>(total_sst_file_size, total_uncompressed_file_size); |
1954 | 1.64M | } |
1955 | | |
1956 | 41.0M | uint64_t DBImpl::GetCurrentVersionNumSSTFiles() { |
1957 | 41.0M | InstrumentedMutexLock lock(&mutex_); |
1958 | 41.0M | return default_cf_handle_->cfd()->current()->storage_info()->NumFiles(); |
1959 | 41.0M | } |
1960 | | |
1961 | 517k | void DBImpl::SetSSTFileTickers() { |
1962 | 517k | if (stats_) { |
1963 | 465k | auto sst_files_size = GetCurrentVersionSstFilesSize(); |
1964 | 465k | SetTickerCount(stats_, CURRENT_VERSION_SST_FILES_SIZE, sst_files_size); |
1965 | 465k | auto uncompressed_sst_files_size = GetCurrentVersionSstFilesUncompressedSize(); |
1966 | 465k | SetTickerCount( |
1967 | 465k | stats_, CURRENT_VERSION_SST_FILES_UNCOMPRESSED_SIZE, uncompressed_sst_files_size); |
1968 | 465k | auto num_sst_files = GetCurrentVersionNumSSTFiles(); |
1969 | 465k | SetTickerCount(stats_, CURRENT_VERSION_NUM_SST_FILES, num_sst_files); |
1970 | 465k | } |
1971 | 517k | } |
1972 | | |
1973 | 20 | uint64_t DBImpl::GetCurrentVersionDataSstFilesSize() { |
1974 | 20 | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1975 | 20 | GetLiveFilesMetaData(&file_metadata); |
1976 | 20 | uint64_t data_sst_file_size = 0; |
1977 | 75 | for (const auto& meta : file_metadata) { |
1978 | | // Each SST has base/metadata SST file (<number>.sst) and at least one data SST file |
1979 | | // (<number>.sst.sblock.0). |
1980 | | // We subtract SST metadata file size from total SST size to get the SST data file(s) size. |
1981 | 75 | data_sst_file_size += meta.total_size - meta.base_size; |
1982 | 75 | } |
1983 | 20 | return data_sst_file_size; |
1984 | 20 | } |
1985 | | |
1986 | | void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, |
1987 | | FileMetaData* file_meta, |
1988 | | const MutableCFOptions& mutable_cf_options, |
1989 | 59.6k | int job_id, TableProperties prop) { |
1990 | 59.6k | #ifndef ROCKSDB_LITE |
1991 | 59.6k | mutex_.AssertHeld(); |
1992 | 59.6k | if (IsShuttingDown()) { |
1993 | 1.25k | return; |
1994 | 1.25k | } |
1995 | 58.4k | if (db_options_.listeners.size() > 0) { |
1996 | 3.51k | int num_0level_files = cfd->current()->storage_info()->NumLevelFiles(0); |
1997 | 3.51k | bool triggered_writes_slowdown = |
1998 | 3.51k | num_0level_files >= mutable_cf_options.level0_slowdown_writes_trigger; |
1999 | 3.51k | bool triggered_writes_stop = |
2000 | 3.51k | num_0level_files >= mutable_cf_options.level0_stop_writes_trigger; |
2001 | | // release lock while notifying events |
2002 | 3.51k | mutex_.Unlock(); |
2003 | 3.51k | { |
2004 | 3.51k | FlushJobInfo info; |
2005 | 3.51k | info.cf_name = cfd->GetName(); |
2006 | | // TODO(yhchiang): make db_paths dynamic in case flush does not |
2007 | | // go to L0 in the future. |
2008 | 3.51k | info.file_path = MakeTableFileName(db_options_.db_paths[0].path, |
2009 | 3.51k | file_meta->fd.GetNumber()); |
2010 | 3.51k | info.thread_id = env_->GetThreadID(); |
2011 | 3.51k | info.job_id = job_id; |
2012 | 3.51k | info.triggered_writes_slowdown = triggered_writes_slowdown; |
2013 | 3.51k | info.triggered_writes_stop = triggered_writes_stop; |
2014 | 3.51k | info.smallest_seqno = file_meta->smallest.seqno; |
2015 | 3.51k | info.largest_seqno = file_meta->largest.seqno; |
2016 | 3.51k | info.table_properties = prop; |
2017 | 3.86k | for (auto listener : db_options_.listeners) { |
2018 | 3.86k | listener->OnFlushCompleted(this, info); |
2019 | 3.86k | } |
2020 | 3.51k | } |
2021 | 54.9k | } else { |
2022 | 54.9k | mutex_.Unlock(); |
2023 | 54.9k | } |
2024 | 58.4k | SetSSTFileTickers(); |
2025 | 58.4k | mutex_.Lock(); |
2026 | | // no need to signal bg_cv_ as it will be signaled at the end of the |
2027 | | // flush process. |
2028 | 58.4k | #endif // ROCKSDB_LITE |
2029 | 58.4k | } |
2030 | | |
2031 | | Status DBImpl::CompactRange(const CompactRangeOptions& options, |
2032 | | ColumnFamilyHandle* column_family, |
2033 | 2.02k | const Slice* begin, const Slice* end) { |
2034 | 2.02k | if (options.target_path_id >= db_options_.db_paths.size()) { |
2035 | 4 | return STATUS(InvalidArgument, "Invalid target path ID"); |
2036 | 4 | } |
2037 | | |
2038 | 2.02k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2039 | 2.02k | auto cfd = cfh->cfd(); |
2040 | 2.02k | bool exclusive = options.exclusive_manual_compaction; |
2041 | | |
2042 | 2.02k | Status s = FlushMemTable(cfd, FlushOptions()); |
2043 | 2.02k | if (!s.ok()) { |
2044 | 0 | LogFlush(db_options_.info_log); |
2045 | 0 | return s; |
2046 | 0 | } |
2047 | | |
2048 | 2.02k | int max_level_with_files = 0; |
2049 | 2.02k | { |
2050 | 2.02k | InstrumentedMutexLock l(&mutex_); |
2051 | 2.02k | Version* base = cfd->current(); |
2052 | 3.97k | for (int level = 1; level < base->storage_info()->num_non_empty_levels(); |
2053 | 2.02k | level++1.95k ) { |
2054 | 1.95k | if (base->storage_info()->OverlapInLevel(level, begin, end)) { |
2055 | 1.44k | max_level_with_files = level; |
2056 | 1.44k | } |
2057 | 1.95k | } |
2058 | 2.02k | } |
2059 | | |
2060 | 2.02k | int final_output_level = 0; |
2061 | 2.02k | if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && |
2062 | 2.02k | cfd->NumberLevels() > 1660 ) { |
2063 | | // Always compact all files together. |
2064 | 88 | s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, |
2065 | 88 | cfd->NumberLevels() - 1, options.target_path_id, |
2066 | 88 | begin, end, exclusive); |
2067 | 88 | final_output_level = cfd->NumberLevels() - 1; |
2068 | 1.93k | } else { |
2069 | 5.32k | for (int level = 0; level <= max_level_with_files; level++3.39k ) { |
2070 | 3.39k | int output_level; |
2071 | | // in case the compaction is universal or if we're compacting the |
2072 | | // bottom-most level, the output level will be the same as input one. |
2073 | | // level 0 can never be the bottommost level (i.e. if all files are in |
2074 | | // level 0, we will compact to level 1) |
2075 | 3.39k | if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || |
2076 | 3.39k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO2.82k ) { |
2077 | 580 | output_level = level; |
2078 | 2.81k | } else if (level == max_level_with_files && level > 01.35k ) { |
2079 | 1.03k | if (options.bottommost_level_compaction == |
2080 | 1.03k | BottommostLevelCompaction::kSkip) { |
2081 | | // Skip bottommost level compaction |
2082 | 5 | continue; |
2083 | 1.03k | } else if (options.bottommost_level_compaction == |
2084 | 1.03k | BottommostLevelCompaction::kIfHaveCompactionFilter && |
2085 | 1.03k | cfd->ioptions()->compaction_filter == nullptr1.02k && |
2086 | 1.03k | cfd->ioptions()->compaction_filter_factory == nullptr1.02k ) { |
2087 | | // Skip bottommost level compaction since we don't have a compaction |
2088 | | // filter |
2089 | 1.00k | continue; |
2090 | 1.00k | } |
2091 | 26 | output_level = level; |
2092 | 1.77k | } else { |
2093 | 1.77k | output_level = level + 1; |
2094 | 1.77k | if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && |
2095 | 1.77k | cfd->ioptions()->level_compaction_dynamic_level_bytes && |
2096 | 1.77k | level == 032 ) { |
2097 | 8 | output_level = ColumnFamilyData::kCompactToBaseLevel; |
2098 | 8 | } |
2099 | 1.77k | } |
2100 | 2.38k | s = RunManualCompaction(cfd, level, output_level, options.target_path_id, |
2101 | 2.38k | begin, end, exclusive); |
2102 | 2.38k | if (!s.ok()) { |
2103 | 2 | break; |
2104 | 2 | } |
2105 | 2.38k | if (output_level == ColumnFamilyData::kCompactToBaseLevel) { |
2106 | 8 | final_output_level = cfd->NumberLevels() - 1; |
2107 | 2.37k | } else if (output_level > final_output_level) { |
2108 | 1.74k | final_output_level = output_level; |
2109 | 1.74k | } |
2110 | 2.38k | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); |
2111 | 2.38k | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); |
2112 | 2.38k | } |
2113 | 1.93k | } |
2114 | 2.02k | if (!s.ok()) { |
2115 | 12 | LogFlush(db_options_.info_log); |
2116 | 12 | return s; |
2117 | 12 | } |
2118 | | |
2119 | 2.00k | if (options.change_level) { |
2120 | 42 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2121 | 42 | "[RefitLevel] waiting for background threads to stop"); |
2122 | 42 | s = PauseBackgroundWork(); |
2123 | 42 | if (s.ok()) { |
2124 | 42 | s = ReFitLevel(cfd, final_output_level, options.target_level); |
2125 | 42 | } |
2126 | 42 | CHECK_OK(ContinueBackgroundWork()); |
2127 | 42 | } |
2128 | 2.00k | LogFlush(db_options_.info_log); |
2129 | | |
2130 | 2.00k | { |
2131 | 2.00k | InstrumentedMutexLock lock(&mutex_); |
2132 | | // an automatic compaction that has been scheduled might have been |
2133 | | // preempted by the manual compactions. Need to schedule it back. |
2134 | 2.00k | if (exclusive) { |
2135 | | // all compaction scheduling was stopped so we reschedule for each cf |
2136 | 1.95k | ColumnFamilySet* columnFamilySet = versions_->GetColumnFamilySet(); |
2137 | 4.46k | for (auto it = columnFamilySet->begin(); it != columnFamilySet->end(); ++it2.51k ) { |
2138 | 2.51k | SchedulePendingCompaction(*it); |
2139 | 2.51k | } |
2140 | 1.95k | } else { |
2141 | | // only compactions in this column family were stopped |
2142 | 52 | SchedulePendingCompaction(cfd); |
2143 | 52 | } |
2144 | 2.00k | MaybeScheduleFlushOrCompaction(); |
2145 | 2.00k | } |
2146 | | |
2147 | 2.00k | return s; |
2148 | 2.02k | } |
2149 | | |
2150 | | Status DBImpl::CompactFiles( |
2151 | | const CompactionOptions& compact_options, |
2152 | | ColumnFamilyHandle* column_family, |
2153 | | const std::vector<std::string>& input_file_names, |
2154 | 45 | const int output_level, const int output_path_id) { |
2155 | | #ifdef ROCKSDB_LITE |
2156 | | // not supported in lite version |
2157 | | return STATUS(NotSupported, "Not supported in ROCKSDB LITE"); |
2158 | | #else |
2159 | 45 | if (column_family == nullptr) { |
2160 | 0 | return STATUS(InvalidArgument, "ColumnFamilyHandle must be non-null."); |
2161 | 0 | } |
2162 | | |
2163 | 45 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
2164 | 45 | assert(cfd); |
2165 | | |
2166 | 0 | Status s; |
2167 | 45 | JobContext job_context(0, true); |
2168 | 45 | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, |
2169 | 45 | db_options_.info_log.get()); |
2170 | | |
2171 | | // Perform CompactFiles |
2172 | 45 | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
2173 | 45 | { |
2174 | 45 | InstrumentedMutexLock l(&mutex_); |
2175 | | |
2176 | 45 | s = CompactFilesImpl(compact_options, cfd, sv->current, |
2177 | 45 | input_file_names, output_level, |
2178 | 45 | output_path_id, &job_context, &log_buffer); |
2179 | 45 | } |
2180 | 45 | ReturnAndCleanupSuperVersion(cfd, sv); |
2181 | | |
2182 | | // Find and delete obsolete files |
2183 | 45 | { |
2184 | 45 | InstrumentedMutexLock l(&mutex_); |
2185 | | // If !s.ok(), this means that Compaction failed. In that case, we want |
2186 | | // to delete all obsolete files we might have created and we force |
2187 | | // FindObsoleteFiles(). This is because job_context does not |
2188 | | // catch all created files if compaction failed. |
2189 | 45 | FindObsoleteFiles(&job_context, !s.ok()); |
2190 | 45 | } // release the mutex |
2191 | | |
2192 | | // delete unnecessary files if any, this is done outside the mutex |
2193 | 45 | if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()0 ) { |
2194 | | // Have to flush the info logs before bg_compaction_scheduled_-- |
2195 | | // because if bg_flush_scheduled_ becomes 0 and the lock is |
2196 | | // released, the deconstructor of DB can kick in and destroy all the |
2197 | | // states of DB so info_log might not be available after that point. |
2198 | | // It also applies to access other states that DB owns. |
2199 | 45 | log_buffer.FlushBufferToLog(); |
2200 | 45 | if (job_context.HaveSomethingToDelete()) { |
2201 | | // no mutex is locked here. No need to Unlock() and Lock() here. |
2202 | 45 | PurgeObsoleteFiles(job_context); |
2203 | 45 | } |
2204 | 45 | job_context.Clean(); |
2205 | 45 | } |
2206 | | |
2207 | 45 | return s; |
2208 | 45 | #endif // ROCKSDB_LITE |
2209 | 45 | } |
2210 | | |
2211 | | #ifndef ROCKSDB_LITE |
2212 | | Status DBImpl::CompactFilesImpl( |
2213 | | const CompactionOptions& compact_options, ColumnFamilyData* cfd, |
2214 | | Version* version, const std::vector<std::string>& input_file_names, |
2215 | | const int output_level, int output_path_id, JobContext* job_context, |
2216 | 45 | LogBuffer* log_buffer) { |
2217 | 45 | mutex_.AssertHeld(); |
2218 | | |
2219 | 45 | if (IsShuttingDown()) { |
2220 | 0 | return STATUS(ShutdownInProgress, ""); |
2221 | 0 | } |
2222 | | |
2223 | 45 | std::unordered_set<uint64_t> input_set; |
2224 | 125 | for (auto file_name : input_file_names) { |
2225 | 125 | input_set.insert(TableFileNameToNumber(file_name)); |
2226 | 125 | } |
2227 | | |
2228 | 45 | ColumnFamilyMetaData cf_meta; |
2229 | | // TODO(yhchiang): can directly use version here if none of the |
2230 | | // following functions call is pluggable to external developers. |
2231 | 45 | version->GetColumnFamilyMetaData(&cf_meta); |
2232 | | |
2233 | 45 | if (output_path_id < 0) { |
2234 | 45 | if (db_options_.db_paths.size() == 1U) { |
2235 | 45 | output_path_id = 0; |
2236 | 45 | } else { |
2237 | 0 | return STATUS(NotSupported, |
2238 | 0 | "Automatic output path selection is not " |
2239 | 0 | "yet supported in CompactFiles()"); |
2240 | 0 | } |
2241 | 45 | } |
2242 | | |
2243 | 45 | Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( |
2244 | 45 | &input_set, cf_meta, output_level); |
2245 | 45 | if (!s.ok()) { |
2246 | 6 | return s; |
2247 | 6 | } |
2248 | | |
2249 | 39 | std::vector<CompactionInputFiles> input_files; |
2250 | 39 | s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( |
2251 | 39 | &input_files, &input_set, version->storage_info(), compact_options); |
2252 | 39 | if (!s.ok()) { |
2253 | 0 | return s; |
2254 | 0 | } |
2255 | | |
2256 | 39 | for (auto inputs : input_files) { |
2257 | 39 | if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) { |
2258 | 0 | return STATUS(Aborted, |
2259 | 0 | "Some of the necessary compaction input " |
2260 | 0 | "files are already being compacted"); |
2261 | 0 | } |
2262 | 39 | } |
2263 | | |
2264 | | // At this point, CompactFiles will be run. |
2265 | 39 | bg_compaction_scheduled_++; |
2266 | | |
2267 | 39 | assert(cfd->compaction_picker()); |
2268 | 0 | unique_ptr<Compaction> c = cfd->compaction_picker()->FormCompaction( |
2269 | 39 | compact_options, input_files, output_level, version->storage_info(), |
2270 | 39 | *cfd->GetLatestMutableCFOptions(), output_path_id); |
2271 | 39 | if (!c) { |
2272 | 0 | return STATUS(Aborted, "Another Level 0 compaction is running or nothing to compact"); |
2273 | 0 | } |
2274 | 39 | c->SetInputVersion(version); |
2275 | | // deletion compaction currently not allowed in CompactFiles. |
2276 | 39 | assert(!c->deletion_compaction()); |
2277 | | |
2278 | 0 | SequenceNumber earliest_write_conflict_snapshot; |
2279 | 39 | std::vector<SequenceNumber> snapshot_seqs = |
2280 | 39 | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
2281 | | |
2282 | 39 | assert(is_snapshot_supported_ || snapshots_.empty()); |
2283 | 0 | CompactionJob compaction_job( |
2284 | 39 | job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), |
2285 | 39 | &shutting_down_, log_buffer, directories_.GetDbDir(), |
2286 | 39 | directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, |
2287 | 39 | snapshot_seqs, earliest_write_conflict_snapshot, pending_outputs_.get(), table_cache_, |
2288 | 39 | &event_logger_, c->mutable_cf_options()->paranoid_file_checks, |
2289 | 39 | c->mutable_cf_options()->compaction_measure_io_stats, dbname_, |
2290 | 39 | nullptr); // Here we pass a nullptr for CompactionJobStats because |
2291 | | // CompactFiles does not trigger OnCompactionCompleted(), |
2292 | | // which is the only place where CompactionJobStats is |
2293 | | // returned. The idea of not triggering OnCompationCompleted() |
2294 | | // is that CompactFiles runs in the caller thread, so the user |
2295 | | // should always know when it completes. As a result, it makes |
2296 | | // less sense to notify the users something they should already |
2297 | | // know. |
2298 | | // |
2299 | | // In the future, if we would like to add CompactionJobStats |
2300 | | // support for CompactFiles, we should have CompactFiles API |
2301 | | // pass a pointer of CompactionJobStats as the out-value |
2302 | | // instead of using EventListener. |
2303 | | |
2304 | | // Creating a compaction influences the compaction score because the score |
2305 | | // takes running compactions into account (by skipping files that are already |
2306 | | // being compacted). Since we just changed compaction score, we recalculate it |
2307 | | // here. |
2308 | 39 | { |
2309 | 39 | CompactionOptionsFIFO dummy_compaction_options_fifo; |
2310 | 39 | version->storage_info()->ComputeCompactionScore( |
2311 | 39 | *c->mutable_cf_options(), dummy_compaction_options_fifo); |
2312 | 39 | } |
2313 | | |
2314 | 39 | compaction_job.Prepare(); |
2315 | | |
2316 | 39 | Status status; |
2317 | 39 | { |
2318 | 39 | mutex_.Unlock(); |
2319 | 39 | for (auto listener : db_options_.listeners) { |
2320 | 3 | listener->OnCompactionStarted(); |
2321 | 3 | } |
2322 | 39 | auto file_numbers_holder = compaction_job.Run(); |
2323 | 39 | TEST_SYNC_POINT("CompactFilesImpl:2"); |
2324 | 39 | TEST_SYNC_POINT("CompactFilesImpl:3"); |
2325 | 39 | mutex_.Lock(); |
2326 | | |
2327 | 39 | status = compaction_job.Install(*c->mutable_cf_options()); |
2328 | 39 | if (status.ok()) { |
2329 | 39 | InstallSuperVersionAndScheduleWorkWrapper( |
2330 | 39 | c->column_family_data(), job_context, *c->mutable_cf_options()); |
2331 | 39 | } |
2332 | 39 | c->ReleaseCompactionFiles(s); |
2333 | 39 | } |
2334 | | |
2335 | 39 | if (status.ok()) { |
2336 | | // Done |
2337 | 39 | } else if (0 status.IsShutdownInProgress()0 ) { |
2338 | | // Ignore compaction errors found during shutting down |
2339 | 0 | } else { |
2340 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
2341 | 0 | "[%s] [JOB %d] Compaction error: %s", |
2342 | 0 | c->column_family_data()->GetName().c_str(), job_context->job_id, |
2343 | 0 | status.ToString().c_str()); |
2344 | 0 | if (db_options_.paranoid_checks && bg_error_.ok()) { |
2345 | 0 | bg_error_ = status; |
2346 | 0 | } |
2347 | 0 | } |
2348 | | |
2349 | 39 | c.reset(); |
2350 | | |
2351 | 39 | bg_compaction_scheduled_--; |
2352 | 39 | if (bg_compaction_scheduled_ == 0) { |
2353 | 39 | bg_cv_.SignalAll(); |
2354 | 39 | } |
2355 | | |
2356 | 39 | return status; |
2357 | 39 | } |
2358 | | #endif // ROCKSDB_LITE |
2359 | | |
2360 | 43 | Status DBImpl::PauseBackgroundWork() { |
2361 | 43 | InstrumentedMutexLock guard_lock(&mutex_); |
2362 | 43 | bg_compaction_paused_++; |
2363 | 43 | while (CheckBackgroundWorkAndLog("Pause")) { |
2364 | 0 | bg_cv_.Wait(); |
2365 | 0 | } |
2366 | 43 | bg_work_paused_++; |
2367 | 43 | return Status::OK(); |
2368 | 43 | } |
2369 | | |
2370 | 43 | Status DBImpl::ContinueBackgroundWork() { |
2371 | 43 | InstrumentedMutexLock guard_lock(&mutex_); |
2372 | 43 | if (bg_work_paused_ == 0) { |
2373 | 0 | return STATUS(InvalidArgument, ""); |
2374 | 0 | } |
2375 | 43 | assert(bg_work_paused_ > 0); |
2376 | 0 | assert(bg_compaction_paused_ > 0); |
2377 | 0 | bg_compaction_paused_--; |
2378 | 43 | bg_work_paused_--; |
2379 | | // It's sufficient to check just bg_work_paused_ here since |
2380 | | // bg_work_paused_ is always no greater than bg_compaction_paused_ |
2381 | 43 | if (bg_work_paused_ == 0) { |
2382 | 43 | MaybeScheduleFlushOrCompaction(); |
2383 | 43 | } |
2384 | 43 | return Status::OK(); |
2385 | 43 | } |
2386 | | |
2387 | | void DBImpl::NotifyOnCompactionCompleted( |
2388 | | ColumnFamilyData* cfd, Compaction *c, const Status &st, |
2389 | | const CompactionJobStats& compaction_job_stats, |
2390 | 23.6k | const int job_id) { |
2391 | 23.6k | #ifndef ROCKSDB_LITE |
2392 | 23.6k | mutex_.AssertHeld(); |
2393 | 23.6k | if (IsShuttingDown()) { |
2394 | 71 | return; |
2395 | 71 | } |
2396 | 23.6k | VersionPtr current = cfd->current(); |
2397 | | // release lock while notifying events |
2398 | 23.6k | mutex_.Unlock(); |
2399 | 23.6k | if (db_options_.listeners.size() > 0) { |
2400 | 687 | CompactionJobInfo info; |
2401 | 687 | info.cf_name = cfd->GetName(); |
2402 | 687 | info.status = st; |
2403 | 687 | info.thread_id = env_->GetThreadID(); |
2404 | 687 | info.job_id = job_id; |
2405 | 687 | info.base_input_level = c->start_level(); |
2406 | 687 | info.output_level = c->output_level(); |
2407 | 687 | info.stats = compaction_job_stats; |
2408 | 687 | info.table_properties = c->GetOutputTableProperties(); |
2409 | 687 | info.compaction_reason = c->compaction_reason(); |
2410 | 687 | info.is_full_compaction = c->is_full_compaction(); |
2411 | 1.66k | for (size_t i = 0; i < c->num_input_levels(); ++i980 ) { |
2412 | 2.63k | for (const auto fmd : *c->inputs(i)) { |
2413 | 2.63k | auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(), |
2414 | 2.63k | fmd->fd.GetPathId()); |
2415 | 2.63k | info.input_files.push_back(fn); |
2416 | 2.63k | if (info.table_properties.count(fn) == 02.63k ) { |
2417 | 2.63k | std::shared_ptr<const TableProperties> tp; |
2418 | 2.63k | auto s = current->GetTableProperties(&tp, fmd, &fn); |
2419 | 2.63k | if (s.ok()) { |
2420 | 2.63k | info.table_properties[fn] = tp; |
2421 | 2.63k | } |
2422 | 2.63k | } |
2423 | 2.63k | } |
2424 | 980 | } |
2425 | 844 | for (const auto& newf : c->edit()->GetNewFiles()) { |
2426 | 844 | info.output_files.push_back( |
2427 | 844 | TableFileName(db_options_.db_paths, |
2428 | 844 | newf.second.fd.GetNumber(), |
2429 | 844 | newf.second.fd.GetPathId())); |
2430 | 844 | } |
2431 | 693 | for (auto listener : db_options_.listeners) { |
2432 | 693 | listener->OnCompactionCompleted(this, info); |
2433 | 693 | } |
2434 | 687 | } |
2435 | 23.6k | SetSSTFileTickers(); |
2436 | 23.6k | mutex_.Lock(); |
2437 | | // no need to signal bg_cv_ as it will be signaled at the end of the |
2438 | | // flush process. |
2439 | 23.6k | #endif // ROCKSDB_LITE |
2440 | 23.6k | } |
2441 | | |
2442 | 382k | void DBImpl::SetDisableFlushOnShutdown(bool disable_flush_on_shutdown) { |
2443 | | // disable_flush_on_shutdown_ can only transition from false to true. This location |
2444 | | // can be called multiple times with arg as false. It is only called once with arg |
2445 | | // as true. Subsequently, the destructor reads this flag. Setting this flag |
2446 | | // to true and the destructor are expected to run on the same thread and hence |
2447 | | // it is not required for disable_flush_on_shutdown_ to be atomic. |
2448 | 382k | if (disable_flush_on_shutdown) { |
2449 | 381k | disable_flush_on_shutdown_ = disable_flush_on_shutdown; |
2450 | 381k | } |
2451 | 382k | } |
2452 | | |
2453 | | Status DBImpl::SetOptions( |
2454 | | ColumnFamilyHandle* column_family, |
2455 | | const std::unordered_map<std::string, std::string>& options_map, |
2456 | 842k | bool dump_options) { |
2457 | | #ifdef ROCKSDB_LITE |
2458 | | return STATUS(NotSupported, "Not supported in ROCKSDB LITE"); |
2459 | | #else |
2460 | 842k | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
2461 | 842k | if (options_map.empty()) { |
2462 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, |
2463 | 0 | db_options_.info_log, "SetOptions() on column family [%s], empty input", |
2464 | 0 | cfd->GetName().c_str()); |
2465 | 0 | return STATUS(InvalidArgument, "empty input"); |
2466 | 0 | } |
2467 | | |
2468 | 842k | MutableCFOptions new_options; |
2469 | 842k | Status s; |
2470 | 842k | Status persist_options_status; |
2471 | 842k | { |
2472 | 842k | InstrumentedMutexLock l(&mutex_); |
2473 | 842k | s = cfd->SetOptions(options_map); |
2474 | 842k | if (s.ok()) { |
2475 | 842k | new_options = *cfd->GetLatestMutableCFOptions(); |
2476 | 842k | } |
2477 | 842k | if (s.ok()842k ) { |
2478 | | // Persist RocksDB options under the single write thread |
2479 | 842k | WriteThread::Writer w; |
2480 | 842k | write_thread_.EnterUnbatched(&w, &mutex_); |
2481 | | |
2482 | 842k | persist_options_status = WriteOptionsFile(); |
2483 | | |
2484 | 842k | write_thread_.ExitUnbatched(&w); |
2485 | 842k | } |
2486 | 842k | } |
2487 | | |
2488 | 842k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2489 | 842k | "SetOptions() on column family [%s], inputs: %s", |
2490 | 842k | cfd->GetName().c_str(), yb::AsString(options_map).c_str()); |
2491 | 842k | if (s.ok()842k ) { |
2492 | 842k | RLOG(InfoLogLevel::INFO_LEVEL, |
2493 | 842k | db_options_.info_log, "[%s] SetOptions succeeded", |
2494 | 842k | cfd->GetName().c_str()); |
2495 | 842k | if (dump_options) { |
2496 | 28 | new_options.Dump(db_options_.info_log.get()); |
2497 | 28 | } |
2498 | 842k | if (!persist_options_status.ok()) { |
2499 | 0 | if (db_options_.fail_if_options_file_error) { |
2500 | 0 | s = STATUS(IOError, |
2501 | 0 | "SetOptions succeeded, but unable to persist options", |
2502 | 0 | persist_options_status.ToString()); |
2503 | 0 | } |
2504 | 0 | RWARN(db_options_.info_log, |
2505 | 0 | "Unable to persist options in SetOptions() -- %s", |
2506 | 0 | persist_options_status.ToString().c_str()); |
2507 | 0 | } |
2508 | 18.4E | } else { |
2509 | 18.4E | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
2510 | 18.4E | "[%s] SetOptions failed", cfd->GetName().c_str()); |
2511 | 18.4E | } |
2512 | 842k | LogFlush(db_options_.info_log); |
2513 | 842k | return s; |
2514 | 842k | #endif // ROCKSDB_LITE |
2515 | 842k | } |
2516 | | |
2517 | | // return the same level if it cannot be moved |
2518 | | int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, |
2519 | 0 | const MutableCFOptions& mutable_cf_options, int level) { |
2520 | 0 | mutex_.AssertHeld(); |
2521 | 0 | const auto* vstorage = cfd->current()->storage_info(); |
2522 | 0 | int minimum_level = level; |
2523 | 0 | for (int i = level - 1; i > 0; --i) { |
2524 | | // stop if level i is not empty |
2525 | 0 | if (vstorage->NumLevelFiles(i) > 0) break; |
2526 | | // stop if level i is too small (cannot fit the level files) |
2527 | 0 | if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { |
2528 | 0 | break; |
2529 | 0 | } |
2530 | | |
2531 | 0 | minimum_level = i; |
2532 | 0 | } |
2533 | 0 | return minimum_level; |
2534 | 0 | } |
2535 | | |
2536 | | // REQUIREMENT: block all background work by calling PauseBackgroundWork() |
2537 | | // before calling this function |
2538 | 42 | Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { |
2539 | 42 | assert(level < cfd->NumberLevels()); |
2540 | 42 | if (target_level >= cfd->NumberLevels()) { |
2541 | 0 | return STATUS(InvalidArgument, "Target level exceeds number of levels"); |
2542 | 0 | } |
2543 | | |
2544 | 42 | std::unique_ptr<SuperVersion> superversion_to_free; |
2545 | 42 | std::unique_ptr<SuperVersion> new_superversion(new SuperVersion()); |
2546 | | |
2547 | 42 | Status status; |
2548 | | |
2549 | 42 | InstrumentedMutexLock guard_lock(&mutex_); |
2550 | | |
2551 | | // only allow one thread refitting |
2552 | 42 | if (refitting_level_) { |
2553 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2554 | 0 | "[ReFitLevel] another thread is refitting"); |
2555 | 0 | return STATUS(NotSupported, "another thread is refitting"); |
2556 | 0 | } |
2557 | 42 | refitting_level_ = true; |
2558 | | |
2559 | 42 | const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); |
2560 | | // move to a smaller level |
2561 | 42 | int to_level = target_level; |
2562 | 42 | if (target_level < 0) { |
2563 | 0 | to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); |
2564 | 0 | } |
2565 | | |
2566 | 42 | auto* vstorage = cfd->current()->storage_info(); |
2567 | 42 | if (to_level > level) { |
2568 | 22 | if (level == 0) { |
2569 | 0 | return STATUS(NotSupported, |
2570 | 0 | "Cannot change from level 0 to other levels."); |
2571 | 0 | } |
2572 | | // Check levels are empty for a trivial move |
2573 | 84 | for (int l = level + 1; 22 l <= to_level; l++62 ) { |
2574 | 62 | if (vstorage->NumLevelFiles(l) > 0) { |
2575 | 0 | return STATUS(NotSupported, |
2576 | 0 | "Levels between source and target are not empty for a move."); |
2577 | 0 | } |
2578 | 62 | } |
2579 | 22 | } |
2580 | 42 | if (to_level != level) { |
2581 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2582 | 39 | "[%s] Before refitting:\n%s", cfd->GetName().c_str(), |
2583 | 39 | cfd->current()->DebugString().data()); |
2584 | | |
2585 | 39 | VersionEdit edit; |
2586 | 39 | edit.SetColumnFamily(cfd->GetID()); |
2587 | 75 | for (const auto& f : vstorage->LevelFiles(level)) { |
2588 | 75 | edit.DeleteFile(level, f->fd.GetNumber()); |
2589 | 75 | edit.AddCleanedFile(to_level, *f); |
2590 | 75 | } |
2591 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2592 | 39 | "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), |
2593 | 39 | edit.DebugString().data()); |
2594 | | |
2595 | 39 | status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, |
2596 | 39 | directories_.GetDbDir()); |
2597 | 39 | superversion_to_free = InstallSuperVersionAndScheduleWork( |
2598 | 39 | cfd, new_superversion.release(), mutable_cf_options); |
2599 | | |
2600 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2601 | 39 | "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), |
2602 | 39 | status.ToString().data()); |
2603 | | |
2604 | 39 | if (status.ok()) { |
2605 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2606 | 39 | "[%s] After refitting:\n%s", cfd->GetName().c_str(), |
2607 | 39 | cfd->current()->DebugString().data()); |
2608 | 39 | } |
2609 | 39 | } |
2610 | | |
2611 | 42 | refitting_level_ = false; |
2612 | | |
2613 | 42 | return status; |
2614 | 42 | } |
2615 | | |
2616 | 1.34k | int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { |
2617 | 1.34k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2618 | 1.34k | return cfh->cfd()->NumberLevels(); |
2619 | 1.34k | } |
2620 | | |
2621 | 0 | int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { |
2622 | 0 | return 0; |
2623 | 0 | } |
2624 | | |
2625 | 0 | int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { |
2626 | 0 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2627 | 0 | InstrumentedMutexLock l(&mutex_); |
2628 | 0 | return cfh->cfd()->GetSuperVersion()-> |
2629 | 0 | mutable_cf_options.level0_stop_writes_trigger; |
2630 | 0 | } |
2631 | | |
2632 | | Status DBImpl::Flush(const FlushOptions& flush_options, |
2633 | 276k | ColumnFamilyHandle* column_family) { |
2634 | 276k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2635 | 276k | return FlushMemTable(cfh->cfd(), flush_options); |
2636 | 276k | } |
2637 | | |
2638 | 174 | Status DBImpl::WaitForFlush(ColumnFamilyHandle* column_family) { |
2639 | 174 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2640 | | // Wait until the flush completes. |
2641 | 174 | return WaitForFlushMemTable(cfh->cfd()); |
2642 | 174 | } |
2643 | | |
2644 | 7 | Status DBImpl::SyncWAL() { |
2645 | 7 | autovector<log::Writer*, 1> logs_to_sync; |
2646 | 7 | bool need_log_dir_sync; |
2647 | 7 | uint64_t current_log_number; |
2648 | | |
2649 | 7 | { |
2650 | 7 | InstrumentedMutexLock l(&mutex_); |
2651 | 7 | assert(!logs_.empty()); |
2652 | | |
2653 | | // This SyncWAL() call only cares about logs up to this number. |
2654 | 0 | current_log_number = logfile_number_; |
2655 | | |
2656 | 7 | while (logs_.front().number <= current_log_number && |
2657 | 7 | logs_.front().getting_synced) { |
2658 | 0 | log_sync_cv_.Wait(); |
2659 | 0 | } |
2660 | | // First check that logs are safe to sync in background. |
2661 | 7 | for (auto it = logs_.begin(); |
2662 | 15 | it != logs_.end() && it->number <= current_log_number9 ; ++it8 ) { |
2663 | 9 | if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { |
2664 | 1 | return STATUS(NotSupported, |
2665 | 1 | "SyncWAL() is not supported for this implementation of WAL file", |
2666 | 1 | db_options_.allow_mmap_writes |
2667 | 1 | ? "try setting Options::allow_mmap_writes to false" |
2668 | 1 | : yb::Slice()); |
2669 | 1 | } |
2670 | 9 | } |
2671 | 6 | for (auto it = logs_.begin(); |
2672 | 14 | it != logs_.end() && it->number <= current_log_number8 ; ++it8 ) { |
2673 | 8 | auto& log = *it; |
2674 | 8 | assert(!log.getting_synced); |
2675 | 0 | log.getting_synced = true; |
2676 | 8 | logs_to_sync.push_back(log.writer); |
2677 | 8 | } |
2678 | | |
2679 | 6 | need_log_dir_sync = !log_dir_synced_; |
2680 | 6 | } |
2681 | | |
2682 | 0 | RecordTick(stats_, WAL_FILE_SYNCED); |
2683 | 6 | Status status; |
2684 | 8 | for (log::Writer* log : logs_to_sync) { |
2685 | 8 | status = log->file()->SyncWithoutFlush(db_options_.use_fsync); |
2686 | 8 | if (!status.ok()) { |
2687 | 0 | break; |
2688 | 0 | } |
2689 | 8 | } |
2690 | 6 | if (status.ok() && need_log_dir_sync) { |
2691 | 5 | status = directories_.GetWalDir()->Fsync(); |
2692 | 5 | } |
2693 | | |
2694 | 6 | TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); |
2695 | 6 | TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); |
2696 | | |
2697 | 6 | { |
2698 | 6 | InstrumentedMutexLock l(&mutex_); |
2699 | 6 | MarkLogsSynced(current_log_number, need_log_dir_sync, status); |
2700 | 6 | } |
2701 | | |
2702 | 6 | return status; |
2703 | 7 | } |
2704 | | |
2705 | | void DBImpl::MarkLogsSynced( |
2706 | 113 | uint64_t up_to, bool synced_dir, const Status& status) { |
2707 | 113 | mutex_.AssertHeld(); |
2708 | 113 | if (synced_dir && |
2709 | 113 | logfile_number_ == up_to75 && |
2710 | 113 | status.ok()73 ) { |
2711 | 43 | log_dir_synced_ = true; |
2712 | 43 | } |
2713 | 231 | for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to120 ;) { |
2714 | 118 | auto& log = *it; |
2715 | 118 | assert(log.getting_synced); |
2716 | 118 | if (status.ok() && logs_.size() > 176 ) { |
2717 | 7 | logs_to_free_.push_back(log.ReleaseWriter()); |
2718 | 7 | it = logs_.erase(it); |
2719 | 111 | } else { |
2720 | 111 | log.getting_synced = false; |
2721 | 111 | ++it; |
2722 | 111 | } |
2723 | 118 | } |
2724 | 113 | assert(logs_.empty() || logs_[0].number > up_to || |
2725 | 113 | (logs_.size() == 1 && !logs_[0].getting_synced)); |
2726 | 0 | log_sync_cv_.SignalAll(); |
2727 | 113 | } |
2728 | | |
2729 | 349k | SequenceNumber DBImpl::GetLatestSequenceNumber() const { |
2730 | 349k | return versions_->LastSequence(); |
2731 | 349k | } |
2732 | | |
2733 | 1.17k | void DBImpl::SubmitCompactionOrFlushTask(std::unique_ptr<ThreadPoolTask> task) { |
2734 | 1.17k | mutex_.AssertHeld(); |
2735 | 1.17k | if (task->Type() == BgTaskType::kCompaction) { |
2736 | 1.14k | compaction_tasks_.insert(down_cast<CompactionTask*>(task.get())); |
2737 | 1.14k | } |
2738 | 1.17k | auto status = db_options_.priority_thread_pool_for_compactions_and_flushes->Submit( |
2739 | 1.17k | task->Priority(), &task); |
2740 | 1.17k | if (!status.ok()) { |
2741 | 0 | task->AbortedUnlocked(status); |
2742 | 0 | } |
2743 | 1.17k | } |
2744 | | |
2745 | | Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, |
2746 | | int output_level, uint32_t output_path_id, |
2747 | | const Slice* begin, const Slice* end, |
2748 | 6.07k | bool exclusive, bool disallow_trivial_move) { |
2749 | 6.07k | TEST_SYNC_POINT("DBImpl::RunManualCompaction"); |
2750 | | |
2751 | 6.07k | DCHECK(input_level == ColumnFamilyData::kCompactAllLevels || |
2752 | 6.07k | input_level >= 0); |
2753 | | |
2754 | 6.07k | InternalKey begin_storage, end_storage; |
2755 | 6.07k | CompactionArg* ca; |
2756 | | |
2757 | 6.07k | bool scheduled = false; |
2758 | 6.07k | bool manual_conflict = false; |
2759 | 6.07k | ManualCompaction manual_compaction; |
2760 | 6.07k | manual_compaction.cfd = cfd; |
2761 | 6.07k | manual_compaction.input_level = input_level; |
2762 | 6.07k | manual_compaction.output_level = output_level; |
2763 | 6.07k | manual_compaction.output_path_id = output_path_id; |
2764 | 6.07k | manual_compaction.done = false; |
2765 | 6.07k | manual_compaction.in_progress = false; |
2766 | 6.07k | manual_compaction.incomplete = false; |
2767 | 6.07k | manual_compaction.exclusive = exclusive; |
2768 | 6.07k | manual_compaction.disallow_trivial_move = disallow_trivial_move; |
2769 | | // For universal compaction, we enforce every manual compaction to compact |
2770 | | // all files. |
2771 | 6.07k | if (begin == nullptr || |
2772 | 6.07k | cfd->ioptions()->compaction_style == kCompactionStyleUniversal1.20k || |
2773 | 6.07k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO1.14k ) { |
2774 | 4.93k | manual_compaction.begin = nullptr; |
2775 | 4.93k | } else { |
2776 | 1.13k | begin_storage = InternalKey::MaxPossibleForUserKey(*begin); |
2777 | 1.13k | manual_compaction.begin = &begin_storage; |
2778 | 1.13k | } |
2779 | 6.07k | if (end == nullptr || |
2780 | 6.07k | cfd->ioptions()->compaction_style == kCompactionStyleUniversal1.25k || |
2781 | 6.07k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO1.19k ) { |
2782 | 4.89k | manual_compaction.end = nullptr; |
2783 | 4.89k | } else { |
2784 | 1.18k | end_storage = InternalKey::MinPossibleForUserKey(*end); |
2785 | 1.18k | manual_compaction.end = &end_storage; |
2786 | 1.18k | } |
2787 | | |
2788 | 6.07k | InstrumentedMutexLock l(&mutex_); |
2789 | | |
2790 | | // When a manual compaction arrives, if it is exclusive, run all scheduled |
2791 | | // and unscheduled compactions (from the queue) and then run the manual |
2792 | | // one. This is to ensure that any key range can be compacted without |
2793 | | // conflict. Otherwise, we let the manual compaction conflict until all |
2794 | | // automatic compactions from the same column family have been scheduled |
2795 | | // and run in the background. |
2796 | | // |
2797 | | // HasPendingManualCompaction() is true when at least one thread is inside |
2798 | | // RunManualCompaction(), i.e. during that time no other compaction will |
2799 | | // get scheduled (see MaybeScheduleFlushOrCompaction). |
2800 | | // |
2801 | | // Note that the following loop doesn't stop more that one thread calling |
2802 | | // RunManualCompaction() from getting to the second while loop below. |
2803 | | // However, only one of them will actually schedule compaction, while |
2804 | | // others will wait on a condition variable until it completes. |
2805 | | |
2806 | 6.07k | AddManualCompaction(&manual_compaction); |
2807 | 6.07k | TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); |
2808 | 6.07k | if (exclusive) { |
2809 | 6.29k | while (unscheduled_compactions_ + bg_compaction_scheduled_ + compaction_tasks_.size() > 0) { |
2810 | 300 | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::Conflict"); |
2811 | 300 | MaybeScheduleFlushOrCompaction(); |
2812 | 621 | while (bg_compaction_scheduled_ + compaction_tasks_.size() > 0) { |
2813 | 321 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2814 | 321 | "[%s] Manual compaction waiting for all other scheduled background " |
2815 | 321 | "compactions to finish", |
2816 | 321 | cfd->GetName().c_str()); |
2817 | 321 | bg_cv_.Wait(); |
2818 | 321 | if (IsShuttingDown()) { |
2819 | 0 | return STATUS(ShutdownInProgress, ""); |
2820 | 0 | } |
2821 | 321 | } |
2822 | 300 | } |
2823 | 5.99k | } |
2824 | | |
2825 | 6.07k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2826 | 6.07k | "[%s] Manual compaction starting", |
2827 | 6.07k | cfd->GetName().c_str()); |
2828 | | |
2829 | 6.07k | size_t compaction_task_serial_no = 0; |
2830 | | // We don't check bg_error_ here, because if we get the error in compaction, |
2831 | | // the compaction will set manual_compaction.status to bg_error_ and set manual_compaction.done to |
2832 | | // true. |
2833 | 17.6k | while (!manual_compaction.done) { |
2834 | 11.5k | DCHECK(HasPendingManualCompaction()); |
2835 | 11.5k | manual_conflict = false; |
2836 | 11.5k | if (ShouldntRunManualCompaction(&manual_compaction) || manual_compaction.in_progress6.37k || |
2837 | 11.5k | scheduled6.33k || |
2838 | 11.5k | (6.27k (manual_compaction.manual_end = &manual_compaction.tmp_storage1)6.27k && ( |
2839 | 6.27k | (manual_compaction.compaction = manual_compaction.cfd->CompactRange( |
2840 | 6.27k | *manual_compaction.cfd->GetLatestMutableCFOptions(), |
2841 | 6.27k | manual_compaction.input_level, manual_compaction.output_level, |
2842 | 6.27k | manual_compaction.output_path_id, manual_compaction.begin, manual_compaction.end, |
2843 | 6.27k | &manual_compaction.manual_end, &manual_conflict)) == |
2844 | 6.27k | nullptr) && |
2845 | 6.27k | manual_conflict1.06k )) { |
2846 | 5.28k | DCHECK(!exclusive || !manual_conflict) |
2847 | 0 | << "exclusive manual compactions should not see a conflict during CompactRange"; |
2848 | 5.28k | if (manual_conflict) { |
2849 | 2 | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::Conflict"); |
2850 | 2 | } |
2851 | | // Running either this or some other manual compaction |
2852 | 5.28k | bg_cv_.Wait(); |
2853 | 5.28k | if (IsShuttingDown()) { |
2854 | 3 | if (!scheduled) { |
2855 | 0 | return STATUS(ShutdownInProgress, ""); |
2856 | 0 | } |
2857 | | // If manual compaction is already scheduled, we increase its priority and will wait for it |
2858 | | // to be aborted. We can't just exit, because compaction task can access manual_compaction |
2859 | | // by raw pointer. |
2860 | 3 | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
2861 | 3 | mutex_.Unlock(); |
2862 | 3 | db_options_.priority_thread_pool_for_compactions_and_flushes->ChangeTaskPriority( |
2863 | 3 | compaction_task_serial_no, kShuttingDownPriority); |
2864 | 3 | mutex_.Lock(); |
2865 | 3 | } |
2866 | 3 | } |
2867 | | |
2868 | 5.28k | if (scheduled && manual_compaction.incomplete == true5.28k ) { |
2869 | 192 | DCHECK(!manual_compaction.in_progress); |
2870 | 192 | scheduled = false; |
2871 | 192 | manual_compaction.incomplete = false; |
2872 | 192 | } |
2873 | 6.27k | } else if (!scheduled) { |
2874 | 6.27k | if (manual_compaction.compaction == nullptr) { |
2875 | 1.05k | manual_compaction.done = true; |
2876 | 1.05k | bg_cv_.SignalAll(); |
2877 | 1.05k | continue; |
2878 | 1.05k | } |
2879 | 5.21k | manual_compaction.incomplete = false; |
2880 | 5.21k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
2881 | 5.21k | FLAGS_use_priority_thread_pool_for_compactions441 ) { |
2882 | 441 | auto compaction_task = std::make_unique<CompactionTask>(this, &manual_compaction); |
2883 | 441 | compaction_task_serial_no = compaction_task->SerialNo(); |
2884 | 441 | SubmitCompactionOrFlushTask(std::move(compaction_task)); |
2885 | 4.77k | } else { |
2886 | 4.77k | bg_compaction_scheduled_++; |
2887 | 4.77k | ca = new CompactionArg; |
2888 | 4.77k | ca->db = this; |
2889 | 4.77k | ca->m = &manual_compaction; |
2890 | 4.77k | env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, |
2891 | 4.77k | &DBImpl::UnscheduleCallback); |
2892 | 4.77k | } |
2893 | 5.21k | scheduled = true; |
2894 | 5.21k | } |
2895 | 11.5k | } |
2896 | | |
2897 | 6.07k | DCHECK(!manual_compaction.in_progress); |
2898 | 6.07k | DCHECK(HasPendingManualCompaction()); |
2899 | 6.07k | RemoveManualCompaction(&manual_compaction); |
2900 | 6.07k | bg_cv_.SignalAll(); |
2901 | 6.07k | return manual_compaction.status; |
2902 | 6.07k | } |
2903 | | |
2904 | | InternalIterator* DBImpl::NewInternalIterator( |
2905 | 404 | Arena* arena, ColumnFamilyHandle* column_family) { |
2906 | 404 | ColumnFamilyData* cfd; |
2907 | 404 | if (column_family == nullptr) { |
2908 | 1 | cfd = default_cf_handle_->cfd(); |
2909 | 403 | } else { |
2910 | 403 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2911 | 403 | cfd = cfh->cfd(); |
2912 | 403 | } |
2913 | | |
2914 | 404 | mutex_.Lock(); |
2915 | 404 | SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); |
2916 | 404 | mutex_.Unlock(); |
2917 | 404 | ReadOptions roptions; |
2918 | 404 | return NewInternalIterator(roptions, cfd, super_version, arena); |
2919 | 404 | } |
2920 | | |
2921 | 3.36k | int DBImpl::GetCfdImmNumNotFlushed() { |
2922 | 3.36k | auto cfd = down_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd(); |
2923 | 3.36k | InstrumentedMutexLock guard_lock(&mutex_); |
2924 | 3.36k | return cfd->imm()->NumNotFlushed(); |
2925 | 3.36k | } |
2926 | | |
2927 | 58.8M | FlushAbility DBImpl::GetFlushAbility() { |
2928 | 58.8M | auto cfd = down_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd(); |
2929 | 58.8M | InstrumentedMutexLock guard_lock(&mutex_); |
2930 | 58.8M | if (cfd->imm()->NumNotFlushed() != 0) { |
2931 | 31.1k | return FlushAbility::kAlreadyFlushing; |
2932 | 31.1k | } |
2933 | 58.7M | return cfd->mem()->IsEmpty() ? FlushAbility::kNoNewData38.1M : FlushAbility::kHasNewData20.6M ; |
2934 | 58.8M | } |
2935 | | |
2936 | | Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, |
2937 | 281k | const FlushOptions& flush_options) { |
2938 | 281k | Status s; |
2939 | 281k | { |
2940 | 281k | WriteContext context; |
2941 | 281k | InstrumentedMutexLock guard_lock(&mutex_); |
2942 | | |
2943 | 281k | if (last_flush_at_tick_ > flush_options.ignore_if_flushed_after_tick) { |
2944 | 0 | return STATUS(AlreadyPresent, "Mem table already flushed"); |
2945 | 0 | } |
2946 | | |
2947 | 281k | if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()281k ) { |
2948 | | // Nothing to flush |
2949 | 265k | return Status::OK(); |
2950 | 265k | } |
2951 | | |
2952 | 15.7k | last_flush_at_tick_ = FlushTick(); |
2953 | | |
2954 | 15.7k | WriteThread::Writer w; |
2955 | 15.7k | write_thread_.EnterUnbatched(&w, &mutex_); |
2956 | | |
2957 | | // SwitchMemtable() will release and reacquire mutex |
2958 | | // during execution |
2959 | 15.7k | s = SwitchMemtable(cfd, &context); |
2960 | 15.7k | write_thread_.ExitUnbatched(&w); |
2961 | | |
2962 | 15.7k | cfd->imm()->FlushRequested(); |
2963 | | |
2964 | | // schedule flush |
2965 | 15.7k | SchedulePendingFlush(cfd); |
2966 | 15.7k | MaybeScheduleFlushOrCompaction(); |
2967 | 15.7k | } |
2968 | | |
2969 | 15.7k | if (s.ok() && flush_options.wait15.7k ) { |
2970 | | // Wait until the compaction completes |
2971 | 14.8k | s = WaitForFlushMemTable(cfd); |
2972 | 14.8k | } |
2973 | 15.7k | return s; |
2974 | 281k | } |
2975 | | |
2976 | 23.3k | Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { |
2977 | 23.3k | Status s; |
2978 | | // Wait until the flush completes |
2979 | 23.3k | InstrumentedMutexLock l(&mutex_); |
2980 | 44.2k | while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()20.9k ) { |
2981 | 20.9k | if (IsShuttingDown() && disable_flush_on_shutdown_1.14k ) { |
2982 | 0 | return STATUS(ShutdownInProgress, ""); |
2983 | 0 | } |
2984 | 20.9k | bg_cv_.Wait(); |
2985 | 20.9k | } |
2986 | 23.3k | if (!bg_error_.ok()) { |
2987 | 8 | s = bg_error_; |
2988 | 8 | } |
2989 | 23.3k | return s; |
2990 | 23.3k | } |
2991 | | |
2992 | | Status DBImpl::EnableAutoCompaction( |
2993 | 421k | const std::vector<ColumnFamilyHandle*>& column_family_handles) { |
2994 | 421k | TEST_SYNC_POINT("DBImpl::EnableAutoCompaction"); |
2995 | 421k | Status s; |
2996 | 421k | for (auto cf_ptr : column_family_handles) { |
2997 | 421k | Status status = |
2998 | 421k | this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}, false); |
2999 | 421k | if (status.ok()421k ) { |
3000 | 421k | ColumnFamilyData* cfd = down_cast<ColumnFamilyHandleImpl*>(cf_ptr)->cfd(); |
3001 | 421k | InstrumentedMutexLock guard_lock(&mutex_); |
3002 | 421k | InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
3003 | 18.4E | } else { |
3004 | 18.4E | s = status; |
3005 | 18.4E | } |
3006 | 421k | } |
3007 | | |
3008 | 421k | return s; |
3009 | 421k | } |
3010 | | |
3011 | 1.61M | void DBImpl::MaybeScheduleFlushOrCompaction() { |
3012 | 1.61M | mutex_.AssertHeld(); |
3013 | 1.61M | if (!opened_successfully_) { |
3014 | | // Compaction may introduce data race to DB open |
3015 | 438k | return; |
3016 | 438k | } |
3017 | 1.18M | if (bg_work_paused_ > 0) { |
3018 | | // we paused the background work |
3019 | 41 | return; |
3020 | 1.18M | } else if (IsShuttingDown() && disable_flush_on_shutdown_4.09k ) { |
3021 | | // DB is being deleted; no more background compactions and flushes. |
3022 | 14 | return; |
3023 | 14 | } |
3024 | | |
3025 | 1.24M | while (1.18M unscheduled_flushes_ > 0 && |
3026 | 1.24M | bg_flush_scheduled_ < db_options_.max_background_flushes64.9k ) { |
3027 | 59.5k | unscheduled_flushes_--; |
3028 | 59.5k | bg_flush_scheduled_++; |
3029 | 59.5k | env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); |
3030 | 59.5k | } |
3031 | | |
3032 | 1.18M | size_t bg_compactions_allowed = BGCompactionsAllowed(); |
3033 | | |
3034 | | // special case -- if max_background_flushes == 0, then schedule flush on a |
3035 | | // compaction thread |
3036 | 1.18M | if (db_options_.max_background_flushes == 0) { |
3037 | 968 | while (unscheduled_flushes_ > 0 && |
3038 | 968 | bg_flush_scheduled_ + bg_compaction_scheduled_ + compaction_tasks_.size() < |
3039 | 130 | bg_compactions_allowed) { |
3040 | 130 | unscheduled_flushes_--; |
3041 | 130 | bg_flush_scheduled_++; |
3042 | 130 | env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); |
3043 | 130 | } |
3044 | 838 | } |
3045 | | |
3046 | 1.18M | if (IsShuttingDown()) { |
3047 | 4.07k | return; |
3048 | 4.07k | } |
3049 | | |
3050 | 1.17M | if (bg_compaction_paused_ > 0) { |
3051 | | // we paused the background compaction |
3052 | 0 | return; |
3053 | 0 | } |
3054 | | |
3055 | 1.27M | while (1.17M bg_compaction_scheduled_ + compaction_tasks_.size() < bg_compactions_allowed && |
3056 | 1.27M | unscheduled_compactions_ > 01.11M ) { |
3057 | 101k | bg_compaction_scheduled_++; |
3058 | 101k | unscheduled_compactions_--; |
3059 | 101k | CompactionArg* ca = new CompactionArg; |
3060 | 101k | ca->db = this; |
3061 | 101k | ca->m = nullptr; |
3062 | 101k | env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, |
3063 | 101k | &DBImpl::UnscheduleCallback); |
3064 | 101k | } |
3065 | 1.17M | } |
3066 | | |
3067 | 1.26M | int DBImpl::BGCompactionsAllowed() const { |
3068 | 1.26M | if (write_controller_.NeedSpeedupCompaction()) { |
3069 | 1.26M | return db_options_.max_background_compactions; |
3070 | 1.26M | } else { |
3071 | 533 | return db_options_.base_background_compactions; |
3072 | 533 | } |
3073 | 1.26M | } |
3074 | | |
3075 | 0 | bool DBImpl::IsEmptyCompactionQueue() { |
3076 | 0 | return small_compaction_queue_.empty() && large_compaction_queue_.empty(); |
3077 | 0 | } |
3078 | | |
3079 | 36.1k | bool DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { |
3080 | 36.1k | mutex_.AssertHeld(); |
3081 | | |
3082 | 36.1k | assert(!cfd->pending_compaction()); |
3083 | | |
3084 | 0 | const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
3085 | 36.1k | std::unique_ptr<Compaction> c; |
3086 | | |
3087 | 36.1k | if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()34.9k |
3088 | 36.1k | && !(34.9k HasExclusiveManualCompaction()34.9k || HaveManualCompaction(cfd)34.6k )) { |
3089 | 34.6k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3090 | 34.6k | c = cfd->PickCompaction(*cfd->GetLatestMutableCFOptions(), &log_buffer); |
3091 | 34.6k | log_buffer.FlushBufferToLog(); |
3092 | 34.6k | if (c) { |
3093 | 18.4k | cfd->Ref(); |
3094 | 18.4k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
3095 | 18.4k | FLAGS_use_priority_thread_pool_for_compactions699 ) { |
3096 | 699 | SubmitCompactionOrFlushTask(std::make_unique<CompactionTask>(this, std::move(c))); |
3097 | | // True means that we need to schedule one more compaction, since it is already scheduled |
3098 | | // one line above we return false. |
3099 | 699 | return false; |
3100 | 17.7k | } else if (!IsLargeCompaction(*c)) { |
3101 | 17.7k | small_compaction_queue_.push_back(std::move(c)); |
3102 | 17.7k | } else { |
3103 | 6 | large_compaction_queue_.push_back(std::move(c)); |
3104 | 6 | } |
3105 | 17.7k | cfd->set_pending_compaction(true); |
3106 | 17.7k | return true; |
3107 | 18.4k | } |
3108 | 34.6k | } |
3109 | | |
3110 | 17.6k | return false; |
3111 | 36.1k | } |
3112 | | |
3113 | 17.7k | std::unique_ptr<Compaction> DBImpl::PopFirstFromSmallCompactionQueue() { |
3114 | 17.7k | return PopFirstFromCompactionQueue(&small_compaction_queue_); |
3115 | 17.7k | } |
3116 | | |
3117 | 6 | std::unique_ptr<Compaction> DBImpl::PopFirstFromLargeCompactionQueue() { |
3118 | 6 | return PopFirstFromCompactionQueue(&large_compaction_queue_); |
3119 | 6 | } |
3120 | | |
3121 | 22.2k | bool DBImpl::IsLargeCompaction(const Compaction& compaction) { |
3122 | 22.2k | return compaction.CalculateTotalInputSize() >= db_options_.compaction_size_threshold_bytes; |
3123 | 22.2k | } |
3124 | | |
3125 | 59.6k | void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) { |
3126 | 59.6k | assert(!cfd->pending_flush()); |
3127 | 0 | cfd->Ref(); |
3128 | 59.6k | flush_queue_.push_back(cfd); |
3129 | 59.6k | cfd->set_pending_flush(true); |
3130 | 59.6k | } |
3131 | | |
3132 | 59.6k | ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { |
3133 | 59.6k | assert(!flush_queue_.empty()); |
3134 | 0 | auto cfd = *flush_queue_.begin(); |
3135 | 59.6k | flush_queue_.pop_front(); |
3136 | 59.6k | assert(cfd->pending_flush()); |
3137 | 0 | cfd->set_pending_flush(false); |
3138 | 59.6k | return cfd; |
3139 | 59.6k | } |
3140 | | |
3141 | 996k | void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) { |
3142 | 996k | if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()977k ) { |
3143 | 59.7k | for (auto listener : db_options_.listeners) { |
3144 | 4.40k | listener->OnFlushScheduled(this); |
3145 | 4.40k | } |
3146 | 59.7k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
3147 | 59.7k | FLAGS_use_priority_thread_pool_for_flushes37.4k ) { |
3148 | 30 | ++bg_flush_scheduled_; |
3149 | 30 | cfd->Ref(); |
3150 | 30 | cfd->set_pending_flush(true); |
3151 | 30 | SubmitCompactionOrFlushTask(std::make_unique<FlushTask>(this, cfd)); |
3152 | 59.6k | } else { |
3153 | 59.6k | AddToFlushQueue(cfd); |
3154 | 59.6k | ++unscheduled_flushes_; |
3155 | 59.6k | } |
3156 | 59.7k | } |
3157 | 996k | } |
3158 | | |
3159 | 1.02M | void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { |
3160 | 1.02M | mutex_.AssertHeld(); |
3161 | | |
3162 | 1.02M | if (!cfd->pending_compaction() && cfd->NeedsCompaction()1.00M && !IsShuttingDown()36.2k ) { |
3163 | 36.1k | if (AddToCompactionQueue(cfd)) { |
3164 | 17.7k | ++unscheduled_compactions_; |
3165 | 17.7k | } |
3166 | 36.1k | } |
3167 | 1.02M | TEST_SYNC_POINT("DBImpl::SchedulePendingCompaction:Done"); |
3168 | 1.02M | } |
3169 | | |
3170 | 59.6k | void DBImpl::BGWorkFlush(void* db) { |
3171 | 59.6k | IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); |
3172 | 59.6k | TEST_SYNC_POINT("DBImpl::BGWorkFlush"); |
3173 | 59.6k | reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush(nullptr /* cfd */); |
3174 | 59.6k | TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); |
3175 | 59.6k | } |
3176 | | |
3177 | 106k | void DBImpl::BGWorkCompaction(void* arg) { |
3178 | 106k | CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg)); |
3179 | 106k | delete reinterpret_cast<CompactionArg*>(arg); |
3180 | 106k | IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); |
3181 | 106k | TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); |
3182 | 106k | reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(ca.m); |
3183 | 106k | } |
3184 | | |
3185 | 2 | void DBImpl::UnscheduleCallback(void* arg) { |
3186 | 2 | CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg)); |
3187 | 2 | delete reinterpret_cast<CompactionArg*>(arg); |
3188 | 2 | if (ca.m != nullptr) { |
3189 | 0 | ca.m->compaction.reset(); |
3190 | 0 | } |
3191 | 2 | TEST_SYNC_POINT("DBImpl::UnscheduleCallback"); |
3192 | 2 | } |
3193 | | |
3194 | | Result<FileNumbersHolder> DBImpl::BackgroundFlush( |
3195 | 59.6k | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, ColumnFamilyData* cfd) { |
3196 | 59.6k | mutex_.AssertHeld(); |
3197 | | |
3198 | 59.6k | auto scope_exit = yb::ScopeExit([&cfd] { |
3199 | 59.6k | if (cfd && cfd->Unref()59.6k ) { |
3200 | 0 | delete cfd; |
3201 | 0 | } |
3202 | 59.6k | }); |
3203 | | |
3204 | 59.6k | if (cfd) { |
3205 | | // cfd is not nullptr when we get here from DBImpl::FlushTask and in this case we need to reset |
3206 | | // pending flush flag. |
3207 | | // In other cases (getting here from DBImpl::BGWorkFlush) this is done by |
3208 | | // DBImpl::PopFirstFromFlushQueue called below. |
3209 | 30 | cfd->set_pending_flush(false); |
3210 | 30 | } |
3211 | | |
3212 | 59.6k | Status status = bg_error_; |
3213 | 59.6k | if (status.ok() && IsShuttingDown()59.6k && disable_flush_on_shutdown_1.14k ) { |
3214 | 3 | status = STATUS(ShutdownInProgress, ""); |
3215 | 3 | } |
3216 | | |
3217 | 59.6k | if (!status.ok()) { |
3218 | 3 | return status; |
3219 | 3 | } |
3220 | | |
3221 | 59.6k | if (cfd == nullptr) { |
3222 | 59.6k | while (!flush_queue_.empty()) { |
3223 | | // This cfd is already referenced |
3224 | 59.6k | auto first_cfd = PopFirstFromFlushQueue(); |
3225 | | |
3226 | 59.6k | if (first_cfd->IsDropped()59.6k || !first_cfd->imm()->IsFlushPending()) { |
3227 | | // can't flush this CF, try next one |
3228 | 1 | if (first_cfd->Unref()) { |
3229 | 0 | delete first_cfd; |
3230 | 0 | } |
3231 | 1 | continue; |
3232 | 1 | } |
3233 | | |
3234 | | // found a flush! |
3235 | 59.6k | cfd = first_cfd; |
3236 | 59.6k | break; |
3237 | 59.6k | } |
3238 | 59.6k | } |
3239 | | |
3240 | 59.6k | if (cfd == nullptr) { |
3241 | 1 | return FileNumbersHolder(); |
3242 | 1 | } |
3243 | 59.6k | const MutableCFOptions mutable_cf_options = |
3244 | 59.6k | *cfd->GetLatestMutableCFOptions(); |
3245 | 59.6k | YB_LOG_WITH_PREFIX_EVERY_N_SECS3.26k (INFO, 1) |
3246 | 3.26k | << "Calling FlushMemTableToOutputFile with column " |
3247 | 3.26k | << "family [" << cfd->GetName() << "], " |
3248 | 3.26k | << "flush slots scheduled " << bg_flush_scheduled_ << ", " |
3249 | 3.26k | << "total flush slots " << db_options_.max_background_flushes << ", " |
3250 | 3.26k | << "compaction slots scheduled " << bg_compaction_scheduled_ << ", " |
3251 | 3.26k | << "compaction tasks " << yb::ToString(compaction_tasks_) << ", " |
3252 | 3.26k | << "total compaction slots " << BGCompactionsAllowed(); |
3253 | 59.6k | return FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, |
3254 | 59.6k | job_context, log_buffer); |
3255 | 59.6k | } |
3256 | | |
3257 | | void DBImpl::WaitAfterBackgroundError( |
3258 | 166k | const Status& s, const char* job_name, LogBuffer* log_buffer) { |
3259 | 166k | if (!s.ok() && !s.IsShutdownInProgress()136 ) { |
3260 | | // Wait a little bit before retrying background job in |
3261 | | // case this is an environmental problem and we do not want to |
3262 | | // chew up resources for failed jobs for the duration of |
3263 | | // the problem. |
3264 | 73 | uint64_t error_cnt = default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); |
3265 | 73 | bg_cv_.SignalAll(); // In case a waiter can proceed despite the error |
3266 | 73 | mutex_.Unlock(); |
3267 | 73 | log_buffer->FlushBufferToLog(); |
3268 | 73 | RLOG( |
3269 | 73 | InfoLogLevel::ERROR_LEVEL, db_options_.info_log, Format( |
3270 | 73 | "Waiting after background $0 error: $1, Accumulated background error counts: $2", |
3271 | 73 | job_name, s, error_cnt).c_str()); |
3272 | 73 | LogFlush(db_options_.info_log); |
3273 | 73 | env_->SleepForMicroseconds(1000000); |
3274 | 73 | mutex_.Lock(); |
3275 | 73 | } |
3276 | 166k | } |
3277 | | |
3278 | | void DBImpl::BackgroundJobComplete( |
3279 | 166k | const Status& s, JobContext* job_context, LogBuffer* log_buffer) { |
3280 | 166k | mutex_.AssertHeld(); |
3281 | | |
3282 | 166k | TaskPriorityUpdater task_priority_updater(this); |
3283 | 166k | task_priority_updater.Prepare(); |
3284 | | |
3285 | | // If flush or compaction failed, we want to delete all temporary files that we might have |
3286 | | // created. Thus, we force full scan in FindObsoleteFiles() |
3287 | 166k | FindObsoleteFiles(job_context, !s.ok() && !s.IsShutdownInProgress()136 ); |
3288 | | |
3289 | | // delete unnecessary files if any, this is done outside the mutex |
3290 | 166k | if (job_context->HaveSomethingToDelete() || !log_buffer->IsEmpty()0 || |
3291 | 166k | !task_priority_updater.Empty()0 || HasFilesChangedListener()0 ) { |
3292 | 166k | mutex_.Unlock(); |
3293 | | // Have to flush the info logs before bg_flush_scheduled_-- |
3294 | | // because if bg_flush_scheduled_ becomes 0 and the lock is |
3295 | | // released, the destructor of DB can kick in and destroy all the |
3296 | | // state of DB so info_log might not be available after that point. |
3297 | | // It also applies to access to other state that DB owns. |
3298 | 166k | log_buffer->FlushBufferToLog(); |
3299 | 166k | if (job_context->HaveSomethingToDelete()166k ) { |
3300 | 166k | PurgeObsoleteFiles(*job_context); |
3301 | 166k | } |
3302 | 166k | job_context->Clean(); |
3303 | | |
3304 | 166k | task_priority_updater.Apply(); |
3305 | | |
3306 | 166k | FilesChanged(); |
3307 | | |
3308 | 166k | mutex_.Lock(); |
3309 | 166k | } |
3310 | 166k | } |
3311 | | |
3312 | 59.6k | void DBImpl::BackgroundCallFlush(ColumnFamilyData* cfd) { |
3313 | 59.6k | bool made_progress = false; |
3314 | 59.6k | JobContext job_context(next_job_id_.fetch_add(1), true); |
3315 | | |
3316 | 59.6k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3317 | | |
3318 | 59.6k | InstrumentedMutexLock l(&mutex_); |
3319 | 59.6k | assert(bg_flush_scheduled_); |
3320 | 0 | num_running_flushes_++; |
3321 | | |
3322 | 59.6k | Status s; |
3323 | 59.6k | { |
3324 | 59.6k | auto file_number_holder = BackgroundFlush(&made_progress, &job_context, &log_buffer, cfd); |
3325 | 59.6k | s = yb::ResultToStatus(file_number_holder); |
3326 | 59.6k | WaitAfterBackgroundError(s, "flush", &log_buffer); |
3327 | 59.6k | } |
3328 | | |
3329 | 59.6k | BackgroundJobComplete(s, &job_context, &log_buffer); |
3330 | | |
3331 | 59.6k | assert(num_running_flushes_ > 0); |
3332 | 0 | num_running_flushes_--; |
3333 | 59.6k | bg_flush_scheduled_--; |
3334 | | // See if there's more work to be done |
3335 | 59.6k | MaybeScheduleFlushOrCompaction(); |
3336 | 59.6k | bg_cv_.SignalAll(); |
3337 | | // IMPORTANT: there should be no code after calling SignalAll. This call may |
3338 | | // signal the DB destructor that it's OK to proceed with destruction. In |
3339 | | // that case, all DB variables will be dealloacated and referencing them |
3340 | | // will cause trouble. |
3341 | 59.6k | } |
3342 | | |
3343 | | void DBImpl::BackgroundCallCompaction(ManualCompaction* m, std::unique_ptr<Compaction> compaction, |
3344 | 107k | CompactionTask* compaction_task) { |
3345 | 107k | bool made_progress = false; |
3346 | 107k | JobContext job_context(next_job_id_.fetch_add(1), true); |
3347 | 107k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3348 | 107k | if (compaction_task) { |
3349 | 1.13k | compaction_task->SetJobID(&job_context); |
3350 | 1.13k | } |
3351 | 107k | InstrumentedMutexLock l(&mutex_); |
3352 | 107k | num_total_running_compactions_++; |
3353 | | |
3354 | 107k | if (compaction_task) { |
3355 | 1.13k | LOG_IF_WITH_PREFIX0 (DFATAL, compaction_tasks_.count(compaction_task) != 1) |
3356 | 0 | << "Running compaction for unknown task: " << compaction_task; |
3357 | 106k | } else { |
3358 | 18.4E | LOG_IF_WITH_PREFIX(DFATAL, bg_compaction_scheduled_ == 0) |
3359 | 18.4E | << "Running compaction while no compactions were scheduled"; |
3360 | 106k | } |
3361 | | |
3362 | 107k | Status s; |
3363 | 107k | { |
3364 | 107k | auto file_numbers_holder = BackgroundCompaction( |
3365 | 107k | &made_progress, &job_context, &log_buffer, m, std::move(compaction)); |
3366 | | |
3367 | 107k | if (compaction_task) { |
3368 | 1.12k | compaction_task->Complete(); |
3369 | 1.12k | } |
3370 | | |
3371 | 107k | s = yb::ResultToStatus(file_numbers_holder); |
3372 | 107k | TEST_SYNC_POINT("BackgroundCallCompaction:1"); |
3373 | 107k | WaitAfterBackgroundError(s, "compaction", &log_buffer); |
3374 | 107k | } |
3375 | | |
3376 | 107k | BackgroundJobComplete(s, &job_context, &log_buffer); |
3377 | | |
3378 | 107k | assert(num_total_running_compactions_ > 0); |
3379 | 0 | num_total_running_compactions_--; |
3380 | 107k | if (compaction_task) { |
3381 | 1.12k | LOG_IF_WITH_PREFIX0 (DFATAL, compaction_tasks_.erase(compaction_task) != 1) |
3382 | 0 | << "Finished compaction with unknown task serial no: " << yb::ToString(compaction_task); |
3383 | 106k | } else { |
3384 | 106k | bg_compaction_scheduled_--; |
3385 | 106k | } |
3386 | | |
3387 | 107k | versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); |
3388 | | |
3389 | | // See if there's more work to be done |
3390 | 107k | MaybeScheduleFlushOrCompaction(); |
3391 | 107k | if (made_progress || (bg_compaction_scheduled_ + compaction_tasks_.size()) == 083.4k || |
3392 | 107k | HasPendingManualCompaction()83.4k ) { |
3393 | | // signal if |
3394 | | // * made_progress -- need to wakeup DelayWrite |
3395 | | // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl |
3396 | | // * HasPendingManualCompaction -- need to wakeup RunManualCompaction |
3397 | | // If none of this is true, there is no need to signal since nobody is |
3398 | | // waiting for it |
3399 | 23.6k | bg_cv_.SignalAll(); |
3400 | 23.6k | } |
3401 | | // IMPORTANT: there should be no code after calling SignalAll. This call may |
3402 | | // signal the DB destructor that it's OK to proceed with destruction. In |
3403 | | // that case, all DB variables will be dealloacated and referencing them |
3404 | | // will cause trouble. |
3405 | 107k | } |
3406 | | |
3407 | | Result<FileNumbersHolder> DBImpl::BackgroundCompaction( |
3408 | | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, |
3409 | 107k | ManualCompaction* manual_compaction, std::unique_ptr<Compaction> compaction) { |
3410 | 107k | *made_progress = false; |
3411 | 107k | mutex_.AssertHeld(); |
3412 | | |
3413 | 107k | bool is_manual = (manual_compaction != nullptr); |
3414 | 107k | if (is_manual && compaction5.21k ) { |
3415 | 0 | return STATUS( |
3416 | 0 | InvalidArgument, |
3417 | 0 | "Both is_manual and compaction are specified in BackgroundCompaction, only one of them is " |
3418 | 0 | "allowed"); |
3419 | 0 | } |
3420 | 107k | DCHECK(!is_manual || !compaction); |
3421 | 107k | bool is_large_compaction = false; |
3422 | | |
3423 | | // (manual_compaction->in_progress == false); |
3424 | 107k | bool trivial_move_disallowed = |
3425 | 107k | is_manual && manual_compaction->disallow_trivial_move5.21k ; |
3426 | | |
3427 | 107k | CompactionJobStats compaction_job_stats; |
3428 | 107k | Status status = bg_error_; |
3429 | 107k | if (status.ok() && IsShuttingDown()107k ) { |
3430 | 1 | status = STATUS(ShutdownInProgress, ""); |
3431 | 1 | } |
3432 | | |
3433 | 107k | if (!status.ok()) { |
3434 | 4 | if (is_manual) { |
3435 | 1 | manual_compaction->status = status; |
3436 | 1 | manual_compaction->done = true; |
3437 | 1 | manual_compaction->in_progress = false; |
3438 | 1 | manual_compaction->compaction.reset(); |
3439 | 1 | manual_compaction = nullptr; |
3440 | 1 | } |
3441 | 4 | if (compaction && compaction->column_family_data()->Unref()0 ) { |
3442 | 0 | delete compaction->column_family_data(); |
3443 | 0 | } |
3444 | 4 | return status; |
3445 | 4 | } |
3446 | | |
3447 | 107k | if (is_manual) { |
3448 | | // another thread cannot pick up the same work |
3449 | 5.20k | manual_compaction->in_progress = true; |
3450 | 5.20k | } |
3451 | | |
3452 | 107k | unique_ptr<Compaction> c; |
3453 | | // InternalKey manual_end_storage; |
3454 | | // InternalKey* manual_end = &manual_end_storage; |
3455 | 107k | if (is_manual) { |
3456 | 5.20k | ManualCompaction* m = manual_compaction; |
3457 | 5.20k | assert(m->in_progress); |
3458 | 0 | c = std::move(m->compaction); |
3459 | 5.20k | if (!c) { |
3460 | 0 | m->done = true; |
3461 | 0 | m->manual_end = nullptr; |
3462 | 0 | LOG_TO_BUFFER(log_buffer, |
3463 | 0 | "[%s] Manual compaction from level-%d from %s .. " |
3464 | 0 | "%s; nothing to do\n", |
3465 | 0 | m->cfd->GetName().c_str(), m->input_level, |
3466 | 0 | (m->begin ? m->begin->DebugString().c_str() : "(begin)"), |
3467 | 0 | (m->end ? m->end->DebugString().c_str() : "(end)")); |
3468 | 5.20k | } else { |
3469 | 5.20k | LOG_TO_BUFFER(log_buffer, |
3470 | 5.20k | "[%s] Manual compaction from level-%d to level-%d from %s .. " |
3471 | 5.20k | "%s; will stop at %s\n", |
3472 | 5.20k | m->cfd->GetName().c_str(), m->input_level, c->output_level(), |
3473 | 5.20k | (m->begin ? m->begin->DebugString().c_str() : "(begin)"), |
3474 | 5.20k | (m->end ? m->end->DebugString().c_str() : "(end)"), |
3475 | 5.20k | ((m->done || m->manual_end == nullptr) |
3476 | 5.20k | ? "(end)" |
3477 | 5.20k | : m->manual_end->DebugString().c_str())); |
3478 | 5.20k | } |
3479 | 101k | } else { |
3480 | | // cfd is referenced here |
3481 | 101k | if (compaction) { |
3482 | 699 | c = std::move(compaction); |
3483 | 699 | is_large_compaction = IsLargeCompaction(*c); |
3484 | 101k | } else if (!large_compaction_queue_.empty() && BGCompactionsAllowed() > |
3485 | 83.5k | num_running_large_compactions() + db_options_.num_reserved_small_compaction_threads) { |
3486 | 6 | c = PopFirstFromLargeCompactionQueue(); |
3487 | 6 | is_large_compaction = true; |
3488 | 101k | } else if (!small_compaction_queue_.empty()) { |
3489 | 17.7k | c = PopFirstFromSmallCompactionQueue(); |
3490 | 17.7k | is_large_compaction = false; |
3491 | 83.4k | } else { |
3492 | 83.4k | LOG_IF(DFATAL, large_compaction_queue_.empty()) |
3493 | 0 | << "Don't have compactions in BackgroundCompaction"; |
3494 | 83.4k | LOG_TO_BUFFER(log_buffer, "No small compactions in queue. Large compaction threads busy."); |
3495 | 83.4k | unscheduled_compactions_++; |
3496 | 83.4k | return FileNumbersHolder(); |
3497 | 83.4k | } |
3498 | | |
3499 | 18.4k | ColumnFamilyData* cfd = c->column_family_data(); |
3500 | | |
3501 | | // We unreference here because the following code will take a Ref() on |
3502 | | // this cfd if it is going to use it (Compaction class holds a |
3503 | | // reference). |
3504 | | // This will all happen under a mutex so we don't have to be afraid of |
3505 | | // somebody else deleting it. |
3506 | 18.4k | if (cfd->Unref()) { |
3507 | 0 | delete cfd; |
3508 | | // This was the last reference of the column family, so no need to |
3509 | | // compact. |
3510 | 0 | return FileNumbersHolder(); |
3511 | 0 | } |
3512 | | |
3513 | 18.4k | if (is_large_compaction) { |
3514 | 6 | num_running_large_compactions_++; |
3515 | 6 | TEST_SYNC_POINT("DBImpl:BackgroundCompaction:LargeCompaction"); |
3516 | 18.4k | } else { |
3517 | 18.4k | TEST_SYNC_POINT("DBImpl:BackgroundCompaction:SmallCompaction"); |
3518 | 18.4k | } |
3519 | | |
3520 | 18.4k | if (c != nullptr) { |
3521 | | // update statistics |
3522 | 18.4k | MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, |
3523 | 18.4k | c->inputs(0)->size()); |
3524 | | // There are three things that can change compaction score: |
3525 | | // 1) When flush or compaction finish. This case is covered by |
3526 | | // InstallSuperVersionAndScheduleWork |
3527 | | // 2) When MutableCFOptions changes. This case is also covered by |
3528 | | // InstallSuperVersionAndScheduleWork, because this is when the new |
3529 | | // options take effect. |
3530 | | // 3) When we Pick a new compaction, we "remove" those files being |
3531 | | // compacted from the calculation, which then influences compaction |
3532 | | // score. Here we check if we need the new compaction even without the |
3533 | | // files that are currently being compacted. If we need another |
3534 | | // compaction, we might be able to execute it in parallel, so we add it |
3535 | | // to the queue and schedule a new thread. |
3536 | | |
3537 | 18.4k | SchedulePendingCompaction(cfd); |
3538 | 18.4k | MaybeScheduleFlushOrCompaction(); |
3539 | 18.4k | } |
3540 | 18.4k | } |
3541 | | |
3542 | 23.6k | Result<FileNumbersHolder> result = FileNumbersHolder(); |
3543 | 23.6k | for (auto listener : db_options_.listeners) { |
3544 | 705 | listener->OnCompactionStarted(); |
3545 | 705 | } |
3546 | 23.6k | if (c->deletion_compaction()) { |
3547 | | // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old |
3548 | | // file if there is alive snapshot pointing to it |
3549 | 12 | assert(c->num_input_files(1) == 0); |
3550 | 0 | assert(c->level() == 0); |
3551 | 0 | assert(c->column_family_data()->ioptions()->compaction_style == |
3552 | 12 | kCompactionStyleFIFO); |
3553 | | |
3554 | 0 | compaction_job_stats.num_input_files = c->num_input_files(0); |
3555 | | |
3556 | 16 | for (const auto& f : *c->inputs(0)) { |
3557 | 16 | c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); |
3558 | 16 | } |
3559 | 12 | status = versions_->LogAndApply(c->column_family_data(), |
3560 | 12 | *c->mutable_cf_options(), c->edit(), |
3561 | 12 | &mutex_, directories_.GetDbDir()); |
3562 | 12 | InstallSuperVersionAndScheduleWorkWrapper( |
3563 | 12 | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3564 | 12 | LOG_TO_BUFFER(log_buffer, "[%s] Deleted %d files\n", |
3565 | 12 | c->column_family_data()->GetName().c_str(), |
3566 | 12 | c->num_input_files(0)); |
3567 | 12 | *made_progress = true; |
3568 | 23.6k | } else if (!trivial_move_disallowed && c->IsTrivialMove()23.6k ) { |
3569 | 12.3k | TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); |
3570 | | |
3571 | 12.3k | compaction_job_stats.num_input_files = c->num_input_files(0); |
3572 | | |
3573 | | // Move files to next level |
3574 | 12.3k | int32_t moved_files = 0; |
3575 | 12.3k | int64_t moved_bytes = 0; |
3576 | 26.7k | for (unsigned int l = 0; l < c->num_input_levels(); l++14.4k ) { |
3577 | 14.4k | if (c->level(l) == c->output_level()) { |
3578 | 224 | continue; |
3579 | 224 | } |
3580 | 27.9k | for (size_t i = 0; 14.1k i < c->num_input_files(l); i++13.7k ) { |
3581 | 13.7k | FileMetaData* f = c->input(l, i); |
3582 | 13.7k | c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); |
3583 | 13.7k | c->edit()->AddCleanedFile(c->output_level(), *f); |
3584 | | |
3585 | 13.7k | LOG_TO_BUFFER(log_buffer, |
3586 | 13.7k | "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", |
3587 | 13.7k | c->column_family_data()->GetName().c_str(), |
3588 | 13.7k | f->fd.GetNumber(), c->output_level(), f->fd.GetTotalFileSize()); |
3589 | 13.7k | ++moved_files; |
3590 | 13.7k | moved_bytes += f->fd.GetTotalFileSize(); |
3591 | 13.7k | } |
3592 | 14.1k | } |
3593 | | |
3594 | 12.3k | status = versions_->LogAndApply(c->column_family_data(), |
3595 | 12.3k | *c->mutable_cf_options(), c->edit(), |
3596 | 12.3k | &mutex_, directories_.GetDbDir()); |
3597 | | // Use latest MutableCFOptions |
3598 | 12.3k | InstallSuperVersionAndScheduleWorkWrapper( |
3599 | 12.3k | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3600 | | |
3601 | 12.3k | VersionStorageInfo::LevelSummaryStorage tmp; |
3602 | 12.3k | c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), |
3603 | 12.3k | moved_bytes); |
3604 | 12.3k | { |
3605 | 12.3k | event_logger_.LogToBuffer(log_buffer) |
3606 | 12.3k | << "job" << job_context->job_id << "event" |
3607 | 12.3k | << "trivial_move" |
3608 | 12.3k | << "destination_level" << c->output_level() << "files" << moved_files |
3609 | 12.3k | << "total_files_size" << moved_bytes; |
3610 | 12.3k | } |
3611 | 12.3k | LOG_TO_BUFFER( |
3612 | 12.3k | log_buffer, |
3613 | 12.3k | "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", |
3614 | 12.3k | c->column_family_data()->GetName().c_str(), moved_files, |
3615 | 12.3k | c->output_level(), moved_bytes, status.ToString().c_str(), |
3616 | 12.3k | c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); |
3617 | 12.3k | *made_progress = true; |
3618 | 12.3k | } else { |
3619 | 11.3k | int output_level __attribute__((unused)) = c->output_level(); |
3620 | 11.3k | TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", |
3621 | 11.3k | &output_level); |
3622 | | |
3623 | 11.3k | SequenceNumber earliest_write_conflict_snapshot; |
3624 | 11.3k | std::vector<SequenceNumber> snapshot_seqs = |
3625 | 11.3k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
3626 | | |
3627 | 11.3k | assert(is_snapshot_supported_ || snapshots_.empty()); |
3628 | 0 | CompactionJob compaction_job( |
3629 | 11.3k | job_context->job_id, c.get(), db_options_, env_options_, |
3630 | 11.3k | versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), |
3631 | 11.3k | directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, |
3632 | 11.3k | &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, |
3633 | 11.3k | pending_outputs_.get(), table_cache_, &event_logger_, |
3634 | 11.3k | c->mutable_cf_options()->paranoid_file_checks, |
3635 | 11.3k | c->mutable_cf_options()->compaction_measure_io_stats, dbname_, |
3636 | 11.3k | &compaction_job_stats); |
3637 | 11.3k | compaction_job.Prepare(); |
3638 | | |
3639 | 11.3k | mutex_.Unlock(); |
3640 | 11.3k | result = compaction_job.Run(); |
3641 | 11.3k | TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); |
3642 | 11.3k | mutex_.Lock(); |
3643 | | |
3644 | 11.3k | status = compaction_job.Install(*c->mutable_cf_options()); |
3645 | 11.3k | if (status.ok()) { |
3646 | 11.2k | InstallSuperVersionAndScheduleWorkWrapper( |
3647 | 11.2k | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3648 | 11.2k | } |
3649 | 11.3k | *made_progress = true; |
3650 | 11.3k | } |
3651 | | |
3652 | 0 | NotifyOnCompactionCompleted( |
3653 | 23.6k | c->column_family_data(), c.get(), status, |
3654 | 23.6k | compaction_job_stats, job_context->job_id); |
3655 | | |
3656 | 23.6k | c->ReleaseCompactionFiles(status); |
3657 | | |
3658 | | // It is possible that a compaction was needed in the column family but we could not |
3659 | | // add it to the queue when this compaction was popped because of L0 conflicts |
3660 | | // or other picker internals, so we try to schedule again. |
3661 | 23.6k | SchedulePendingCompaction(c->column_family_data()); |
3662 | | |
3663 | 23.6k | *made_progress = true; |
3664 | | // this will unref its input_version and column_family_data |
3665 | 23.6k | c.reset(); |
3666 | | |
3667 | 23.6k | if (status.ok()) { |
3668 | | // Done |
3669 | 23.5k | } else if (118 status.IsShutdownInProgress()118 ) { |
3670 | | // Ignore compaction errors found during shutting down |
3671 | 63 | } else { |
3672 | 63 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", |
3673 | 63 | status.ToString().c_str()); |
3674 | 63 | if (db_options_.paranoid_checks && bg_error_.ok()27 ) { |
3675 | 22 | bg_error_ = status; |
3676 | 22 | } |
3677 | 63 | } |
3678 | | |
3679 | 23.6k | if (is_manual) { |
3680 | 5.20k | ManualCompaction* m = manual_compaction; |
3681 | 5.20k | if (!status.ok()) { |
3682 | 32 | m->status = status; |
3683 | 32 | m->done = true; |
3684 | 32 | } |
3685 | | // For universal compaction: |
3686 | | // Because universal compaction always happens at level 0, so one |
3687 | | // compaction will pick up all overlapped files. No files will be |
3688 | | // filtered out due to size limit and left for a successive compaction. |
3689 | | // So we can safely conclude the current compaction. |
3690 | | // |
3691 | | // Also note that, if we don't stop here, then the current compaction |
3692 | | // writes a new file back to level 0, which will be used in successive |
3693 | | // compaction. Hence the manual compaction will never finish. |
3694 | | // |
3695 | | // Stop the compaction if manual_end points to nullptr -- this means |
3696 | | // that we compacted the whole range. manual_end should always point |
3697 | | // to nullptr in case of universal compaction |
3698 | 5.20k | if (m->manual_end == nullptr) { |
3699 | 5.01k | m->done = true; |
3700 | 5.01k | } |
3701 | 5.20k | if (!m->done) { |
3702 | | // We only compacted part of the requested range. Update *m |
3703 | | // to the range that is left to be compacted. |
3704 | | // Universal and FIFO compactions should always compact the whole range |
3705 | 192 | assert(m->cfd->ioptions()->compaction_style != |
3706 | 192 | kCompactionStyleUniversal || |
3707 | 192 | m->cfd->ioptions()->num_levels > 1); |
3708 | 0 | assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); |
3709 | 0 | m->tmp_storage = *m->manual_end; |
3710 | 192 | m->begin = &m->tmp_storage; |
3711 | 192 | m->incomplete = true; |
3712 | 192 | } |
3713 | 0 | m->in_progress = false; // not being processed anymore |
3714 | 5.20k | } |
3715 | | |
3716 | 23.6k | if (is_large_compaction) { |
3717 | 6 | num_running_large_compactions_--; |
3718 | 6 | } |
3719 | | |
3720 | 23.6k | RETURN_NOT_OK(status); |
3721 | | |
3722 | 23.5k | return result; |
3723 | 23.6k | } |
3724 | | |
3725 | 101k | bool DBImpl::HasPendingManualCompaction() { |
3726 | 101k | return (!manual_compaction_dequeue_.empty()); |
3727 | 101k | } |
3728 | | |
3729 | 6.07k | void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) { |
3730 | 6.07k | manual_compaction_dequeue_.push_back(m); |
3731 | 6.07k | } |
3732 | | |
3733 | 6.07k | void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) { |
3734 | | // Remove from queue |
3735 | 6.07k | std::deque<ManualCompaction*>::iterator it = |
3736 | 6.07k | manual_compaction_dequeue_.begin(); |
3737 | 6.07k | while (it != manual_compaction_dequeue_.end()) { |
3738 | 6.07k | if (m == (*it)) { |
3739 | 6.07k | it = manual_compaction_dequeue_.erase(it); |
3740 | 6.07k | return; |
3741 | 6.07k | } |
3742 | 1 | it++; |
3743 | 1 | } |
3744 | 0 | assert(false); |
3745 | 0 | return; |
3746 | 6.07k | } |
3747 | | |
3748 | 11.5k | bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) { |
3749 | 11.5k | if (m->exclusive) { |
3750 | 11.3k | return (bg_compaction_scheduled_ + compaction_tasks_.size() > 0); |
3751 | 11.3k | } |
3752 | 187 | std::deque<ManualCompaction*>::iterator it = |
3753 | 187 | manual_compaction_dequeue_.begin(); |
3754 | 187 | bool seen = false; |
3755 | 377 | while (it != manual_compaction_dequeue_.end()) { |
3756 | 190 | if (m == (*it)) { |
3757 | 187 | it++; |
3758 | 187 | seen = true; |
3759 | 187 | continue; |
3760 | 187 | } else if (3 MCOverlap(m, (*it))3 && (0 !seen0 && !(*it)->in_progress0 )) { |
3761 | | // Consider the other manual compaction *it, conflicts if: |
3762 | | // overlaps with m |
3763 | | // and (*it) is ahead in the queue and is not yet in progress |
3764 | 0 | return true; |
3765 | 0 | } |
3766 | 3 | it++; |
3767 | 3 | } |
3768 | 187 | return false; |
3769 | 187 | } |
3770 | | |
3771 | 34.6k | bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { |
3772 | | // Remove from priority queue |
3773 | 34.6k | std::deque<ManualCompaction*>::iterator it = |
3774 | 34.6k | manual_compaction_dequeue_.begin(); |
3775 | 34.7k | while (it != manual_compaction_dequeue_.end()) { |
3776 | 17 | if ((*it)->exclusive) { |
3777 | 0 | return true; |
3778 | 0 | } |
3779 | 17 | if ((cfd == (*it)->cfd) && (!(16 (*it)->in_progress16 || (*it)->done2 ))) { |
3780 | | // Allow automatic compaction if manual compaction is |
3781 | | // is in progress |
3782 | 2 | return true; |
3783 | 2 | } |
3784 | 15 | it++; |
3785 | 15 | } |
3786 | 34.6k | return false; |
3787 | 34.6k | } |
3788 | | |
3789 | 34.9k | bool DBImpl::HasExclusiveManualCompaction() { |
3790 | | // Remove from priority queue |
3791 | 34.9k | std::deque<ManualCompaction*>::iterator it = |
3792 | 34.9k | manual_compaction_dequeue_.begin(); |
3793 | 35.0k | while (it != manual_compaction_dequeue_.end()) { |
3794 | 313 | if ((*it)->exclusive) { |
3795 | 296 | return true; |
3796 | 296 | } |
3797 | 17 | it++; |
3798 | 17 | } |
3799 | 34.6k | return false; |
3800 | 34.9k | } |
3801 | | |
3802 | 3 | bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) { |
3803 | 3 | if ((m->exclusive) || (m1->exclusive)) { |
3804 | 0 | return true; |
3805 | 0 | } |
3806 | 3 | if (m->cfd != m1->cfd) { |
3807 | 3 | return false; |
3808 | 3 | } |
3809 | 0 | return true; |
3810 | 3 | } |
3811 | | |
3812 | | namespace { |
3813 | | struct IterState { |
3814 | | IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version) |
3815 | 38.1M | : db(_db), mu(_mu), super_version(_super_version) {} |
3816 | | |
3817 | | DBImpl* db; |
3818 | | InstrumentedMutex* mu; |
3819 | | SuperVersion* super_version; |
3820 | | }; |
3821 | | |
3822 | 38.1M | static void CleanupIteratorState(void* arg1, void* arg2) { |
3823 | 38.1M | IterState* state = reinterpret_cast<IterState*>(arg1); |
3824 | | |
3825 | 38.1M | if (state->super_version->Unref()) { |
3826 | | // Job id == 0 means that this is not our background process, but rather |
3827 | | // user thread |
3828 | 3.61k | JobContext job_context(0); |
3829 | | |
3830 | 3.61k | state->mu->Lock(); |
3831 | 3.61k | state->super_version->Cleanup(); |
3832 | 3.61k | state->db->FindObsoleteFiles(&job_context, false, true); |
3833 | 3.61k | state->mu->Unlock(); |
3834 | | |
3835 | 3.61k | delete state->super_version; |
3836 | 3.61k | if (job_context.HaveSomethingToDelete()) { |
3837 | 401 | state->db->PurgeObsoleteFiles(job_context); |
3838 | 401 | } |
3839 | 3.61k | job_context.Clean(); |
3840 | 3.61k | } |
3841 | | |
3842 | 38.1M | delete state; |
3843 | 38.1M | } |
3844 | | } // namespace |
3845 | | |
3846 | | InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, |
3847 | | ColumnFamilyData* cfd, |
3848 | | SuperVersion* super_version, |
3849 | 38.1M | Arena* arena) { |
3850 | 38.1M | InternalIterator* internal_iter; |
3851 | 38.1M | assert(arena != nullptr); |
3852 | | // Need to create internal iterator from the arena. |
3853 | 0 | MergeIteratorBuilder merge_iter_builder(cfd->internal_comparator().get(), arena); |
3854 | | // Collect iterator for mutable mem |
3855 | 38.1M | merge_iter_builder.AddIterator( |
3856 | 38.1M | super_version->mem->NewIterator(read_options, arena)); |
3857 | | // Collect all needed child iterators for immutable memtables |
3858 | 38.1M | super_version->imm->AddIterators(read_options, &merge_iter_builder); |
3859 | | // Collect iterators for files in L0 - Ln |
3860 | 38.1M | super_version->current->AddIterators(read_options, env_options_, |
3861 | 38.1M | &merge_iter_builder); |
3862 | 38.1M | internal_iter = merge_iter_builder.Finish(); |
3863 | 38.1M | IterState* cleanup = new IterState(this, &mutex_, super_version); |
3864 | 38.1M | internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); |
3865 | | |
3866 | 38.1M | return internal_iter; |
3867 | 38.1M | } |
3868 | | |
3869 | 131M | ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { |
3870 | 131M | return default_cf_handle_; |
3871 | 131M | } |
3872 | | |
3873 | | Status DBImpl::Get(const ReadOptions& read_options, |
3874 | | ColumnFamilyHandle* column_family, const Slice& key, |
3875 | 6.54M | std::string* value) { |
3876 | 6.54M | return GetImpl(read_options, column_family, key, value); |
3877 | 6.54M | } |
3878 | | |
3879 | | // JobContext gets created and destructed outside of the lock -- |
3880 | | // we |
3881 | | // use this convinently to: |
3882 | | // * malloc one SuperVersion() outside of the lock -- new_superversion |
3883 | | // * delete SuperVersion()s outside of the lock -- superversions_to_free |
3884 | | // |
3885 | | // However, if InstallSuperVersionAndScheduleWork() gets called twice with the |
3886 | | // same job_context, we can't reuse the SuperVersion() that got |
3887 | | // malloced because |
3888 | | // first call already used it. In that rare case, we take a hit and create a |
3889 | | // new SuperVersion() inside of the mutex. We do similar thing |
3890 | | // for superversion_to_free |
3891 | | void DBImpl::InstallSuperVersionAndScheduleWorkWrapper( |
3892 | | ColumnFamilyData* cfd, JobContext* job_context, |
3893 | 83.5k | const MutableCFOptions& mutable_cf_options) { |
3894 | 83.5k | mutex_.AssertHeld(); |
3895 | 83.5k | auto old_superversion = InstallSuperVersionAndScheduleWork( |
3896 | 83.5k | cfd, job_context->new_superversion, mutable_cf_options); |
3897 | 83.5k | job_context->new_superversion = nullptr; |
3898 | 83.5k | job_context->superversions_to_free.push_back(old_superversion.release()); |
3899 | 83.5k | } |
3900 | | |
3901 | | std::unique_ptr<SuperVersion> DBImpl::InstallSuperVersionAndScheduleWork( |
3902 | | ColumnFamilyData* cfd, SuperVersion* new_sv, |
3903 | 979k | const MutableCFOptions& mutable_cf_options) { |
3904 | 979k | mutex_.AssertHeld(); |
3905 | | |
3906 | | // Update max_total_in_memory_state_ |
3907 | 979k | size_t old_memtable_size = 0; |
3908 | 979k | auto* old_sv = cfd->GetSuperVersion(); |
3909 | 979k | if (old_sv) { |
3910 | 539k | old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * |
3911 | 539k | old_sv->mutable_cf_options.max_write_buffer_number; |
3912 | 539k | } |
3913 | | |
3914 | 979k | auto old = cfd->InstallSuperVersion( |
3915 | 979k | new_sv ? new_sv112k : new SuperVersion()866k , &mutex_, mutable_cf_options); |
3916 | | |
3917 | | // Whenever we install new SuperVersion, we might need to issue new flushes or |
3918 | | // compactions. |
3919 | 979k | SchedulePendingFlush(cfd); |
3920 | 979k | SchedulePendingCompaction(cfd); |
3921 | 979k | MaybeScheduleFlushOrCompaction(); |
3922 | | |
3923 | | // Update max_total_in_memory_state_ |
3924 | 979k | max_total_in_memory_state_ = |
3925 | 979k | max_total_in_memory_state_ - old_memtable_size + |
3926 | 979k | mutable_cf_options.write_buffer_size * |
3927 | 979k | mutable_cf_options.max_write_buffer_number; |
3928 | 979k | return old; |
3929 | 979k | } |
3930 | | |
3931 | | Status DBImpl::GetImpl(const ReadOptions& read_options, |
3932 | | ColumnFamilyHandle* column_family, const Slice& key, |
3933 | 6.54M | std::string* value, bool* value_found) { |
3934 | 6.54M | StopWatch sw(env_, stats_, DB_GET); |
3935 | 6.54M | PERF_TIMER_GUARD(get_snapshot_time); |
3936 | | |
3937 | 6.54M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
3938 | 6.54M | auto cfd = cfh->cfd(); |
3939 | | |
3940 | 6.54M | SequenceNumber snapshot; |
3941 | 6.54M | if (read_options.snapshot != nullptr) { |
3942 | 905 | snapshot = reinterpret_cast<const SnapshotImpl*>( |
3943 | 905 | read_options.snapshot)->number_; |
3944 | 6.54M | } else { |
3945 | 6.54M | snapshot = versions_->LastSequence(); |
3946 | 6.54M | } |
3947 | | // Acquire SuperVersion |
3948 | 6.54M | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
3949 | | // Prepare to store a list of merge operations if merge occurs. |
3950 | 6.54M | MergeContext merge_context; |
3951 | | |
3952 | 6.54M | Status s; |
3953 | | // First look in the memtable, then in the immutable memtable (if any). |
3954 | | // s is both in/out. When in, s could either be OK or MergeInProgress. |
3955 | | // merge_operands will contain the sequence of merges in the latter case. |
3956 | 6.54M | LookupKey lkey(key, snapshot); |
3957 | 6.54M | PERF_TIMER_STOP(get_snapshot_time); |
3958 | | |
3959 | 6.54M | bool skip_memtable = |
3960 | 6.54M | (read_options.read_tier == kPersistedTier && has_unpersisted_data_480 ); |
3961 | 6.54M | bool done = false; |
3962 | 6.54M | if (!skip_memtable) { |
3963 | 6.54M | if (sv->mem->Get(lkey, value, &s, &merge_context)) { |
3964 | 28.3k | done = true; |
3965 | 28.3k | RecordTick(stats_, MEMTABLE_HIT); |
3966 | 6.51M | } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { |
3967 | 77 | done = true; |
3968 | 77 | RecordTick(stats_, MEMTABLE_HIT); |
3969 | 77 | } |
3970 | 6.54M | } |
3971 | 6.54M | if (!done) { |
3972 | 6.51M | PERF_TIMER_GUARD(get_from_output_files_time); |
3973 | 6.51M | sv->current->Get(read_options, lkey, value, &s, &merge_context, |
3974 | 6.51M | value_found); |
3975 | 6.51M | RecordTick(stats_, MEMTABLE_MISS); |
3976 | 6.51M | } |
3977 | | |
3978 | 6.54M | { |
3979 | 6.54M | PERF_TIMER_GUARD(get_post_process_time); |
3980 | | |
3981 | 6.54M | ReturnAndCleanupSuperVersion(cfd, sv); |
3982 | | |
3983 | 6.54M | RecordTick(stats_, NUMBER_KEYS_READ); |
3984 | 6.54M | RecordTick(stats_, BYTES_READ, value->size()); |
3985 | 6.54M | MeasureTime(stats_, BYTES_PER_READ, value->size()); |
3986 | 6.54M | } |
3987 | 6.54M | return s; |
3988 | 6.54M | } |
3989 | | |
3990 | | std::vector<Status> DBImpl::MultiGet( |
3991 | | const ReadOptions& read_options, |
3992 | | const std::vector<ColumnFamilyHandle*>& column_family, |
3993 | 1.46M | const std::vector<Slice>& keys, std::vector<std::string>* values) { |
3994 | | |
3995 | 1.46M | StopWatch sw(env_, stats_, DB_MULTIGET); |
3996 | 1.46M | PERF_TIMER_GUARD(get_snapshot_time); |
3997 | | |
3998 | 1.46M | struct MultiGetColumnFamilyData { |
3999 | 1.46M | ColumnFamilyData* cfd; |
4000 | 1.46M | SuperVersion* super_version; |
4001 | 1.46M | }; |
4002 | 1.46M | std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data; |
4003 | | // fill up and allocate outside of mutex |
4004 | 14.6M | for (auto cf : column_family) { |
4005 | 14.6M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(cf); |
4006 | 14.6M | auto cfd = cfh->cfd(); |
4007 | 14.6M | if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { |
4008 | 14.6M | auto mgcfd = new MultiGetColumnFamilyData(); |
4009 | 14.6M | mgcfd->cfd = cfd; |
4010 | 14.6M | multiget_cf_data.insert({cfd->GetID(), mgcfd}); |
4011 | 14.6M | } |
4012 | 14.6M | } |
4013 | | |
4014 | 1.46M | mutex_.Lock(); |
4015 | 1.46M | SequenceNumber snapshot; |
4016 | 1.46M | if (read_options.snapshot != nullptr) { |
4017 | 0 | snapshot = reinterpret_cast<const SnapshotImpl*>( |
4018 | 0 | read_options.snapshot)->number_; |
4019 | 1.46M | } else { |
4020 | 1.46M | snapshot = versions_->LastSequence(); |
4021 | 1.46M | } |
4022 | 14.6M | for (auto mgd_iter : multiget_cf_data) { |
4023 | 14.6M | mgd_iter.second->super_version = |
4024 | 14.6M | mgd_iter.second->cfd->GetSuperVersion()->Ref(); |
4025 | 14.6M | } |
4026 | 1.46M | mutex_.Unlock(); |
4027 | | |
4028 | | // Contain a list of merge operations if merge occurs. |
4029 | 1.46M | MergeContext merge_context; |
4030 | | |
4031 | | // Note: this always resizes the values array |
4032 | 1.46M | size_t num_keys = keys.size(); |
4033 | 1.46M | std::vector<Status> stat_list(num_keys); |
4034 | 1.46M | values->resize(num_keys); |
4035 | | |
4036 | | // Keep track of bytes that we read for statistics-recording later |
4037 | 1.46M | uint64_t bytes_read = 0; |
4038 | 1.46M | PERF_TIMER_STOP(get_snapshot_time); |
4039 | | |
4040 | | // For each of the given keys, apply the entire "get" process as follows: |
4041 | | // First look in the memtable, then in the immutable memtable (if any). |
4042 | | // s is both in/out. When in, s could either be OK or MergeInProgress. |
4043 | | // merge_operands will contain the sequence of merges in the latter case. |
4044 | 16.0M | for (size_t i = 0; i < num_keys; ++i14.6M ) { |
4045 | 14.6M | merge_context.Clear(); |
4046 | 14.6M | Status& s = stat_list[i]; |
4047 | 14.6M | std::string* value = &(*values)[i]; |
4048 | | |
4049 | 14.6M | LookupKey lkey(keys[i], snapshot); |
4050 | 14.6M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family[i]); |
4051 | 14.6M | auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); |
4052 | 14.6M | assert(mgd_iter != multiget_cf_data.end()); |
4053 | 0 | auto mgd = mgd_iter->second; |
4054 | 14.6M | auto super_version = mgd->super_version; |
4055 | 14.6M | bool skip_memtable = |
4056 | 14.6M | (read_options.read_tier == kPersistedTier && has_unpersisted_data_480 ); |
4057 | 14.6M | bool done = false; |
4058 | 14.6M | if (!skip_memtable14.6M ) { |
4059 | 14.6M | if (super_version->mem->Get(lkey, value, &s, &merge_context)) { |
4060 | 13.6M | done = true; |
4061 | | // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? |
4062 | 13.6M | } else if (1.03M super_version->imm->Get(lkey, value, &s, &merge_context)1.03M ) { |
4063 | 261k | done = true; |
4064 | | // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? |
4065 | 261k | } |
4066 | 14.6M | } |
4067 | 14.6M | if (!done) { |
4068 | 787k | PERF_TIMER_GUARD(get_from_output_files_time); |
4069 | 787k | super_version->current->Get(read_options, lkey, value, &s, |
4070 | 787k | &merge_context); |
4071 | | // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? |
4072 | 787k | } |
4073 | | |
4074 | 14.6M | if (s.ok()) { |
4075 | 14.3M | bytes_read += value->size(); |
4076 | 14.3M | } |
4077 | 14.6M | } |
4078 | | |
4079 | | // Post processing (decrement reference counts and record statistics) |
4080 | 1.46M | PERF_TIMER_GUARD(get_post_process_time); |
4081 | 1.46M | autovector<SuperVersion*> superversions_to_delete; |
4082 | | |
4083 | | // TODO(icanadi) do we need lock here or just around Cleanup()? |
4084 | 1.46M | mutex_.Lock(); |
4085 | 14.6M | for (auto mgd_iter : multiget_cf_data) { |
4086 | 14.6M | auto mgd = mgd_iter.second; |
4087 | 14.6M | if (mgd->super_version->Unref()) { |
4088 | 1.19k | mgd->super_version->Cleanup(); |
4089 | 1.19k | superversions_to_delete.push_back(mgd->super_version); |
4090 | 1.19k | } |
4091 | 14.6M | } |
4092 | 1.46M | mutex_.Unlock(); |
4093 | | |
4094 | 1.46M | for (auto td : superversions_to_delete) { |
4095 | 1.19k | delete td; |
4096 | 1.19k | } |
4097 | 14.6M | for (auto mgd : multiget_cf_data) { |
4098 | 14.6M | delete mgd.second; |
4099 | 14.6M | } |
4100 | | |
4101 | 1.46M | RecordTick(stats_, NUMBER_MULTIGET_CALLS); |
4102 | 1.46M | RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); |
4103 | 1.46M | RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); |
4104 | 1.46M | MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); |
4105 | 1.46M | PERF_TIMER_STOP(get_post_process_time); |
4106 | | |
4107 | 1.46M | return stat_list; |
4108 | 1.46M | } |
4109 | | |
4110 | | #ifndef ROCKSDB_LITE |
4111 | | Status DBImpl::AddFile(ColumnFamilyHandle* column_family, |
4112 | 569 | const std::string& file_path, bool move_file) { |
4113 | 569 | Status status; |
4114 | 569 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4115 | 569 | ColumnFamilyData* cfd = cfh->cfd(); |
4116 | | |
4117 | 569 | ExternalSstFileInfo file_info; |
4118 | 569 | file_info.file_path = file_path; |
4119 | 569 | status = env_->GetFileSize(file_path, &file_info.base_file_size); |
4120 | 569 | if (!status.ok()) { |
4121 | 0 | return status; |
4122 | 0 | } |
4123 | | |
4124 | | // Access the file using TableReader to extract |
4125 | | // version, number of entries, smallest user key, largest user key |
4126 | 569 | std::unique_ptr<RandomAccessFile> base_sst_file; |
4127 | 569 | status = env_->NewRandomAccessFile(file_path, &base_sst_file, env_options_); |
4128 | 569 | if (!status.ok()) { |
4129 | 0 | return status; |
4130 | 0 | } |
4131 | 569 | std::unique_ptr<RandomAccessFileReader> base_sst_file_reader; |
4132 | 569 | base_sst_file_reader.reset(new RandomAccessFileReader(std::move(base_sst_file))); |
4133 | | |
4134 | 569 | std::unique_ptr<TableReader> table_reader; |
4135 | 569 | status = cfd->ioptions()->table_factory->NewTableReader( |
4136 | 569 | TableReaderOptions(*cfd->ioptions(), env_options_, |
4137 | 569 | cfd->internal_comparator()), |
4138 | 569 | std::move(base_sst_file_reader), file_info.base_file_size, |
4139 | 569 | &table_reader); |
4140 | 569 | if (!status.ok()) { |
4141 | 0 | return status; |
4142 | 0 | } |
4143 | | |
4144 | | // Get the external sst file version from table properties |
4145 | 569 | const UserCollectedProperties& user_collected_properties = |
4146 | 569 | table_reader->GetTableProperties()->user_collected_properties; |
4147 | 569 | UserCollectedProperties::const_iterator external_sst_file_version_iter = |
4148 | 569 | user_collected_properties.find(ExternalSstFilePropertyNames::kVersion); |
4149 | 569 | if (external_sst_file_version_iter == user_collected_properties.end()) { |
4150 | 0 | return STATUS(InvalidArgument, "Generated table version not found"); |
4151 | 0 | } |
4152 | | |
4153 | 569 | file_info.is_split_sst = table_reader->IsSplitSst(); |
4154 | 569 | if (file_info.is_split_sst) { |
4155 | 554 | std::unique_ptr<RandomAccessFile> data_sst_file; |
4156 | 554 | status = env_->NewRandomAccessFile(TableBaseToDataFileName(file_path), &data_sst_file, |
4157 | 554 | env_options_); |
4158 | 554 | if (!status.ok()) { |
4159 | 0 | return status; |
4160 | 0 | } |
4161 | 554 | std::unique_ptr<RandomAccessFileReader> data_sst_file_reader; |
4162 | 554 | data_sst_file_reader.reset(new RandomAccessFileReader(std::move(data_sst_file))); |
4163 | 554 | table_reader->SetDataFileReader(std::move(data_sst_file_reader)); |
4164 | 554 | } |
4165 | | |
4166 | 569 | file_info.file_size = file_info.base_file_size + |
4167 | 18.4E | (file_info.is_split_sst569 ? table_reader->GetTableProperties()->data_size576 : 0); |
4168 | | |
4169 | 569 | file_info.version = |
4170 | 569 | DecodeFixed32(external_sst_file_version_iter->second.c_str()); |
4171 | 574 | if (file_info.version == 1569 ) { |
4172 | | // version 1 imply that all sequence numbers in table equal 0 |
4173 | 574 | file_info.sequence_number = 0; |
4174 | 18.4E | } else { |
4175 | 18.4E | return STATUS(InvalidArgument, "Generated table version is not supported"); |
4176 | 18.4E | } |
4177 | | |
4178 | | // Get number of entries in table |
4179 | 574 | file_info.num_entries = table_reader->GetTableProperties()->num_entries; |
4180 | | |
4181 | 574 | ParsedInternalKey key; |
4182 | 574 | std::unique_ptr<InternalIterator> iter( |
4183 | 574 | table_reader->NewIterator(ReadOptions())); |
4184 | | |
4185 | | // Get first (smallest) key from file |
4186 | 574 | iter->SeekToFirst(); |
4187 | 574 | if (!ParseInternalKey(iter->key(), &key)) { |
4188 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4189 | 0 | } |
4190 | 574 | if (key.sequence != 0) { |
4191 | 0 | return STATUS(Corruption, "Generated table have non zero sequence number"); |
4192 | 0 | } |
4193 | 574 | file_info.smallest_key = key.user_key.ToString(); |
4194 | | |
4195 | | // Get last (largest) key from file |
4196 | 574 | iter->SeekToLast(); |
4197 | 574 | if (!ParseInternalKey(iter->key(), &key)) { |
4198 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4199 | 0 | } |
4200 | 574 | if (key.sequence != 0) { |
4201 | 0 | return STATUS(Corruption, "Generated table have non zero sequence number"); |
4202 | 0 | } |
4203 | 574 | file_info.largest_key = key.user_key.ToString(); |
4204 | | |
4205 | 574 | return AddFile(column_family, &file_info, move_file); |
4206 | 574 | } |
4207 | | |
4208 | | namespace { |
4209 | | |
4210 | | // Helper function for copying file from src_path to dst_path. If try_hard_link is true it tries |
4211 | | // to make a hard link instead of copyging if possible. |
4212 | | Status AddFile(Env* env, const std::string& src_path, const std::string& dst_path, |
4213 | 19.7k | bool try_hard_link) { |
4214 | 19.7k | Status status; |
4215 | 19.7k | if (try_hard_link) { |
4216 | 326 | status = env->LinkFile(src_path, dst_path); |
4217 | 326 | if (status.IsNotSupported()) { |
4218 | | // Original file is on a different FS, use copy instead of hard linking |
4219 | 0 | status = CopyFile(env, src_path, dst_path, 0); |
4220 | 0 | } |
4221 | 19.4k | } else { |
4222 | 19.4k | status = CopyFile(env, src_path, dst_path, 0); |
4223 | 19.4k | } |
4224 | 19.7k | return status; |
4225 | 19.7k | } |
4226 | | |
4227 | | // Deletes file and logs error message in case of failure. error_format should have format |
4228 | | // specifications exactly for 2 string arguments: path and status. |
4229 | | void DeleteFile(Env* env, const std::string& path, const shared_ptr<Logger>& info_log, |
4230 | 9.17k | const char* error_format) { |
4231 | 9.17k | Status s = env->DeleteFile(path); |
4232 | 9.17k | if (!s.ok()) { |
4233 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, info_log, error_format, path.c_str(), s.ToString().c_str()); |
4234 | 0 | } |
4235 | 9.17k | } |
4236 | | |
4237 | | } // namespace |
4238 | | |
4239 | | Status DBImpl::AddFile(ColumnFamilyHandle* column_family, |
4240 | 9.86k | const ExternalSstFileInfo* file_info, bool move_file) { |
4241 | 9.86k | Status status; |
4242 | 9.86k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4243 | 9.86k | ColumnFamilyData* cfd = cfh->cfd(); |
4244 | | |
4245 | 9.86k | if (file_info->num_entries == 0) { |
4246 | 0 | return STATUS(InvalidArgument, "File contain no entries"); |
4247 | 0 | } |
4248 | 9.86k | if (file_info->version != 1) { |
4249 | 0 | return STATUS(InvalidArgument, "Generated table version is not supported"); |
4250 | 0 | } |
4251 | | // version 1 imply that file have only Put Operations with Sequence Number = 0 |
4252 | | |
4253 | 9.86k | FileMetaData meta; |
4254 | 9.86k | meta.smallest.key = InternalKey(file_info->smallest_key, |
4255 | 9.86k | file_info->sequence_number, |
4256 | 9.86k | ValueType::kTypeValue); |
4257 | 9.86k | meta.largest.key = InternalKey(file_info->largest_key, |
4258 | 9.86k | file_info->sequence_number, |
4259 | 9.86k | ValueType::kTypeValue); |
4260 | 9.86k | if (!meta.smallest.key.Valid() || !meta.largest.key.Valid()9.84k ) { |
4261 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4262 | 0 | } |
4263 | 9.86k | meta.smallest.seqno = file_info->sequence_number; |
4264 | 9.86k | meta.largest.seqno = file_info->sequence_number; |
4265 | 9.86k | if (meta.smallest.seqno != 0 || meta.largest.seqno != 09.85k ) { |
4266 | 0 | return STATUS(InvalidArgument, |
4267 | 0 | "Non zero sequence numbers are not supported"); |
4268 | 0 | } |
4269 | | |
4270 | 9.86k | std::string db_base_fname; |
4271 | 9.86k | std::string db_data_fname; |
4272 | 9.86k | std::string data_file_path; |
4273 | 9.86k | { |
4274 | | // Generate a location for the new table |
4275 | 9.86k | auto file_number_holder = pending_outputs_->NewFileNumber(); |
4276 | 9.86k | meta.fd = FileDescriptor(file_number_holder.Last(), 0, file_info->file_size, |
4277 | 9.86k | file_info->base_file_size); |
4278 | | |
4279 | 9.86k | db_base_fname = TableFileName( |
4280 | 9.86k | db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); |
4281 | 9.86k | status = ::rocksdb::AddFile(env_, file_info->file_path, db_base_fname, move_file); |
4282 | | |
4283 | 9.87k | if (status.ok()9.86k && file_info->is_split_sst) { |
4284 | 9.87k | data_file_path = TableBaseToDataFileName(file_info->file_path); |
4285 | 9.87k | db_data_fname = TableBaseToDataFileName(db_base_fname); |
4286 | 9.87k | status = ::rocksdb::AddFile(env_, data_file_path, db_data_fname, move_file); |
4287 | 9.87k | if (!status.ok()) { |
4288 | 0 | ::rocksdb::DeleteFile(env_, db_base_fname, db_options_.info_log, |
4289 | 0 | "AddFile() clean up for file %s failed : %s"); |
4290 | 0 | } |
4291 | 9.87k | } |
4292 | | |
4293 | 9.86k | TEST_SYNC_POINT("DBImpl::AddFile:FileCopied"); |
4294 | 9.86k | if (!status.ok()) { |
4295 | 0 | return status; |
4296 | 0 | } |
4297 | | |
4298 | 9.86k | { |
4299 | 9.86k | InstrumentedMutexLock l(&mutex_); |
4300 | 9.86k | const MutableCFOptions mutable_cf_options = |
4301 | 9.86k | *cfd->GetLatestMutableCFOptions(); |
4302 | | |
4303 | 9.86k | WriteThread::Writer w; |
4304 | 9.86k | write_thread_.EnterUnbatched(&w, &mutex_); |
4305 | | |
4306 | 9.86k | if (!snapshots_.empty()) { |
4307 | | // Check that no snapshots are being held |
4308 | 23 | status = |
4309 | 23 | STATUS(NotSupported, "Cannot add a file while holding snapshots"); |
4310 | 23 | } |
4311 | | |
4312 | 9.86k | if (status.ok()) { |
4313 | | // Verify that added file key range dont overlap with any keys in DB |
4314 | 9.84k | SuperVersion* sv = cfd->GetSuperVersion()->Ref(); |
4315 | 9.84k | Arena arena; |
4316 | 9.84k | ReadOptions ro; |
4317 | 9.84k | ro.total_order_seek = true; |
4318 | 9.84k | ScopedArenaIterator iter(NewInternalIterator(ro, cfd, sv, &arena)); |
4319 | | |
4320 | 9.84k | InternalKey range_start(file_info->smallest_key, kMaxSequenceNumber, kTypeValue); |
4321 | 9.84k | iter->Seek(range_start.Encode()); |
4322 | 9.84k | status = iter->status(); |
4323 | | |
4324 | 9.84k | if (status.ok() && iter->Valid()) { |
4325 | 9.54k | ParsedInternalKey seek_result; |
4326 | 9.54k | if (ParseInternalKey(iter->key(), &seek_result)) { |
4327 | 9.54k | auto* vstorage = cfd->current()->storage_info(); |
4328 | 9.54k | if (vstorage->InternalComparator()->user_comparator()->Compare( |
4329 | 9.54k | seek_result.user_key, file_info->largest_key) <= 0) { |
4330 | 4.50k | status = STATUS(NotSupported, "Cannot add overlapping range"); |
4331 | 4.50k | } |
4332 | 9.54k | } else { |
4333 | 0 | status = STATUS(Corruption, "DB have corrupted keys"); |
4334 | 0 | } |
4335 | 9.54k | } |
4336 | 9.84k | } |
4337 | | |
4338 | 9.86k | if (status.ok()) { |
4339 | | // Add file to L0 |
4340 | 5.34k | VersionEdit edit; |
4341 | 5.34k | edit.SetColumnFamily(cfd->GetID()); |
4342 | 5.34k | edit.AddCleanedFile(0, meta); |
4343 | | |
4344 | 5.34k | status = versions_->LogAndApply( |
4345 | 5.34k | cfd, mutable_cf_options, &edit, &mutex_, directories_.GetDbDir()); |
4346 | 5.34k | } |
4347 | 9.86k | write_thread_.ExitUnbatched(&w); |
4348 | | |
4349 | 9.86k | if (status.ok()) { |
4350 | 5.34k | InstallSuperVersionAndScheduleWork(cfd, nullptr, mutable_cf_options); |
4351 | 5.34k | } |
4352 | 9.86k | } |
4353 | 9.86k | } |
4354 | | |
4355 | 9.86k | if (!status.ok()) { |
4356 | | // We failed to add the file to the database |
4357 | 4.53k | const char* error_format = "AddFile() clean up for file %s failed : %s"; |
4358 | 4.53k | ::rocksdb::DeleteFile(env_, db_base_fname, db_options_.info_log, error_format); |
4359 | 4.53k | if (file_info->is_split_sst) { |
4360 | 4.53k | ::rocksdb::DeleteFile(env_, db_data_fname, db_options_.info_log, error_format); |
4361 | 4.53k | } |
4362 | 5.34k | } else if (5.33k status.ok()5.33k ) { |
4363 | 5.34k | if (move_file) { |
4364 | | // The file was moved and added successfully, remove original file link |
4365 | 58 | const char* error_format = |
4366 | 58 | "%s was added to DB successfully but failed to remove original file link : %s"; |
4367 | 58 | ::rocksdb::DeleteFile(env_, file_info->file_path, db_options_.info_log, error_format); |
4368 | 58 | if (file_info->is_split_sst) { |
4369 | 58 | ::rocksdb::DeleteFile(env_, data_file_path, db_options_.info_log, error_format); |
4370 | 58 | } |
4371 | 58 | } |
4372 | 5.34k | FilesChanged(); |
4373 | 5.34k | } |
4374 | 9.86k | return status; |
4375 | 9.86k | } |
4376 | | #endif // ROCKSDB_LITE |
4377 | | |
4378 | 172k | std::function<void()> DBImpl::GetFilesChangedListener() const { |
4379 | 172k | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4380 | 172k | return files_changed_listener_; |
4381 | 172k | } |
4382 | | |
4383 | 0 | bool DBImpl::HasFilesChangedListener() const { |
4384 | 0 | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4385 | 0 | return files_changed_listener_ != nullptr; |
4386 | 0 | } |
4387 | | |
4388 | 554k | void DBImpl::ListenFilesChanged(std::function<void()> files_changed_listener) { |
4389 | 554k | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4390 | 554k | files_changed_listener_ = std::move(files_changed_listener); |
4391 | 554k | } |
4392 | | |
4393 | 172k | void DBImpl::FilesChanged() { |
4394 | 172k | auto files_changed_listener = GetFilesChangedListener(); |
4395 | 172k | if (files_changed_listener) { |
4396 | 35.8k | files_changed_listener(); |
4397 | 35.8k | } |
4398 | 172k | } |
4399 | | |
4400 | | Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
4401 | | const std::string& column_family_name, |
4402 | 1.71k | ColumnFamilyHandle** handle) { |
4403 | 1.71k | Status s; |
4404 | 1.71k | Status persist_options_status; |
4405 | 1.71k | *handle = nullptr; |
4406 | | |
4407 | 1.71k | s = CheckCompressionSupported(cf_options); |
4408 | 1.71k | if (s.ok() && db_options_.allow_concurrent_memtable_write1.71k ) { |
4409 | 34 | s = CheckConcurrentWritesSupported(cf_options); |
4410 | 34 | } |
4411 | 1.71k | if (!s.ok()) { |
4412 | 2 | return s; |
4413 | 2 | } |
4414 | | |
4415 | 1.71k | { |
4416 | 1.71k | InstrumentedMutexLock l(&mutex_); |
4417 | | |
4418 | 1.71k | if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != |
4419 | 1.71k | nullptr) { |
4420 | 0 | return STATUS(InvalidArgument, "Column family already exists"); |
4421 | 0 | } |
4422 | 1.71k | VersionEdit edit; |
4423 | 1.71k | edit.AddColumnFamily(column_family_name); |
4424 | 1.71k | uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); |
4425 | 1.71k | edit.SetColumnFamily(new_id); |
4426 | 1.71k | edit.SetLogNumber(logfile_number_); |
4427 | 1.71k | edit.SetComparatorName(cf_options.comparator->Name()); |
4428 | | |
4429 | | // LogAndApply will both write the creation in MANIFEST and create |
4430 | | // ColumnFamilyData object |
4431 | 1.71k | Options opt(db_options_, cf_options); |
4432 | 1.71k | { // write thread |
4433 | 1.71k | WriteThread::Writer w; |
4434 | 1.71k | write_thread_.EnterUnbatched(&w, &mutex_); |
4435 | | // LogAndApply will both write the creation in MANIFEST and create |
4436 | | // ColumnFamilyData object |
4437 | 1.71k | s = versions_->LogAndApply( |
4438 | 1.71k | nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit, |
4439 | 1.71k | &mutex_, directories_.GetDbDir(), false, &cf_options); |
4440 | | |
4441 | 1.71k | if (s.ok()) { |
4442 | | // If the column family was created successfully, we then persist |
4443 | | // the updated RocksDB options under the same single write thread |
4444 | 1.71k | persist_options_status = WriteOptionsFile(); |
4445 | 1.71k | } |
4446 | 1.71k | write_thread_.ExitUnbatched(&w); |
4447 | 1.71k | } |
4448 | 1.71k | if (s.ok()) { |
4449 | 1.71k | single_column_family_mode_ = false; |
4450 | 1.71k | auto* cfd = |
4451 | 1.71k | versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); |
4452 | 1.71k | assert(cfd != nullptr); |
4453 | 0 | InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
4454 | | |
4455 | 1.71k | if (!cfd->mem()->IsSnapshotSupported()) { |
4456 | 30 | is_snapshot_supported_ = false; |
4457 | 30 | } |
4458 | | |
4459 | 1.71k | *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); |
4460 | 1.71k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4461 | 1.71k | "Created column family [%s] (ID %u)", |
4462 | 1.71k | column_family_name.c_str(), (unsigned)cfd->GetID()); |
4463 | 1.71k | } else { |
4464 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
4465 | 0 | "Creating column family [%s] FAILED -- %s", |
4466 | 0 | column_family_name.c_str(), s.ToString().c_str()); |
4467 | 0 | } |
4468 | 1.71k | } // InstrumentedMutexLock l(&mutex_) |
4469 | | |
4470 | | // this is outside the mutex |
4471 | 1.71k | if (s.ok()) { |
4472 | 1.71k | if (!persist_options_status.ok()) { |
4473 | 0 | if (db_options_.fail_if_options_file_error) { |
4474 | 0 | s = STATUS(IOError, |
4475 | 0 | "ColumnFamily has been created, but unable to persist" |
4476 | 0 | "options in CreateColumnFamily()", |
4477 | 0 | persist_options_status.ToString().c_str()); |
4478 | 0 | } |
4479 | 0 | RWARN(db_options_.info_log, |
4480 | 0 | "Unable to persist options in CreateColumnFamily() -- %s", |
4481 | 0 | persist_options_status.ToString().c_str()); |
4482 | 0 | } |
4483 | 1.71k | } |
4484 | 1.71k | return s; |
4485 | 1.71k | } |
4486 | | |
4487 | 26 | Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { |
4488 | 26 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4489 | 26 | auto cfd = cfh->cfd(); |
4490 | 26 | if (cfd->GetID() == 0) { |
4491 | 0 | return STATUS(InvalidArgument, "Can't drop default column family"); |
4492 | 0 | } |
4493 | | |
4494 | 26 | bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); |
4495 | | |
4496 | 26 | VersionEdit edit; |
4497 | 26 | edit.DropColumnFamily(); |
4498 | 26 | edit.SetColumnFamily(cfd->GetID()); |
4499 | | |
4500 | 26 | Status s; |
4501 | 26 | Status options_persist_status; |
4502 | 26 | { |
4503 | 26 | InstrumentedMutexLock l(&mutex_); |
4504 | 26 | if (cfd->IsDropped()) { |
4505 | 0 | s = STATUS(InvalidArgument, "Column family already dropped!\n"); |
4506 | 0 | } |
4507 | 26 | if (s.ok()) { |
4508 | | // we drop column family from a single write thread |
4509 | 26 | WriteThread::Writer w; |
4510 | 26 | write_thread_.EnterUnbatched(&w, &mutex_); |
4511 | 26 | s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
4512 | 26 | &edit, &mutex_); |
4513 | 26 | if (s.ok()) { |
4514 | | // If the column family was dropped successfully, we then persist |
4515 | | // the updated RocksDB options under the same single write thread |
4516 | 26 | options_persist_status = WriteOptionsFile(); |
4517 | 26 | } |
4518 | 26 | write_thread_.ExitUnbatched(&w); |
4519 | 26 | } |
4520 | | |
4521 | 26 | if (!cf_support_snapshot) { |
4522 | | // Dropped Column Family doesn't support snapshot. Need to recalculate |
4523 | | // is_snapshot_supported_. |
4524 | 0 | bool new_is_snapshot_supported = true; |
4525 | 0 | for (auto c : *versions_->GetColumnFamilySet()) { |
4526 | 0 | if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { |
4527 | 0 | new_is_snapshot_supported = false; |
4528 | 0 | break; |
4529 | 0 | } |
4530 | 0 | } |
4531 | 0 | is_snapshot_supported_ = new_is_snapshot_supported; |
4532 | 0 | } |
4533 | 26 | } |
4534 | | |
4535 | 26 | if (s.ok()) { |
4536 | | // Note that here we erase the associated cf_info of the to-be-dropped |
4537 | | // cfd before its ref-count goes to zero to avoid having to erase cf_info |
4538 | | // later inside db_mutex. |
4539 | 26 | assert(cfd->IsDropped()); |
4540 | 0 | auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
4541 | 26 | max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * |
4542 | 26 | mutable_cf_options->max_write_buffer_number; |
4543 | 26 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4544 | 26 | "Dropped column family with id %u\n", cfd->GetID()); |
4545 | | |
4546 | 26 | if (!options_persist_status.ok()) { |
4547 | 0 | if (db_options_.fail_if_options_file_error) { |
4548 | 0 | s = STATUS(IOError, |
4549 | 0 | "ColumnFamily has been dropped, but unable to persist " |
4550 | 0 | "options in DropColumnFamily()", |
4551 | 0 | options_persist_status.ToString().c_str()); |
4552 | 0 | } |
4553 | 0 | RWARN(db_options_.info_log, |
4554 | 0 | "Unable to persist options in DropColumnFamily() -- %s", |
4555 | 0 | options_persist_status.ToString().c_str()); |
4556 | 0 | } |
4557 | 26 | } else { |
4558 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
4559 | 0 | "Dropping column family with id %u FAILED -- %s\n", |
4560 | 0 | cfd->GetID(), s.ToString().c_str()); |
4561 | 0 | } |
4562 | | |
4563 | 0 | return s; |
4564 | 26 | } |
4565 | | |
4566 | | bool DBImpl::KeyMayExist(const ReadOptions& read_options, |
4567 | | ColumnFamilyHandle* column_family, const Slice& key, |
4568 | 385 | std::string* value, bool* value_found) { |
4569 | 385 | if (value_found != nullptr) { |
4570 | | // falsify later if key-may-exist but can't fetch value |
4571 | 146 | *value_found = true; |
4572 | 146 | } |
4573 | 385 | ReadOptions roptions = read_options; |
4574 | 385 | roptions.read_tier = kBlockCacheTier; // read from block cache only |
4575 | 385 | auto s = GetImpl(roptions, column_family, key, value, value_found); |
4576 | | |
4577 | | // If block_cache is enabled and the index block of the table was |
4578 | | // not present in block_cache, the return value will be Status::Incomplete. |
4579 | | // In this case, key may still exist in the table. |
4580 | 385 | return s.ok() || s.IsIncomplete()103 ; |
4581 | 385 | } |
4582 | | |
4583 | | Iterator* DBImpl::NewIterator(const ReadOptions& read_options, |
4584 | 38.1M | ColumnFamilyHandle* column_family) { |
4585 | 38.1M | if (read_options.read_tier == kPersistedTier) { |
4586 | 1 | return NewErrorIterator(STATUS(NotSupported, |
4587 | 1 | "ReadTier::kPersistedData is not yet supported in iterators.")); |
4588 | 1 | } |
4589 | 38.1M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4590 | 38.1M | auto cfd = cfh->cfd(); |
4591 | | |
4592 | 38.1M | XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, |
4593 | 38.1M | reinterpret_cast<DBImpl*>(this), |
4594 | 38.1M | const_cast<ReadOptions*>(&read_options), is_snapshot_supported_); |
4595 | 38.1M | if (read_options.managed) { |
4596 | | #ifdef ROCKSDB_LITE |
4597 | | // not supported in lite version |
4598 | | return NewErrorIterator(STATUS(InvalidArgument, |
4599 | | "Managed Iterators not supported in RocksDBLite.")); |
4600 | | #else |
4601 | 76 | if ((read_options.tailing) || (read_options.snapshot != nullptr)69 || |
4602 | 76 | (is_snapshot_supported_)69 ) { |
4603 | 76 | return new ManagedIterator(this, read_options, cfd); |
4604 | 76 | } |
4605 | | // Managed iter not supported |
4606 | 0 | return NewErrorIterator(STATUS(InvalidArgument, |
4607 | 0 | "Managed Iterators not supported without snapshots.")); |
4608 | 76 | #endif |
4609 | 38.1M | } else if (read_options.tailing) { |
4610 | | #ifdef ROCKSDB_LITE |
4611 | | // not supported in lite version |
4612 | | return nullptr; |
4613 | | #else |
4614 | 23 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4615 | 23 | auto iter = new ForwardIterator(this, read_options, cfd, sv); |
4616 | 23 | return NewDBIterator( |
4617 | 23 | env_, *cfd->ioptions(), cfd->user_comparator(), iter, |
4618 | 23 | kMaxSequenceNumber, |
4619 | 23 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4620 | 23 | sv->version_number, read_options.iterate_upper_bound, |
4621 | 23 | read_options.prefix_same_as_start, read_options.pin_data); |
4622 | 23 | #endif |
4623 | 38.1M | } else { |
4624 | 38.1M | SequenceNumber latest_snapshot = versions_->LastSequence(); |
4625 | 38.1M | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4626 | | |
4627 | 38.1M | auto snapshot = |
4628 | 38.1M | read_options.snapshot != nullptr |
4629 | 38.1M | ? reinterpret_cast<const SnapshotImpl*>( |
4630 | 2.67k | read_options.snapshot)->number_ |
4631 | 38.1M | : latest_snapshot38.1M ; |
4632 | | |
4633 | | // Try to generate a DB iterator tree in continuous memory area to be |
4634 | | // cache friendly. Here is an example of result: |
4635 | | // +-------------------------------+ |
4636 | | // | | |
4637 | | // | ArenaWrappedDBIter | |
4638 | | // | + | |
4639 | | // | +---> Inner Iterator ------------+ |
4640 | | // | | | | |
4641 | | // | | +-- -- -- -- -- -- -- --+ | |
4642 | | // | +--- | Arena | | |
4643 | | // | | | | |
4644 | | // | Allocated Memory: | | |
4645 | | // | | +-------------------+ | |
4646 | | // | | | DBIter | <---+ |
4647 | | // | | + | |
4648 | | // | | | +-> iter_ ------------+ |
4649 | | // | | | | | |
4650 | | // | | +-------------------+ | |
4651 | | // | | | MergingIterator | <---+ |
4652 | | // | | + | |
4653 | | // | | | +->child iter1 ------------+ |
4654 | | // | | | | | | |
4655 | | // | | +->child iter2 ----------+ | |
4656 | | // | | | | | | | |
4657 | | // | | | +->child iter3 --------+ | | |
4658 | | // | | | | | | |
4659 | | // | | +-------------------+ | | | |
4660 | | // | | | Iterator1 | <--------+ |
4661 | | // | | +-------------------+ | | |
4662 | | // | | | Iterator2 | <------+ |
4663 | | // | | +-------------------+ | |
4664 | | // | | | Iterator3 | <----+ |
4665 | | // | | +-------------------+ |
4666 | | // | | | |
4667 | | // +-------+-----------------------+ |
4668 | | // |
4669 | | // ArenaWrappedDBIter inlines an arena area where all the iterators in |
4670 | | // the iterator tree are allocated in the order of being accessed when |
4671 | | // querying. |
4672 | | // Laying out the iterators in the order of being accessed makes it more |
4673 | | // likely that any iterator pointer is close to the iterator it points to so |
4674 | | // that they are likely to be in the same cache line and/or page. |
4675 | 38.1M | ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( |
4676 | 38.1M | env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, |
4677 | 38.1M | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4678 | 38.1M | sv->version_number, read_options.iterate_upper_bound, |
4679 | 38.1M | read_options.prefix_same_as_start, read_options.pin_data); |
4680 | | |
4681 | 38.1M | InternalIterator* internal_iter = |
4682 | 38.1M | NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); |
4683 | 38.1M | db_iter->SetIterUnderDBIter(internal_iter); |
4684 | | |
4685 | 38.1M | if (yb::GetAtomicFlag(&FLAGS_rocksdb_use_logging_iterator)) { |
4686 | 0 | return new TransitionLoggingIteratorWrapper(db_iter, LogPrefix()); |
4687 | 0 | } |
4688 | 38.1M | return db_iter; |
4689 | 38.1M | } |
4690 | | // To stop compiler from complaining |
4691 | 0 | return nullptr; |
4692 | 38.1M | } |
4693 | | |
4694 | | Status DBImpl::NewIterators( |
4695 | | const ReadOptions& read_options, |
4696 | | const std::vector<ColumnFamilyHandle*>& column_families, |
4697 | 19 | std::vector<Iterator*>* iterators) { |
4698 | 19 | if (read_options.read_tier == kPersistedTier) { |
4699 | 1 | return STATUS(NotSupported, |
4700 | 1 | "ReadTier::kPersistedData is not yet supported in iterators."); |
4701 | 1 | } |
4702 | 18 | iterators->clear(); |
4703 | 18 | iterators->reserve(column_families.size()); |
4704 | 18 | XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, |
4705 | 18 | reinterpret_cast<DBImpl*>(this), |
4706 | 18 | const_cast<ReadOptions*>(&read_options), is_snapshot_supported_); |
4707 | 18 | if (read_options.managed) { |
4708 | | #ifdef ROCKSDB_LITE |
4709 | | return STATUS(InvalidArgument, |
4710 | | "Managed interator not supported in RocksDB lite"); |
4711 | | #else |
4712 | 0 | if ((!read_options.tailing) && (read_options.snapshot == nullptr) && |
4713 | 0 | (!is_snapshot_supported_)) { |
4714 | 0 | return STATUS(InvalidArgument, |
4715 | 0 | "Managed interator not supported without snapshots"); |
4716 | 0 | } |
4717 | 0 | for (auto cfh : column_families) { |
4718 | 0 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(cfh)->cfd(); |
4719 | 0 | auto iter = new ManagedIterator(this, read_options, cfd); |
4720 | 0 | iterators->push_back(iter); |
4721 | 0 | } |
4722 | 0 | #endif |
4723 | 18 | } else if (read_options.tailing) { |
4724 | | #ifdef ROCKSDB_LITE |
4725 | | return STATUS(InvalidArgument, |
4726 | | "Tailing interator not supported in RocksDB lite"); |
4727 | | #else |
4728 | 3 | for (auto cfh : column_families) { |
4729 | 3 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(cfh)->cfd(); |
4730 | 3 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4731 | 3 | auto iter = new ForwardIterator(this, read_options, cfd, sv); |
4732 | 3 | iterators->push_back(NewDBIterator( |
4733 | 3 | env_, *cfd->ioptions(), cfd->user_comparator(), iter, |
4734 | 3 | kMaxSequenceNumber, |
4735 | 3 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4736 | 3 | sv->version_number, nullptr, false, read_options.pin_data)); |
4737 | 3 | } |
4738 | 1 | #endif |
4739 | 17 | } else { |
4740 | 17 | SequenceNumber latest_snapshot = versions_->LastSequence(); |
4741 | | |
4742 | 53 | for (size_t i = 0; i < column_families.size(); ++i36 ) { |
4743 | 36 | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd(); |
4744 | 36 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4745 | | |
4746 | 36 | auto snapshot = |
4747 | 36 | read_options.snapshot != nullptr |
4748 | 36 | ? reinterpret_cast<const SnapshotImpl*>( |
4749 | 0 | read_options.snapshot)->number_ |
4750 | 36 | : latest_snapshot; |
4751 | | |
4752 | 36 | ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( |
4753 | 36 | env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, |
4754 | 36 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4755 | 36 | sv->version_number, nullptr, false, read_options.pin_data); |
4756 | 36 | InternalIterator* internal_iter = |
4757 | 36 | NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); |
4758 | 36 | db_iter->SetIterUnderDBIter(internal_iter); |
4759 | 36 | iterators->push_back(db_iter); |
4760 | 36 | } |
4761 | 17 | } |
4762 | | |
4763 | 18 | return Status::OK(); |
4764 | 18 | } |
4765 | | |
4766 | 3.20k | const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } |
4767 | | |
4768 | | #ifndef ROCKSDB_LITE |
4769 | 71 | const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { |
4770 | 71 | return GetSnapshotImpl(true); |
4771 | 71 | } |
4772 | | #endif // ROCKSDB_LITE |
4773 | | |
4774 | 3.27k | const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { |
4775 | 3.27k | int64_t unix_time = 0; |
4776 | 3.27k | WARN_NOT_OK(env_->GetCurrentTime(&unix_time), "Failed to get current time"); |
4777 | 3.27k | SnapshotImpl* s = new SnapshotImpl; |
4778 | | |
4779 | 3.27k | InstrumentedMutexLock l(&mutex_); |
4780 | | // returns null if the underlying memtable does not support snapshot. |
4781 | 3.27k | if (!is_snapshot_supported_) { |
4782 | 0 | delete s; |
4783 | 0 | return nullptr; |
4784 | 0 | } |
4785 | 3.27k | return snapshots_.New(s, versions_->LastSequence(), unix_time, |
4786 | 3.27k | is_write_conflict_boundary); |
4787 | 3.27k | } |
4788 | | |
4789 | 3.27k | void DBImpl::ReleaseSnapshot(const Snapshot* s) { |
4790 | 3.27k | const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s); |
4791 | 3.27k | { |
4792 | 3.27k | InstrumentedMutexLock l(&mutex_); |
4793 | 3.27k | snapshots_.Delete(casted_s); |
4794 | 3.27k | } |
4795 | 3.27k | delete casted_s; |
4796 | 3.27k | } |
4797 | | |
4798 | | // Convenience methods |
4799 | | Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, |
4800 | 14.9M | const Slice& key, const Slice& val) { |
4801 | 14.9M | return DB::Put(o, column_family, key, val); |
4802 | 14.9M | } |
4803 | | |
4804 | | Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, |
4805 | 89.9k | const Slice& key, const Slice& val) { |
4806 | 89.9k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4807 | 89.9k | if (!cfh->cfd()->ioptions()->merge_operator) { |
4808 | 1 | return STATUS(NotSupported, "Provide a merge_operator when opening DB"); |
4809 | 89.9k | } else { |
4810 | 89.9k | return DB::Merge(o, column_family, key, val); |
4811 | 89.9k | } |
4812 | 89.9k | } |
4813 | | |
4814 | | Status DBImpl::Delete(const WriteOptions& write_options, |
4815 | 544k | ColumnFamilyHandle* column_family, const Slice& key) { |
4816 | 544k | return DB::Delete(write_options, column_family, key); |
4817 | 544k | } |
4818 | | |
4819 | | Status DBImpl::SingleDelete(const WriteOptions& write_options, |
4820 | | ColumnFamilyHandle* column_family, |
4821 | 185 | const Slice& key) { |
4822 | 185 | return DB::SingleDelete(write_options, column_family, key); |
4823 | 185 | } |
4824 | | |
4825 | 30.0M | Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { |
4826 | 30.0M | return WriteImpl(write_options, my_batch, nullptr); |
4827 | 30.0M | } |
4828 | | |
4829 | | #ifndef ROCKSDB_LITE |
4830 | | Status DBImpl::WriteWithCallback(const WriteOptions& write_options, |
4831 | | WriteBatch* my_batch, |
4832 | 353 | WriteCallback* callback) { |
4833 | 353 | return WriteImpl(write_options, my_batch, callback); |
4834 | 353 | } |
4835 | | #endif // ROCKSDB_LITE |
4836 | | |
4837 | | Status DBImpl::WriteImpl(const WriteOptions& write_options, |
4838 | 30.0M | WriteBatch* my_batch, WriteCallback* callback) { |
4839 | | |
4840 | 30.0M | if (my_batch == nullptr) { |
4841 | 0 | return STATUS(Corruption, "Batch is nullptr!"); |
4842 | 0 | } |
4843 | 30.0M | if (write_options.timeout_hint_us != 0) { |
4844 | 1 | return STATUS(InvalidArgument, "timeout_hint_us is deprecated"); |
4845 | 1 | } |
4846 | | |
4847 | 30.0M | Status status; |
4848 | | |
4849 | 30.0M | bool xfunc_attempted_write = false; |
4850 | 30.0M | XFUNC_TEST("transaction", "transaction_xftest_write_impl", |
4851 | 30.0M | xf_transaction_write1, xf_transaction_write, write_options, |
4852 | 30.0M | db_options_, my_batch, callback, this, &status, |
4853 | 30.0M | &xfunc_attempted_write); |
4854 | 30.0M | if (xfunc_attempted_write) { |
4855 | | // Test already did the write |
4856 | 0 | return status; |
4857 | 0 | } |
4858 | | |
4859 | 30.0M | PERF_TIMER_GUARD(write_pre_and_post_process_time); |
4860 | 30.0M | WriteThread::Writer w; |
4861 | 30.0M | w.batch = my_batch; |
4862 | 30.0M | w.sync = write_options.sync; |
4863 | 30.0M | w.disableWAL = write_options.disableWAL; |
4864 | 30.0M | w.in_batch_group = false; |
4865 | 30.0M | w.callback = callback; |
4866 | | |
4867 | 30.0M | if (!write_options.disableWAL) { |
4868 | 17.3M | RecordTick(stats_, WRITE_WITH_WAL); |
4869 | 17.3M | } |
4870 | | |
4871 | 30.0M | StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE); |
4872 | | |
4873 | 30.0M | #ifndef NDEBUG |
4874 | 30.0M | auto num_write_waiters = write_waiters_.fetch_add(1, std::memory_order_acq_rel); |
4875 | 30.0M | #endif |
4876 | | |
4877 | 30.0M | write_thread_.JoinBatchGroup(&w); |
4878 | | |
4879 | 30.0M | #ifndef NDEBUG |
4880 | 30.0M | write_waiters_.fetch_sub(1, std::memory_order_acq_rel); |
4881 | 30.0M | DCHECK_LE(num_write_waiters, FLAGS_TEST_max_write_waiters); |
4882 | 30.0M | #endif |
4883 | | |
4884 | 30.0M | if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { |
4885 | | // we are a non-leader in a parallel group |
4886 | 42.1k | PERF_TIMER_GUARD(write_memtable_time); |
4887 | | |
4888 | 42.4k | if (!w.CallbackFailed()42.1k ) { |
4889 | 42.4k | ColumnFamilyMemTablesImpl column_family_memtables( |
4890 | 42.4k | versions_->GetColumnFamilySet()); |
4891 | 42.4k | WriteBatchInternal::SetSequence(w.batch, w.sequence); |
4892 | 42.4k | InsertFlags insert_flags{InsertFlag::kConcurrentMemtableWrites}; |
4893 | 42.4k | w.status = WriteBatchInternal::InsertInto( |
4894 | 42.4k | w.batch, &column_family_memtables, &flush_scheduler_, |
4895 | 42.4k | write_options.ignore_missing_column_families, 0 /*log_number*/, this, insert_flags); |
4896 | 42.4k | } |
4897 | | |
4898 | 42.1k | if (write_thread_.CompleteParallelWorker(&w)) { |
4899 | | // we're responsible for early exit |
4900 | 11.9k | auto last_sequence = w.parallel_group->last_sequence; |
4901 | 11.9k | SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); |
4902 | 11.9k | versions_->SetLastSequence(last_sequence); |
4903 | 11.9k | write_thread_.EarlyExitParallelGroup(&w); |
4904 | 11.9k | } |
4905 | 42.1k | assert(w.state == WriteThread::STATE_COMPLETED); |
4906 | | // STATE_COMPLETED conditional below handles exit |
4907 | | |
4908 | 0 | status = w.FinalStatus(); |
4909 | 42.1k | } |
4910 | 30.0M | if (w.state == WriteThread::STATE_COMPLETED) { |
4911 | | // write is complete and leader has updated sequence |
4912 | 1.18M | RecordTick(stats_, WRITE_DONE_BY_OTHER); |
4913 | 1.18M | return w.FinalStatus(); |
4914 | 1.18M | } |
4915 | | // else we are the leader of the write batch group |
4916 | 28.8M | assert(w.state == WriteThread::STATE_GROUP_LEADER); |
4917 | | |
4918 | 0 | WriteContext context; |
4919 | 28.8M | mutex_.Lock(); |
4920 | | |
4921 | 28.8M | if (!write_options.disableWAL) { |
4922 | 16.1M | default_cf_internal_stats_->AddDBStats(InternalDBStatsType::WRITE_WITH_WAL, 1); |
4923 | 16.1M | } |
4924 | | |
4925 | 28.8M | RecordTick(stats_, WRITE_DONE_BY_SELF); |
4926 | 28.8M | default_cf_internal_stats_->AddDBStats(InternalDBStatsType::WRITE_DONE_BY_SELF, 1); |
4927 | | |
4928 | | // Once reaches this point, the current writer "w" will try to do its write |
4929 | | // job. It may also pick up some of the remaining writers in the "writers_" |
4930 | | // when it finds suitable, and finish them in the same write batch. |
4931 | | // This is how a write job could be done by the other writer. |
4932 | 28.8M | assert(!single_column_family_mode_ || |
4933 | 28.8M | versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); |
4934 | | |
4935 | 28.8M | uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) |
4936 | 28.8M | ? 4 * max_total_in_memory_state_28.7M |
4937 | 28.8M | : db_options_.max_total_wal_size75.4k ; |
4938 | 28.8M | if (UNLIKELY(!single_column_family_mode_ && |
4939 | 28.8M | alive_log_files_.begin()->getting_flushed == false && |
4940 | 28.8M | total_log_size() > max_total_wal_size)) { |
4941 | 17 | uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; |
4942 | 17 | alive_log_files_.begin()->getting_flushed = true; |
4943 | 17 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4944 | 17 | "Flushing all column families with data in WAL number %" PRIu64 |
4945 | 17 | ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, |
4946 | 17 | flush_column_family_if_log_file, total_log_size(), max_total_wal_size); |
4947 | | // no need to refcount because drop is happening in write thread, so can't |
4948 | | // happen while we're in the write thread |
4949 | 37 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
4950 | 37 | if (cfd->IsDropped()) { |
4951 | 1 | continue; |
4952 | 1 | } |
4953 | 36 | if (cfd->GetLogNumber() <= flush_column_family_if_log_file) { |
4954 | 18 | status = SwitchMemtable(cfd, &context); |
4955 | 18 | if (!status.ok()) { |
4956 | 0 | break; |
4957 | 0 | } |
4958 | 18 | cfd->imm()->FlushRequested(); |
4959 | 18 | SchedulePendingFlush(cfd); |
4960 | 18 | } |
4961 | 36 | } |
4962 | 17 | MaybeScheduleFlushOrCompaction(); |
4963 | 28.8M | } else if (UNLIKELY(write_buffer_.ShouldFlush())) { |
4964 | 1.41k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4965 | 1.41k | "Flushing column family with largest mem table size. Write buffer is " |
4966 | 1.41k | "using %" PRIu64 " bytes out of a total of %" PRIu64 ".", |
4967 | 1.41k | write_buffer_.memory_usage(), write_buffer_.buffer_size()); |
4968 | | // no need to refcount because drop is happening in write thread, so can't |
4969 | | // happen while we're in the write thread |
4970 | 1.41k | ColumnFamilyData* largest_cfd = nullptr; |
4971 | 1.41k | size_t largest_cfd_size = 0; |
4972 | | |
4973 | 1.42k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
4974 | 1.42k | if (cfd->IsDropped()) { |
4975 | 0 | continue; |
4976 | 0 | } |
4977 | 1.42k | if (!cfd->mem()->IsEmpty()) { |
4978 | | // We only consider active mem table, hoping immutable memtable is |
4979 | | // already in the process of flushing. |
4980 | 1.42k | size_t cfd_size = cfd->mem()->ApproximateMemoryUsage(); |
4981 | 1.42k | if (largest_cfd == nullptr || cfd_size > largest_cfd_size11 ) { |
4982 | 1.42k | largest_cfd = cfd; |
4983 | 1.42k | largest_cfd_size = cfd_size; |
4984 | 1.42k | } |
4985 | 1.42k | } |
4986 | 1.42k | } |
4987 | 1.41k | if (largest_cfd != nullptr) { |
4988 | 1.41k | status = SwitchMemtable(largest_cfd, &context); |
4989 | 1.41k | if (status.ok()) { |
4990 | 1.41k | largest_cfd->imm()->FlushRequested(); |
4991 | 1.41k | SchedulePendingFlush(largest_cfd); |
4992 | 1.41k | MaybeScheduleFlushOrCompaction(); |
4993 | 1.41k | } |
4994 | 1.41k | } |
4995 | 1.41k | } |
4996 | | |
4997 | 28.8M | if (UNLIKELY(status.ok() && !bg_error_.ok())) { |
4998 | 828k | status = bg_error_; |
4999 | 828k | } |
5000 | | |
5001 | 28.8M | if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { |
5002 | 11.2k | status = ScheduleFlushes(&context); |
5003 | 11.2k | } |
5004 | | |
5005 | 28.8M | if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || |
5006 | 28.8M | write_controller_.NeedsDelay()))) { |
5007 | 14.1k | PERF_TIMER_STOP(write_pre_and_post_process_time); |
5008 | 14.1k | PERF_TIMER_GUARD(write_delay_time); |
5009 | | // We don't know size of curent batch so that we always use the size |
5010 | | // for previous one. It might create a fairness issue that expiration |
5011 | | // might happen for smaller writes but larger writes can go through. |
5012 | | // Can optimize it if it is an issue. |
5013 | 14.1k | status = DelayWrite(last_batch_group_size_); |
5014 | 14.1k | PERF_TIMER_START(write_pre_and_post_process_time); |
5015 | 14.1k | } |
5016 | | |
5017 | 28.8M | uint64_t last_sequence = versions_->LastSequence(); |
5018 | 28.8M | WriteThread::Writer* last_writer = &w; |
5019 | 28.8M | autovector<WriteThread::Writer*> write_group; |
5020 | 28.8M | bool need_log_sync = !write_options.disableWAL && write_options.sync16.1M ; |
5021 | 28.8M | bool need_log_dir_sync = need_log_sync && !log_dir_synced_107 ; |
5022 | | |
5023 | 28.8M | if (status.ok()) { |
5024 | 27.9M | if (need_log_sync) { |
5025 | 107 | while (logs_.front().getting_synced) { |
5026 | 0 | log_sync_cv_.Wait(); |
5027 | 0 | } |
5028 | 110 | for (auto& log : logs_) { |
5029 | 110 | assert(!log.getting_synced); |
5030 | 0 | log.getting_synced = true; |
5031 | 110 | } |
5032 | 107 | } |
5033 | | |
5034 | | // Add to log and apply to memtable. We can release the lock |
5035 | | // during this phase since &w is currently responsible for logging |
5036 | | // and protects against concurrent loggers and concurrent writes |
5037 | | // into memtables |
5038 | 27.9M | } |
5039 | | |
5040 | 28.8M | mutex_.Unlock(); |
5041 | | |
5042 | | // At this point the mutex is unlocked |
5043 | | |
5044 | 28.8M | bool exit_completed_early = false; |
5045 | 28.8M | last_batch_group_size_ = |
5046 | 28.8M | write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); |
5047 | | |
5048 | 28.8M | if (status.ok()) { |
5049 | | // Rules for when we can update the memtable concurrently |
5050 | | // 1. supported by memtable |
5051 | | // 2. Puts are not okay if inplace_update_support |
5052 | | // 3. Deletes or SingleDeletes are not okay if filtering deletes |
5053 | | // (controlled by both batch and memtable setting) |
5054 | | // 4. Merges are not okay |
5055 | | // 5. YugaByte-specific user-specified sequence numbers are currently not compatible with |
5056 | | // parallel memtable writes. |
5057 | | // |
5058 | | // Rules 1..3 are enforced by checking the options |
5059 | | // during startup (CheckConcurrentWritesSupported), so if |
5060 | | // options.allow_concurrent_memtable_write is true then they can be |
5061 | | // assumed to be true. Rule 4 is checked for each batch. We could |
5062 | | // relax rules 2 and 3 if we could prevent write batches from referring |
5063 | | // more than once to a particular key. |
5064 | 27.9M | bool parallel = |
5065 | 27.9M | db_options_.allow_concurrent_memtable_write && write_group.size() > 127.0k ; |
5066 | 27.9M | size_t total_count = 0; |
5067 | 27.9M | uint64_t total_byte_size = 0; |
5068 | 29.1M | for (auto writer : write_group) { |
5069 | 29.1M | if (writer->CheckCallback(this)) { |
5070 | 29.1M | total_count += WriteBatchInternal::Count(writer->batch); |
5071 | 29.1M | total_byte_size = WriteBatchInternal::AppendedByteSize( |
5072 | 29.1M | total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); |
5073 | 29.1M | parallel = parallel && !writer->batch->HasMerge()55.9k ; |
5074 | 29.1M | } |
5075 | 29.1M | } |
5076 | | |
5077 | 27.9M | const SequenceNumber current_sequence = last_sequence + 1; |
5078 | | |
5079 | 27.9M | #ifndef NDEBUG |
5080 | 27.9M | if (current_sequence <= last_sequence) { |
5081 | 0 | RLOG(InfoLogLevel::FATAL_LEVEL, db_options_.info_log, |
5082 | 0 | "Current sequence number %" PRIu64 " is <= last sequence number %" PRIu64, |
5083 | 0 | current_sequence, last_sequence); |
5084 | 0 | } |
5085 | 27.9M | #endif |
5086 | | |
5087 | | // Reserve sequence numbers for all individual updates in this batch group. |
5088 | 27.9M | last_sequence += total_count; |
5089 | | |
5090 | | // Record statistics |
5091 | 27.9M | RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); |
5092 | 27.9M | RecordTick(stats_, BYTES_WRITTEN, total_byte_size); |
5093 | 27.9M | MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); |
5094 | 27.9M | PERF_TIMER_STOP(write_pre_and_post_process_time); |
5095 | | |
5096 | 27.9M | if (write_options.disableWAL) { |
5097 | 12.6M | has_unpersisted_data_ = true; |
5098 | 12.6M | } |
5099 | | |
5100 | 27.9M | uint64_t log_size = 0; |
5101 | 27.9M | if (!write_options.disableWAL) { |
5102 | 15.3M | PERF_TIMER_GUARD(write_wal_time); |
5103 | | |
5104 | 15.3M | WriteBatch* merged_batch = nullptr; |
5105 | 15.3M | if (write_group.size() == 1 && !write_group[0]->CallbackFailed()14.9M ) { |
5106 | 14.9M | merged_batch = write_group[0]->batch; |
5107 | 14.9M | } else { |
5108 | | // WAL needs all of the batches flattened into a single batch. |
5109 | | // We could avoid copying here with an iov-like AddRecord |
5110 | | // interface |
5111 | 333k | merged_batch = &tmp_batch_; |
5112 | 1.59M | for (auto writer : write_group) { |
5113 | 1.59M | if (!writer->CallbackFailed()) { |
5114 | 1.59M | WriteBatchInternal::Append(merged_batch, writer->batch); |
5115 | 1.59M | } |
5116 | 1.59M | } |
5117 | 333k | } |
5118 | 15.3M | WriteBatchInternal::SetSequence(merged_batch, current_sequence); |
5119 | | |
5120 | 15.3M | CHECK_EQ(WriteBatchInternal::Count(merged_batch), total_count); |
5121 | | |
5122 | 15.3M | Slice log_entry = WriteBatchInternal::Contents(merged_batch); |
5123 | 15.3M | log::Writer* log_writer; |
5124 | 15.3M | LogFileNumberSize* last_alive_log_file; |
5125 | 15.3M | { |
5126 | 15.3M | InstrumentedMutexLock l(&mutex_); |
5127 | 15.3M | log_writer = logs_.back().writer; |
5128 | 15.3M | last_alive_log_file = &alive_log_files_.back(); |
5129 | 15.3M | } |
5130 | 15.3M | status = log_writer->AddRecord(log_entry); |
5131 | 15.3M | total_log_size_.fetch_add(static_cast<int64_t>(log_entry.size())); |
5132 | 15.3M | last_alive_log_file->AddSize(log_entry.size()); |
5133 | 15.3M | log_empty_ = false; |
5134 | 15.3M | log_size = log_entry.size(); |
5135 | 15.3M | RecordTick(stats_, WAL_FILE_BYTES, log_size); |
5136 | 15.3M | if (status.ok() && need_log_sync15.3M ) { |
5137 | 107 | RecordTick(stats_, WAL_FILE_SYNCED); |
5138 | 107 | StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); |
5139 | | // It's safe to access logs_ with unlocked mutex_ here because: |
5140 | | // - we've set getting_synced=true for all logs, |
5141 | | // so other threads won't pop from logs_ while we're here, |
5142 | | // - only writer thread can push to logs_, and we're in |
5143 | | // writer thread, so no one will push to logs_, |
5144 | | // - as long as other threads don't modify it, it's safe to read |
5145 | | // from std::deque from multiple threads concurrently. |
5146 | 110 | for (auto& log : logs_) { |
5147 | 110 | status = log.writer->file()->Sync(db_options_.use_fsync); |
5148 | 110 | if (!status.ok()) { |
5149 | 0 | break; |
5150 | 0 | } |
5151 | 110 | } |
5152 | 107 | if (status.ok() && need_log_dir_sync) { |
5153 | | // We only sync WAL directory the first time WAL syncing is |
5154 | | // requested, so that in case users never turn on WAL sync, |
5155 | | // we can avoid the disk I/O in the write code path. |
5156 | 70 | status = directories_.GetWalDir()->Fsync(); |
5157 | 70 | } |
5158 | 107 | } |
5159 | | |
5160 | 15.3M | if (merged_batch == &tmp_batch_) { |
5161 | 333k | tmp_batch_.Clear(); |
5162 | 333k | } |
5163 | 15.3M | } |
5164 | 27.9M | if (status.ok()27.9M ) { |
5165 | 27.9M | PERF_TIMER_GUARD(write_memtable_time); |
5166 | | |
5167 | 27.9M | { |
5168 | | // Update stats while we are an exclusive group leader, so we know |
5169 | | // that nobody else can be writing to these particular stats. |
5170 | | // We're optimistic, updating the stats before we successfully |
5171 | | // commit. That lets us release our leader status early in |
5172 | | // some cases. |
5173 | 27.9M | auto stats = default_cf_internal_stats_; |
5174 | 27.9M | stats->AddDBStats(InternalDBStatsType::BYTES_WRITTEN, total_byte_size); |
5175 | 27.9M | stats->AddDBStats(InternalDBStatsType::NUMBER_KEYS_WRITTEN, total_count); |
5176 | 27.9M | if (!write_options.disableWAL) { |
5177 | 15.3M | if (write_options.sync) { |
5178 | 107 | stats->AddDBStats(InternalDBStatsType::WAL_FILE_SYNCED, 1); |
5179 | 107 | } |
5180 | 15.3M | stats->AddDBStats(InternalDBStatsType::WAL_FILE_BYTES, log_size); |
5181 | 15.3M | } |
5182 | 27.9M | uint64_t for_other = write_group.size() - 1; |
5183 | 27.9M | if (for_other > 0) { |
5184 | 333k | stats->AddDBStats(InternalDBStatsType::WRITE_DONE_BY_OTHER, for_other); |
5185 | 333k | if (!write_options.disableWAL) { |
5186 | 333k | stats->AddDBStats(InternalDBStatsType::WRITE_WITH_WAL, for_other); |
5187 | 333k | } |
5188 | 333k | } |
5189 | 27.9M | } |
5190 | | |
5191 | 27.9M | if (!parallel) { |
5192 | 27.9M | InsertFlags insert_flags{InsertFlag::kFilterDeletes}; |
5193 | 27.9M | status = WriteBatchInternal::InsertInto( |
5194 | 27.9M | write_group, current_sequence, column_family_memtables_.get(), |
5195 | 27.9M | &flush_scheduler_, write_options.ignore_missing_column_families, |
5196 | 27.9M | 0 /*log_number*/, this, insert_flags); |
5197 | | |
5198 | 27.9M | if (status.ok()27.9M ) { |
5199 | | // There were no write failures. Set leader's status |
5200 | | // in case the write callback returned a non-ok status. |
5201 | 27.9M | status = w.FinalStatus(); |
5202 | 27.9M | } |
5203 | 29.1M | for (const auto& writer : write_group) { |
5204 | 29.1M | last_sequence += writer->batch->DirectEntries(); |
5205 | 29.1M | } |
5206 | | |
5207 | 27.9M | } else { |
5208 | 7.79k | WriteThread::ParallelGroup pg; |
5209 | 7.79k | pg.leader = &w; |
5210 | 7.79k | pg.last_writer = last_writer; |
5211 | 7.79k | pg.last_sequence = last_sequence; |
5212 | 7.79k | pg.early_exit_allowed = !need_log_sync; |
5213 | 7.79k | pg.running.store(static_cast<uint32_t>(write_group.size()), |
5214 | 7.79k | std::memory_order_relaxed); |
5215 | 7.79k | write_thread_.LaunchParallelFollowers(&pg, current_sequence); |
5216 | | |
5217 | 12.2k | if (!w.CallbackFailed()7.79k ) { |
5218 | | // do leader write |
5219 | 12.2k | ColumnFamilyMemTablesImpl column_family_memtables( |
5220 | 12.2k | versions_->GetColumnFamilySet()); |
5221 | 12.2k | assert(w.sequence == current_sequence); |
5222 | 0 | WriteBatchInternal::SetSequence(w.batch, w.sequence); |
5223 | 12.2k | InsertFlags insert_flags{InsertFlag::kConcurrentMemtableWrites}; |
5224 | 12.2k | w.status = WriteBatchInternal::InsertInto( |
5225 | 12.2k | w.batch, &column_family_memtables, &flush_scheduler_, |
5226 | 12.2k | write_options.ignore_missing_column_families, 0 /*log_number*/, |
5227 | 12.2k | this, insert_flags); |
5228 | 12.2k | } |
5229 | | |
5230 | | // CompleteParallelWorker returns true if this thread should |
5231 | | // handle exit, false means somebody else did |
5232 | 0 | exit_completed_early = !write_thread_.CompleteParallelWorker(&w); |
5233 | 7.79k | status = w.FinalStatus(); |
5234 | 7.79k | } |
5235 | | |
5236 | 27.9M | if (!exit_completed_early && w.status.ok()27.9M ) { |
5237 | 27.9M | SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); |
5238 | 27.9M | versions_->SetLastSequence(last_sequence); |
5239 | 27.9M | if (!need_log_sync) { |
5240 | 27.9M | write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); |
5241 | 27.9M | exit_completed_early = true; |
5242 | 27.9M | } |
5243 | 27.9M | } |
5244 | | |
5245 | | // A non-OK status here indicates that the state implied by the |
5246 | | // WAL has diverged from the in-memory state. This could be |
5247 | | // because of a corrupt write_batch (very bad), or because the |
5248 | | // client specified an invalid column family and didn't specify |
5249 | | // ignore_missing_column_families. |
5250 | | // |
5251 | | // Is setting bg_error_ enough here? This will at least stop |
5252 | | // compaction and fail any further writes. |
5253 | 27.9M | if (!status.ok() && bg_error_.ok()105 && !w.CallbackFailed()105 ) { |
5254 | 1 | bg_error_ = status; |
5255 | 1 | } |
5256 | 27.9M | } |
5257 | 27.9M | } |
5258 | 28.8M | PERF_TIMER_START(write_pre_and_post_process_time); |
5259 | | |
5260 | 28.8M | if (db_options_.paranoid_checks && !status.ok()28.7M && !w.CallbackFailed()828k && !status.IsBusy()828k ) { |
5261 | 828k | mutex_.Lock(); |
5262 | 828k | if (bg_error_.ok()) { |
5263 | 8 | bg_error_ = status; // stop compaction & fail any further writes |
5264 | 8 | } |
5265 | 828k | mutex_.Unlock(); |
5266 | 828k | } |
5267 | | |
5268 | 28.8M | if (need_log_sync) { |
5269 | 107 | mutex_.Lock(); |
5270 | 107 | MarkLogsSynced(logfile_number_, need_log_dir_sync, status); |
5271 | 107 | mutex_.Unlock(); |
5272 | 107 | } |
5273 | | |
5274 | 28.8M | if (!exit_completed_early) { |
5275 | 828k | write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); |
5276 | 828k | } |
5277 | | |
5278 | 28.8M | return status; |
5279 | 30.0M | } |
5280 | | |
5281 | | // REQUIRES: mutex_ is held |
5282 | | // REQUIRES: this thread is currently at the front of the writer queue |
5283 | 14.1k | Status DBImpl::DelayWrite(uint64_t num_bytes) { |
5284 | 14.1k | uint64_t time_delayed = 0; |
5285 | 14.1k | bool delayed = false; |
5286 | 14.1k | { |
5287 | 14.1k | auto delay = write_controller_.GetDelay(env_, num_bytes); |
5288 | 14.1k | if (delay > 0) { |
5289 | 2.09k | mutex_.Unlock(); |
5290 | 2.09k | delayed = true; |
5291 | 2.09k | TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); |
5292 | | // hopefully we don't have to sleep more than 2 billion microseconds |
5293 | 2.09k | env_->SleepForMicroseconds(static_cast<int>(delay)); |
5294 | 2.09k | mutex_.Lock(); |
5295 | 2.09k | } |
5296 | | |
5297 | | // If we are shutting down, background job that make WriteController stopped could be aborted |
5298 | | // and never release WriteControllerToken, so we need to check IsShuttingDown to not stuck here |
5299 | | // in this case. |
5300 | 16.2k | while (bg_error_.ok() && write_controller_.IsStopped()16.2k && !IsShuttingDown()2.12k ) { |
5301 | 2.12k | delayed = true; |
5302 | 2.12k | TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); |
5303 | 2.12k | bg_cv_.Wait(); |
5304 | 2.12k | } |
5305 | 14.1k | } |
5306 | 14.1k | if (delayed) { |
5307 | 3.76k | RecordTick(stats_, STALL_MICROS, time_delayed); |
5308 | 3.76k | } |
5309 | | |
5310 | 14.1k | return bg_error_; |
5311 | 14.1k | } |
5312 | | |
5313 | 11.2k | Status DBImpl::ScheduleFlushes(WriteContext* context) { |
5314 | 11.2k | ColumnFamilyData* cfd; |
5315 | 23.2k | while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { |
5316 | 11.9k | auto status = SwitchMemtable(cfd, context); |
5317 | 11.9k | if (cfd->Unref()) { |
5318 | 0 | delete cfd; |
5319 | 0 | } |
5320 | 11.9k | if (!status.ok()) { |
5321 | 6 | return status; |
5322 | 6 | } |
5323 | 11.9k | } |
5324 | 11.2k | return Status::OK(); |
5325 | 11.2k | } |
5326 | | |
5327 | | // REQUIRES: mutex_ is held |
5328 | | // REQUIRES: this thread is currently at the front of the writer queue |
5329 | 29.0k | Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { |
5330 | 29.0k | mutex_.AssertHeld(); |
5331 | 29.0k | unique_ptr<WritableFile> lfile; |
5332 | 29.0k | log::Writer* new_log = nullptr; |
5333 | 29.0k | MemTable* new_mem = nullptr; |
5334 | | |
5335 | | // Attempt to switch to a new memtable and trigger flush of old. |
5336 | | // Do this without holding the dbmutex lock. |
5337 | 29.0k | assert(versions_->prev_log_number() == 0); |
5338 | 0 | bool creating_new_log = !log_empty_; |
5339 | 29.0k | uint64_t recycle_log_number = 0; |
5340 | 29.0k | if (creating_new_log && db_options_.recycle_log_file_num20.3k && |
5341 | 29.0k | !log_recycle_files.empty()83 ) { |
5342 | 57 | recycle_log_number = log_recycle_files.front(); |
5343 | 57 | log_recycle_files.pop_front(); |
5344 | 57 | } |
5345 | 29.0k | uint64_t new_log_number = |
5346 | 29.0k | creating_new_log ? versions_->NewFileNumber()20.3k : logfile_number_8.73k ; |
5347 | 29.0k | SuperVersion* new_superversion = nullptr; |
5348 | 29.0k | const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); |
5349 | 29.0k | mutex_.Unlock(); |
5350 | 29.0k | Status s; |
5351 | 29.0k | { |
5352 | 29.0k | if (creating_new_log) { |
5353 | 20.3k | EnvOptions opt_env_opt = |
5354 | 20.3k | env_->OptimizeForLogWrite(env_options_, db_options_); |
5355 | 20.3k | if (recycle_log_number) { |
5356 | 57 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
5357 | 57 | "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); |
5358 | 57 | s = env_->ReuseWritableFile( |
5359 | 57 | LogFileName(db_options_.wal_dir, new_log_number), |
5360 | 57 | LogFileName(db_options_.wal_dir, recycle_log_number), &lfile, |
5361 | 57 | opt_env_opt); |
5362 | 20.3k | } else { |
5363 | 20.3k | s = NewWritableFile(env_, |
5364 | 20.3k | LogFileName(db_options_.wal_dir, new_log_number), |
5365 | 20.3k | &lfile, opt_env_opt); |
5366 | 20.3k | } |
5367 | 20.3k | if (s.ok()) { |
5368 | | // Our final size should be less than write_buffer_size |
5369 | | // (compression, etc) but err on the side of caution. |
5370 | 20.3k | lfile->SetPreallocationBlockSize( |
5371 | 20.3k | mutable_cf_options.write_buffer_size / 10 + |
5372 | 20.3k | mutable_cf_options.write_buffer_size); |
5373 | 20.3k | unique_ptr<WritableFileWriter> file_writer( |
5374 | 20.3k | new WritableFileWriter(std::move(lfile), opt_env_opt)); |
5375 | 20.3k | new_log = new log::Writer(std::move(file_writer), new_log_number, |
5376 | 20.3k | db_options_.recycle_log_file_num > 0); |
5377 | 20.3k | } |
5378 | 20.3k | } |
5379 | | |
5380 | 29.0k | if (s.ok()) { |
5381 | 29.0k | SequenceNumber seq = versions_->LastSequence(); |
5382 | 29.0k | new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); |
5383 | 29.0k | new_superversion = new SuperVersion(); |
5384 | 29.0k | } |
5385 | 29.0k | } |
5386 | 29.0k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
5387 | 29.0k | "[%s] New memtable created with log file: #%" PRIu64 "\n", |
5388 | 29.0k | cfd->GetName().c_str(), new_log_number); |
5389 | 29.0k | mutex_.Lock(); |
5390 | 29.0k | if (!s.ok()) { |
5391 | | // how do we fail if we're not creating new log? |
5392 | 7 | assert(creating_new_log); |
5393 | 0 | assert(!new_mem); |
5394 | 0 | assert(!new_log); |
5395 | 0 | return s; |
5396 | 7 | } |
5397 | 29.0k | if (creating_new_log) { |
5398 | 20.3k | logfile_number_ = new_log_number; |
5399 | 20.3k | assert(new_log != nullptr); |
5400 | 0 | log_empty_ = true; |
5401 | 20.3k | log_dir_synced_ = false; |
5402 | 20.3k | logs_.emplace_back(logfile_number_, new_log); |
5403 | 20.3k | alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); |
5404 | 29.3k | for (auto loop_cfd : *versions_->GetColumnFamilySet()) { |
5405 | | // all this is just optimization to delete logs that |
5406 | | // are no longer needed -- if CF is empty, that means it |
5407 | | // doesn't need that particular log to stay alive, so we just |
5408 | | // advance the log number. no need to persist this in the manifest |
5409 | 29.3k | if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && |
5410 | 29.3k | loop_cfd->imm()->NumNotFlushed() == 07.54k ) { |
5411 | 7.54k | loop_cfd->SetLogNumber(logfile_number_); |
5412 | 7.54k | } |
5413 | 29.3k | } |
5414 | 20.3k | } |
5415 | 0 | cfd->mem()->SetFlushStartTime(std::chrono::steady_clock::now()); |
5416 | 29.0k | cfd->mem()->SetNextLogNumber(logfile_number_); |
5417 | 29.0k | cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); |
5418 | 29.0k | new_mem->Ref(); |
5419 | 29.0k | cfd->SetMemtable(new_mem); |
5420 | 29.0k | context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork( |
5421 | 29.0k | cfd, new_superversion, mutable_cf_options)); |
5422 | | |
5423 | 29.0k | return s; |
5424 | 29.0k | } |
5425 | | |
5426 | | #ifndef ROCKSDB_LITE |
5427 | | Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, |
5428 | 56 | TablePropertiesCollection* props) { |
5429 | 56 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5430 | 56 | auto cfd = cfh->cfd(); |
5431 | | |
5432 | | // Increment the ref count |
5433 | 56 | mutex_.Lock(); |
5434 | 56 | auto version = cfd->current(); |
5435 | 56 | version->Ref(); |
5436 | 56 | mutex_.Unlock(); |
5437 | | |
5438 | 56 | auto s = version->GetPropertiesOfAllTables(props); |
5439 | | |
5440 | | // Decrement the ref count |
5441 | 56 | mutex_.Lock(); |
5442 | 56 | version->Unref(); |
5443 | 56 | mutex_.Unlock(); |
5444 | | |
5445 | 56 | return s; |
5446 | 56 | } |
5447 | | |
5448 | | Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, |
5449 | | const Range* range, std::size_t n, |
5450 | 0 | TablePropertiesCollection* props) { |
5451 | 0 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5452 | 0 | auto cfd = cfh->cfd(); |
5453 | | |
5454 | | // Increment the ref count |
5455 | 0 | mutex_.Lock(); |
5456 | 0 | auto version = cfd->current(); |
5457 | 0 | version->Ref(); |
5458 | 0 | mutex_.Unlock(); |
5459 | |
|
5460 | 0 | auto s = version->GetPropertiesOfTablesInRange(range, n, props); |
5461 | | |
5462 | | // Decrement the ref count |
5463 | 0 | mutex_.Lock(); |
5464 | 0 | version->Unref(); |
5465 | 0 | mutex_.Unlock(); |
5466 | |
|
5467 | 0 | return s; |
5468 | 0 | } |
5469 | | |
5470 | | void DBImpl::GetColumnFamiliesOptions( |
5471 | | std::vector<std::string>* column_family_names, |
5472 | 9 | std::vector<ColumnFamilyOptions>* column_family_options) { |
5473 | 9 | DCHECK(column_family_names); |
5474 | 9 | DCHECK(column_family_options); |
5475 | 9 | InstrumentedMutexLock lock(&mutex_); |
5476 | 9 | GetColumnFamiliesOptionsUnlocked(column_family_names, column_family_options); |
5477 | 9 | } |
5478 | | |
5479 | | void DBImpl::GetColumnFamiliesOptionsUnlocked( |
5480 | | std::vector<std::string>* column_family_names, |
5481 | 1.27M | std::vector<ColumnFamilyOptions>* column_family_options) { |
5482 | 1.28M | for (auto cfd : *versions_->GetColumnFamilySet()) { |
5483 | 1.28M | if (cfd->IsDropped()) { |
5484 | 29 | continue; |
5485 | 29 | } |
5486 | 1.28M | column_family_names->push_back(cfd->GetName()); |
5487 | 1.28M | column_family_options->push_back( |
5488 | 1.28M | BuildColumnFamilyOptions(*cfd->options(), *cfd->GetLatestMutableCFOptions())); |
5489 | 1.28M | } |
5490 | 1.27M | } |
5491 | | #endif // ROCKSDB_LITE |
5492 | | |
5493 | 7.01M | const std::string& DBImpl::GetName() const { |
5494 | 7.01M | return dbname_; |
5495 | 7.01M | } |
5496 | | |
5497 | 5.13M | Env* DBImpl::GetEnv() const { |
5498 | 5.13M | return env_; |
5499 | 5.13M | } |
5500 | | |
5501 | 19.6k | Env* DBImpl::GetCheckpointEnv() const { |
5502 | 19.6k | return checkpoint_env_; |
5503 | 19.6k | } |
5504 | | |
5505 | 22.5M | const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { |
5506 | 22.5M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5507 | 22.5M | return *cfh->cfd()->options(); |
5508 | 22.5M | } |
5509 | | |
5510 | 1.28M | const DBOptions& DBImpl::GetDBOptions() const { return db_options_; } |
5511 | | |
5512 | | bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, |
5513 | 10.4k | const Slice& property, std::string* value) { |
5514 | 10.4k | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5515 | 10.4k | value->clear(); |
5516 | 10.4k | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
5517 | 10.4k | if (property_info == nullptr) { |
5518 | 0 | return false; |
5519 | 10.4k | } else if (property_info->handle_int) { |
5520 | 426 | uint64_t int_value; |
5521 | 426 | bool ret_value = |
5522 | 426 | GetIntPropertyInternal(cfd, *property_info, false, &int_value); |
5523 | 426 | if (ret_value) { |
5524 | 426 | *value = ToString(int_value); |
5525 | 426 | } |
5526 | 426 | return ret_value; |
5527 | 10.0k | } else if (property_info->handle_string) { |
5528 | 10.0k | InstrumentedMutexLock l(&mutex_); |
5529 | 10.0k | return cfd->internal_stats()->GetStringProperty(*property_info, property, |
5530 | 10.0k | value); |
5531 | 10.0k | } |
5532 | | // Shouldn't reach here since exactly one of handle_string and handle_int |
5533 | | // should be non-nullptr. |
5534 | 0 | assert(false); |
5535 | 0 | return false; |
5536 | 10.4k | } |
5537 | | |
5538 | | bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, |
5539 | 587 | const Slice& property, uint64_t* value) { |
5540 | 587 | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5541 | 587 | if (property_info == nullptr || property_info->handle_int == nullptr) { |
5542 | 0 | return false; |
5543 | 0 | } |
5544 | 587 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
5545 | 587 | return GetIntPropertyInternal(cfd, *property_info, false, value); |
5546 | 587 | } |
5547 | | |
5548 | | bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, |
5549 | | const DBPropertyInfo& property_info, |
5550 | 1.06k | bool is_locked, uint64_t* value) { |
5551 | 1.06k | assert(property_info.handle_int != nullptr); |
5552 | 1.06k | if (!property_info.need_out_of_mutex) { |
5553 | 915 | if (is_locked) { |
5554 | 5 | mutex_.AssertHeld(); |
5555 | 5 | return cfd->internal_stats()->GetIntProperty(property_info, value, this); |
5556 | 910 | } else { |
5557 | 910 | InstrumentedMutexLock l(&mutex_); |
5558 | 910 | return cfd->internal_stats()->GetIntProperty(property_info, value, this); |
5559 | 910 | } |
5560 | 915 | } else { |
5561 | 153 | SuperVersion* sv = nullptr; |
5562 | 153 | if (!is_locked) { |
5563 | 103 | sv = GetAndRefSuperVersion(cfd); |
5564 | 103 | } else { |
5565 | 50 | sv = cfd->GetSuperVersion(); |
5566 | 50 | } |
5567 | | |
5568 | 153 | bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( |
5569 | 153 | property_info, sv->current, value); |
5570 | | |
5571 | 153 | if (!is_locked) { |
5572 | 103 | ReturnAndCleanupSuperVersion(cfd, sv); |
5573 | 103 | } |
5574 | | |
5575 | 153 | return ret; |
5576 | 153 | } |
5577 | 1.06k | } |
5578 | | |
5579 | | bool DBImpl::GetAggregatedIntProperty(const Slice& property, |
5580 | 12 | uint64_t* aggregated_value) { |
5581 | 12 | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5582 | 12 | if (property_info == nullptr || property_info->handle_int == nullptr) { |
5583 | 1 | return false; |
5584 | 1 | } |
5585 | | |
5586 | 11 | uint64_t sum = 0; |
5587 | 11 | { |
5588 | | // Needs mutex to protect the list of column families. |
5589 | 11 | InstrumentedMutexLock l(&mutex_); |
5590 | 11 | uint64_t value; |
5591 | 55 | for (auto* cfd : *versions_->GetColumnFamilySet()) { |
5592 | 55 | if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { |
5593 | 55 | sum += value; |
5594 | 55 | } else { |
5595 | 0 | return false; |
5596 | 0 | } |
5597 | 55 | } |
5598 | 11 | } |
5599 | 11 | *aggregated_value = sum; |
5600 | 11 | return true; |
5601 | 11 | } |
5602 | | |
5603 | 6.55M | SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { |
5604 | | // TODO(ljin): consider using GetReferencedSuperVersion() directly |
5605 | 6.55M | return cfd->GetThreadLocalSuperVersion(&mutex_); |
5606 | 6.55M | } |
5607 | | |
5608 | | // REQUIRED: this function should only be called on the write thread or if the |
5609 | | // mutex is held. |
5610 | 59 | SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { |
5611 | 59 | auto column_family_set = versions_->GetColumnFamilySet(); |
5612 | 59 | auto cfd = column_family_set->GetColumnFamily(column_family_id); |
5613 | 59 | if (!cfd) { |
5614 | 0 | return nullptr; |
5615 | 0 | } |
5616 | | |
5617 | 59 | return GetAndRefSuperVersion(cfd); |
5618 | 59 | } |
5619 | | |
5620 | | // REQUIRED: mutex is NOT held |
5621 | 0 | SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) { |
5622 | 0 | ColumnFamilyData* cfd; |
5623 | 0 | { |
5624 | 0 | InstrumentedMutexLock l(&mutex_); |
5625 | 0 | auto column_family_set = versions_->GetColumnFamilySet(); |
5626 | 0 | cfd = column_family_set->GetColumnFamily(column_family_id); |
5627 | 0 | } |
5628 | |
|
5629 | 0 | if (!cfd) { |
5630 | 0 | return nullptr; |
5631 | 0 | } |
5632 | | |
5633 | 0 | return GetAndRefSuperVersion(cfd); |
5634 | 0 | } |
5635 | | |
5636 | | void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, |
5637 | 6.55M | SuperVersion* sv) { |
5638 | 6.55M | bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); |
5639 | | |
5640 | 6.55M | if (unref_sv) { |
5641 | | // Release SuperVersion |
5642 | 177 | if (sv->Unref()) { |
5643 | 140 | { |
5644 | 140 | InstrumentedMutexLock l(&mutex_); |
5645 | 140 | sv->Cleanup(); |
5646 | 140 | } |
5647 | 140 | delete sv; |
5648 | 140 | RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); |
5649 | 140 | } |
5650 | 177 | RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); |
5651 | 177 | } |
5652 | 6.55M | } |
5653 | | |
5654 | | // REQUIRED: this function should only be called on the write thread. |
5655 | | void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, |
5656 | 59 | SuperVersion* sv) { |
5657 | 59 | auto column_family_set = versions_->GetColumnFamilySet(); |
5658 | 59 | auto cfd = column_family_set->GetColumnFamily(column_family_id); |
5659 | | |
5660 | | // If SuperVersion is held, and we successfully fetched a cfd using |
5661 | | // GetAndRefSuperVersion(), it must still exist. |
5662 | 59 | assert(cfd != nullptr); |
5663 | 0 | ReturnAndCleanupSuperVersion(cfd, sv); |
5664 | 59 | } |
5665 | | |
5666 | | // REQUIRED: Mutex should NOT be held. |
5667 | | void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id, |
5668 | 0 | SuperVersion* sv) { |
5669 | 0 | ColumnFamilyData* cfd; |
5670 | 0 | { |
5671 | 0 | InstrumentedMutexLock l(&mutex_); |
5672 | 0 | auto column_family_set = versions_->GetColumnFamilySet(); |
5673 | 0 | cfd = column_family_set->GetColumnFamily(column_family_id); |
5674 | 0 | } |
5675 | | |
5676 | | // If SuperVersion is held, and we successfully fetched a cfd using |
5677 | | // GetAndRefSuperVersion(), it must still exist. |
5678 | 0 | assert(cfd != nullptr); |
5679 | 0 | ReturnAndCleanupSuperVersion(cfd, sv); |
5680 | 0 | } |
5681 | | |
5682 | | // REQUIRED: this function should only be called on the write thread or if the |
5683 | | // mutex is held. |
5684 | 0 | ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { |
5685 | 0 | ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); |
5686 | |
|
5687 | 0 | if (!cf_memtables->Seek(column_family_id)) { |
5688 | 0 | return nullptr; |
5689 | 0 | } |
5690 | | |
5691 | 0 | return cf_memtables->GetColumnFamilyHandle(); |
5692 | 0 | } |
5693 | | |
5694 | | // REQUIRED: mutex is NOT held. |
5695 | | ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked( |
5696 | 0 | uint32_t column_family_id) { |
5697 | 0 | ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); |
5698 | |
|
5699 | 0 | InstrumentedMutexLock l(&mutex_); |
5700 | |
|
5701 | 0 | if (!cf_memtables->Seek(column_family_id)) { |
5702 | 0 | return nullptr; |
5703 | 0 | } |
5704 | | |
5705 | 0 | return cf_memtables->GetColumnFamilyHandle(); |
5706 | 0 | } |
5707 | | |
5708 | 0 | Status DBImpl::Import(const std::string& source_dir) { |
5709 | 0 | const auto seqno = versions_->LastSequence(); |
5710 | 0 | FlushOptions options; |
5711 | 0 | RETURN_NOT_OK(Flush(options)); |
5712 | 0 | VersionEdit edit; |
5713 | 0 | auto status = versions_->Import(source_dir, seqno, &edit); |
5714 | 0 | if (!status.ok()) { |
5715 | 0 | return status; |
5716 | 0 | } |
5717 | 0 | return ApplyVersionEdit(&edit); |
5718 | 0 | } |
5719 | | |
5720 | 0 | bool DBImpl::AreWritesStopped() { |
5721 | 0 | return write_controller_.IsStopped(); |
5722 | 0 | } |
5723 | | |
5724 | 5.53M | bool DBImpl::NeedsDelay() { |
5725 | 5.53M | return write_controller_.NeedsDelay(); |
5726 | 5.53M | } |
5727 | | |
5728 | 144 | Result<std::string> DBImpl::GetMiddleKey() { |
5729 | 144 | InstrumentedMutexLock lock(&mutex_); |
5730 | 144 | return default_cf_handle_->cfd()->current()->GetMiddleKey(); |
5731 | 144 | } |
5732 | | |
5733 | 0 | void DBImpl::TEST_SwitchMemtable() { |
5734 | 0 | std::lock_guard<InstrumentedMutex> lock(mutex_); |
5735 | 0 | WriteContext context; |
5736 | 0 | CHECK_OK(SwitchMemtable(default_cf_handle_->cfd(), &context)); |
5737 | 0 | } |
5738 | | |
5739 | | void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, |
5740 | | const Range* range, int n, uint64_t* sizes, |
5741 | 14.0k | bool include_memtable) { |
5742 | 14.0k | Version* v; |
5743 | 14.0k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5744 | 14.0k | auto cfd = cfh->cfd(); |
5745 | 14.0k | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
5746 | 14.0k | v = sv->current; |
5747 | | |
5748 | 28.1k | for (int i = 0; i < n; i++14.0k ) { |
5749 | | // Convert user_key into a corresponding internal key. |
5750 | 14.0k | InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); |
5751 | 14.0k | InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); |
5752 | 14.0k | sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode()); |
5753 | 14.0k | if (include_memtable) { |
5754 | 9 | sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode()); |
5755 | 9 | sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode()); |
5756 | 9 | } |
5757 | 14.0k | } |
5758 | | |
5759 | 14.0k | ReturnAndCleanupSuperVersion(cfd, sv); |
5760 | 14.0k | } |
5761 | | |
5762 | | #ifndef ROCKSDB_LITE |
5763 | | Status DBImpl::GetUpdatesSince( |
5764 | | SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter, |
5765 | 48 | const TransactionLogIterator::ReadOptions& read_options) { |
5766 | | |
5767 | 48 | RecordTick(stats_, GET_UPDATES_SINCE_CALLS); |
5768 | 48 | if (seq > versions_->LastSequence()) { |
5769 | 0 | return STATUS(NotFound, "Requested sequence not yet written in the db"); |
5770 | 0 | } |
5771 | 48 | return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); |
5772 | 48 | } |
5773 | | |
5774 | 297 | Status DBImpl::DeleteFile(std::string name) { |
5775 | 297 | uint64_t number; |
5776 | 297 | FileType type; |
5777 | 297 | WalFileType log_type; |
5778 | 297 | if (!ParseFileName(name, &number, &type, &log_type) || |
5779 | 297 | (type != kTableFile && type != kLogFile2 )) { |
5780 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5781 | 0 | "DeleteFile %s failed.\n", name.c_str()); |
5782 | 0 | return STATUS(InvalidArgument, "Invalid file name"); |
5783 | 0 | } |
5784 | | |
5785 | 297 | Status status; |
5786 | 297 | if (type == kLogFile) { |
5787 | | // Only allow deleting archived log files |
5788 | 2 | if (log_type != kArchivedLogFile) { |
5789 | 1 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5790 | 1 | "DeleteFile %s failed - not archived log.\n", |
5791 | 1 | name.c_str()); |
5792 | 1 | return STATUS(NotSupported, "Delete only supported for archived logs"); |
5793 | 1 | } |
5794 | 1 | status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); |
5795 | 1 | if (!status.ok()) { |
5796 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5797 | 0 | "DeleteFile %s failed -- %s.\n", |
5798 | 0 | name.c_str(), status.ToString().c_str()); |
5799 | 0 | } |
5800 | 1 | return status; |
5801 | 2 | } |
5802 | | |
5803 | 295 | int level; |
5804 | 295 | FileMetaData* metadata; |
5805 | 295 | ColumnFamilyData* cfd; |
5806 | 295 | VersionEdit edit; |
5807 | 295 | JobContext job_context(next_job_id_.fetch_add(1), true); |
5808 | 295 | { |
5809 | 295 | InstrumentedMutexLock l(&mutex_); |
5810 | | // Delete file is infrequent operation, so could just busy wait here. |
5811 | 295 | while (versions_->has_manifest_writers()) { |
5812 | 0 | mutex_.unlock(); |
5813 | 0 | std::this_thread::sleep_for(10ms); |
5814 | 0 | mutex_.lock(); |
5815 | 0 | } |
5816 | | |
5817 | 295 | status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); |
5818 | 295 | if (!status.ok()) { |
5819 | 1 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5820 | 1 | "DeleteFile %s failed. File not found\n", name.c_str()); |
5821 | 1 | job_context.Clean(); |
5822 | 1 | return STATUS(InvalidArgument, "File not found"); |
5823 | 1 | } |
5824 | 294 | assert(level < cfd->NumberLevels()); |
5825 | | |
5826 | | // If the file is being compacted no need to delete. |
5827 | 294 | if (metadata->being_compacted) { |
5828 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
5829 | 0 | "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); |
5830 | 0 | job_context.Clean(); |
5831 | 0 | return Status::OK(); |
5832 | 0 | } |
5833 | | |
5834 | | // Only the files in the last level can be deleted externally. |
5835 | | // This is to make sure that any deletion tombstones are not |
5836 | | // lost. Check that the level passed is the last level. |
5837 | 294 | auto* vstoreage = cfd->current()->storage_info(); |
5838 | 386 | for (int i = level + 1; i < cfd->NumberLevels(); i++92 ) { |
5839 | 93 | if (vstoreage->NumLevelFiles(i) != 0) { |
5840 | 1 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5841 | 1 | "DeleteFile %s FAILED. File not in last level\n", name.c_str()); |
5842 | 1 | job_context.Clean(); |
5843 | 1 | return STATUS(InvalidArgument, "File not in last level"); |
5844 | 1 | } |
5845 | 93 | } |
5846 | | // if level == 0, it has to be the oldest file |
5847 | 293 | if (level == 0 && |
5848 | 293 | vstoreage->LevelFiles(0).back()->fd.GetNumber() != number242 ) { |
5849 | 7 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5850 | 7 | "DeleteFile %s failed ---" |
5851 | 7 | " target file in level 0 must be the oldest. Expected: %" PRIu64, name.c_str(), number); |
5852 | 7 | job_context.Clean(); |
5853 | 7 | return STATUS(InvalidArgument, "File in level 0, but not oldest"); |
5854 | 7 | } |
5855 | | |
5856 | 286 | TEST_SYNC_POINT("DBImpl::DeleteFile:DecidedToDelete"); |
5857 | | |
5858 | 286 | metadata->being_deleted = true; |
5859 | | |
5860 | 286 | edit.SetColumnFamily(cfd->GetID()); |
5861 | 286 | edit.DeleteFile(level, number); |
5862 | 286 | status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
5863 | 286 | &edit, &mutex_, directories_.GetDbDir()); |
5864 | 286 | if (status.ok()) { |
5865 | 286 | InstallSuperVersionAndScheduleWorkWrapper( |
5866 | 286 | cfd, &job_context, *cfd->GetLatestMutableCFOptions()); |
5867 | 286 | } |
5868 | 286 | FindObsoleteFiles(&job_context, false); |
5869 | 286 | } // lock released here |
5870 | | |
5871 | 0 | LogFlush(db_options_.info_log); |
5872 | | // remove files outside the db-lock |
5873 | 286 | if (job_context.HaveSomethingToDelete()) { |
5874 | | // Call PurgeObsoleteFiles() without holding mutex. |
5875 | 286 | PurgeObsoleteFiles(job_context); |
5876 | 286 | } |
5877 | 286 | job_context.Clean(); |
5878 | | |
5879 | 286 | FilesChanged(); |
5880 | | |
5881 | 286 | return status; |
5882 | 293 | } |
5883 | | |
5884 | | Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family, |
5885 | 3 | const Slice* begin, const Slice* end) { |
5886 | 3 | Status status; |
5887 | 3 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5888 | 3 | ColumnFamilyData* cfd = cfh->cfd(); |
5889 | 3 | VersionEdit edit; |
5890 | 3 | std::vector<FileMetaData*> deleted_files; |
5891 | 3 | JobContext job_context(next_job_id_.fetch_add(1), true); |
5892 | 3 | { |
5893 | 3 | InstrumentedMutexLock l(&mutex_); |
5894 | 3 | Version* input_version = cfd->current(); |
5895 | | |
5896 | 3 | auto* vstorage = input_version->storage_info(); |
5897 | 12 | for (int i = 1; i < cfd->NumberLevels(); i++9 ) { |
5898 | 9 | if (vstorage->LevelFiles(i).empty() || |
5899 | 9 | !vstorage->OverlapInLevel(i, begin, end)) { |
5900 | 4 | continue; |
5901 | 4 | } |
5902 | 5 | std::vector<FileMetaData*> level_files; |
5903 | 5 | InternalKey begin_storage, end_storage, *begin_key, *end_key; |
5904 | 5 | if (begin == nullptr) { |
5905 | 3 | begin_key = nullptr; |
5906 | 3 | } else { |
5907 | 2 | begin_storage = InternalKey::MaxPossibleForUserKey(*begin); |
5908 | 2 | begin_key = &begin_storage; |
5909 | 2 | } |
5910 | 5 | if (end == nullptr) { |
5911 | 3 | end_key = nullptr; |
5912 | 3 | } else { |
5913 | 2 | end_storage = InternalKey::MinPossibleForUserKey(*end); |
5914 | 2 | end_key = &end_storage; |
5915 | 2 | } |
5916 | | |
5917 | 5 | vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1, |
5918 | 5 | nullptr, false); |
5919 | 5 | FileMetaData* level_file; |
5920 | 41 | for (uint32_t j = 0; j < level_files.size(); j++36 ) { |
5921 | 36 | level_file = level_files[j]; |
5922 | 36 | if (((begin == nullptr) || |
5923 | 36 | (cfd->internal_comparator()->user_comparator()->Compare( |
5924 | 6 | level_file->smallest.key.user_key(), *begin) >= 0)) && |
5925 | 36 | (35 (end == nullptr)35 || |
5926 | 35 | (cfd->internal_comparator()->user_comparator()->Compare( |
5927 | 34 | level_file->largest.key.user_key(), *end) <= 0))) { |
5928 | 34 | if (level_file->being_compacted) { |
5929 | 0 | continue; |
5930 | 0 | } |
5931 | 34 | edit.SetColumnFamily(cfd->GetID()); |
5932 | 34 | edit.DeleteFile(i, level_file->fd.GetNumber()); |
5933 | 34 | deleted_files.push_back(level_file); |
5934 | 34 | level_file->being_compacted = true; |
5935 | 34 | } |
5936 | 36 | } |
5937 | 5 | } |
5938 | 3 | if (edit.GetDeletedFiles().empty()) { |
5939 | 1 | job_context.Clean(); |
5940 | 1 | return Status::OK(); |
5941 | 1 | } |
5942 | 2 | input_version->Ref(); |
5943 | 2 | status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
5944 | 2 | &edit, &mutex_, directories_.GetDbDir()); |
5945 | 2 | if (status.ok()) { |
5946 | 2 | InstallSuperVersionAndScheduleWorkWrapper( |
5947 | 2 | cfd, &job_context, *cfd->GetLatestMutableCFOptions()); |
5948 | 2 | } |
5949 | 34 | for (auto* deleted_file : deleted_files) { |
5950 | 34 | deleted_file->being_compacted = false; |
5951 | 34 | } |
5952 | 2 | input_version->Unref(); |
5953 | 2 | FindObsoleteFiles(&job_context, false); |
5954 | 2 | } // lock released here |
5955 | | |
5956 | 0 | LogFlush(db_options_.info_log); |
5957 | | // remove files outside the db-lock |
5958 | 2 | if (job_context.HaveSomethingToDelete()) { |
5959 | | // Call PurgeObsoleteFiles() without holding mutex. |
5960 | 2 | PurgeObsoleteFiles(job_context); |
5961 | 2 | } |
5962 | 2 | job_context.Clean(); |
5963 | 2 | return status; |
5964 | 3 | } |
5965 | | |
5966 | 2.78M | void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) { |
5967 | 2.78M | InstrumentedMutexLock l(&mutex_); |
5968 | 2.78M | versions_->GetLiveFilesMetaData(metadata); |
5969 | 2.78M | } |
5970 | | |
5971 | 21.1M | UserFrontierPtr DBImpl::GetFlushedFrontier() { |
5972 | 21.1M | InstrumentedMutexLock l(&mutex_); |
5973 | 21.1M | auto result = versions_->FlushedFrontier(); |
5974 | 21.1M | if (result) { |
5975 | 3.20M | return result->Clone(); |
5976 | 3.20M | } |
5977 | 17.9M | std::vector<LiveFileMetaData> files; |
5978 | 17.9M | versions_->GetLiveFilesMetaData(&files); |
5979 | 17.9M | UserFrontierPtr accumulated; |
5980 | 17.9M | for (const auto& file : files) { |
5981 | 0 | if (!file.imported) { |
5982 | 0 | UserFrontier::Update( |
5983 | 0 | file.largest.user_frontier.get(), UpdateUserValueType::kLargest, &accumulated); |
5984 | 0 | } |
5985 | 0 | } |
5986 | 17.9M | return accumulated; |
5987 | 21.1M | } |
5988 | | |
5989 | 19.0M | UserFrontierPtr DBImpl::GetMutableMemTableFrontier(UpdateUserValueType type) { |
5990 | 19.0M | InstrumentedMutexLock l(&mutex_); |
5991 | 19.0M | UserFrontierPtr accumulated; |
5992 | 19.0M | for (auto cfd : *versions_->GetColumnFamilySet()) { |
5993 | 19.0M | if (cfd) { |
5994 | 19.0M | const auto* mem = cfd->mem(); |
5995 | 19.0M | if (mem) { |
5996 | 19.0M | if (!cfd->IsDropped() && cfd->imm()->NumNotFlushed() == 0 && !mem->IsEmpty()19.0M ) { |
5997 | 2.46M | auto frontier = mem->GetFrontier(type); |
5998 | 2.46M | if (frontier) { |
5999 | 2.46M | UserFrontier::Update(frontier.get(), type, &accumulated); |
6000 | 2.46M | } else { |
6001 | 0 | YB_LOG_EVERY_N_SECS(DFATAL, 5) |
6002 | 0 | << db_options_.log_prefix << "[" << cfd->GetName() |
6003 | 0 | << "] " << ToString(type) << " frontier is not initialized for non-empty MemTable"; |
6004 | 0 | } |
6005 | 2.46M | } |
6006 | 19.0M | } else { |
6007 | 0 | YB_LOG_EVERY_N_SECS(WARNING, 5) << db_options_.log_prefix |
6008 | 0 | << "[" << cfd->GetName() |
6009 | 0 | << "] mem is expected to be non-nullptr here"; |
6010 | 0 | } |
6011 | 19.0M | } else { |
6012 | 0 | YB_LOG_EVERY_N_SECS(WARNING, 5) << db_options_.log_prefix |
6013 | 0 | << "cfd is expected to be non-nullptr here"; |
6014 | 0 | } |
6015 | 19.0M | } |
6016 | 19.0M | return accumulated; |
6017 | 19.0M | } |
6018 | | |
6019 | 262k | Status DBImpl::ApplyVersionEdit(VersionEdit* edit) { |
6020 | 262k | auto cfd = versions_->GetColumnFamilySet()->GetDefault(); |
6021 | 262k | std::unique_ptr<SuperVersion> superversion_to_free_after_unlock_because_of_install; |
6022 | 262k | std::unique_ptr<SuperVersion> superversion_to_free_after_unlock_because_of_unref; |
6023 | 262k | InstrumentedMutexLock lock(&mutex_); |
6024 | 262k | auto current_sv = cfd->GetSuperVersion()->Ref(); |
6025 | 262k | auto se = yb::ScopeExit([&superversion_to_free_after_unlock_because_of_unref, current_sv]() { |
6026 | 262k | if (current_sv->Unref()) { |
6027 | 262k | current_sv->Cleanup(); |
6028 | 262k | superversion_to_free_after_unlock_because_of_unref.reset(current_sv); |
6029 | 262k | } |
6030 | 262k | }); |
6031 | 262k | auto status = versions_->LogAndApply(cfd, current_sv->mutable_cf_options, edit, &mutex_); |
6032 | 262k | if (!status.ok()) { |
6033 | 1 | return status; |
6034 | 1 | } |
6035 | 262k | superversion_to_free_after_unlock_because_of_install = cfd->InstallSuperVersion( |
6036 | 262k | new SuperVersion(), &mutex_); |
6037 | | |
6038 | 262k | return Status::OK(); |
6039 | 262k | } |
6040 | | |
6041 | 262k | Status DBImpl::ModifyFlushedFrontier(UserFrontierPtr frontier, FrontierModificationMode mode) { |
6042 | 262k | VersionEdit edit; |
6043 | 262k | edit.ModifyFlushedFrontier(std::move(frontier), mode); |
6044 | 262k | return ApplyVersionEdit(&edit); |
6045 | 262k | } |
6046 | | |
6047 | | void DBImpl::GetColumnFamilyMetaData( |
6048 | | ColumnFamilyHandle* column_family, |
6049 | 610 | ColumnFamilyMetaData* cf_meta) { |
6050 | 610 | assert(column_family); |
6051 | 0 | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
6052 | 610 | auto* sv = GetAndRefSuperVersion(cfd); |
6053 | 610 | sv->current->GetColumnFamilyMetaData(cf_meta); |
6054 | 610 | ReturnAndCleanupSuperVersion(cfd, sv); |
6055 | 610 | } |
6056 | | |
6057 | | #endif // ROCKSDB_LITE |
6058 | | |
6059 | 435k | Status DBImpl::CheckConsistency() { |
6060 | 435k | mutex_.AssertHeld(); |
6061 | 435k | std::vector<LiveFileMetaData> metadata; |
6062 | 435k | versions_->GetLiveFilesMetaData(&metadata); |
6063 | | |
6064 | 435k | std::string corruption_messages; |
6065 | 435k | for (const auto& md : metadata) { |
6066 | | // md.name has a leading "/". |
6067 | 22.0k | std::string base_file_path = md.db_path + md.name; |
6068 | 22.0k | uint64_t base_fsize = 0; |
6069 | 22.0k | Status s = env_->GetFileSize(base_file_path, &base_fsize); |
6070 | 22.0k | if (!s.ok() && |
6071 | 22.0k | env_->GetFileSize(Rocks2LevelTableFileName(base_file_path), &base_fsize).ok()7 ) { |
6072 | 5 | s = Status::OK(); |
6073 | 5 | } |
6074 | 22.0k | if (!s.ok()) { |
6075 | 2 | corruption_messages += |
6076 | 2 | "Can't access " + md.name + ": " + s.ToString() + "\n"; |
6077 | 22.0k | } else if (base_fsize != md.base_size) { |
6078 | 2 | corruption_messages += "Sst base file size mismatch: " + base_file_path + |
6079 | 2 | ". Size recorded in manifest " + |
6080 | 2 | ToString(md.base_size) + ", actual size " + |
6081 | 2 | ToString(base_fsize) + "\n"; |
6082 | 2 | } |
6083 | 22.0k | if (md.total_size > md.base_size) { |
6084 | 20.9k | const std::string data_file_path = TableBaseToDataFileName(base_file_path); |
6085 | 20.9k | uint64_t data_fsize = 0; |
6086 | 20.9k | s = env_->GetFileSize(data_file_path, &data_fsize); |
6087 | 20.9k | const uint64_t md_data_size = md.total_size - md.base_size; |
6088 | 20.9k | if (!s.ok()) { |
6089 | 0 | corruption_messages += |
6090 | 0 | "Can't access " + TableBaseToDataFileName(md.name) + ": " + s.ToString() + "\n"; |
6091 | 20.9k | } else if (data_fsize != md_data_size) { |
6092 | 0 | corruption_messages += "Sst data file size mismatch: " + data_file_path + |
6093 | 0 | ". Data size based on total and base size recorded in manifest " + |
6094 | 0 | ToString(md_data_size) + ", actual data size " + |
6095 | 0 | ToString(data_fsize) + "\n"; |
6096 | 0 | } |
6097 | 20.9k | } |
6098 | 22.0k | } |
6099 | 435k | if (corruption_messages.size() == 0) { |
6100 | 435k | return Status::OK(); |
6101 | 435k | } else { |
6102 | 49 | return STATUS(Corruption, corruption_messages); |
6103 | 49 | } |
6104 | 435k | } |
6105 | | |
6106 | 15 | Status DBImpl::GetDbIdentity(std::string* identity) const { |
6107 | 15 | std::string idfilename = IdentityFileName(dbname_); |
6108 | 15 | const EnvOptions soptions; |
6109 | 15 | unique_ptr<SequentialFileReader> id_file_reader; |
6110 | 15 | Status s; |
6111 | 15 | { |
6112 | 15 | unique_ptr<SequentialFile> idfile; |
6113 | 15 | s = env_->NewSequentialFile(idfilename, &idfile, soptions); |
6114 | 15 | if (!s.ok()) { |
6115 | 0 | return s; |
6116 | 0 | } |
6117 | 15 | id_file_reader.reset(new SequentialFileReader(std::move(idfile))); |
6118 | 15 | } |
6119 | | |
6120 | 0 | uint64_t file_size; |
6121 | 15 | s = env_->GetFileSize(idfilename, &file_size); |
6122 | 15 | if (!s.ok()) { |
6123 | 0 | return s; |
6124 | 0 | } |
6125 | 15 | uint8_t* buffer = reinterpret_cast<uint8_t*>(alloca(file_size)); |
6126 | 15 | Slice id; |
6127 | 15 | s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer); |
6128 | 15 | if (!s.ok()) { |
6129 | 0 | return s; |
6130 | 0 | } |
6131 | 15 | identity->assign(id.cdata(), id.size()); |
6132 | | // If last character is '\n' remove it from identity |
6133 | 15 | if (!identity->empty() && identity->back() == '\n') { |
6134 | 0 | identity->pop_back(); |
6135 | 0 | } |
6136 | 15 | return s; |
6137 | 15 | } |
6138 | | |
6139 | | // Default implementations of convenience methods that subclasses of DB |
6140 | | // can call if they wish |
6141 | | Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6142 | 14.9M | const Slice& key, const Slice& value) { |
6143 | | // Pre-allocate size of write batch conservatively. |
6144 | | // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, |
6145 | | // and we allocate 11 extra bytes for key length, as well as value length. |
6146 | 14.9M | WriteBatch batch(key.size() + value.size() + 24); |
6147 | 14.9M | batch.Put(column_family, key, value); |
6148 | 14.9M | return Write(opt, &batch); |
6149 | 14.9M | } |
6150 | | |
6151 | | Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6152 | 544k | const Slice& key) { |
6153 | 544k | WriteBatch batch; |
6154 | 544k | batch.Delete(column_family, key); |
6155 | 544k | return Write(opt, &batch); |
6156 | 544k | } |
6157 | | |
6158 | | Status DB::SingleDelete(const WriteOptions& opt, |
6159 | 185 | ColumnFamilyHandle* column_family, const Slice& key) { |
6160 | 185 | WriteBatch batch; |
6161 | 185 | batch.SingleDelete(column_family, key); |
6162 | 185 | return Write(opt, &batch); |
6163 | 185 | } |
6164 | | |
6165 | | Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6166 | 89.9k | const Slice& key, const Slice& value) { |
6167 | 89.9k | WriteBatch batch; |
6168 | 89.9k | batch.Merge(column_family, key, value); |
6169 | 89.9k | return Write(opt, &batch); |
6170 | 89.9k | } |
6171 | | |
6172 | | // Default implementation -- returns not supported status |
6173 | | Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
6174 | | const std::string& column_family_name, |
6175 | 0 | ColumnFamilyHandle** handle) { |
6176 | 0 | return STATUS(NotSupported, ""); |
6177 | 0 | } |
6178 | 0 | Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { |
6179 | 0 | return STATUS(NotSupported, ""); |
6180 | 0 | } |
6181 | | |
6182 | 396k | DB::~DB() { } |
6183 | | |
6184 | 432k | Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { |
6185 | 432k | DBOptions db_options(options); |
6186 | 432k | ColumnFamilyOptions cf_options(options); |
6187 | 432k | std::vector<ColumnFamilyDescriptor> column_families; |
6188 | 432k | column_families.push_back( |
6189 | 432k | ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); |
6190 | 432k | std::vector<ColumnFamilyHandle*> handles; |
6191 | 432k | Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); |
6192 | 432k | if (s.ok()) { |
6193 | 432k | assert(handles.size() == 1); |
6194 | | // i can delete the handle since DBImpl is always holding a reference to |
6195 | | // default column family |
6196 | 0 | delete handles[0]; |
6197 | 432k | } |
6198 | 0 | return s; |
6199 | 432k | } |
6200 | | |
6201 | | Status DB::Open(const DBOptions& db_options, const std::string& dbname, |
6202 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
6203 | 435k | std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) { |
6204 | 435k | Status s = SanitizeOptionsByTable(db_options, column_families); |
6205 | 435k | if (!s.ok()) { |
6206 | 1 | return s; |
6207 | 1 | } |
6208 | | |
6209 | 438k | for (auto& cfd : column_families)435k { |
6210 | 438k | s = CheckCompressionSupported(cfd.options); |
6211 | 438k | if (s.ok() && db_options.allow_concurrent_memtable_write438k ) { |
6212 | 352 | s = CheckConcurrentWritesSupported(cfd.options); |
6213 | 352 | } |
6214 | 438k | if (!s.ok()) { |
6215 | 2 | return s; |
6216 | 2 | } |
6217 | 438k | if (db_options.db_paths.size() > 1) { |
6218 | 90 | if ((cfd.options.compaction_style != kCompactionStyleUniversal) && |
6219 | 90 | (cfd.options.compaction_style != kCompactionStyleLevel)38 ) { |
6220 | 0 | return STATUS(NotSupported, |
6221 | 0 | "More than one DB paths are only supported in " |
6222 | 0 | "universal and level compaction styles. "); |
6223 | 0 | } |
6224 | 90 | } |
6225 | 438k | } |
6226 | | |
6227 | 435k | if (db_options.db_paths.size() > 4) { |
6228 | 1 | return STATUS(NotSupported, |
6229 | 1 | "More than four DB paths are not supported yet. "); |
6230 | 1 | } |
6231 | | |
6232 | 435k | *dbptr = nullptr; |
6233 | 435k | handles->clear(); |
6234 | | |
6235 | 435k | size_t max_write_buffer_size = 0; |
6236 | 438k | for (auto cf : column_families) { |
6237 | 438k | max_write_buffer_size = |
6238 | 438k | std::max(max_write_buffer_size, cf.options.write_buffer_size); |
6239 | 438k | } |
6240 | | |
6241 | 435k | DBImpl* impl = new DBImpl(db_options, dbname); |
6242 | 435k | for (auto db_path : impl->db_options_.db_paths) { |
6243 | 435k | s = impl->env_->CreateDirIfMissing(db_path.path); |
6244 | 435k | if (!s.ok()) { |
6245 | 0 | break; |
6246 | 0 | } |
6247 | 435k | } |
6248 | | // WAL dir could be inside other paths, so we create it after. |
6249 | 435k | if (s.ok()) { |
6250 | 435k | s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); |
6251 | 435k | } |
6252 | | |
6253 | 435k | if (!s.ok()) { |
6254 | 0 | delete impl; |
6255 | 0 | return s; |
6256 | 0 | } |
6257 | | |
6258 | 435k | s = impl->CreateArchivalDirectory(); |
6259 | 435k | if (!s.ok()) { |
6260 | 0 | delete impl; |
6261 | 0 | return s; |
6262 | 0 | } |
6263 | 435k | impl->mutex_.Lock(); |
6264 | | // Handles create_if_missing, error_if_exists |
6265 | 435k | s = impl->Recover(column_families); |
6266 | 435k | if (s.ok()) { |
6267 | 435k | uint64_t new_log_number = impl->versions_->NewFileNumber(); |
6268 | 435k | unique_ptr<WritableFile> lfile; |
6269 | 435k | EnvOptions soptions(db_options); |
6270 | 435k | EnvOptions opt_env_options = |
6271 | 435k | impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_); |
6272 | 435k | s = NewWritableFile(impl->db_options_.env, |
6273 | 435k | LogFileName(impl->db_options_.wal_dir, new_log_number), |
6274 | 435k | &lfile, opt_env_options); |
6275 | 435k | if (s.ok()) { |
6276 | 435k | lfile->SetPreallocationBlockSize((max_write_buffer_size / 10) + max_write_buffer_size); |
6277 | 435k | impl->logfile_number_ = new_log_number; |
6278 | 435k | unique_ptr<WritableFileWriter> file_writer( |
6279 | 435k | new WritableFileWriter(std::move(lfile), opt_env_options)); |
6280 | 435k | impl->logs_.emplace_back( |
6281 | 435k | new_log_number, |
6282 | 435k | new log::Writer(std::move(file_writer), new_log_number, |
6283 | 435k | impl->db_options_.recycle_log_file_num > 0)); |
6284 | | |
6285 | | // set column family handles |
6286 | 437k | for (auto cf : column_families) { |
6287 | 437k | auto cfd = |
6288 | 437k | impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); |
6289 | 437k | if (cfd != nullptr) { |
6290 | 437k | handles->push_back( |
6291 | 437k | new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); |
6292 | 437k | } else { |
6293 | 62 | if (db_options.create_missing_column_families) { |
6294 | | // missing column family, create it |
6295 | 17 | ColumnFamilyHandle* handle; |
6296 | 17 | impl->mutex_.Unlock(); |
6297 | 17 | s = impl->CreateColumnFamily(cf.options, cf.name, &handle); |
6298 | 17 | impl->mutex_.Lock(); |
6299 | 17 | if (s.ok()) { |
6300 | 17 | handles->push_back(handle); |
6301 | 17 | } else { |
6302 | 0 | break; |
6303 | 0 | } |
6304 | 45 | } else { |
6305 | 45 | s = STATUS(InvalidArgument, "Column family not found: ", cf.name); |
6306 | 45 | break; |
6307 | 45 | } |
6308 | 62 | } |
6309 | 437k | } |
6310 | 435k | } |
6311 | 435k | if (s.ok()) { |
6312 | 437k | for (auto cfd : *impl->versions_->GetColumnFamilySet()) { |
6313 | 437k | impl->InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
6314 | 437k | } |
6315 | 434k | impl->alive_log_files_.push_back( |
6316 | 434k | DBImpl::LogFileNumberSize(impl->logfile_number_)); |
6317 | 434k | impl->DeleteObsoleteFiles(); |
6318 | 434k | s = impl->directories_.GetDbDir()->Fsync(); |
6319 | 434k | } |
6320 | 435k | } |
6321 | | |
6322 | 435k | if (s.ok()) { |
6323 | 437k | for (auto cfd : *impl->versions_->GetColumnFamilySet()) { |
6324 | 437k | if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { |
6325 | 257 | auto* vstorage = cfd->current()->storage_info(); |
6326 | 257 | for (int i = 1; i < vstorage->num_levels(); ++i0 ) { |
6327 | 0 | int num_files = vstorage->NumLevelFiles(i); |
6328 | 0 | if (num_files > 0) { |
6329 | 0 | s = STATUS(InvalidArgument, |
6330 | 0 | "Not all files are at level 0. Cannot " |
6331 | 0 | "open with FIFO compaction style."); |
6332 | 0 | break; |
6333 | 0 | } |
6334 | 0 | } |
6335 | 257 | } |
6336 | 437k | if (!cfd->mem()->IsSnapshotSupported()) { |
6337 | 60 | impl->is_snapshot_supported_ = false; |
6338 | 60 | } |
6339 | 437k | if (cfd->ioptions()->merge_operator != nullptr && |
6340 | 437k | !cfd->mem()->IsMergeOperatorSupported()530 ) { |
6341 | 0 | s = STATUS(InvalidArgument, |
6342 | 0 | "The memtable of column family %s does not support merge operator " |
6343 | 0 | "its options.merge_operator is non-null", cfd->GetName().c_str()); |
6344 | 0 | } |
6345 | 437k | if (!s.ok()) { |
6346 | 0 | break; |
6347 | 0 | } |
6348 | 437k | } |
6349 | 434k | } |
6350 | 435k | TEST_SYNC_POINT("DBImpl::Open:Opened"); |
6351 | 435k | Status persist_options_status; |
6352 | 435k | if (s.ok()) { |
6353 | | // Persist RocksDB Options before scheduling the compaction. |
6354 | | // The WriteOptionsFile() will release and lock the mutex internally. |
6355 | 435k | persist_options_status = impl->WriteOptionsFile(); |
6356 | | |
6357 | 435k | *dbptr = impl; |
6358 | 435k | impl->opened_successfully_ = true; |
6359 | 435k | impl->MaybeScheduleFlushOrCompaction(); |
6360 | 435k | } |
6361 | 435k | impl->mutex_.Unlock(); |
6362 | | |
6363 | 435k | auto sfm = static_cast<SstFileManagerImpl*>( |
6364 | 435k | impl->db_options_.sst_file_manager.get()); |
6365 | 435k | if (s.ok() && sfm435k ) { |
6366 | | // Notify SstFileManager about all sst files that already exist in |
6367 | | // db_paths[0] when the DB is opened. |
6368 | 11 | auto& db_path = impl->db_options_.db_paths[0]; |
6369 | 11 | std::vector<std::string> existing_files; |
6370 | 11 | RETURN_NOT_OK(impl->db_options_.env->GetChildren(db_path.path, &existing_files)); |
6371 | 112 | for (auto& file_name : existing_files)11 { |
6372 | 112 | uint64_t file_number; |
6373 | 112 | FileType file_type; |
6374 | 112 | std::string file_path = db_path.path + "/" + file_name; |
6375 | 112 | if (ParseFileName(file_name, &file_number, &file_type) && |
6376 | 112 | (76 file_type == kTableFile76 || file_type == kTableSBlockFile72 )) { |
6377 | 8 | RETURN_NOT_OK(sfm->OnAddFile(file_path)); |
6378 | 8 | } |
6379 | 112 | } |
6380 | 11 | } |
6381 | | |
6382 | 435k | if (s.ok()) { |
6383 | 435k | LogFlush(impl->db_options_.info_log); |
6384 | 435k | if (!persist_options_status.ok()) { |
6385 | 56 | if (db_options.fail_if_options_file_error) { |
6386 | 0 | s = STATUS(IOError, |
6387 | 0 | "DB::Open() failed --- Unable to persist Options file", |
6388 | 0 | persist_options_status.ToString()); |
6389 | 0 | } |
6390 | 56 | RWARN(impl->db_options_.info_log, |
6391 | 56 | "Unable to persist options in DB::Open() -- %s", |
6392 | 56 | persist_options_status.ToString().c_str()); |
6393 | 56 | } |
6394 | 435k | } |
6395 | 435k | if (!s.ok()) { |
6396 | 175 | for (auto* h : *handles) { |
6397 | 0 | delete h; |
6398 | 0 | } |
6399 | 175 | handles->clear(); |
6400 | 175 | delete impl; |
6401 | 175 | *dbptr = nullptr; |
6402 | 435k | } else if (435k impl435k ) { |
6403 | 435k | impl->SetSSTFileTickers(); |
6404 | 435k | } |
6405 | | |
6406 | 435k | return s; |
6407 | 435k | } |
6408 | | |
6409 | 249 | yb::Result<std::unique_ptr<DB>> DB::Open(const Options& options, const std::string& name) { |
6410 | 249 | DB* db = nullptr; |
6411 | 249 | Status status = Open(options, name, &db); |
6412 | 249 | if (!status.ok()) { |
6413 | 0 | delete db; |
6414 | 0 | return status; |
6415 | 0 | } |
6416 | 249 | return std::unique_ptr<DB>(db); |
6417 | 249 | } |
6418 | | |
6419 | | Status DB::ListColumnFamilies(const DBOptions& db_options, |
6420 | | const std::string& name, |
6421 | 8 | std::vector<std::string>* column_families) { |
6422 | 8 | return VersionSet::ListColumnFamilies(column_families, |
6423 | 8 | name, |
6424 | 8 | db_options.boundary_extractor.get(), |
6425 | 8 | db_options.env); |
6426 | 8 | } |
6427 | | |
6428 | 402k | Snapshot::~Snapshot() { |
6429 | 402k | } |
6430 | | |
6431 | 420k | Status DestroyDB(const std::string& dbname, const Options& options) { |
6432 | 420k | const InternalKeyComparator comparator(options.comparator); |
6433 | 420k | const Options& soptions(SanitizeOptions(dbname, &comparator, options)); |
6434 | 420k | Env* env = soptions.env; |
6435 | 420k | std::vector<std::string> filenames; |
6436 | | |
6437 | | // Ignore error in case directory does not exist |
6438 | 420k | env->GetChildrenWarnNotOk(dbname, &filenames); |
6439 | | |
6440 | 420k | FileLock* lock; |
6441 | 420k | const std::string lockname = LockFileName(dbname); |
6442 | 420k | Status result = env->LockFile(lockname, &lock); |
6443 | 420k | if (result.ok()) { |
6444 | 420k | uint64_t number; |
6445 | 420k | FileType type; |
6446 | 420k | InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname); |
6447 | 4.02M | for (size_t i = 0; i < filenames.size(); i++3.60M ) { |
6448 | 3.60M | if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) && |
6449 | 3.60M | type != kDBLockFile2.75M ) { // Lock file will be deleted at end |
6450 | 2.37M | Status del; |
6451 | 2.37M | std::string path_to_delete = dbname + "/" + filenames[i]; |
6452 | 2.37M | if (type == kMetaDatabase) { |
6453 | 2 | del = DestroyDB(path_to_delete, options); |
6454 | 2.37M | } else if (type == kTableFile || type == kTableSBlockFile2.35M ) { |
6455 | 23.3k | del = DeleteSSTFile(&options, path_to_delete, 0); |
6456 | 2.34M | } else { |
6457 | 2.34M | del = env->DeleteFile(path_to_delete); |
6458 | 2.34M | } |
6459 | 2.37M | if (result.ok() && !del.ok()2.37M ) { |
6460 | 0 | result = del; |
6461 | 0 | } |
6462 | 2.37M | } |
6463 | 3.60M | } |
6464 | | |
6465 | 422k | for (size_t path_id = 0; path_id < options.db_paths.size(); path_id++2.26k ) { |
6466 | 2.26k | const auto& db_path = options.db_paths[path_id]; |
6467 | 2.26k | env->GetChildrenWarnNotOk(db_path.path, &filenames); |
6468 | 4.40k | for (size_t i = 0; i < filenames.size(); i++2.13k ) { |
6469 | 2.13k | if (ParseFileName(filenames[i], &number, &type) && |
6470 | | // Lock file will be deleted at end |
6471 | 2.13k | (744 type == kTableFile744 || type == kTableSBlockFile659 )) { |
6472 | 170 | std::string table_path = db_path.path + "/" + filenames[i]; |
6473 | 170 | Status del = DeleteSSTFile(&options, table_path, |
6474 | 170 | static_cast<uint32_t>(path_id)); |
6475 | 170 | if (result.ok() && !del.ok()) { |
6476 | 0 | result = del; |
6477 | 0 | } |
6478 | 170 | } |
6479 | 2.13k | } |
6480 | 2.26k | } |
6481 | | |
6482 | 420k | std::vector<std::string> walDirFiles; |
6483 | 420k | std::string archivedir = ArchivalDirectory(dbname); |
6484 | 420k | if (dbname != soptions.wal_dir) { |
6485 | 710 | env->GetChildrenWarnNotOk(soptions.wal_dir, &walDirFiles); |
6486 | 710 | archivedir = ArchivalDirectory(soptions.wal_dir); |
6487 | 710 | } |
6488 | | |
6489 | | // Delete log files in the WAL dir |
6490 | 420k | for (const auto& file : walDirFiles) { |
6491 | 424 | if (ParseFileName(file, &number, &type) && type == kLogFile139 ) { |
6492 | 139 | Status del = env->DeleteFile(soptions.wal_dir + "/" + file); |
6493 | 139 | if (result.ok() && !del.ok()) { |
6494 | 0 | result = del; |
6495 | 0 | } |
6496 | 139 | } |
6497 | 424 | } |
6498 | | |
6499 | | // ignore case where no archival directory is present. |
6500 | 420k | if (env->FileExists(archivedir).ok()) { |
6501 | 45 | std::vector<std::string> archiveFiles; |
6502 | 45 | env->GetChildrenWarnNotOk(archivedir, &archiveFiles); |
6503 | | // Delete archival files. |
6504 | 228 | for (size_t i = 0; i < archiveFiles.size(); ++i183 ) { |
6505 | 183 | if (ParseFileName(archiveFiles[i], &number, &type) && |
6506 | 183 | type == kLogFile93 ) { |
6507 | 93 | Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]); |
6508 | 93 | if (result.ok() && !del.ok()) { |
6509 | 0 | result = del; |
6510 | 0 | } |
6511 | 93 | } |
6512 | 183 | } |
6513 | | |
6514 | 45 | WARN_NOT_OK(env->DeleteDir(archivedir), "Failed to cleanup dir " + archivedir); |
6515 | 45 | } |
6516 | 420k | WARN_NOT_OK(env->UnlockFile(lock), "Unlock file failed"); |
6517 | 420k | env->CleanupFile(lockname); |
6518 | 420k | if (env->FileExists(dbname).ok()) { |
6519 | 420k | WARN_NOT_OK(env->DeleteDir(dbname), "Failed to cleanup dir " + dbname); |
6520 | 420k | } |
6521 | 420k | if (env->FileExists(soptions.wal_dir).ok()) { |
6522 | 1.60k | WARN_NOT_OK(env->DeleteDir(soptions.wal_dir), |
6523 | 1.60k | "Failed to cleanup wal dir " + soptions.wal_dir); |
6524 | 1.60k | } |
6525 | 420k | } |
6526 | 420k | return result; |
6527 | 420k | } |
6528 | | |
6529 | 1.27M | Status DBImpl::WriteOptionsFile() { |
6530 | 1.27M | #ifndef ROCKSDB_LITE |
6531 | 1.27M | mutex_.AssertHeld(); |
6532 | | |
6533 | 1.27M | std::vector<std::string> cf_names; |
6534 | 1.27M | std::vector<ColumnFamilyOptions> cf_opts; |
6535 | | |
6536 | | // This part requires mutex to protect the column family options |
6537 | 1.27M | GetColumnFamiliesOptionsUnlocked(&cf_names, &cf_opts); |
6538 | | |
6539 | | // Unlock during expensive operations. New writes cannot get here |
6540 | | // because the single write thread ensures all new writes get queued. |
6541 | 1.27M | mutex_.Unlock(); |
6542 | | |
6543 | 1.27M | std::string file_name = |
6544 | 1.27M | TempOptionsFileName(GetName(), versions_->NewFileNumber()); |
6545 | 1.27M | Status s = PersistRocksDBOptions(GetDBOptions(), cf_names, cf_opts, file_name, |
6546 | 1.27M | GetEnv()); |
6547 | | |
6548 | 1.27M | if (s.ok()) { |
6549 | 1.27M | s = RenameTempFileToOptionsFile(file_name); |
6550 | 1.27M | } |
6551 | 1.27M | mutex_.Lock(); |
6552 | 1.27M | return s; |
6553 | | #else |
6554 | | return Status::OK(); |
6555 | | #endif // !ROCKSDB_LITE |
6556 | 1.27M | } |
6557 | | |
6558 | | #ifndef ROCKSDB_LITE |
6559 | | namespace { |
6560 | | void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames, |
6561 | | const size_t num_files_to_keep, |
6562 | | const std::shared_ptr<Logger>& info_log, |
6563 | 1.27M | Env* env) { |
6564 | 1.27M | if (filenames.size() <= num_files_to_keep) { |
6565 | 851k | return; |
6566 | 851k | } |
6567 | 428k | for (auto iter = std::next(filenames.begin(), num_files_to_keep); |
6568 | 856k | iter != filenames.end(); ++iter428k ) { |
6569 | 428k | if (!env->DeleteFile(iter->second).ok()) { |
6570 | 0 | RWARN(info_log, "Unable to delete options file %s", iter->second.c_str()); |
6571 | 0 | } |
6572 | 428k | } |
6573 | 428k | } |
6574 | | } // namespace |
6575 | | #endif // !ROCKSDB_LITE |
6576 | | |
6577 | 1.27M | Status DBImpl::DeleteObsoleteOptionsFiles() { |
6578 | 1.27M | #ifndef ROCKSDB_LITE |
6579 | 1.27M | std::vector<std::string> filenames; |
6580 | | // use ordered map to store keep the filenames sorted from the newest |
6581 | | // to the oldest. |
6582 | 1.27M | std::map<uint64_t, std::string> options_filenames; |
6583 | 1.27M | Status s; |
6584 | 1.27M | s = GetEnv()->GetChildren(GetName(), &filenames); |
6585 | 1.27M | if (!s.ok()) { |
6586 | 1 | return s; |
6587 | 1 | } |
6588 | 12.2M | for (auto& filename : filenames)1.27M { |
6589 | 12.2M | uint64_t file_number; |
6590 | 12.2M | FileType type; |
6591 | 12.2M | if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile9.54M ) { |
6592 | 2.55M | options_filenames.insert( |
6593 | 2.55M | {std::numeric_limits<uint64_t>::max() - file_number, |
6594 | 2.55M | GetName() + "/" + filename}); |
6595 | 2.55M | } |
6596 | 12.2M | } |
6597 | | |
6598 | | // Keeps the latest 2 Options file |
6599 | 1.27M | const size_t kNumOptionsFilesKept = 2; |
6600 | 1.27M | DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, |
6601 | 1.27M | db_options_.info_log, GetEnv()); |
6602 | 1.27M | return Status::OK(); |
6603 | | #else |
6604 | | return Status::OK(); |
6605 | | #endif // !ROCKSDB_LITE |
6606 | 1.27M | } |
6607 | | |
6608 | 1.27M | Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { |
6609 | 1.27M | #ifndef ROCKSDB_LITE |
6610 | 1.27M | Status s; |
6611 | 1.27M | std::string options_file_name = |
6612 | 1.27M | OptionsFileName(GetName(), versions_->NewFileNumber()); |
6613 | | // Retry if the file name happen to conflict with an existing one. |
6614 | 1.27M | s = GetEnv()->RenameFile(file_name, options_file_name); |
6615 | | |
6616 | 1.27M | WARN_NOT_OK(DeleteObsoleteOptionsFiles(), "Failed to cleanup obsolete options file"); |
6617 | 1.27M | return s; |
6618 | | #else |
6619 | | return Status::OK(); |
6620 | | #endif // !ROCKSDB_LITE |
6621 | 1.27M | } |
6622 | | |
6623 | | #ifndef ROCKSDB_LITE |
6624 | | SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, |
6625 | 135 | bool include_history) { |
6626 | | // Find the earliest sequence number that we know we can rely on reading |
6627 | | // from the memtable without needing to check sst files. |
6628 | 135 | SequenceNumber earliest_seq = |
6629 | 135 | sv->imm->GetEarliestSequenceNumber(include_history); |
6630 | 135 | if (earliest_seq == kMaxSequenceNumber) { |
6631 | 112 | earliest_seq = sv->mem->GetEarliestSequenceNumber(); |
6632 | 112 | } |
6633 | 135 | assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); |
6634 | | |
6635 | 0 | return earliest_seq; |
6636 | 135 | } |
6637 | | #endif // ROCKSDB_LITE |
6638 | | |
6639 | | #ifndef ROCKSDB_LITE |
6640 | | Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, |
6641 | | bool cache_only, SequenceNumber* seq, |
6642 | 138 | bool* found_record_for_key) { |
6643 | 138 | Status s; |
6644 | 138 | MergeContext merge_context; |
6645 | | |
6646 | 138 | SequenceNumber current_seq = versions_->LastSequence(); |
6647 | 138 | LookupKey lkey(key, current_seq); |
6648 | | |
6649 | 138 | *seq = kMaxSequenceNumber; |
6650 | 138 | *found_record_for_key = false; |
6651 | | |
6652 | | // Check if there is a record for this key in the latest memtable |
6653 | 138 | sv->mem->Get(lkey, nullptr, &s, &merge_context, seq); |
6654 | | |
6655 | 138 | if (!(s.ok() || s.IsNotFound()5 || s.IsMergeInProgress()0 )) { |
6656 | | // unexpected error reading memtable. |
6657 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6658 | 0 | "Unexpected status returned from MemTable::Get: %s\n", |
6659 | 0 | s.ToString().c_str()); |
6660 | |
|
6661 | 0 | return s; |
6662 | 0 | } |
6663 | | |
6664 | 138 | if (*seq != kMaxSequenceNumber) { |
6665 | | // Found a sequence number, no need to check immutable memtables |
6666 | 81 | *found_record_for_key = true; |
6667 | 81 | return Status::OK(); |
6668 | 81 | } |
6669 | | |
6670 | | // Check if there is a record for this key in the immutable memtables |
6671 | 57 | sv->imm->Get(lkey, nullptr, &s, &merge_context, seq); |
6672 | | |
6673 | 57 | if (!(s.ok() || s.IsNotFound()0 || s.IsMergeInProgress()0 )) { |
6674 | | // unexpected error reading memtable. |
6675 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6676 | 0 | "Unexpected status returned from MemTableList::Get: %s\n", |
6677 | 0 | s.ToString().c_str()); |
6678 | |
|
6679 | 0 | return s; |
6680 | 0 | } |
6681 | | |
6682 | 57 | if (*seq != kMaxSequenceNumber) { |
6683 | | // Found a sequence number, no need to check memtable history |
6684 | 0 | *found_record_for_key = true; |
6685 | 0 | return Status::OK(); |
6686 | 0 | } |
6687 | | |
6688 | | // Check if there is a record for this key in the immutable memtables |
6689 | 57 | sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, seq); |
6690 | | |
6691 | 57 | if (!(s.ok() || s.IsNotFound()0 || s.IsMergeInProgress()0 )) { |
6692 | | // unexpected error reading memtable. |
6693 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6694 | 0 | "Unexpected status returned from MemTableList::GetFromHistory: %s\n", |
6695 | 0 | s.ToString().c_str()); |
6696 | |
|
6697 | 0 | return s; |
6698 | 0 | } |
6699 | | |
6700 | 57 | if (*seq != kMaxSequenceNumber) { |
6701 | | // Found a sequence number, no need to check SST files |
6702 | 1 | *found_record_for_key = true; |
6703 | 1 | return Status::OK(); |
6704 | 1 | } |
6705 | | |
6706 | | // TODO(agiardullo): possible optimization: consider checking cached |
6707 | | // SST files if cache_only=true? |
6708 | 56 | if (!cache_only) { |
6709 | | // Check tables |
6710 | 21 | ReadOptions read_options; |
6711 | | |
6712 | 21 | sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, |
6713 | 21 | nullptr /* value_found */, found_record_for_key, seq); |
6714 | | |
6715 | 21 | if (!(s.ok() || s.IsNotFound()9 || s.IsMergeInProgress()0 )) { |
6716 | | // unexpected error reading SST files |
6717 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6718 | 0 | "Unexpected status returned from Version::Get: %s\n", |
6719 | 0 | s.ToString().c_str()); |
6720 | |
|
6721 | 0 | return s; |
6722 | 0 | } |
6723 | 21 | } |
6724 | | |
6725 | 56 | return Status::OK(); |
6726 | 56 | } |
6727 | | #endif // ROCKSDB_LITE |
6728 | | |
6729 | 465k | const std::string& DBImpl::LogPrefix() const { |
6730 | 465k | static const std::string kEmptyString; |
6731 | 465k | return db_options_.info_log ? db_options_.info_log->Prefix()465k : kEmptyString86 ; |
6732 | 465k | } |
6733 | | |
6734 | | } // namespace rocksdb |