/Users/deen/code/yugabyte-db/src/yb/rocksdb/db/db_impl.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under the BSD-style license found in the |
3 | | // LICENSE file in the root directory of this source tree. An additional grant |
4 | | // of patent rights can be found in the PATENTS file in the same directory. |
5 | | // |
6 | | // The following only applies to changes made to this file as part of YugaByte development. |
7 | | // |
8 | | // Portions Copyright (c) YugaByte, Inc. |
9 | | // |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
11 | | // in compliance with the License. You may obtain a copy of the License at |
12 | | // |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // |
15 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
16 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
17 | | // or implied. See the License for the specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
21 | | // Use of this source code is governed by a BSD-style license that can be |
22 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
23 | | |
24 | | #include "yb/rocksdb/db/db_impl.h" |
25 | | |
26 | | #ifndef __STDC_FORMAT_MACROS |
27 | | #define __STDC_FORMAT_MACROS |
28 | | #endif |
29 | | |
30 | | #include <inttypes.h> |
31 | | #include <stdint.h> |
32 | | #ifdef OS_SOLARIS |
33 | | #include <alloca.h> |
34 | | #endif |
35 | | |
36 | | #include <algorithm> |
37 | | #include <climits> |
38 | | #include <map> |
39 | | #include <set> |
40 | | #include <stdexcept> |
41 | | #include <string> |
42 | | #include <unordered_map> |
43 | | #include <unordered_set> |
44 | | #include <utility> |
45 | | #include <vector> |
46 | | |
47 | | #include <boost/container/small_vector.hpp> |
48 | | |
49 | | #include "yb/gutil/stringprintf.h" |
50 | | #include "yb/util/string_util.h" |
51 | | #include "yb/util/scope_exit.h" |
52 | | #include "yb/util/logging.h" |
53 | | #include "yb/util/debug-util.h" |
54 | | #include "yb/util/fault_injection.h" |
55 | | #include "yb/util/flag_tags.h" |
56 | | #include "yb/util/priority_thread_pool.h" |
57 | | #include "yb/util/atomic.h" |
58 | | |
59 | | #include "yb/rocksdb/db/auto_roll_logger.h" |
60 | | #include "yb/rocksdb/db/builder.h" |
61 | | #include "yb/rocksdb/db/compaction_job.h" |
62 | | #include "yb/rocksdb/db/compaction_picker.h" |
63 | | #include "yb/rocksdb/db/db_info_dumper.h" |
64 | | #include "yb/rocksdb/db/db_iter.h" |
65 | | #include "yb/rocksdb/db/dbformat.h" |
66 | | #include "yb/rocksdb/db/event_helpers.h" |
67 | | #include "yb/rocksdb/db/filename.h" |
68 | | #include "yb/rocksdb/db/file_numbers.h" |
69 | | #include "yb/rocksdb/db/flush_job.h" |
70 | | #include "yb/rocksdb/db/forward_iterator.h" |
71 | | #include "yb/rocksdb/db/job_context.h" |
72 | | #include "yb/rocksdb/db/log_reader.h" |
73 | | #include "yb/rocksdb/db/log_writer.h" |
74 | | #include "yb/rocksdb/db/managed_iterator.h" |
75 | | #include "yb/rocksdb/db/memtable.h" |
76 | | #include "yb/rocksdb/db/memtable_list.h" |
77 | | #include "yb/rocksdb/db/merge_context.h" |
78 | | #include "yb/rocksdb/db/merge_helper.h" |
79 | | #include "yb/rocksdb/db/table_cache.h" |
80 | | #include "yb/rocksdb/db/table_properties_collector.h" |
81 | | #include "yb/rocksdb/db/version_set.h" |
82 | | #include "yb/rocksdb/db/write_batch_internal.h" |
83 | | #include "yb/rocksdb/db/write_callback.h" |
84 | | #include "yb/rocksdb/db/writebuffer.h" |
85 | | #include "yb/rocksdb/port/likely.h" |
86 | | #include "yb/rocksdb/port/port.h" |
87 | | #include "yb/rocksdb/cache.h" |
88 | | #include "yb/rocksdb/compaction_filter.h" |
89 | | #include "yb/rocksdb/db.h" |
90 | | #include "yb/rocksdb/env.h" |
91 | | #include "yb/rocksdb/sst_file_writer.h" |
92 | | #include "yb/rocksdb/statistics.h" |
93 | | #include "yb/rocksdb/status.h" |
94 | | #include "yb/rocksdb/table.h" |
95 | | #include "yb/rocksdb/wal_filter.h" |
96 | | #include "yb/rocksdb/table/block_based_table_factory.h" |
97 | | #include "yb/rocksdb/table/merger.h" |
98 | | #include "yb/rocksdb/table/scoped_arena_iterator.h" |
99 | | #include "yb/rocksdb/table/table_builder.h" |
100 | | #include "yb/rocksdb/util/autovector.h" |
101 | | #include "yb/rocksdb/util/coding.h" |
102 | | #include "yb/rocksdb/util/compression.h" |
103 | | #include "yb/rocksdb/util/crc32c.h" |
104 | | #include "yb/rocksdb/util/file_reader_writer.h" |
105 | | #include "yb/rocksdb/util/file_util.h" |
106 | | #include "yb/rocksdb/util/log_buffer.h" |
107 | | #include "yb/rocksdb/util/logging.h" |
108 | | #include "yb/rocksdb/util/mutexlock.h" |
109 | | #include "yb/rocksdb/util/sst_file_manager_impl.h" |
110 | | #include "yb/rocksdb/util/options_helper.h" |
111 | | #include "yb/rocksdb/util/options_parser.h" |
112 | | #include "yb/rocksdb/util/perf_context_imp.h" |
113 | | #include "yb/rocksdb/util/stop_watch.h" |
114 | | #include "yb/rocksdb/util/sync_point.h" |
115 | | #include "yb/rocksdb/util/xfunc.h" |
116 | | #include "yb/rocksdb/db/db_iterator_wrapper.h" |
117 | | |
118 | | #include "yb/util/status_log.h" |
119 | | #include "yb/util/stats/iostats_context_imp.h" |
120 | | |
121 | | using namespace std::literals; |
122 | | |
123 | | DEFINE_bool(dump_dbimpl_info, false, "Dump RocksDB info during constructor."); |
124 | | DEFINE_bool(flush_rocksdb_on_shutdown, true, |
125 | | "Safely flush RocksDB when instance is destroyed, disabled for crash tests."); |
126 | | DEFINE_double(fault_crash_after_rocksdb_flush, 0.0, |
127 | | "Fraction of time to crash right after a successful RocksDB flush in tests."); |
128 | | |
129 | | DEFINE_bool(use_priority_thread_pool_for_flushes, false, |
130 | | "When true priority thread pool will be used for flushes, otherwise " |
131 | | "Env thread pool with Priority::HIGH will be used."); |
132 | | TAG_FLAG(use_priority_thread_pool_for_flushes, runtime); |
133 | | |
134 | | DEFINE_bool(use_priority_thread_pool_for_compactions, true, |
135 | | "When true priority thread pool will be used for compactions, otherwise " |
136 | | "Env thread pool with Priority::LOW will be used."); |
137 | | TAG_FLAG(use_priority_thread_pool_for_compactions, runtime); |
138 | | |
139 | | DEFINE_int32(compaction_priority_start_bound, 10, |
140 | | "Compaction task of DB that has number of SST files less than specified will have " |
141 | | "priority 0."); |
142 | | |
143 | | DEFINE_int32(compaction_priority_step_size, 5, |
144 | | "Compaction task of DB that has number of SST files greater that " |
145 | | "compaction_priority_start_bound will get 1 extra priority per every " |
146 | | "compaction_priority_step_size files."); |
147 | | |
148 | | DEFINE_int32(small_compaction_extra_priority, 1, |
149 | | "Small compaction will get small_compaction_extra_priority extra priority."); |
150 | | |
151 | | DEFINE_bool(rocksdb_use_logging_iterator, false, |
152 | | "Wrap newly created RocksDB iterators in a logging wrapper"); |
153 | | |
154 | | DEFINE_test_flag(int32, max_write_waiters, std::numeric_limits<int32_t>::max(), |
155 | | "Max allowed number of write waiters per RocksDB instance in tests."); |
156 | | |
157 | | namespace rocksdb { |
158 | | |
159 | | namespace { |
160 | | |
161 | | std::unique_ptr<Compaction> PopFirstFromCompactionQueue( |
162 | 17.7k | std::deque<std::unique_ptr<Compaction>>* queue) { |
163 | 17.7k | DCHECK(!queue->empty()); |
164 | 17.7k | auto c = std::move(queue->front()); |
165 | 17.7k | ColumnFamilyData* cfd = c->column_family_data(); |
166 | 17.7k | queue->pop_front(); |
167 | 17.7k | DCHECK(cfd->pending_compaction()); |
168 | 17.7k | cfd->set_pending_compaction(false); |
169 | 17.7k | return c; |
170 | 17.7k | } |
171 | | |
172 | 646k | void ClearCompactionQueue(std::deque<std::unique_ptr<Compaction>>* queue) { |
173 | 646k | while (!queue->empty()) { |
174 | 11 | auto c = PopFirstFromCompactionQueue(queue); |
175 | 11 | c->ReleaseCompactionFiles(STATUS(Incomplete, "DBImpl destroyed before compaction scheduled")); |
176 | 11 | auto cfd = c->column_family_data(); |
177 | 11 | c.reset(); |
178 | 11 | if (cfd->Unref()) { |
179 | 0 | delete cfd; |
180 | 0 | } |
181 | 11 | } |
182 | 646k | } |
183 | | |
184 | | } // namespace |
185 | | |
186 | | const char kDefaultColumnFamilyName[] = "default"; |
187 | | |
188 | | struct DBImpl::WriteContext { |
189 | | boost::container::small_vector<std::unique_ptr<SuperVersion>, 8> superversions_to_free_; |
190 | | autovector<MemTable*> memtables_to_free_; |
191 | | |
192 | 23.0M | ~WriteContext() { |
193 | 0 | for (auto& m : memtables_to_free_) { |
194 | 0 | delete m; |
195 | 0 | } |
196 | 23.0M | } |
197 | | }; |
198 | | |
199 | | YB_DEFINE_ENUM(BgTaskType, (kFlush)(kCompaction)); |
200 | | |
201 | | class DBImpl::ThreadPoolTask : public yb::PriorityThreadPoolTask { |
202 | | public: |
203 | 471 | explicit ThreadPoolTask(DBImpl* db_impl) : db_impl_(db_impl) {} |
204 | | |
205 | 471 | void Run(const Status& status, yb::PriorityThreadPoolSuspender* suspender) override { |
206 | 471 | if (!status.ok()) { |
207 | 1 | LOG_WITH_PREFIX(INFO) << "Task cancelled " << ToString() << ": " << status; |
208 | 1 | InstrumentedMutexLock lock(&db_impl_->mutex_); |
209 | 1 | AbortedUnlocked(status); |
210 | 1 | return; // Failed to schedule, could just drop compaction. |
211 | 1 | } |
212 | 470 | DoRun(suspender); |
213 | 470 | } |
214 | | |
215 | | virtual BgTaskType Type() const = 0; |
216 | | |
217 | | virtual int Priority() const = 0; |
218 | | |
219 | | virtual void AbortedUnlocked(const Status& status) = 0; |
220 | | |
221 | | virtual void DoRun(yb::PriorityThreadPoolSuspender* suspender) = 0; |
222 | | |
223 | | // Tries to recalculate and update task priority, returns true if priority was updated. |
224 | | virtual bool UpdatePriority() = 0; |
225 | | |
226 | 1 | const std::string& LogPrefix() const { |
227 | 1 | return db_impl_->LogPrefix(); |
228 | 1 | } |
229 | | |
230 | | protected: |
231 | | DBImpl* const db_impl_; |
232 | | }; |
233 | | |
234 | | constexpr int kShuttingDownPriority = 200; |
235 | | constexpr int kFlushPriority = 100; |
236 | | constexpr int kNoJobId = -1; |
237 | | |
238 | | class DBImpl::CompactionTask : public ThreadPoolTask { |
239 | | public: |
240 | | CompactionTask(DBImpl* db_impl, DBImpl::ManualCompaction* manual_compaction) |
241 | | : ThreadPoolTask(db_impl), manual_compaction_(manual_compaction), |
242 | 94 | compaction_(manual_compaction->compaction.get()), priority_(CalcPriority()) { |
243 | 94 | db_impl->mutex_.AssertHeld(); |
244 | 94 | SetFileAndByteCount(); |
245 | 94 | } |
246 | | |
247 | | CompactionTask(DBImpl* db_impl, std::unique_ptr<Compaction> compaction) |
248 | | : ThreadPoolTask(db_impl), manual_compaction_(nullptr), |
249 | | compaction_holder_(std::move(compaction)), compaction_(compaction_holder_.get()), |
250 | 346 | priority_(CalcPriority()) { |
251 | 346 | db_impl->mutex_.AssertHeld(); |
252 | 346 | SetFileAndByteCount(); |
253 | 346 | } |
254 | | |
255 | 59 | bool ShouldRemoveWithKey(void* key) override { |
256 | 59 | return key == db_impl_; |
257 | 59 | } |
258 | | |
259 | 439 | void DoRun(yb::PriorityThreadPoolSuspender* suspender) override { |
260 | 439 | compaction_->SetSuspender(suspender); |
261 | 439 | db_impl_->BackgroundCallCompaction(manual_compaction_, std::move(compaction_holder_), this); |
262 | 439 | } |
263 | | |
264 | 1 | void AbortedUnlocked(const Status& status) override { |
265 | 1 | db_impl_->mutex_.AssertHeld(); |
266 | 1 | if (!manual_compaction_) { |
267 | | // This corresponds to cfd->Ref() inside DBImpl::AddToCompactionQueue that is |
268 | | // unreferenced by DBImpl::BackgroundCompaction in normal workflow, but in case of cancelling |
269 | | // compaction task we don't get there. |
270 | | // Since DBImpl::AddToCompactionQueue calls Ref only for non-manual compactions, we should |
271 | | // do the same here too. |
272 | | // TODO: https://github.com/yugabyte/yugabyte-db/issues/8578 |
273 | 0 | auto cfd = compaction_->column_family_data(); |
274 | 0 | if (cfd->Unref()) { |
275 | 0 | delete cfd; |
276 | 0 | } |
277 | 1 | } else { |
278 | 1 | if (!manual_compaction_->done) { |
279 | 1 | manual_compaction_->in_progress = false; |
280 | 1 | manual_compaction_->done = true; |
281 | 1 | manual_compaction_->status = status; |
282 | 1 | } |
283 | 1 | } |
284 | 1 | compaction_->ReleaseCompactionFiles(status); |
285 | 0 | LOG_IF_WITH_PREFIX(DFATAL, db_impl_->compaction_tasks_.erase(this) != 1) |
286 | 0 | << "Aborted unknown compaction task: " << SerialNo(); |
287 | 1 | if (db_impl_->compaction_tasks_.empty()) { |
288 | 1 | db_impl_->bg_cv_.SignalAll(); |
289 | 1 | } |
290 | 1 | } |
291 | | |
292 | 440 | BgTaskType Type() const override { |
293 | 440 | return BgTaskType::kCompaction; |
294 | 440 | } |
295 | | |
296 | 343 | std::string ToString() const override { |
297 | 343 | int job_id_value = job_id_.Load(); |
298 | 343 | return yb::Format( |
299 | 343 | "{ compact db: $0 is_manual: $1 serial_no: $2 job_id: $3}", db_impl_->GetName(), |
300 | 343 | manual_compaction_ != nullptr, SerialNo(), |
301 | 243 | ((job_id_value == kNoJobId) ? "None" : std::to_string(job_id_value))); |
302 | 343 | } |
303 | | |
304 | 1.45k | yb::CompactionInfo GetFileAndByteInfoIfCompaction() const override { |
305 | 1.45k | return yb::CompactionInfo{file_count_, byte_count_}; |
306 | 1.45k | } |
307 | | |
308 | 439 | void SetJobID(JobContext* job_context) { |
309 | 439 | job_id_.Store(job_context->job_id); |
310 | 439 | } |
311 | | |
312 | 2.85k | bool UpdatePriority() override { |
313 | 2.85k | db_impl_->mutex_.AssertHeld(); |
314 | | |
315 | | // Task already complete. |
316 | 2.85k | if (compaction_ == nullptr) { |
317 | 449 | return false; |
318 | 449 | } |
319 | | |
320 | 2.41k | auto new_priority = CalcPriority(); |
321 | 2.41k | if (new_priority != priority_) { |
322 | 568 | priority_ = new_priority; |
323 | 568 | return true; |
324 | 568 | } |
325 | 1.84k | return false; |
326 | 1.84k | } |
327 | | |
328 | 436 | void Complete() { |
329 | 436 | db_impl_->mutex_.AssertHeld(); |
330 | 436 | compaction_ = nullptr; |
331 | 436 | } |
332 | | |
333 | 1.00k | int Priority() const override { |
334 | 1.00k | return priority_; |
335 | 1.00k | } |
336 | | |
337 | | private: |
338 | 2.85k | int CalcPriority() const { |
339 | 2.85k | db_impl_->mutex_.AssertHeld(); |
340 | | |
341 | 2.85k | if (db_impl_->IsShuttingDown()) { |
342 | 2 | return kShuttingDownPriority; |
343 | 2 | } |
344 | | |
345 | 2.84k | auto* current_version = compaction_->column_family_data()->GetSuperVersion()->current; |
346 | 2.84k | auto num_files = current_version->storage_info()->l0_delay_trigger_count(); |
347 | | |
348 | 2.84k | int result = 0; |
349 | 2.84k | if (num_files >= FLAGS_compaction_priority_start_bound) { |
350 | 1.99k | result = |
351 | 1.99k | 1 + |
352 | 1.99k | (num_files - FLAGS_compaction_priority_start_bound) / FLAGS_compaction_priority_step_size; |
353 | 1.99k | } |
354 | | |
355 | 2.84k | if (!db_impl_->IsLargeCompaction(*compaction_)) { |
356 | 2.84k | result += FLAGS_small_compaction_extra_priority; |
357 | 2.84k | } |
358 | | |
359 | 2.84k | return result; |
360 | 2.84k | } |
361 | | |
362 | 440 | void SetFileAndByteCount() { |
363 | 440 | size_t levels = compaction_->num_input_levels(); |
364 | 440 | uint64_t file_count = 0; |
365 | 880 | for (size_t i = 0; i < levels; i++) { |
366 | 440 | file_count += compaction_->num_input_files(i); |
367 | 440 | } |
368 | 440 | file_count_ = file_count; |
369 | 440 | byte_count_ = compaction_->CalculateTotalInputSize(); |
370 | 440 | } |
371 | | |
372 | | DBImpl::ManualCompaction* const manual_compaction_; |
373 | | std::unique_ptr<Compaction> compaction_holder_; |
374 | | Compaction* compaction_; |
375 | | int priority_; |
376 | | yb::AtomicInt<int> job_id_{kNoJobId}; |
377 | | uint64_t file_count_; |
378 | | uint64_t byte_count_; |
379 | | }; |
380 | | |
381 | | class DBImpl::FlushTask : public ThreadPoolTask { |
382 | | public: |
383 | | FlushTask(DBImpl* db_impl, ColumnFamilyData* cfd) |
384 | 31 | : ThreadPoolTask(db_impl), cfd_(cfd) {} |
385 | | |
386 | 1 | bool ShouldRemoveWithKey(void* key) override { |
387 | 1 | return key == db_impl_ && db_impl_->disable_flush_on_shutdown_; |
388 | 1 | } |
389 | | |
390 | 31 | void DoRun(yb::PriorityThreadPoolSuspender* suspender) override { |
391 | | // Since flush tasks has highest priority we could don't use suspender for them. |
392 | 31 | db_impl_->BackgroundCallFlush(cfd_); |
393 | 31 | } |
394 | | |
395 | 31 | int Priority() const override { |
396 | 31 | return kFlushPriority; |
397 | 31 | } |
398 | | |
399 | 0 | void AbortedUnlocked(const Status& status) override { |
400 | 0 | db_impl_->mutex_.AssertHeld(); |
401 | 0 | cfd_->set_pending_flush(false); |
402 | 0 | if (cfd_->Unref()) { |
403 | 0 | delete cfd_; |
404 | 0 | } |
405 | 0 | if (--db_impl_->bg_flush_scheduled_ == 0) { |
406 | 0 | db_impl_->bg_cv_.SignalAll(); |
407 | 0 | } |
408 | 0 | } |
409 | | |
410 | 31 | BgTaskType Type() const override { |
411 | 31 | return BgTaskType::kFlush; |
412 | 31 | } |
413 | | |
414 | 0 | bool UpdatePriority() override { |
415 | 0 | return false; |
416 | 0 | } |
417 | | |
418 | 1 | std::string ToString() const override { |
419 | 1 | return yb::Format("{ flush db: $0 serial_no: $1 }", db_impl_->GetName(), SerialNo()); |
420 | 1 | } |
421 | | |
422 | | private: |
423 | | ColumnFamilyData* cfd_; |
424 | | }; |
425 | | |
426 | | // Utility class to update task priority. |
427 | | // We use two phase update to avoid calling thread pool while holding the mutex. |
428 | | class DBImpl::TaskPriorityUpdater { |
429 | | public: |
430 | | explicit TaskPriorityUpdater(DBImpl* db) |
431 | | : db_(db), |
432 | | priority_thread_pool_for_compactions_and_flushes_( |
433 | 766k | db_->db_options_.priority_thread_pool_for_compactions_and_flushes) {} |
434 | | |
435 | 766k | void Prepare() { |
436 | 766k | db_->mutex_.AssertHeld(); |
437 | 2.85k | for (auto* task : db_->compaction_tasks_) { |
438 | 2.85k | if (task->UpdatePriority()) { |
439 | 568 | update_priorities_request_.push_back({task->SerialNo(), task->Priority()}); |
440 | 568 | } |
441 | 2.85k | } |
442 | 766k | db_ = nullptr; |
443 | 766k | } |
444 | | |
445 | 0 | bool Empty() const { |
446 | 0 | return update_priorities_request_.empty(); |
447 | 0 | } |
448 | | |
449 | 766k | void Apply() { |
450 | 568 | for (const auto& entry : update_priorities_request_) { |
451 | 568 | priority_thread_pool_for_compactions_and_flushes_->ChangeTaskPriority( |
452 | 568 | entry.task_serial_no, entry.new_priority); |
453 | 568 | } |
454 | 766k | } |
455 | | |
456 | | private: |
457 | | DBImpl* db_; |
458 | | yb::PriorityThreadPool* priority_thread_pool_for_compactions_and_flushes_; |
459 | | boost::container::small_vector<TaskPriorityChange, 8> update_priorities_request_; |
460 | | }; |
461 | | |
462 | | Options SanitizeOptions(const std::string& dbname, |
463 | | const InternalKeyComparator* icmp, |
464 | 346k | const Options& src) { |
465 | 346k | auto db_options = SanitizeOptions(dbname, DBOptions(src)); |
466 | 346k | auto cf_options = SanitizeOptions(db_options, icmp, ColumnFamilyOptions(src)); |
467 | 346k | return Options(db_options, cf_options); |
468 | 346k | } |
469 | | |
470 | 688k | DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { |
471 | 688k | DBOptions result = src; |
472 | | |
473 | | // result.max_open_files means an "infinite" open files. |
474 | 688k | if (result.max_open_files != -1) { |
475 | 688k | int max_max_open_files = port::GetMaxOpenFiles(); |
476 | 688k | if (max_max_open_files == -1) { |
477 | 0 | max_max_open_files = 1000000; |
478 | 0 | } |
479 | 688k | ClipToRange(&result.max_open_files, 20, max_max_open_files); |
480 | 688k | } |
481 | | |
482 | 688k | if (result.info_log == nullptr) { |
483 | 21.0k | Status s = CreateLoggerFromOptions(dbname, result, &result.info_log); |
484 | 21.0k | if (!s.ok()) { |
485 | | // No place suitable for logging |
486 | 2 | result.info_log = nullptr; |
487 | 2 | } |
488 | 21.0k | } |
489 | 688k | if (result.base_background_compactions == -1) { |
490 | 21.0k | result.base_background_compactions = result.max_background_compactions; |
491 | 21.0k | } |
492 | 688k | if (result.base_background_compactions > result.max_background_compactions) { |
493 | 1 | result.base_background_compactions = result.max_background_compactions; |
494 | 1 | } |
495 | 688k | if (result.base_background_compactions == 1) { |
496 | 20.9k | result.num_reserved_small_compaction_threads = 0; |
497 | 20.9k | } |
498 | 688k | if (result.num_reserved_small_compaction_threads == -1 || |
499 | 667k | result.num_reserved_small_compaction_threads >= result.base_background_compactions) { |
500 | 667k | result.num_reserved_small_compaction_threads = result.base_background_compactions - 1; |
501 | 667k | } |
502 | 688k | result.env->IncBackgroundThreadsIfNeeded( |
503 | 688k | src.max_background_compactions, Env::Priority::LOW); |
504 | 688k | result.env->IncBackgroundThreadsIfNeeded( |
505 | 688k | src.max_background_flushes, Env::Priority::HIGH); |
506 | | |
507 | 688k | if (result.rate_limiter.get() != nullptr) { |
508 | 667k | if (result.bytes_per_sync == 0) { |
509 | 667k | result.bytes_per_sync = 1024 * 1024; |
510 | 667k | } |
511 | 667k | } |
512 | | |
513 | 688k | if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { |
514 | 147 | result.recycle_log_file_num = false; |
515 | 147 | } |
516 | | |
517 | 688k | if (result.wal_dir.empty()) { |
518 | | // Use dbname as default |
519 | 687k | result.wal_dir = dbname; |
520 | 687k | } |
521 | 688k | if (result.wal_dir.back() == '/') { |
522 | 5 | result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); |
523 | 5 | } |
524 | | |
525 | 688k | if (result.db_paths.size() == 0) { |
526 | 688k | result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max()); |
527 | 688k | } |
528 | | |
529 | 688k | if (result.compaction_readahead_size > 0) { |
530 | 272 | result.new_table_reader_for_compaction_inputs = true; |
531 | 272 | } |
532 | | |
533 | 688k | return result; |
534 | 688k | } |
535 | | |
536 | | namespace { |
537 | | |
538 | | Status SanitizeOptionsByTable( |
539 | | const DBOptions& db_opts, |
540 | 341k | const std::vector<ColumnFamilyDescriptor>& column_families) { |
541 | 341k | Status s; |
542 | 344k | for (auto cf : column_families) { |
543 | 344k | s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); |
544 | 344k | if (!s.ok()) { |
545 | 1 | return s; |
546 | 1 | } |
547 | 344k | } |
548 | 341k | return Status::OK(); |
549 | 341k | } |
550 | | |
551 | 35.5k | CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { |
552 | | // Compressing memtable flushes might not help unless the sequential load |
553 | | // optimization is used for leveled compaction. Otherwise the CPU and |
554 | | // latency overhead is not offset by saving much space. |
555 | | |
556 | 35.5k | bool can_compress; |
557 | | |
558 | 35.5k | if (ioptions.compaction_style == kCompactionStyleUniversal) { |
559 | 19.8k | can_compress = |
560 | 19.8k | (ioptions.compaction_options_universal.compression_size_percent < 0); |
561 | 15.7k | } else { |
562 | | // For leveled compress when min_level_to_compress == 0. |
563 | 15.7k | can_compress = ioptions.compression_per_level.empty() || |
564 | 991 | ioptions.compression_per_level[0] != kNoCompression; |
565 | 15.7k | } |
566 | | |
567 | 35.5k | if (can_compress) { |
568 | 34.3k | return ioptions.compression; |
569 | 1.17k | } else { |
570 | 1.17k | return kNoCompression; |
571 | 1.17k | } |
572 | 35.5k | } |
573 | | |
574 | 0 | void DumpSupportInfo(Logger* logger) { |
575 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "Compression algorithms supported:"); |
576 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tSnappy supported: %d", |
577 | 0 | Snappy_Supported()); |
578 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tZlib supported: %d", |
579 | 0 | Zlib_Supported()); |
580 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tBzip supported: %d", |
581 | 0 | BZip2_Supported()); |
582 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "\tLZ4 supported: %d", LZ4_Supported()); |
583 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, logger, "Fast CRC32 supported: %d", |
584 | 0 | crc32c::IsFastCrc32Supported()); |
585 | 0 | } |
586 | | |
587 | | } // namespace |
588 | | |
589 | | DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) |
590 | | : env_(options.env), |
591 | | checkpoint_env_(options.get_checkpoint_env()), |
592 | | dbname_(dbname), |
593 | | db_options_(SanitizeOptions(dbname, options)), |
594 | | stats_(db_options_.statistics.get()), |
595 | | db_lock_(nullptr), |
596 | | mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS, options.use_adaptive_mutex), |
597 | | shutting_down_(false), |
598 | | bg_cv_(&mutex_), |
599 | | logfile_number_(0), |
600 | | log_dir_synced_(false), |
601 | | log_empty_(true), |
602 | | default_cf_handle_(nullptr), |
603 | | log_sync_cv_(&mutex_), |
604 | | total_log_size_(0), |
605 | | max_total_in_memory_state_(0), |
606 | | is_snapshot_supported_(true), |
607 | | write_buffer_(options.db_write_buffer_size, options.memory_monitor), |
608 | | write_thread_(options.enable_write_thread_adaptive_yield |
609 | | ? options.write_thread_max_yield_usec |
610 | | : 0, |
611 | | options.write_thread_slow_yield_usec), |
612 | | write_controller_(options.delayed_write_rate), |
613 | | last_batch_group_size_(0), |
614 | | unscheduled_flushes_(0), |
615 | | unscheduled_compactions_(0), |
616 | | bg_compaction_scheduled_(0), |
617 | | num_total_running_compactions_(0), |
618 | | num_running_large_compactions_(0), |
619 | | bg_flush_scheduled_(0), |
620 | | num_running_flushes_(0), |
621 | | disable_delete_obsolete_files_(0), |
622 | | delete_obsolete_files_next_run_( |
623 | | options.env->NowMicros() + |
624 | | db_options_.delete_obsolete_files_period_micros), |
625 | | last_stats_dump_time_microsec_(0), |
626 | | next_job_id_(1), |
627 | | has_unpersisted_data_(false), |
628 | | env_options_(db_options_), |
629 | | #ifndef ROCKSDB_LITE |
630 | | wal_manager_(db_options_, env_options_), |
631 | | #endif // ROCKSDB_LITE |
632 | | event_logger_(db_options_.info_log.get()), |
633 | | bg_work_paused_(0), |
634 | | bg_compaction_paused_(0), |
635 | | refitting_level_(false), |
636 | 341k | opened_successfully_(false) { |
637 | 341k | CHECK_OK(env_->GetAbsolutePath(dbname, &db_absolute_path_)); |
638 | | |
639 | | // Reserve ten files or so for other uses and give the rest to TableCache. |
640 | | // Give a large number for setting of "infinite" open files. |
641 | 341k | const int table_cache_size = (db_options_.max_open_files == -1) ? |
642 | 341k | 4194304 : db_options_.max_open_files - 10; |
643 | 341k | table_cache_ = |
644 | 341k | NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits); |
645 | | |
646 | 341k | versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, |
647 | 341k | table_cache_.get(), &write_buffer_, |
648 | 341k | &write_controller_)); |
649 | 341k | pending_outputs_ = std::make_unique<FileNumbersProvider>(versions_.get()); |
650 | 341k | column_family_memtables_.reset( |
651 | 341k | new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); |
652 | | |
653 | 341k | if (FLAGS_dump_dbimpl_info) { |
654 | 0 | DumpDBFileSummary(db_options_, dbname_); |
655 | 0 | db_options_.Dump(db_options_.info_log.get()); |
656 | 0 | DumpSupportInfo(db_options_.info_log.get()); |
657 | 0 | } |
658 | 341k | } |
659 | | |
660 | | // Will lock the mutex_, will wait for completion if wait is true |
661 | 40 | void DBImpl::CancelAllBackgroundWork(bool wait) { |
662 | 40 | InstrumentedMutexLock l(&mutex_); |
663 | 40 | shutting_down_.store(true, std::memory_order_release); |
664 | 40 | bg_cv_.SignalAll(); |
665 | 40 | if (!wait) { |
666 | 3 | return; |
667 | 3 | } |
668 | | // Wait for background work to finish |
669 | 37 | while (CheckBackgroundWorkAndLog("Cancel")) { |
670 | 0 | bg_cv_.Wait(); |
671 | 0 | } |
672 | 37 | } |
673 | | |
674 | 323k | bool DBImpl::CheckBackgroundWorkAndLog(const char* prefix) const { |
675 | 323k | if (bg_compaction_scheduled_ || bg_flush_scheduled_ || !compaction_tasks_.empty()) { |
676 | 231 | LOG_WITH_PREFIX(INFO) |
677 | 231 | << prefix << " waiting for " << bg_compaction_scheduled_ << " scheduled compactions, " |
678 | 231 | << compaction_tasks_.size() << " compaction tasks and " |
679 | 231 | << bg_flush_scheduled_ << " flushes"; |
680 | 231 | return true; |
681 | 231 | } |
682 | 323k | return false; |
683 | 323k | } |
684 | | |
685 | 632k | void DBImpl::StartShutdown() { |
686 | 632k | bool expected = false; |
687 | 632k | if (!shutting_down_.compare_exchange_strong(expected, true)) { |
688 | 310k | return; |
689 | 310k | } |
690 | | |
691 | 322k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Shutting down RocksDB at: %s\n", |
692 | 322k | dbname_.c_str()); |
693 | | |
694 | 322k | bg_cv_.SignalAll(); |
695 | | |
696 | 322k | TaskPriorityUpdater task_priority_updater(this); |
697 | 322k | { |
698 | 322k | InstrumentedMutexLock lock(&mutex_); |
699 | 322k | task_priority_updater.Prepare(); |
700 | 322k | } |
701 | 322k | task_priority_updater.Apply(); |
702 | 322k | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
703 | 310k | db_options_.priority_thread_pool_for_compactions_and_flushes->Remove(this); |
704 | 310k | } |
705 | 322k | } |
706 | | |
707 | 323k | DBImpl::~DBImpl() { |
708 | 323k | StartShutdown(); |
709 | | |
710 | 323k | TaskPriorityUpdater task_priority_updater(this); |
711 | 323k | { |
712 | 323k | InstrumentedMutexLock lock(&mutex_); |
713 | | |
714 | 323k | if (has_unpersisted_data_) { |
715 | 40.9k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
716 | 40.9k | if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) { |
717 | 40.6k | cfd->Ref(); |
718 | 40.6k | mutex_.Unlock(); |
719 | 40.6k | if (disable_flush_on_shutdown_) { |
720 | 40.0k | LOG_WITH_PREFIX(INFO) << "Skipping mem table flush - disable_flush_on_shutdown_ is set"; |
721 | 664 | } else if (FLAGS_flush_rocksdb_on_shutdown) { |
722 | 617 | LOG_WITH_PREFIX(INFO) << "Flushing mem table on shutdown"; |
723 | 617 | CHECK_OK(FlushMemTable(cfd, FlushOptions())); |
724 | 47 | } else { |
725 | 47 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
726 | 47 | "Skipping mem table flush - flush_rocksdb_on_shutdown is unset"); |
727 | 47 | } |
728 | 40.6k | mutex_.Lock(); |
729 | 40.6k | cfd->Unref(); |
730 | 40.6k | } |
731 | 40.9k | } |
732 | 40.9k | versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); |
733 | 40.9k | } |
734 | 323k | task_priority_updater.Prepare(); |
735 | 323k | } |
736 | | |
737 | 323k | task_priority_updater.Apply(); |
738 | | |
739 | 323k | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
740 | 310k | db_options_.priority_thread_pool_for_compactions_and_flushes->Remove(this); |
741 | 310k | } |
742 | | |
743 | 323k | int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW); |
744 | 323k | int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH); |
745 | | |
746 | 323k | mutex_.Lock(); |
747 | 323k | bg_compaction_scheduled_ -= compactions_unscheduled; |
748 | 323k | bg_flush_scheduled_ -= flushes_unscheduled; |
749 | | |
750 | | // Wait for background work to finish |
751 | 323k | while (CheckBackgroundWorkAndLog("Shutdown")) { |
752 | | // Use timed wait for periodic status logging. |
753 | 231 | bg_cv_.TimedWait(env_->NowMicros() + yb::ToMicroseconds(5s)); |
754 | 231 | } |
755 | 323k | flush_scheduler_.Clear(); |
756 | | |
757 | 323k | while (!flush_queue_.empty()) { |
758 | 13 | auto cfd = PopFirstFromFlushQueue(); |
759 | 13 | if (cfd->Unref()) { |
760 | 0 | delete cfd; |
761 | 0 | } |
762 | 13 | } |
763 | | |
764 | 323k | ClearCompactionQueue(&small_compaction_queue_); |
765 | 323k | ClearCompactionQueue(&large_compaction_queue_); |
766 | | |
767 | 323k | if (default_cf_handle_ != nullptr) { |
768 | | // we need to delete handle outside of lock because it does its own locking |
769 | 323k | mutex_.Unlock(); |
770 | 323k | delete default_cf_handle_; |
771 | 323k | mutex_.Lock(); |
772 | 323k | } |
773 | | |
774 | | // Clean up obsolete files due to SuperVersion release. |
775 | | // (1) Need to delete to obsolete files before closing because RepairDB() |
776 | | // scans all existing files in the file system and builds manifest file. |
777 | | // Keeping obsolete files confuses the repair process. |
778 | | // (2) Need to check if we Open()/Recover() the DB successfully before |
779 | | // deleting because if VersionSet recover fails (may be due to corrupted |
780 | | // manifest file), it is not able to identify live files correctly. As a |
781 | | // result, all "live" files can get deleted by accident. However, corrupted |
782 | | // manifest is recoverable by RepairDB(). |
783 | 323k | if (opened_successfully_) { |
784 | 323k | JobContext job_context(next_job_id_.fetch_add(1)); |
785 | 323k | FindObsoleteFiles(&job_context, true); |
786 | | |
787 | 323k | mutex_.Unlock(); |
788 | | // manifest number starting from 2 |
789 | 323k | job_context.manifest_file_number = 1; |
790 | 323k | if (job_context.HaveSomethingToDelete()) { |
791 | 322k | PurgeObsoleteFiles(job_context); |
792 | 322k | } |
793 | 323k | job_context.Clean(); |
794 | 323k | mutex_.Lock(); |
795 | 323k | } |
796 | | |
797 | 0 | for (auto l : logs_to_free_) { |
798 | 0 | delete l; |
799 | 0 | } |
800 | 323k | for (auto& log : logs_) { |
801 | 323k | log.ClearWriter(); |
802 | 323k | } |
803 | 323k | logs_.clear(); |
804 | | |
805 | | // versions need to be destroyed before table_cache since it can hold |
806 | | // references to table_cache. |
807 | 323k | versions_.reset(); |
808 | 323k | mutex_.Unlock(); |
809 | 323k | if (db_lock_ != nullptr) { |
810 | 323k | CHECK_OK(env_->UnlockFile(db_lock_)); |
811 | 323k | } |
812 | | |
813 | 323k | LogFlush(db_options_.info_log); |
814 | | |
815 | 323k | LOG_WITH_PREFIX(INFO) << "Shutdown done"; |
816 | 323k | } |
817 | | |
818 | 333k | Status DBImpl::NewDB() { |
819 | 333k | VersionEdit new_db; |
820 | 333k | new_db.InitNewDB(); |
821 | 333k | new_db.SetLastSequence(db_options_.initial_seqno); |
822 | | |
823 | 333k | Status s; |
824 | | |
825 | 333k | RLOG(InfoLogLevel::INFO_LEVEL, |
826 | 333k | db_options_.info_log, "Creating manifest 1 \n"); |
827 | 333k | const std::string manifest = DescriptorFileName(dbname_, 1); |
828 | 333k | { |
829 | 333k | unique_ptr<WritableFile> file; |
830 | 333k | EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); |
831 | 333k | s = NewWritableFile(env_, manifest, &file, env_options); |
832 | 333k | if (!s.ok()) { |
833 | 0 | return s; |
834 | 0 | } |
835 | 333k | file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); |
836 | 333k | unique_ptr<WritableFileWriter> file_writer( |
837 | 333k | new WritableFileWriter(std::move(file), env_options)); |
838 | 333k | log::Writer log(std::move(file_writer), 0, false); |
839 | 333k | std::string record; |
840 | 333k | new_db.AppendEncodedTo(&record); |
841 | 333k | s = log.AddRecord(record); |
842 | 333k | if (s.ok()) { |
843 | 333k | s = SyncManifest(env_, &db_options_, log.file()); |
844 | 333k | } |
845 | 333k | } |
846 | 333k | if (s.ok()) { |
847 | | // Make "CURRENT" file that points to the new manifest file. |
848 | 332k | s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir(), db_options_.disableDataSync); |
849 | 165 | } else { |
850 | 165 | env_->CleanupFile(manifest); |
851 | 165 | } |
852 | 333k | return s; |
853 | 333k | } |
854 | | |
855 | 2.95M | void DBImpl::MaybeIgnoreError(Status* s) const { |
856 | 2.95M | if (s->ok() || db_options_.paranoid_checks) { |
857 | | // No change needed |
858 | 0 | } else { |
859 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, |
860 | 0 | db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); |
861 | 0 | *s = Status::OK(); |
862 | 0 | } |
863 | 2.95M | } |
864 | | |
865 | 341k | const Status DBImpl::CreateArchivalDirectory() { |
866 | 341k | if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) { |
867 | 98 | std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); |
868 | 98 | return env_->CreateDirIfMissing(archivalPath); |
869 | 98 | } |
870 | 341k | return Status::OK(); |
871 | 341k | } |
872 | | |
873 | | // * Returns the list of live files in 'sst_live' |
874 | | // If it's doing full scan: |
875 | | // * Returns the list of all files in the filesystem in |
876 | | // 'full_scan_candidate_files'. |
877 | | // Otherwise, gets obsolete files from VersionSet. |
878 | | // no_full_scan = true -- never do the full scan using GetChildren() |
879 | | // force = false -- don't force the full scan, except every |
880 | | // db_options_.delete_obsolete_files_period_micros |
881 | | // force = true -- force the full scan |
882 | | void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, |
883 | 1.47M | bool no_full_scan) { |
884 | 1.47M | mutex_.AssertHeld(); |
885 | | |
886 | | // if deletion is disabled, do nothing |
887 | 1.47M | if (disable_delete_obsolete_files_ > 0) { |
888 | 745 | return; |
889 | 745 | } |
890 | | |
891 | 1.47M | bool doing_the_full_scan = false; |
892 | | |
893 | | // logic for figurint out if we're doing the full scan |
894 | 1.47M | if (no_full_scan) { |
895 | 671k | doing_the_full_scan = false; |
896 | 803k | } else if (force || db_options_.delete_obsolete_files_period_micros == 0) { |
897 | 684k | doing_the_full_scan = true; |
898 | 119k | } else { |
899 | 119k | const uint64_t now_micros = env_->NowMicros(); |
900 | 119k | if (delete_obsolete_files_next_run_ < now_micros) { |
901 | 0 | doing_the_full_scan = true; |
902 | 0 | delete_obsolete_files_next_run_ = |
903 | 0 | now_micros + db_options_.delete_obsolete_files_period_micros; |
904 | 0 | } |
905 | 119k | } |
906 | | |
907 | | // Get obsolete files. This function will also update the list of |
908 | | // pending files in VersionSet(). |
909 | 1.47M | versions_->GetObsoleteFiles(*pending_outputs_, |
910 | 1.47M | &job_context->sst_delete_files, |
911 | 1.47M | &job_context->manifest_delete_files); |
912 | | |
913 | | // store the current filenum, lognum, etc |
914 | 1.47M | job_context->manifest_file_number = versions_->manifest_file_number(); |
915 | 1.47M | job_context->pending_manifest_file_number = |
916 | 1.47M | versions_->pending_manifest_file_number(); |
917 | 1.47M | job_context->log_number = versions_->MinLogNumber(); |
918 | 1.47M | job_context->prev_log_number = versions_->prev_log_number(); |
919 | | |
920 | 1.47M | versions_->AddLiveFiles(&job_context->sst_live); |
921 | 1.47M | if (doing_the_full_scan) { |
922 | 684k | InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); |
923 | 1.36M | for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { |
924 | | // set of all files in the directory. We'll exclude files that are still |
925 | | // alive in the subsequent processings. |
926 | 684k | std::vector<std::string> files; |
927 | 684k | env_->GetChildrenWarnNotOk(db_options_.db_paths[path_id].path, &files); |
928 | 6.09M | for (std::string file : files) { |
929 | 6.09M | uint64_t number; |
930 | 6.09M | FileType type; |
931 | 6.09M | if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) || |
932 | 4.71M | pending_outputs_->HasFileNumber(number)) { |
933 | 1.39M | continue; |
934 | 1.39M | } |
935 | | // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes |
936 | 4.69M | job_context->full_scan_candidate_files.emplace_back( |
937 | 4.69M | "/" + file, static_cast<uint32_t>(path_id)); |
938 | 4.69M | } |
939 | 684k | } |
940 | | |
941 | | // Add log files in wal_dir |
942 | 684k | if (db_options_.wal_dir != dbname_) { |
943 | 818 | std::vector<std::string> log_files; |
944 | 818 | env_->GetChildrenWarnNotOk(db_options_.wal_dir, &log_files); |
945 | 2.98k | for (std::string log_file : log_files) { |
946 | 2.98k | job_context->full_scan_candidate_files.emplace_back(log_file, 0); |
947 | 2.98k | } |
948 | 818 | } |
949 | | // Add info log files in db_log_dir |
950 | 684k | if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) { |
951 | 441 | std::vector<std::string> info_log_files; |
952 | | // Ignore errors |
953 | 441 | env_->GetChildrenWarnNotOk(db_options_.db_log_dir, &info_log_files); |
954 | 12.1k | for (std::string log_file : info_log_files) { |
955 | 12.1k | job_context->full_scan_candidate_files.emplace_back(log_file, 0); |
956 | 12.1k | } |
957 | 441 | } |
958 | 684k | } |
959 | | |
960 | 1.47M | if (!alive_log_files_.empty()) { |
961 | 1.47M | uint64_t min_log_number = versions_->MinLogNumber(); |
962 | | // find newly obsoleted log files |
963 | 1.49M | while (alive_log_files_.begin()->number < min_log_number) { |
964 | 20.1k | auto& earliest = *alive_log_files_.begin(); |
965 | 20.1k | if (db_options_.recycle_log_file_num > log_recycle_files.size()) { |
966 | 83 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
967 | 83 | "adding log %" PRIu64 " to recycle list\n", earliest.number); |
968 | 83 | log_recycle_files.push_back(earliest.number); |
969 | 20.0k | } else { |
970 | 20.0k | job_context->log_delete_files.push_back(earliest.number); |
971 | 20.0k | } |
972 | 20.1k | total_log_size_.fetch_sub(static_cast<int64_t>(earliest.size)); |
973 | 20.1k | alive_log_files_.pop_front(); |
974 | | // Current log should always stay alive since it can't have |
975 | | // number < MinLogNumber(). |
976 | 20.1k | DCHECK(alive_log_files_.size()); |
977 | 20.1k | } |
978 | 1.49M | while (!logs_.empty() && logs_.front().number < min_log_number) { |
979 | 20.1k | auto& log = logs_.front(); |
980 | 20.1k | if (log.getting_synced) { |
981 | 0 | log_sync_cv_.Wait(); |
982 | | // logs_ could have changed while we were waiting. |
983 | 0 | continue; |
984 | 0 | } |
985 | 20.1k | logs_to_free_.push_back(log.ReleaseWriter()); |
986 | 20.1k | logs_.pop_front(); |
987 | 20.1k | } |
988 | | // Current log cannot be obsolete. |
989 | 1.47M | DCHECK(!logs_.empty()); |
990 | 1.47M | } |
991 | | |
992 | | // We're just cleaning up for DB::Write(). |
993 | 1.47M | DCHECK(job_context->logs_to_free.empty()); |
994 | 1.47M | job_context->logs_to_free = logs_to_free_; |
995 | 1.47M | logs_to_free_.clear(); |
996 | 1.47M | } |
997 | | |
998 | | namespace { |
999 | | bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, |
1000 | 13.2M | const JobContext::CandidateFileInfo& second) { |
1001 | 13.2M | if (first.file_name > second.file_name) { |
1002 | 7.67M | return true; |
1003 | 5.61M | } else if (first.file_name < second.file_name) { |
1004 | 5.01M | return false; |
1005 | 607k | } else { |
1006 | 607k | return (first.path_id > second.path_id); |
1007 | 607k | } |
1008 | 13.2M | } |
1009 | | }; // namespace |
1010 | | |
1011 | | // Diffs the files listed in filenames and those that do not |
1012 | | // belong to live files are posibly removed. Also, removes all the |
1013 | | // files in sst_delete_files and log_delete_files. |
1014 | | // It is not necessary to hold the mutex when invoking this method. |
1015 | 1.04M | void DBImpl::PurgeObsoleteFiles(const JobContext& state) { |
1016 | | // we'd better have sth to delete |
1017 | 1.04M | assert(state.HaveSomethingToDelete()); |
1018 | | |
1019 | | // this checks if FindObsoleteFiles() was run before. If not, don't do |
1020 | | // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also |
1021 | | // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true |
1022 | 1.04M | if (state.manifest_file_number == 0) { |
1023 | 710 | return; |
1024 | 710 | } |
1025 | | |
1026 | | // Now, convert live list to an unordered map, WITHOUT mutex held; |
1027 | | // set is slow. |
1028 | 1.04M | std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map; |
1029 | 2.10M | for (const FileDescriptor& fd : state.sst_live) { |
1030 | 2.10M | sst_live_map[fd.GetNumber()] = &fd; |
1031 | 2.10M | } |
1032 | | |
1033 | 1.04M | auto candidate_files = state.full_scan_candidate_files; |
1034 | 1.04M | candidate_files.reserve( |
1035 | 1.04M | candidate_files.size() + state.sst_delete_files.size() + |
1036 | 1.04M | state.log_delete_files.size() + state.manifest_delete_files.size()); |
1037 | | // We may ignore the dbname when generating the file names. |
1038 | 1.04M | const char* kDumbDbName = ""; |
1039 | 58.9k | for (auto file : state.sst_delete_files) { |
1040 | | // We only put base SST file in candidate_files |
1041 | 58.9k | candidate_files.emplace_back( |
1042 | 58.9k | MakeTableFileName(kDumbDbName, file->fd.GetNumber()), |
1043 | 58.9k | file->fd.GetPathId()); |
1044 | 58.9k | delete file; |
1045 | 58.9k | } |
1046 | | |
1047 | 20.0k | for (auto file_num : state.log_delete_files) { |
1048 | 20.0k | if (file_num > 0) { |
1049 | 20.0k | candidate_files.emplace_back(LogFileName(kDumbDbName, file_num).substr(1), |
1050 | 20.0k | 0); |
1051 | 20.0k | } |
1052 | 20.0k | } |
1053 | 251k | for (const auto& filename : state.manifest_delete_files) { |
1054 | 251k | candidate_files.emplace_back(filename, 0); |
1055 | 251k | } |
1056 | | |
1057 | | // dedup state.candidate_files so we don't try to delete the same |
1058 | | // file twice |
1059 | 1.04M | sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile); |
1060 | 1.04M | candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), |
1061 | 1.04M | candidate_files.end()); |
1062 | | |
1063 | 1.04M | std::vector<std::string> old_info_log_files; |
1064 | 1.04M | InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); |
1065 | 5.03M | for (const auto& candidate_file : candidate_files) { |
1066 | 5.03M | std::string to_delete = candidate_file.file_name; |
1067 | 5.03M | uint32_t path_id = candidate_file.path_id; |
1068 | 5.03M | uint64_t number; |
1069 | 5.03M | FileType type; |
1070 | | // Ignore file if we cannot recognize it. |
1071 | 5.03M | if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) { |
1072 | 2.57k | continue; |
1073 | 2.57k | } |
1074 | | |
1075 | 5.02M | bool keep = true; |
1076 | 5.02M | switch (type) { |
1077 | 725k | case kLogFile: |
1078 | 725k | keep = ((number >= state.log_number) || |
1079 | 29.8k | (number == state.prev_log_number)); |
1080 | 725k | break; |
1081 | 937k | case kDescriptorFile: |
1082 | | // Keep my manifest file, and any newer incarnations' |
1083 | | // (can happen during manifest roll) |
1084 | 937k | keep = (number >= state.manifest_file_number); |
1085 | 937k | break; |
1086 | 153k | case kTableFile: |
1087 | | // If the second condition is not there, this makes |
1088 | | // DontDeletePendingOutputs fail |
1089 | 153k | keep = (sst_live_map.find(number) != sst_live_map.end()) || |
1090 | 46.7k | pending_outputs_->HasFileNumber(number); |
1091 | 153k | break; |
1092 | 92.7k | case kTableSBlockFile: |
1093 | | // Just skip, since we will process SST data file during processing of corresponding |
1094 | | // SST base file. |
1095 | 92.7k | keep = true; |
1096 | 92.7k | break; |
1097 | 65 | case kTempFile: |
1098 | | // Any temp files that are currently being written to must |
1099 | | // be recorded in pending_outputs_, which is inserted into "live". |
1100 | | // Also, SetCurrentFile creates a temp file when writing out new |
1101 | | // manifest, which is equal to state.pending_manifest_file_number. We |
1102 | | // should not delete that file |
1103 | | // |
1104 | | // TODO(yhchiang): carefully modify the third condition to safely |
1105 | | // remove the temp options files. |
1106 | 65 | keep = (sst_live_map.find(number) != sst_live_map.end()) || |
1107 | 65 | (number == state.pending_manifest_file_number) || |
1108 | 56 | (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); |
1109 | 65 | break; |
1110 | 392k | case kInfoLogFile: |
1111 | 392k | keep = true; |
1112 | 392k | if (number != 0) { |
1113 | 348k | old_info_log_files.push_back(to_delete); |
1114 | 348k | } |
1115 | 392k | break; |
1116 | 684k | case kCurrentFile: |
1117 | 1.36M | case kDBLockFile: |
1118 | 2.05M | case kIdentityFile: |
1119 | 2.05M | case kMetaDatabase: |
1120 | 2.72M | case kOptionsFile: |
1121 | 2.72M | keep = true; |
1122 | 2.72M | break; |
1123 | 5.03M | } |
1124 | | |
1125 | 5.03M | if (keep) { |
1126 | 4.70M | continue; |
1127 | 4.70M | } |
1128 | | |
1129 | 329k | std::string fname; |
1130 | 329k | if (type == kTableFile) { |
1131 | | // evict from cache |
1132 | 46.7k | TableCache::Evict(table_cache_.get(), number); |
1133 | 46.7k | fname = TableFileName(db_options_.db_paths, number, path_id); |
1134 | 282k | } else { |
1135 | 282k | fname = ((type == kLogFile) ? |
1136 | 252k | db_options_.wal_dir : dbname_) + "/" + to_delete; |
1137 | 282k | } |
1138 | | |
1139 | 329k | #ifndef ROCKSDB_LITE |
1140 | 329k | if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 || |
1141 | 29.6k | db_options_.WAL_size_limit_MB > 0)) { |
1142 | 108 | wal_manager_.ArchiveWALFile(fname, number); |
1143 | 108 | continue; |
1144 | 108 | } |
1145 | 328k | #endif // !ROCKSDB_LITE |
1146 | 328k | Status file_deletion_status; |
1147 | 328k | if (type == kTableFile) { |
1148 | 46.7k | file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id); |
1149 | 46.7k | const std::string data_fname = TableBaseToDataFileName(fname); |
1150 | 46.7k | if (file_deletion_status.ok()) { |
1151 | | // Delete corresponding data file if exists. |
1152 | 45.6k | Status s = db_options_.env->FileExists(data_fname); |
1153 | 45.6k | if (s.ok()) { |
1154 | 44.7k | file_deletion_status = DeleteSSTFile(&db_options_, data_fname, path_id); |
1155 | 918 | } else if (!s.IsNotFound()) { |
1156 | 0 | file_deletion_status = s; |
1157 | 0 | } |
1158 | 45.6k | } |
1159 | 282k | } else { |
1160 | 282k | file_deletion_status = env_->DeleteFile(fname); |
1161 | 282k | } |
1162 | 328k | if (file_deletion_status.ok()) { |
1163 | 326k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1164 | 326k | "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id, |
1165 | 326k | fname.c_str(), type, number, |
1166 | 326k | file_deletion_status.ToString().c_str()); |
1167 | 2.01k | } else if (env_->FileExists(fname).IsNotFound()) { |
1168 | 1.98k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1169 | 1.98k | "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64 |
1170 | 1.98k | " -- %s\n", |
1171 | 1.98k | state.job_id, fname.c_str(), type, number, |
1172 | 1.98k | file_deletion_status.ToString().c_str()); |
1173 | 29 | } else { |
1174 | 29 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
1175 | 29 | "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n", |
1176 | 29 | state.job_id, fname.c_str(), type, number, |
1177 | 29 | file_deletion_status.ToString().c_str()); |
1178 | 29 | } |
1179 | 328k | if (type == kTableFile) { |
1180 | 46.7k | EventHelpers::LogAndNotifyTableFileDeletion( |
1181 | 46.7k | &event_logger_, state.job_id, number, fname, |
1182 | 46.7k | file_deletion_status, GetName(), |
1183 | 46.7k | db_options_.listeners); |
1184 | 46.7k | } |
1185 | 328k | } |
1186 | | |
1187 | | // Delete old info log files. |
1188 | 1.04M | size_t old_info_log_file_count = old_info_log_files.size(); |
1189 | 1.04M | if (old_info_log_file_count != 0 && |
1190 | 12.8k | old_info_log_file_count >= db_options_.keep_log_file_num) { |
1191 | 0 | std::sort(old_info_log_files.begin(), old_info_log_files.end()); |
1192 | 0 | size_t end = old_info_log_file_count - db_options_.keep_log_file_num; |
1193 | 0 | for (unsigned int i = 0; i <= end; i++) { |
1194 | 0 | std::string& to_delete = old_info_log_files.at(i); |
1195 | 0 | std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? |
1196 | 0 | dbname_ : db_options_.db_log_dir) + "/" + to_delete; |
1197 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1198 | 0 | "[JOB %d] Delete info log file %s\n", state.job_id, |
1199 | 0 | full_path_to_delete.c_str()); |
1200 | 0 | Status s = env_->DeleteFile(full_path_to_delete); |
1201 | 0 | if (!s.ok()) { |
1202 | 0 | if (env_->FileExists(full_path_to_delete).IsNotFound()) { |
1203 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1204 | 0 | "[JOB %d] Tried to delete non-existing info log file %s FAILED " |
1205 | 0 | "-- %s\n", |
1206 | 0 | state.job_id, to_delete.c_str(), s.ToString().c_str()); |
1207 | 0 | } else { |
1208 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
1209 | 0 | "[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id, |
1210 | 0 | to_delete.c_str(), s.ToString().c_str()); |
1211 | 0 | } |
1212 | 0 | } |
1213 | 0 | } |
1214 | 0 | } |
1215 | 1.04M | #ifndef ROCKSDB_LITE |
1216 | 1.04M | wal_manager_.PurgeObsoleteWALFiles(); |
1217 | 1.04M | #endif // ROCKSDB_LITE |
1218 | 1.04M | LogFlush(db_options_.info_log); |
1219 | 1.04M | } |
1220 | | |
1221 | 341k | void DBImpl::DeleteObsoleteFiles() { |
1222 | 341k | mutex_.AssertHeld(); |
1223 | 341k | JobContext job_context(next_job_id_.fetch_add(1)); |
1224 | 341k | FindObsoleteFiles(&job_context, true); |
1225 | | |
1226 | 341k | mutex_.Unlock(); |
1227 | 341k | if (job_context.HaveSomethingToDelete()) { |
1228 | 341k | PurgeObsoleteFiles(job_context); |
1229 | 341k | } |
1230 | 341k | job_context.Clean(); |
1231 | 341k | mutex_.Lock(); |
1232 | 341k | } |
1233 | | |
1234 | | Status DBImpl::Directories::CreateAndNewDirectory( |
1235 | | Env* env, const std::string& dirname, |
1236 | 342k | std::unique_ptr<Directory>* directory) const { |
1237 | | // We call CreateDirIfMissing() as the directory may already exist (if we |
1238 | | // are reopening a DB), when this happens we don't want creating the |
1239 | | // directory to cause an error. However, we need to check if creating the |
1240 | | // directory fails or else we may get an obscure message about the lock |
1241 | | // file not existing. One real-world example of this occurring is if |
1242 | | // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. |
1243 | | // when dbname_ is "dir/db" but when "dir" doesn't exist. |
1244 | 342k | Status s = env->CreateDirIfMissing(dirname); |
1245 | 342k | if (!s.ok()) { |
1246 | 0 | return s; |
1247 | 0 | } |
1248 | 342k | return env->NewDirectory(dirname, directory); |
1249 | 342k | } |
1250 | | |
1251 | | Status DBImpl::Directories::SetDirectories( |
1252 | | Env* env, const std::string& dbname, const std::string& wal_dir, |
1253 | 341k | const std::vector<DbPath>& data_paths) { |
1254 | 341k | Status s = CreateAndNewDirectory(env, dbname, &db_dir_); |
1255 | 341k | if (!s.ok()) { |
1256 | 0 | return s; |
1257 | 0 | } |
1258 | 341k | if (!wal_dir.empty() && dbname != wal_dir) { |
1259 | 413 | s = CreateAndNewDirectory(env, wal_dir, &wal_dir_); |
1260 | 413 | if (!s.ok()) { |
1261 | 0 | return s; |
1262 | 0 | } |
1263 | 341k | } |
1264 | | |
1265 | 341k | data_dirs_.clear(); |
1266 | 341k | for (auto& p : data_paths) { |
1267 | 341k | const std::string db_path = p.path; |
1268 | 341k | if (db_path == dbname) { |
1269 | 341k | data_dirs_.emplace_back(nullptr); |
1270 | 18.4E | } else { |
1271 | 18.4E | std::unique_ptr<Directory> path_directory; |
1272 | 18.4E | s = CreateAndNewDirectory(env, db_path, &path_directory); |
1273 | 18.4E | if (!s.ok()) { |
1274 | 0 | return s; |
1275 | 0 | } |
1276 | 18.4E | data_dirs_.emplace_back(path_directory.release()); |
1277 | 18.4E | } |
1278 | 341k | } |
1279 | 341k | assert(data_dirs_.size() == data_paths.size()); |
1280 | 341k | return Status::OK(); |
1281 | 341k | } |
1282 | | |
1283 | 42.5k | Directory* DBImpl::Directories::GetDataDir(size_t path_id) { |
1284 | 42.5k | assert(path_id < data_dirs_.size()); |
1285 | 42.5k | Directory* ret_dir = data_dirs_[path_id].get(); |
1286 | 42.5k | if (ret_dir == nullptr) { |
1287 | | // Should use db_dir_ |
1288 | 42.2k | return db_dir_.get(); |
1289 | 42.2k | } |
1290 | 256 | return ret_dir; |
1291 | 256 | } |
1292 | | |
1293 | | Status DBImpl::Recover( |
1294 | | const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only, |
1295 | 341k | bool error_if_log_file_exist) { |
1296 | 341k | mutex_.AssertHeld(); |
1297 | | |
1298 | 341k | bool is_new_db = false; |
1299 | 341k | assert(db_lock_ == nullptr); |
1300 | 341k | if (!read_only) { |
1301 | 341k | Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir, |
1302 | 341k | db_options_.db_paths); |
1303 | 341k | if (!s.ok()) { |
1304 | 0 | return s; |
1305 | 0 | } |
1306 | | |
1307 | 341k | s = env_->LockFile(LockFileName(dbname_), &db_lock_); |
1308 | 341k | if (!s.ok()) { |
1309 | 5 | return s; |
1310 | 5 | } |
1311 | | |
1312 | 341k | s = env_->FileExists(CurrentFileName(dbname_)); |
1313 | 341k | if (s.IsNotFound()) { |
1314 | 333k | if (db_options_.create_if_missing) { |
1315 | 333k | s = NewDB(); |
1316 | 333k | is_new_db = true; |
1317 | 333k | if (!s.ok()) { |
1318 | 0 | return s; |
1319 | 0 | } |
1320 | 7 | } else { |
1321 | 7 | return STATUS(InvalidArgument, |
1322 | 7 | dbname_, "does not exist (create_if_missing is false)"); |
1323 | 7 | } |
1324 | 8.73k | } else if (s.ok()) { |
1325 | 8.73k | if (db_options_.error_if_exists) { |
1326 | 1 | return STATUS(InvalidArgument, |
1327 | 1 | dbname_, "exists (error_if_exists is true)"); |
1328 | 1 | } |
1329 | 18.4E | } else { |
1330 | | // Unexpected error reading file |
1331 | 18.4E | assert(s.IsIOError()); |
1332 | 18.4E | return s; |
1333 | 18.4E | } |
1334 | | // Check for the IDENTITY file and create it if not there |
1335 | 341k | s = env_->FileExists(IdentityFileName(dbname_)); |
1336 | 341k | if (s.IsNotFound()) { |
1337 | 335k | s = SetIdentityFile(env_, dbname_); |
1338 | 335k | if (!s.ok()) { |
1339 | 0 | return s; |
1340 | 0 | } |
1341 | 6.80k | } else if (!s.ok()) { |
1342 | 0 | assert(s.IsIOError()); |
1343 | 0 | return s; |
1344 | 0 | } |
1345 | 341k | } |
1346 | | |
1347 | 341k | Status s = versions_->Recover(column_families, read_only); |
1348 | 341k | if (db_options_.paranoid_checks && s.ok()) { |
1349 | 341k | s = CheckConsistency(); |
1350 | 341k | } |
1351 | 341k | if (s.ok()) { |
1352 | 341k | SequenceNumber max_sequence(kMaxSequenceNumber); |
1353 | 341k | default_cf_handle_ = new ColumnFamilyHandleImpl( |
1354 | 341k | versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); |
1355 | 341k | default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); |
1356 | 341k | single_column_family_mode_ = |
1357 | 341k | versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; |
1358 | | |
1359 | | // Recover from all newer log files than the ones named in the |
1360 | | // descriptor (new log files may have been added by the previous |
1361 | | // incarnation without registering them in the descriptor). |
1362 | | // |
1363 | | // Note that prev_log_number() is no longer used, but we pay |
1364 | | // attention to it in case we are recovering a database |
1365 | | // produced by an older version of rocksdb. |
1366 | 341k | const uint64_t min_log = versions_->MinLogNumber(); |
1367 | 341k | const uint64_t prev_log = versions_->prev_log_number(); |
1368 | 341k | std::vector<std::string> filenames; |
1369 | 341k | s = env_->GetChildren(db_options_.wal_dir, &filenames); |
1370 | 341k | if (!s.ok()) { |
1371 | 0 | return s; |
1372 | 0 | } |
1373 | | |
1374 | 341k | std::vector<uint64_t> logs; |
1375 | 2.62M | for (size_t i = 0; i < filenames.size(); i++) { |
1376 | 2.28M | uint64_t number; |
1377 | 2.28M | FileType type; |
1378 | 2.28M | if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) { |
1379 | 10.0k | if (is_new_db) { |
1380 | 30 | return STATUS(Corruption, |
1381 | 30 | "While creating a new Db, wal_dir contains " |
1382 | 30 | "existing log file: ", |
1383 | 30 | filenames[i]); |
1384 | 10.0k | } else if ((number >= min_log) || (number == prev_log)) { |
1385 | 9.99k | logs.push_back(number); |
1386 | 9.99k | } |
1387 | 10.0k | } |
1388 | 2.28M | } |
1389 | | |
1390 | 341k | if (logs.size() > 0 && error_if_log_file_exist) { |
1391 | 0 | return STATUS(Corruption, "" |
1392 | 0 | "The db was opened in readonly mode with error_if_log_file_exist" |
1393 | 0 | "flag but a log file already exists"); |
1394 | 0 | } |
1395 | | |
1396 | 341k | if (!logs.empty()) { |
1397 | | // Recover in the order in which the logs were generated |
1398 | 6.84k | std::sort(logs.begin(), logs.end()); |
1399 | 6.84k | s = RecoverLogFiles(logs, &max_sequence, read_only); |
1400 | 6.84k | if (!s.ok()) { |
1401 | | // Clear memtables if recovery failed |
1402 | 116 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1403 | 116 | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1404 | 116 | kMaxSequenceNumber); |
1405 | 116 | } |
1406 | 114 | } |
1407 | 6.84k | } |
1408 | | |
1409 | 341k | SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); |
1410 | 341k | } |
1411 | | |
1412 | | // Initial value |
1413 | 341k | max_total_in_memory_state_ = 0; |
1414 | 344k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1415 | 344k | auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
1416 | 344k | max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * |
1417 | 344k | mutable_cf_options->max_write_buffer_number; |
1418 | 344k | } |
1419 | | |
1420 | 341k | return s; |
1421 | 341k | } |
1422 | | |
1423 | | // REQUIRES: log_numbers are sorted in ascending order |
1424 | | Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers, |
1425 | 6.82k | SequenceNumber* max_sequence, bool read_only) { |
1426 | 6.82k | struct LogReporter : public log::Reader::Reporter { |
1427 | 6.82k | Env* env; |
1428 | 6.82k | Logger* info_log; |
1429 | 6.82k | const char* fname; |
1430 | 6.82k | Status* status; // nullptr if db_options_.paranoid_checks==false |
1431 | 319 | void Corruption(size_t bytes, const Status& s) override { |
1432 | 319 | RLOG(InfoLogLevel::WARN_LEVEL, |
1433 | 319 | info_log, "%s%s: dropping %d bytes; %s", |
1434 | 319 | (this->status == nullptr ? "(ignoring error) " : ""), |
1435 | 319 | fname, static_cast<int>(bytes), s.ToString().c_str()); |
1436 | 319 | if (this->status != nullptr && this->status->ok()) { |
1437 | 151 | *this->status = s; |
1438 | 151 | } |
1439 | 319 | } |
1440 | 6.82k | }; |
1441 | | |
1442 | 6.82k | mutex_.AssertHeld(); |
1443 | 6.82k | Status status; |
1444 | 6.82k | std::unordered_map<int, VersionEdit> version_edits; |
1445 | | // no need to refcount because iteration is under mutex |
1446 | 9.74k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1447 | 9.74k | VersionEdit edit; |
1448 | 9.74k | edit.SetColumnFamily(cfd->GetID()); |
1449 | 9.74k | auto frontier = versions_->FlushedFrontier(); |
1450 | 9.74k | if (frontier) { |
1451 | 176 | edit.UpdateFlushedFrontier(frontier->Clone()); |
1452 | 176 | } |
1453 | 9.74k | version_edits.insert({cfd->GetID(), edit}); |
1454 | 9.74k | } |
1455 | 6.82k | int job_id = next_job_id_.fetch_add(1); |
1456 | 6.82k | { |
1457 | 6.82k | auto stream = event_logger_.Log(); |
1458 | 6.82k | stream << "job" << job_id << "event" |
1459 | 6.82k | << "recovery_started"; |
1460 | 6.82k | stream << "log_files"; |
1461 | 6.82k | stream.StartArray(); |
1462 | 9.97k | for (auto log_number : log_numbers) { |
1463 | 9.97k | stream << log_number; |
1464 | 9.97k | } |
1465 | 6.82k | stream.EndArray(); |
1466 | 6.82k | } |
1467 | | |
1468 | 6.82k | bool continue_replay_log = true; |
1469 | 9.49k | for (auto log_number : log_numbers) { |
1470 | | // The previous incarnation may not have written any MANIFEST |
1471 | | // records after allocating this log number. So we manually |
1472 | | // update the file number allocation counter in VersionSet. |
1473 | 9.49k | versions_->MarkFileNumberUsedDuringRecovery(log_number); |
1474 | | // Open the log file |
1475 | 9.49k | std::string fname = LogFileName(db_options_.wal_dir, log_number); |
1476 | 9.49k | unique_ptr<SequentialFileReader> file_reader; |
1477 | 9.49k | { |
1478 | 9.49k | unique_ptr<SequentialFile> file; |
1479 | 9.49k | status = env_->NewSequentialFile(fname, &file, env_options_); |
1480 | 9.49k | if (!status.ok()) { |
1481 | 0 | MaybeIgnoreError(&status); |
1482 | 0 | if (!status.ok()) { |
1483 | 0 | return status; |
1484 | 0 | } else { |
1485 | | // Fail with one log file, but that's ok. |
1486 | | // Try next one. |
1487 | 0 | continue; |
1488 | 0 | } |
1489 | 9.49k | } |
1490 | 9.49k | file_reader.reset(new SequentialFileReader(std::move(file))); |
1491 | 9.49k | } |
1492 | | |
1493 | | // Create the log reader. |
1494 | 9.49k | LogReporter reporter; |
1495 | 9.49k | reporter.env = env_; |
1496 | 9.49k | reporter.info_log = db_options_.info_log.get(); |
1497 | 9.49k | reporter.fname = fname.c_str(); |
1498 | 9.49k | if (!db_options_.paranoid_checks || |
1499 | 9.48k | db_options_.wal_recovery_mode == |
1500 | 892 | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1501 | 892 | reporter.status = nullptr; |
1502 | 8.60k | } else { |
1503 | 8.60k | reporter.status = &status; |
1504 | 8.60k | } |
1505 | | // We intentially make log::Reader do checksumming even if |
1506 | | // paranoid_checks==false so that corruptions cause entire commits |
1507 | | // to be skipped instead of propagating bad information (like overly |
1508 | | // large sequence numbers). |
1509 | 9.49k | log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, |
1510 | 9.49k | true /*checksum*/, 0 /*initial_offset*/, log_number); |
1511 | 9.49k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1512 | 9.49k | "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, |
1513 | 9.49k | db_options_.wal_recovery_mode, !continue_replay_log); |
1514 | | |
1515 | | // Determine if we should tolerate incomplete records at the tail end of the |
1516 | | // Read all the records and add to a memtable |
1517 | 9.49k | std::string scratch; |
1518 | 9.49k | Slice record; |
1519 | 9.49k | WriteBatch batch; |
1520 | | |
1521 | 9.49k | if (!continue_replay_log) { |
1522 | 180 | uint64_t bytes; |
1523 | 180 | if (env_->GetFileSize(fname, &bytes).ok()) { |
1524 | 180 | auto info_log = db_options_.info_log.get(); |
1525 | 180 | RLOG(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes", |
1526 | 180 | fname.c_str(), static_cast<int>(bytes)); |
1527 | 180 | } |
1528 | 180 | } |
1529 | | |
1530 | 9.49k | while ( |
1531 | 2.96M | continue_replay_log && |
1532 | 2.96M | reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) && |
1533 | 2.95M | status.ok()) { |
1534 | 2.95M | if (record.size() < 12) { |
1535 | 0 | reporter.Corruption(record.size(), |
1536 | 0 | STATUS(Corruption, "log record too small")); |
1537 | 0 | continue; |
1538 | 0 | } |
1539 | 2.95M | WriteBatchInternal::SetContents(&batch, record); |
1540 | | |
1541 | 2.95M | #ifndef ROCKSDB_LITE |
1542 | 2.95M | if (db_options_.wal_filter != nullptr) { |
1543 | 17 | WriteBatch new_batch; |
1544 | 17 | bool batch_changed = false; |
1545 | | |
1546 | 17 | WalFilter::WalProcessingOption wal_processing_option = |
1547 | 17 | db_options_.wal_filter->LogRecord(batch, &new_batch, |
1548 | 17 | &batch_changed); |
1549 | | |
1550 | 17 | switch (wal_processing_option) { |
1551 | 14 | case WalFilter::WalProcessingOption::kContinueProcessing: |
1552 | | // do nothing, proceeed normally |
1553 | 14 | break; |
1554 | 1 | case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: |
1555 | | // skip current record |
1556 | 1 | continue; |
1557 | 1 | case WalFilter::WalProcessingOption::kStopReplay: |
1558 | | // skip current record and stop replay |
1559 | 1 | continue_replay_log = false; |
1560 | 1 | continue; |
1561 | 1 | case WalFilter::WalProcessingOption::kCorruptedRecord: { |
1562 | 1 | status = STATUS(Corruption, "Corruption reported by Wal Filter ", |
1563 | 1 | db_options_.wal_filter->Name()); |
1564 | 1 | MaybeIgnoreError(&status); |
1565 | 1 | if (!status.ok()) { |
1566 | 1 | reporter.Corruption(record.size(), status); |
1567 | 1 | continue; |
1568 | 1 | } |
1569 | 0 | break; |
1570 | 0 | } |
1571 | 0 | default: { |
1572 | 0 | assert(false); // unhandled case |
1573 | 0 | status = STATUS(NotSupported, |
1574 | 0 | "Unknown WalProcessingOption returned" |
1575 | 0 | " by Wal Filter ", |
1576 | 0 | db_options_.wal_filter->Name()); |
1577 | 0 | MaybeIgnoreError(&status); |
1578 | 0 | if (!status.ok()) { |
1579 | 0 | return status; |
1580 | 0 | } else { |
1581 | | // Ignore the error with current record processing. |
1582 | 0 | continue; |
1583 | 0 | } |
1584 | 14 | } |
1585 | 14 | } |
1586 | | |
1587 | 14 | if (batch_changed) { |
1588 | | // Make sure that the count in the new batch is |
1589 | | // within the orignal count. |
1590 | 3 | int new_count = WriteBatchInternal::Count(&new_batch); |
1591 | 3 | int original_count = WriteBatchInternal::Count(&batch); |
1592 | 3 | if (new_count > original_count) { |
1593 | 1 | RLOG(InfoLogLevel::FATAL_LEVEL, db_options_.info_log, |
1594 | 1 | "Recovering log #%" PRIu64 |
1595 | 1 | " mode %d log filter %s returned " |
1596 | 1 | "more records (%d) than original (%d) which is not allowed. " |
1597 | 1 | "Aborting recovery.", |
1598 | 1 | log_number, db_options_.wal_recovery_mode, |
1599 | 1 | db_options_.wal_filter->Name(), new_count, original_count); |
1600 | 1 | status = STATUS(NotSupported, |
1601 | 1 | "More than original # of records " |
1602 | 1 | "returned by Wal Filter ", |
1603 | 1 | db_options_.wal_filter->Name()); |
1604 | 1 | return status; |
1605 | 1 | } |
1606 | | // Set the same sequence number in the new_batch |
1607 | | // as the original batch. |
1608 | 2 | WriteBatchInternal::SetSequence(&new_batch, |
1609 | 2 | WriteBatchInternal::Sequence(&batch)); |
1610 | 2 | batch = new_batch; |
1611 | 2 | } |
1612 | 14 | } |
1613 | 2.95M | #endif // ROCKSDB_LITE |
1614 | | |
1615 | | // If column family was not found, it might mean that the WAL write |
1616 | | // batch references to the column family that was dropped after the |
1617 | | // insert. We don't want to fail the whole write batch in that case -- |
1618 | | // we just ignore the update. |
1619 | | // That's why we set ignore missing column families to true |
1620 | 2.95M | status = |
1621 | 2.95M | WriteBatchInternal::InsertInto(&batch, column_family_memtables_.get(), |
1622 | 2.95M | &flush_scheduler_, true, log_number); |
1623 | | |
1624 | 2.95M | MaybeIgnoreError(&status); |
1625 | 2.95M | if (!status.ok()) { |
1626 | | // We are treating this as a failure while reading since we read valid |
1627 | | // blocks that do not form coherent data |
1628 | 0 | reporter.Corruption(record.size(), status); |
1629 | 0 | continue; |
1630 | 0 | } |
1631 | | |
1632 | 2.95M | const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + |
1633 | 2.95M | WriteBatchInternal::Count(&batch) - 1; |
1634 | 2.95M | if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)) { |
1635 | 2.95M | *max_sequence = last_seq; |
1636 | 2.95M | } |
1637 | | |
1638 | 2.95M | if (!read_only) { |
1639 | | // we can do this because this is called before client has access to the |
1640 | | // DB and there is only a single thread operating on DB |
1641 | 2.95M | ColumnFamilyData* cfd; |
1642 | | |
1643 | 2.95M | while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { |
1644 | 23 | cfd->Unref(); |
1645 | | // If this asserts, it means that InsertInto failed in |
1646 | | // filtering updates to already-flushed column families |
1647 | 23 | assert(cfd->GetLogNumber() <= log_number); |
1648 | 23 | auto iter = version_edits.find(cfd->GetID()); |
1649 | 23 | assert(iter != version_edits.end()); |
1650 | 23 | VersionEdit* edit = &iter->second; |
1651 | 23 | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1652 | 23 | if (!status.ok()) { |
1653 | | // Reflect errors immediately so that conditions like full |
1654 | | // file-systems cause the DB::Open() to fail. |
1655 | 0 | return status; |
1656 | 0 | } |
1657 | | |
1658 | 23 | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1659 | 23 | *max_sequence); |
1660 | 23 | } |
1661 | 2.95M | } |
1662 | 2.95M | } |
1663 | | |
1664 | 9.49k | if (!status.ok()) { |
1665 | 152 | if (db_options_.wal_recovery_mode == |
1666 | 0 | WALRecoveryMode::kSkipAnyCorruptedRecords) { |
1667 | | // We should ignore all errors unconditionally |
1668 | 0 | status = Status::OK(); |
1669 | 152 | } else if (db_options_.wal_recovery_mode == |
1670 | 40 | WALRecoveryMode::kPointInTimeRecovery) { |
1671 | | // We should ignore the error but not continue replaying |
1672 | 40 | status = Status::OK(); |
1673 | 40 | continue_replay_log = false; |
1674 | | |
1675 | 40 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
1676 | 40 | "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64, |
1677 | 40 | log_number, *max_sequence); |
1678 | 112 | } else { |
1679 | 112 | assert(db_options_.wal_recovery_mode == |
1680 | 112 | WALRecoveryMode::kTolerateCorruptedTailRecords |
1681 | 112 | || db_options_.wal_recovery_mode == |
1682 | 112 | WALRecoveryMode::kAbsoluteConsistency); |
1683 | 112 | return status; |
1684 | 112 | } |
1685 | 9.38k | } |
1686 | | |
1687 | 9.38k | flush_scheduler_.Clear(); |
1688 | 9.38k | if ((*max_sequence != kMaxSequenceNumber) && (versions_->LastSequence() < *max_sequence)) { |
1689 | 5.64k | versions_->SetLastSequence(*max_sequence); |
1690 | 5.64k | } |
1691 | 9.38k | } |
1692 | | |
1693 | 6.71k | if (!read_only) { |
1694 | | // no need to refcount since client still doesn't have access |
1695 | | // to the DB and can not drop column families while we iterate |
1696 | 6.70k | auto max_log_number = log_numbers.back(); |
1697 | 9.61k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
1698 | 9.61k | auto iter = version_edits.find(cfd->GetID()); |
1699 | 9.61k | assert(iter != version_edits.end()); |
1700 | 9.61k | VersionEdit* edit = &iter->second; |
1701 | | |
1702 | 9.61k | if (cfd->GetLogNumber() > max_log_number) { |
1703 | | // Column family cfd has already flushed the data |
1704 | | // from all logs. Memtable has to be empty because |
1705 | | // we filter the updates based on log_number |
1706 | | // (in WriteBatch::InsertInto) |
1707 | 5 | assert(cfd->mem()->GetFirstSequenceNumber() == 0); |
1708 | 5 | assert(edit->NumEntries() == 0); |
1709 | 5 | continue; |
1710 | 5 | } |
1711 | | |
1712 | | // flush the final memtable (if non-empty) |
1713 | 9.61k | if (cfd->mem()->GetFirstSequenceNumber() != 0) { |
1714 | 3.65k | status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit); |
1715 | 3.65k | if (!status.ok()) { |
1716 | | // Recovery failed |
1717 | 0 | break; |
1718 | 0 | } |
1719 | | |
1720 | 3.65k | cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), |
1721 | 3.65k | *max_sequence); |
1722 | 3.65k | } |
1723 | | |
1724 | | // write MANIFEST with update |
1725 | | // writing log_number in the manifest means that any log file |
1726 | | // with number strongly less than (log_number + 1) is already |
1727 | | // recovered and should be ignored on next reincarnation. |
1728 | | // Since we already recovered max_log_number, we want all logs |
1729 | | // with numbers `<= max_log_number` (includes this one) to be ignored |
1730 | 9.61k | edit->SetLogNumber(max_log_number + 1); |
1731 | | // we must mark the next log number as used, even though it's |
1732 | | // not actually used. that is because VersionSet assumes |
1733 | | // VersionSet::next_file_number_ always to be strictly greater than any |
1734 | | // log number |
1735 | 9.61k | versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); |
1736 | 9.61k | status = versions_->LogAndApply( |
1737 | 9.61k | cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); |
1738 | 9.61k | if (!status.ok()) { |
1739 | | // Recovery failed |
1740 | 1 | break; |
1741 | 1 | } |
1742 | 9.61k | } |
1743 | 6.70k | } |
1744 | | |
1745 | 6.71k | event_logger_.Log() << "job" << job_id << "event" |
1746 | 6.71k | << "recovery_finished"; |
1747 | | |
1748 | 6.71k | return status; |
1749 | 6.82k | } |
1750 | | |
1751 | | Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, |
1752 | 3.68k | MemTable* mem, VersionEdit* edit) { |
1753 | 3.68k | mutex_.AssertHeld(); |
1754 | 3.68k | const uint64_t start_micros = env_->NowMicros(); |
1755 | 3.68k | FileMetaData meta; |
1756 | 3.68k | Status s; |
1757 | 3.68k | { |
1758 | 3.68k | auto file_number_holder = pending_outputs_->NewFileNumber(); |
1759 | 3.68k | meta.fd = FileDescriptor(file_number_holder.Last(), 0, 0, 0); |
1760 | 3.68k | const auto* frontier = mem->Frontiers(); |
1761 | 3.68k | if (frontier) { |
1762 | 0 | meta.smallest.user_frontier = frontier->Smallest().Clone(); |
1763 | 0 | meta.largest.user_frontier = frontier->Largest().Clone(); |
1764 | 0 | } |
1765 | 3.68k | ReadOptions ro; |
1766 | 3.68k | ro.total_order_seek = true; |
1767 | 3.68k | Arena arena; |
1768 | 3.68k | TableProperties table_properties; |
1769 | 3.68k | { |
1770 | 3.68k | ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); |
1771 | 3.68k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1772 | 3.68k | "[%s] [WriteLevel0TableForRecovery]" |
1773 | 3.68k | " Level-0 table #%" PRIu64 ": started", |
1774 | 3.68k | cfd->GetName().c_str(), meta.fd.GetNumber()); |
1775 | | |
1776 | 3.68k | bool paranoid_file_checks = |
1777 | 3.68k | cfd->GetLatestMutableCFOptions()->paranoid_file_checks; |
1778 | 3.68k | { |
1779 | 3.68k | mutex_.Unlock(); |
1780 | 3.68k | TableFileCreationInfo info; |
1781 | | |
1782 | 3.68k | SequenceNumber earliest_write_conflict_snapshot; |
1783 | 3.68k | std::vector<SequenceNumber> snapshot_seqs = |
1784 | 3.68k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
1785 | | |
1786 | 3.68k | s = BuildTable(dbname_, |
1787 | 3.68k | env_, |
1788 | 3.68k | *cfd->ioptions(), |
1789 | 3.68k | env_options_, |
1790 | 3.68k | cfd->table_cache(), |
1791 | 3.68k | iter.get(), |
1792 | 3.68k | &meta, |
1793 | 3.68k | cfd->internal_comparator(), |
1794 | 3.68k | cfd->int_tbl_prop_collector_factories(), |
1795 | 3.68k | cfd->GetID(), |
1796 | 3.68k | snapshot_seqs, |
1797 | 3.68k | earliest_write_conflict_snapshot, |
1798 | 3.68k | GetCompressionFlush(*cfd->ioptions()), |
1799 | 3.68k | cfd->ioptions()->compression_opts, |
1800 | 3.68k | paranoid_file_checks, |
1801 | 3.68k | cfd->internal_stats(), |
1802 | 3.68k | db_options_.boundary_extractor.get(), |
1803 | 3.68k | Env::IO_HIGH, |
1804 | 3.68k | &info.table_properties); |
1805 | 3.68k | LogFlush(db_options_.info_log); |
1806 | 3.68k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
1807 | 3.68k | "[%s] [WriteLevel0TableForRecovery]" |
1808 | 3.68k | " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", |
1809 | 3.68k | cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetTotalFileSize(), |
1810 | 3.68k | s.ToString().c_str()); |
1811 | | |
1812 | | // output to event logger |
1813 | 3.68k | if (s.ok()) { |
1814 | 3.68k | info.db_name = dbname_; |
1815 | 3.68k | info.cf_name = cfd->GetName(); |
1816 | 3.68k | info.file_path = TableFileName(db_options_.db_paths, |
1817 | 3.68k | meta.fd.GetNumber(), |
1818 | 3.68k | meta.fd.GetPathId()); |
1819 | 3.68k | info.file_size = meta.fd.GetTotalFileSize(); |
1820 | 3.68k | info.job_id = job_id; |
1821 | 3.68k | EventHelpers::LogAndNotifyTableFileCreation( |
1822 | 3.68k | &event_logger_, db_options_.listeners, meta.fd, info); |
1823 | 3.68k | } |
1824 | 3.68k | mutex_.Lock(); |
1825 | 3.68k | } |
1826 | 3.68k | } |
1827 | 3.68k | } |
1828 | | |
1829 | | // Note that if file_size is zero, the file has been deleted and |
1830 | | // should not be added to the manifest. |
1831 | 3.68k | int level = 0; |
1832 | 3.68k | if (s.ok() && meta.fd.GetTotalFileSize() > 0) { |
1833 | 3.68k | edit->AddCleanedFile(level, meta); |
1834 | 3.68k | } |
1835 | | |
1836 | 3.68k | InternalStats::CompactionStats stats(1); |
1837 | 3.68k | stats.micros = env_->NowMicros() - start_micros; |
1838 | 3.68k | stats.bytes_written = meta.fd.GetTotalFileSize(); |
1839 | 3.68k | stats.num_output_files = 1; |
1840 | 3.68k | cfd->internal_stats()->AddCompactionStats(level, stats); |
1841 | 3.68k | cfd->internal_stats()->AddCFStats( |
1842 | 3.68k | InternalStats::BYTES_FLUSHED, meta.fd.GetTotalFileSize()); |
1843 | 3.68k | RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetTotalFileSize()); |
1844 | 3.68k | return s; |
1845 | 3.68k | } |
1846 | | |
1847 | | Result<FileNumbersHolder> DBImpl::FlushMemTableToOutputFile( |
1848 | | ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, |
1849 | 31.8k | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) { |
1850 | 31.8k | mutex_.AssertHeld(); |
1851 | 31.8k | DCHECK_NE(cfd->imm()->NumNotFlushed(), 0); |
1852 | 31.8k | DCHECK(cfd->imm()->IsFlushPending()); |
1853 | | |
1854 | 31.8k | SequenceNumber earliest_write_conflict_snapshot; |
1855 | 31.8k | std::vector<SequenceNumber> snapshot_seqs = |
1856 | 31.8k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
1857 | | |
1858 | 31.8k | MemTableFilter mem_table_flush_filter; |
1859 | 31.8k | if (db_options_.mem_table_flush_filter_factory) { |
1860 | 9.58k | mem_table_flush_filter = (*db_options_.mem_table_flush_filter_factory)(); |
1861 | 9.58k | } |
1862 | | |
1863 | 31.8k | FlushJob flush_job( |
1864 | 31.8k | dbname_, cfd, db_options_, mutable_cf_options, env_options_, |
1865 | 31.8k | versions_.get(), &mutex_, &shutting_down_, &disable_flush_on_shutdown_, snapshot_seqs, |
1866 | 31.8k | earliest_write_conflict_snapshot, mem_table_flush_filter, pending_outputs_.get(), |
1867 | 31.8k | job_context, log_buffer, directories_.GetDbDir(), directories_.GetDataDir(0U), |
1868 | 31.8k | GetCompressionFlush(*cfd->ioptions()), stats_, &event_logger_); |
1869 | | |
1870 | 31.8k | FileMetaData file_meta; |
1871 | | |
1872 | | // Within flush_job.Run, rocksdb may call event listener to notify |
1873 | | // file creation and deletion. |
1874 | | // |
1875 | | // Note that flush_job.Run will unlock and lock the db_mutex, |
1876 | | // and EventListener callback will be called when the db_mutex |
1877 | | // is unlocked by the current thread. |
1878 | 31.8k | auto file_number_holder = flush_job.Run(&file_meta); |
1879 | | |
1880 | 31.8k | if (file_number_holder.ok()) { |
1881 | 31.8k | InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context, |
1882 | 31.8k | mutable_cf_options); |
1883 | 31.8k | if (made_progress) { |
1884 | 31.8k | *made_progress = 1; |
1885 | 31.8k | } |
1886 | 31.8k | VersionStorageInfo::LevelSummaryStorage tmp; |
1887 | 31.8k | YB_LOG_EVERY_N_SECS(INFO, 1) |
1888 | 1.67k | << "[" << cfd->GetName() << "] Level summary: " |
1889 | 1.67k | << cfd->current()->storage_info()->LevelSummary(&tmp); |
1890 | 31.8k | } |
1891 | | |
1892 | 31.8k | if (!file_number_holder.ok() && !file_number_holder.status().IsShutdownInProgress() |
1893 | 18 | && db_options_.paranoid_checks && bg_error_.ok()) { |
1894 | | // if a bad error happened (not ShutdownInProgress) and paranoid_checks is |
1895 | | // true, mark DB read-only |
1896 | 18 | bg_error_ = file_number_holder.status(); |
1897 | 18 | } |
1898 | 31.8k | RecordFlushIOStats(); |
1899 | 31.8k | RETURN_NOT_OK(file_number_holder); |
1900 | 31.8k | MAYBE_FAULT(FLAGS_fault_crash_after_rocksdb_flush); |
1901 | 31.8k | #ifndef ROCKSDB_LITE |
1902 | | // may temporarily unlock and lock the mutex. |
1903 | 31.8k | NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, |
1904 | 31.8k | job_context->job_id, flush_job.GetTableProperties()); |
1905 | 31.8k | #endif // ROCKSDB_LITE |
1906 | 31.8k | auto sfm = |
1907 | 31.8k | static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get()); |
1908 | 31.8k | if (sfm) { |
1909 | | // Notify sst_file_manager that a new file was added |
1910 | 80 | std::string file_path = MakeTableFileName(db_options_.db_paths[0].path, |
1911 | 80 | file_meta.fd.GetNumber()); |
1912 | 80 | RETURN_NOT_OK(sfm->OnAddFile(file_path)); |
1913 | 80 | if (cfd->ioptions()->table_factory->IsSplitSstForWriteSupported()) { |
1914 | 80 | RETURN_NOT_OK(sfm->OnAddFile(TableBaseToDataFileName(file_path))); |
1915 | 80 | } |
1916 | 80 | if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) { |
1917 | 2 | bg_error_ = STATUS(IOError, "Max allowed space was reached"); |
1918 | 2 | TEST_SYNC_POINT( |
1919 | 2 | "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached"); |
1920 | 2 | } |
1921 | 80 | } |
1922 | 31.8k | return file_number_holder; |
1923 | 31.8k | } |
1924 | | |
1925 | 348k | uint64_t DBImpl::GetCurrentVersionSstFilesSize() { |
1926 | 348k | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1927 | 348k | GetLiveFilesMetaData(&file_metadata); |
1928 | 348k | uint64_t total_sst_file_size = 0; |
1929 | 46.6k | for (const auto& meta : file_metadata) { |
1930 | 46.6k | total_sst_file_size += meta.total_size; |
1931 | 46.6k | } |
1932 | 348k | return total_sst_file_size; |
1933 | 348k | } |
1934 | | |
1935 | 348k | uint64_t DBImpl::GetCurrentVersionSstFilesUncompressedSize() { |
1936 | 348k | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1937 | 348k | GetLiveFilesMetaData(&file_metadata); |
1938 | 348k | uint64_t total_uncompressed_file_size = 0; |
1939 | 46.6k | for (const auto &meta : file_metadata) { |
1940 | 46.6k | total_uncompressed_file_size += meta.uncompressed_size; |
1941 | 46.6k | } |
1942 | 348k | return total_uncompressed_file_size; |
1943 | 348k | } |
1944 | | |
1945 | 163k | std::pair<uint64_t, uint64_t> DBImpl::GetCurrentVersionSstFilesAllSizes() { |
1946 | 163k | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1947 | 163k | GetLiveFilesMetaData(&file_metadata); |
1948 | 163k | uint64_t total_sst_file_size = 0; |
1949 | 163k | uint64_t total_uncompressed_file_size = 0; |
1950 | 23.2k | for (const auto& meta : file_metadata) { |
1951 | 23.2k | total_sst_file_size += meta.total_size; |
1952 | 23.2k | total_uncompressed_file_size += meta.uncompressed_size; |
1953 | 23.2k | } |
1954 | 163k | return std::pair<uint64_t, uint64_t>(total_sst_file_size, total_uncompressed_file_size); |
1955 | 163k | } |
1956 | | |
1957 | 18.4M | uint64_t DBImpl::GetCurrentVersionNumSSTFiles() { |
1958 | 18.4M | InstrumentedMutexLock lock(&mutex_); |
1959 | 18.4M | return default_cf_handle_->cfd()->current()->storage_info()->NumFiles(); |
1960 | 18.4M | } |
1961 | | |
1962 | 395k | void DBImpl::SetSSTFileTickers() { |
1963 | 395k | if (stats_) { |
1964 | 344k | auto sst_files_size = GetCurrentVersionSstFilesSize(); |
1965 | 344k | SetTickerCount(stats_, CURRENT_VERSION_SST_FILES_SIZE, sst_files_size); |
1966 | 344k | auto uncompressed_sst_files_size = GetCurrentVersionSstFilesUncompressedSize(); |
1967 | 344k | SetTickerCount( |
1968 | 344k | stats_, CURRENT_VERSION_SST_FILES_UNCOMPRESSED_SIZE, uncompressed_sst_files_size); |
1969 | 344k | auto num_sst_files = GetCurrentVersionNumSSTFiles(); |
1970 | 344k | SetTickerCount(stats_, CURRENT_VERSION_NUM_SST_FILES, num_sst_files); |
1971 | 344k | } |
1972 | 395k | } |
1973 | | |
1974 | 12 | uint64_t DBImpl::GetCurrentVersionDataSstFilesSize() { |
1975 | 12 | std::vector<rocksdb::LiveFileMetaData> file_metadata; |
1976 | 12 | GetLiveFilesMetaData(&file_metadata); |
1977 | 12 | uint64_t data_sst_file_size = 0; |
1978 | 54 | for (const auto& meta : file_metadata) { |
1979 | | // Each SST has base/metadata SST file (<number>.sst) and at least one data SST file |
1980 | | // (<number>.sst.sblock.0). |
1981 | | // We subtract SST metadata file size from total SST size to get the SST data file(s) size. |
1982 | 54 | data_sst_file_size += meta.total_size - meta.base_size; |
1983 | 54 | } |
1984 | 12 | return data_sst_file_size; |
1985 | 12 | } |
1986 | | |
1987 | | void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd, |
1988 | | FileMetaData* file_meta, |
1989 | | const MutableCFOptions& mutable_cf_options, |
1990 | 31.8k | int job_id, TableProperties prop) { |
1991 | 31.8k | #ifndef ROCKSDB_LITE |
1992 | 31.8k | mutex_.AssertHeld(); |
1993 | 31.8k | if (IsShuttingDown()) { |
1994 | 722 | return; |
1995 | 722 | } |
1996 | 31.1k | if (db_options_.listeners.size() > 0) { |
1997 | 2.56k | int num_0level_files = cfd->current()->storage_info()->NumLevelFiles(0); |
1998 | 2.56k | bool triggered_writes_slowdown = |
1999 | 2.56k | num_0level_files >= mutable_cf_options.level0_slowdown_writes_trigger; |
2000 | 2.56k | bool triggered_writes_stop = |
2001 | 2.56k | num_0level_files >= mutable_cf_options.level0_stop_writes_trigger; |
2002 | | // release lock while notifying events |
2003 | 2.56k | mutex_.Unlock(); |
2004 | 2.56k | { |
2005 | 2.56k | FlushJobInfo info; |
2006 | 2.56k | info.cf_name = cfd->GetName(); |
2007 | | // TODO(yhchiang): make db_paths dynamic in case flush does not |
2008 | | // go to L0 in the future. |
2009 | 2.56k | info.file_path = MakeTableFileName(db_options_.db_paths[0].path, |
2010 | 2.56k | file_meta->fd.GetNumber()); |
2011 | 2.56k | info.thread_id = env_->GetThreadID(); |
2012 | 2.56k | info.job_id = job_id; |
2013 | 2.56k | info.triggered_writes_slowdown = triggered_writes_slowdown; |
2014 | 2.56k | info.triggered_writes_stop = triggered_writes_stop; |
2015 | 2.56k | info.smallest_seqno = file_meta->smallest.seqno; |
2016 | 2.56k | info.largest_seqno = file_meta->largest.seqno; |
2017 | 2.56k | info.table_properties = prop; |
2018 | 2.91k | for (auto listener : db_options_.listeners) { |
2019 | 2.91k | listener->OnFlushCompleted(this, info); |
2020 | 2.91k | } |
2021 | 2.56k | } |
2022 | 28.5k | } else { |
2023 | 28.5k | mutex_.Unlock(); |
2024 | 28.5k | } |
2025 | 31.1k | SetSSTFileTickers(); |
2026 | 31.1k | mutex_.Lock(); |
2027 | | // no need to signal bg_cv_ as it will be signaled at the end of the |
2028 | | // flush process. |
2029 | 31.1k | #endif // ROCKSDB_LITE |
2030 | 31.1k | } |
2031 | | |
2032 | | Status DBImpl::CompactRange(const CompactRangeOptions& options, |
2033 | | ColumnFamilyHandle* column_family, |
2034 | 1.63k | const Slice* begin, const Slice* end) { |
2035 | 1.63k | if (options.target_path_id >= db_options_.db_paths.size()) { |
2036 | 4 | return STATUS(InvalidArgument, "Invalid target path ID"); |
2037 | 4 | } |
2038 | | |
2039 | 1.62k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2040 | 1.62k | auto cfd = cfh->cfd(); |
2041 | 1.62k | bool exclusive = options.exclusive_manual_compaction; |
2042 | | |
2043 | 1.62k | Status s = FlushMemTable(cfd, FlushOptions()); |
2044 | 1.62k | if (!s.ok()) { |
2045 | 0 | LogFlush(db_options_.info_log); |
2046 | 0 | return s; |
2047 | 0 | } |
2048 | | |
2049 | 1.62k | int max_level_with_files = 0; |
2050 | 1.62k | { |
2051 | 1.62k | InstrumentedMutexLock l(&mutex_); |
2052 | 1.62k | Version* base = cfd->current(); |
2053 | 3.57k | for (int level = 1; level < base->storage_info()->num_non_empty_levels(); |
2054 | 1.95k | level++) { |
2055 | 1.95k | if (base->storage_info()->OverlapInLevel(level, begin, end)) { |
2056 | 1.44k | max_level_with_files = level; |
2057 | 1.44k | } |
2058 | 1.95k | } |
2059 | 1.62k | } |
2060 | | |
2061 | 1.62k | int final_output_level = 0; |
2062 | 1.62k | if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && |
2063 | 266 | cfd->NumberLevels() > 1) { |
2064 | | // Always compact all files together. |
2065 | 88 | s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, |
2066 | 88 | cfd->NumberLevels() - 1, options.target_path_id, |
2067 | 88 | begin, end, exclusive); |
2068 | 88 | final_output_level = cfd->NumberLevels() - 1; |
2069 | 1.53k | } else { |
2070 | 4.53k | for (int level = 0; level <= max_level_with_files; level++) { |
2071 | 2.99k | int output_level; |
2072 | | // in case the compaction is universal or if we're compacting the |
2073 | | // bottom-most level, the output level will be the same as input one. |
2074 | | // level 0 can never be the bottommost level (i.e. if all files are in |
2075 | | // level 0, we will compact to level 1) |
2076 | 2.99k | if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || |
2077 | 2.82k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { |
2078 | 186 | output_level = level; |
2079 | 2.81k | } else if (level == max_level_with_files && level > 0) { |
2080 | 1.03k | if (options.bottommost_level_compaction == |
2081 | 5 | BottommostLevelCompaction::kSkip) { |
2082 | | // Skip bottommost level compaction |
2083 | 5 | continue; |
2084 | 1.03k | } else if (options.bottommost_level_compaction == |
2085 | 1.03k | BottommostLevelCompaction::kIfHaveCompactionFilter && |
2086 | 1.02k | cfd->ioptions()->compaction_filter == nullptr && |
2087 | 1.02k | cfd->ioptions()->compaction_filter_factory == nullptr) { |
2088 | | // Skip bottommost level compaction since we don't have a compaction |
2089 | | // filter |
2090 | 1.00k | continue; |
2091 | 1.00k | } |
2092 | 26 | output_level = level; |
2093 | 1.77k | } else { |
2094 | 1.77k | output_level = level + 1; |
2095 | 1.77k | if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && |
2096 | 1.77k | cfd->ioptions()->level_compaction_dynamic_level_bytes && |
2097 | 32 | level == 0) { |
2098 | 8 | output_level = ColumnFamilyData::kCompactToBaseLevel; |
2099 | 8 | } |
2100 | 1.77k | } |
2101 | 1.98k | s = RunManualCompaction(cfd, level, output_level, options.target_path_id, |
2102 | 1.98k | begin, end, exclusive); |
2103 | 1.98k | if (!s.ok()) { |
2104 | 2 | break; |
2105 | 2 | } |
2106 | 1.98k | if (output_level == ColumnFamilyData::kCompactToBaseLevel) { |
2107 | 8 | final_output_level = cfd->NumberLevels() - 1; |
2108 | 1.97k | } else if (output_level > final_output_level) { |
2109 | 1.74k | final_output_level = output_level; |
2110 | 1.74k | } |
2111 | 1.98k | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); |
2112 | 1.98k | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); |
2113 | 1.98k | } |
2114 | 1.53k | } |
2115 | 1.62k | if (!s.ok()) { |
2116 | 12 | LogFlush(db_options_.info_log); |
2117 | 12 | return s; |
2118 | 12 | } |
2119 | | |
2120 | 1.61k | if (options.change_level) { |
2121 | 42 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2122 | 42 | "[RefitLevel] waiting for background threads to stop"); |
2123 | 42 | s = PauseBackgroundWork(); |
2124 | 42 | if (s.ok()) { |
2125 | 42 | s = ReFitLevel(cfd, final_output_level, options.target_level); |
2126 | 42 | } |
2127 | 42 | CHECK_OK(ContinueBackgroundWork()); |
2128 | 42 | } |
2129 | 1.61k | LogFlush(db_options_.info_log); |
2130 | | |
2131 | 1.61k | { |
2132 | 1.61k | InstrumentedMutexLock lock(&mutex_); |
2133 | | // an automatic compaction that has been scheduled might have been |
2134 | | // preempted by the manual compactions. Need to schedule it back. |
2135 | 1.61k | if (exclusive) { |
2136 | | // all compaction scheduling was stopped so we reschedule for each cf |
2137 | 1.56k | ColumnFamilySet* columnFamilySet = versions_->GetColumnFamilySet(); |
2138 | 3.68k | for (auto it = columnFamilySet->begin(); it != columnFamilySet->end(); ++it) { |
2139 | 2.11k | SchedulePendingCompaction(*it); |
2140 | 2.11k | } |
2141 | 51 | } else { |
2142 | | // only compactions in this column family were stopped |
2143 | 51 | SchedulePendingCompaction(cfd); |
2144 | 51 | } |
2145 | 1.61k | MaybeScheduleFlushOrCompaction(); |
2146 | 1.61k | } |
2147 | | |
2148 | 1.61k | return s; |
2149 | 1.61k | } |
2150 | | |
2151 | | Status DBImpl::CompactFiles( |
2152 | | const CompactionOptions& compact_options, |
2153 | | ColumnFamilyHandle* column_family, |
2154 | | const std::vector<std::string>& input_file_names, |
2155 | 21 | const int output_level, const int output_path_id) { |
2156 | | #ifdef ROCKSDB_LITE |
2157 | | // not supported in lite version |
2158 | | return STATUS(NotSupported, "Not supported in ROCKSDB LITE"); |
2159 | | #else |
2160 | 21 | if (column_family == nullptr) { |
2161 | 0 | return STATUS(InvalidArgument, "ColumnFamilyHandle must be non-null."); |
2162 | 0 | } |
2163 | | |
2164 | 21 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
2165 | 21 | assert(cfd); |
2166 | | |
2167 | 21 | Status s; |
2168 | 21 | JobContext job_context(0, true); |
2169 | 21 | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, |
2170 | 21 | db_options_.info_log.get()); |
2171 | | |
2172 | | // Perform CompactFiles |
2173 | 21 | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
2174 | 21 | { |
2175 | 21 | InstrumentedMutexLock l(&mutex_); |
2176 | | |
2177 | 21 | s = CompactFilesImpl(compact_options, cfd, sv->current, |
2178 | 21 | input_file_names, output_level, |
2179 | 21 | output_path_id, &job_context, &log_buffer); |
2180 | 21 | } |
2181 | 21 | ReturnAndCleanupSuperVersion(cfd, sv); |
2182 | | |
2183 | | // Find and delete obsolete files |
2184 | 21 | { |
2185 | 21 | InstrumentedMutexLock l(&mutex_); |
2186 | | // If !s.ok(), this means that Compaction failed. In that case, we want |
2187 | | // to delete all obsolete files we might have created and we force |
2188 | | // FindObsoleteFiles(). This is because job_context does not |
2189 | | // catch all created files if compaction failed. |
2190 | 21 | FindObsoleteFiles(&job_context, !s.ok()); |
2191 | 21 | } // release the mutex |
2192 | | |
2193 | | // delete unnecessary files if any, this is done outside the mutex |
2194 | 21 | if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { |
2195 | | // Have to flush the info logs before bg_compaction_scheduled_-- |
2196 | | // because if bg_flush_scheduled_ becomes 0 and the lock is |
2197 | | // released, the deconstructor of DB can kick in and destroy all the |
2198 | | // states of DB so info_log might not be available after that point. |
2199 | | // It also applies to access other states that DB owns. |
2200 | 21 | log_buffer.FlushBufferToLog(); |
2201 | 21 | if (job_context.HaveSomethingToDelete()) { |
2202 | | // no mutex is locked here. No need to Unlock() and Lock() here. |
2203 | 21 | PurgeObsoleteFiles(job_context); |
2204 | 21 | } |
2205 | 21 | job_context.Clean(); |
2206 | 21 | } |
2207 | | |
2208 | 21 | return s; |
2209 | 21 | #endif // ROCKSDB_LITE |
2210 | 21 | } |
2211 | | |
2212 | | #ifndef ROCKSDB_LITE |
2213 | | Status DBImpl::CompactFilesImpl( |
2214 | | const CompactionOptions& compact_options, ColumnFamilyData* cfd, |
2215 | | Version* version, const std::vector<std::string>& input_file_names, |
2216 | | const int output_level, int output_path_id, JobContext* job_context, |
2217 | 21 | LogBuffer* log_buffer) { |
2218 | 21 | mutex_.AssertHeld(); |
2219 | | |
2220 | 21 | if (IsShuttingDown()) { |
2221 | 0 | return STATUS(ShutdownInProgress, ""); |
2222 | 0 | } |
2223 | | |
2224 | 21 | std::unordered_set<uint64_t> input_set; |
2225 | 63 | for (auto file_name : input_file_names) { |
2226 | 63 | input_set.insert(TableFileNameToNumber(file_name)); |
2227 | 63 | } |
2228 | | |
2229 | 21 | ColumnFamilyMetaData cf_meta; |
2230 | | // TODO(yhchiang): can directly use version here if none of the |
2231 | | // following functions call is pluggable to external developers. |
2232 | 21 | version->GetColumnFamilyMetaData(&cf_meta); |
2233 | | |
2234 | 21 | if (output_path_id < 0) { |
2235 | 21 | if (db_options_.db_paths.size() == 1U) { |
2236 | 21 | output_path_id = 0; |
2237 | 0 | } else { |
2238 | 0 | return STATUS(NotSupported, |
2239 | 0 | "Automatic output path selection is not " |
2240 | 0 | "yet supported in CompactFiles()"); |
2241 | 0 | } |
2242 | 21 | } |
2243 | | |
2244 | 21 | Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( |
2245 | 21 | &input_set, cf_meta, output_level); |
2246 | 21 | if (!s.ok()) { |
2247 | 6 | return s; |
2248 | 6 | } |
2249 | | |
2250 | 15 | std::vector<CompactionInputFiles> input_files; |
2251 | 15 | s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( |
2252 | 15 | &input_files, &input_set, version->storage_info(), compact_options); |
2253 | 15 | if (!s.ok()) { |
2254 | 0 | return s; |
2255 | 0 | } |
2256 | | |
2257 | 15 | for (auto inputs : input_files) { |
2258 | 15 | if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) { |
2259 | 0 | return STATUS(Aborted, |
2260 | 0 | "Some of the necessary compaction input " |
2261 | 0 | "files are already being compacted"); |
2262 | 0 | } |
2263 | 15 | } |
2264 | | |
2265 | | // At this point, CompactFiles will be run. |
2266 | 15 | bg_compaction_scheduled_++; |
2267 | | |
2268 | 15 | assert(cfd->compaction_picker()); |
2269 | 15 | unique_ptr<Compaction> c = cfd->compaction_picker()->FormCompaction( |
2270 | 15 | compact_options, input_files, output_level, version->storage_info(), |
2271 | 15 | *cfd->GetLatestMutableCFOptions(), output_path_id); |
2272 | 15 | if (!c) { |
2273 | 0 | return STATUS(Aborted, "Another Level 0 compaction is running or nothing to compact"); |
2274 | 0 | } |
2275 | 15 | c->SetInputVersion(version); |
2276 | | // deletion compaction currently not allowed in CompactFiles. |
2277 | 15 | assert(!c->deletion_compaction()); |
2278 | | |
2279 | 15 | SequenceNumber earliest_write_conflict_snapshot; |
2280 | 15 | std::vector<SequenceNumber> snapshot_seqs = |
2281 | 15 | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
2282 | | |
2283 | 15 | assert(is_snapshot_supported_ || snapshots_.empty()); |
2284 | 15 | CompactionJob compaction_job( |
2285 | 15 | job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), |
2286 | 15 | &shutting_down_, log_buffer, directories_.GetDbDir(), |
2287 | 15 | directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, |
2288 | 15 | snapshot_seqs, earliest_write_conflict_snapshot, pending_outputs_.get(), table_cache_, |
2289 | 15 | &event_logger_, c->mutable_cf_options()->paranoid_file_checks, |
2290 | 15 | c->mutable_cf_options()->compaction_measure_io_stats, dbname_, |
2291 | 15 | nullptr); // Here we pass a nullptr for CompactionJobStats because |
2292 | | // CompactFiles does not trigger OnCompactionCompleted(), |
2293 | | // which is the only place where CompactionJobStats is |
2294 | | // returned. The idea of not triggering OnCompationCompleted() |
2295 | | // is that CompactFiles runs in the caller thread, so the user |
2296 | | // should always know when it completes. As a result, it makes |
2297 | | // less sense to notify the users something they should already |
2298 | | // know. |
2299 | | // |
2300 | | // In the future, if we would like to add CompactionJobStats |
2301 | | // support for CompactFiles, we should have CompactFiles API |
2302 | | // pass a pointer of CompactionJobStats as the out-value |
2303 | | // instead of using EventListener. |
2304 | | |
2305 | | // Creating a compaction influences the compaction score because the score |
2306 | | // takes running compactions into account (by skipping files that are already |
2307 | | // being compacted). Since we just changed compaction score, we recalculate it |
2308 | | // here. |
2309 | 15 | { |
2310 | 15 | CompactionOptionsFIFO dummy_compaction_options_fifo; |
2311 | 15 | version->storage_info()->ComputeCompactionScore( |
2312 | 15 | *c->mutable_cf_options(), dummy_compaction_options_fifo); |
2313 | 15 | } |
2314 | | |
2315 | 15 | compaction_job.Prepare(); |
2316 | | |
2317 | 15 | Status status; |
2318 | 15 | { |
2319 | 15 | mutex_.Unlock(); |
2320 | 3 | for (auto listener : db_options_.listeners) { |
2321 | 3 | listener->OnCompactionStarted(); |
2322 | 3 | } |
2323 | 15 | auto file_numbers_holder = compaction_job.Run(); |
2324 | 15 | TEST_SYNC_POINT("CompactFilesImpl:2"); |
2325 | 15 | TEST_SYNC_POINT("CompactFilesImpl:3"); |
2326 | 15 | mutex_.Lock(); |
2327 | | |
2328 | 15 | status = compaction_job.Install(*c->mutable_cf_options()); |
2329 | 15 | if (status.ok()) { |
2330 | 15 | InstallSuperVersionAndScheduleWorkWrapper( |
2331 | 15 | c->column_family_data(), job_context, *c->mutable_cf_options()); |
2332 | 15 | } |
2333 | 15 | c->ReleaseCompactionFiles(s); |
2334 | 15 | } |
2335 | | |
2336 | 15 | if (status.ok()) { |
2337 | | // Done |
2338 | 0 | } else if (status.IsShutdownInProgress()) { |
2339 | | // Ignore compaction errors found during shutting down |
2340 | 0 | } else { |
2341 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
2342 | 0 | "[%s] [JOB %d] Compaction error: %s", |
2343 | 0 | c->column_family_data()->GetName().c_str(), job_context->job_id, |
2344 | 0 | status.ToString().c_str()); |
2345 | 0 | if (db_options_.paranoid_checks && bg_error_.ok()) { |
2346 | 0 | bg_error_ = status; |
2347 | 0 | } |
2348 | 0 | } |
2349 | | |
2350 | 15 | c.reset(); |
2351 | | |
2352 | 15 | bg_compaction_scheduled_--; |
2353 | 15 | if (bg_compaction_scheduled_ == 0) { |
2354 | 15 | bg_cv_.SignalAll(); |
2355 | 15 | } |
2356 | | |
2357 | 15 | return status; |
2358 | 15 | } |
2359 | | #endif // ROCKSDB_LITE |
2360 | | |
2361 | 43 | Status DBImpl::PauseBackgroundWork() { |
2362 | 43 | InstrumentedMutexLock guard_lock(&mutex_); |
2363 | 43 | bg_compaction_paused_++; |
2364 | 43 | while (CheckBackgroundWorkAndLog("Pause")) { |
2365 | 0 | bg_cv_.Wait(); |
2366 | 0 | } |
2367 | 43 | bg_work_paused_++; |
2368 | 43 | return Status::OK(); |
2369 | 43 | } |
2370 | | |
2371 | 43 | Status DBImpl::ContinueBackgroundWork() { |
2372 | 43 | InstrumentedMutexLock guard_lock(&mutex_); |
2373 | 43 | if (bg_work_paused_ == 0) { |
2374 | 0 | return STATUS(InvalidArgument, ""); |
2375 | 0 | } |
2376 | 43 | assert(bg_work_paused_ > 0); |
2377 | 43 | assert(bg_compaction_paused_ > 0); |
2378 | 43 | bg_compaction_paused_--; |
2379 | 43 | bg_work_paused_--; |
2380 | | // It's sufficient to check just bg_work_paused_ here since |
2381 | | // bg_work_paused_ is always no greater than bg_compaction_paused_ |
2382 | 43 | if (bg_work_paused_ == 0) { |
2383 | 43 | MaybeScheduleFlushOrCompaction(); |
2384 | 43 | } |
2385 | 43 | return Status::OK(); |
2386 | 43 | } |
2387 | | |
2388 | | void DBImpl::NotifyOnCompactionCompleted( |
2389 | | ColumnFamilyData* cfd, Compaction *c, const Status &st, |
2390 | | const CompactionJobStats& compaction_job_stats, |
2391 | 22.9k | const int job_id) { |
2392 | 22.9k | #ifndef ROCKSDB_LITE |
2393 | 22.9k | mutex_.AssertHeld(); |
2394 | 22.9k | if (IsShuttingDown()) { |
2395 | 66 | return; |
2396 | 66 | } |
2397 | 22.8k | VersionPtr current = cfd->current(); |
2398 | | // release lock while notifying events |
2399 | 22.8k | mutex_.Unlock(); |
2400 | 22.8k | if (db_options_.listeners.size() > 0) { |
2401 | 641 | CompactionJobInfo info; |
2402 | 641 | info.cf_name = cfd->GetName(); |
2403 | 641 | info.status = st; |
2404 | 641 | info.thread_id = env_->GetThreadID(); |
2405 | 641 | info.job_id = job_id; |
2406 | 641 | info.base_input_level = c->start_level(); |
2407 | 641 | info.output_level = c->output_level(); |
2408 | 641 | info.stats = compaction_job_stats; |
2409 | 641 | info.table_properties = c->GetOutputTableProperties(); |
2410 | 641 | info.compaction_reason = c->compaction_reason(); |
2411 | 641 | info.is_full_compaction = c->is_full_compaction(); |
2412 | 1.57k | for (size_t i = 0; i < c->num_input_levels(); ++i) { |
2413 | 2.54k | for (const auto fmd : *c->inputs(i)) { |
2414 | 2.54k | auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(), |
2415 | 2.54k | fmd->fd.GetPathId()); |
2416 | 2.54k | info.input_files.push_back(fn); |
2417 | 2.54k | if (info.table_properties.count(fn) == 0) { |
2418 | 2.54k | std::shared_ptr<const TableProperties> tp; |
2419 | 2.54k | auto s = current->GetTableProperties(&tp, fmd, &fn); |
2420 | 2.54k | if (s.ok()) { |
2421 | 2.54k | info.table_properties[fn] = tp; |
2422 | 2.54k | } |
2423 | 2.54k | } |
2424 | 2.54k | } |
2425 | 932 | } |
2426 | 808 | for (const auto& newf : c->edit()->GetNewFiles()) { |
2427 | 808 | info.output_files.push_back( |
2428 | 808 | TableFileName(db_options_.db_paths, |
2429 | 808 | newf.second.fd.GetNumber(), |
2430 | 808 | newf.second.fd.GetPathId())); |
2431 | 808 | } |
2432 | 646 | for (auto listener : db_options_.listeners) { |
2433 | 646 | listener->OnCompactionCompleted(this, info); |
2434 | 646 | } |
2435 | 641 | } |
2436 | 22.8k | SetSSTFileTickers(); |
2437 | 22.8k | mutex_.Lock(); |
2438 | | // no need to signal bg_cv_ as it will be signaled at the end of the |
2439 | | // flush process. |
2440 | 22.8k | #endif // ROCKSDB_LITE |
2441 | 22.8k | } |
2442 | | |
2443 | 310k | void DBImpl::SetDisableFlushOnShutdown(bool disable_flush_on_shutdown) { |
2444 | | // disable_flush_on_shutdown_ can only transition from false to true. This location |
2445 | | // can be called multiple times with arg as false. It is only called once with arg |
2446 | | // as true. Subsequently, the destructor reads this flag. Setting this flag |
2447 | | // to true and the destructor are expected to run on the same thread and hence |
2448 | | // it is not required for disable_flush_on_shutdown_ to be atomic. |
2449 | 310k | if (disable_flush_on_shutdown) { |
2450 | 309k | disable_flush_on_shutdown_ = disable_flush_on_shutdown; |
2451 | 309k | } |
2452 | 310k | } |
2453 | | |
2454 | | Status DBImpl::SetOptions( |
2455 | | ColumnFamilyHandle* column_family, |
2456 | | const std::unordered_map<std::string, std::string>& options_map, |
2457 | 656k | bool dump_options) { |
2458 | | #ifdef ROCKSDB_LITE |
2459 | | return STATUS(NotSupported, "Not supported in ROCKSDB LITE"); |
2460 | | #else |
2461 | 656k | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
2462 | 656k | if (options_map.empty()) { |
2463 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, |
2464 | 0 | db_options_.info_log, "SetOptions() on column family [%s], empty input", |
2465 | 0 | cfd->GetName().c_str()); |
2466 | 0 | return STATUS(InvalidArgument, "empty input"); |
2467 | 0 | } |
2468 | | |
2469 | 656k | MutableCFOptions new_options; |
2470 | 656k | Status s; |
2471 | 656k | Status persist_options_status; |
2472 | 656k | { |
2473 | 656k | InstrumentedMutexLock l(&mutex_); |
2474 | 656k | s = cfd->SetOptions(options_map); |
2475 | 656k | if (s.ok()) { |
2476 | 656k | new_options = *cfd->GetLatestMutableCFOptions(); |
2477 | 656k | } |
2478 | 656k | if (s.ok()) { |
2479 | | // Persist RocksDB options under the single write thread |
2480 | 656k | WriteThread::Writer w; |
2481 | 656k | write_thread_.EnterUnbatched(&w, &mutex_); |
2482 | | |
2483 | 656k | persist_options_status = WriteOptionsFile(); |
2484 | | |
2485 | 656k | write_thread_.ExitUnbatched(&w); |
2486 | 656k | } |
2487 | 656k | } |
2488 | | |
2489 | 656k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2490 | 656k | "SetOptions() on column family [%s], inputs: %s", |
2491 | 656k | cfd->GetName().c_str(), yb::AsString(options_map).c_str()); |
2492 | 656k | if (s.ok()) { |
2493 | 656k | RLOG(InfoLogLevel::INFO_LEVEL, |
2494 | 656k | db_options_.info_log, "[%s] SetOptions succeeded", |
2495 | 656k | cfd->GetName().c_str()); |
2496 | 656k | if (dump_options) { |
2497 | 28 | new_options.Dump(db_options_.info_log.get()); |
2498 | 28 | } |
2499 | 656k | if (!persist_options_status.ok()) { |
2500 | 0 | if (db_options_.fail_if_options_file_error) { |
2501 | 0 | s = STATUS(IOError, |
2502 | 0 | "SetOptions succeeded, but unable to persist options", |
2503 | 0 | persist_options_status.ToString()); |
2504 | 0 | } |
2505 | 0 | RWARN(db_options_.info_log, |
2506 | 0 | "Unable to persist options in SetOptions() -- %s", |
2507 | 0 | persist_options_status.ToString().c_str()); |
2508 | 0 | } |
2509 | 18.4E | } else { |
2510 | 18.4E | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
2511 | 18.4E | "[%s] SetOptions failed", cfd->GetName().c_str()); |
2512 | 18.4E | } |
2513 | 656k | LogFlush(db_options_.info_log); |
2514 | 656k | return s; |
2515 | 656k | #endif // ROCKSDB_LITE |
2516 | 656k | } |
2517 | | |
2518 | | // return the same level if it cannot be moved |
2519 | | int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, |
2520 | 0 | const MutableCFOptions& mutable_cf_options, int level) { |
2521 | 0 | mutex_.AssertHeld(); |
2522 | 0 | const auto* vstorage = cfd->current()->storage_info(); |
2523 | 0 | int minimum_level = level; |
2524 | 0 | for (int i = level - 1; i > 0; --i) { |
2525 | | // stop if level i is not empty |
2526 | 0 | if (vstorage->NumLevelFiles(i) > 0) break; |
2527 | | // stop if level i is too small (cannot fit the level files) |
2528 | 0 | if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) { |
2529 | 0 | break; |
2530 | 0 | } |
2531 | | |
2532 | 0 | minimum_level = i; |
2533 | 0 | } |
2534 | 0 | return minimum_level; |
2535 | 0 | } |
2536 | | |
2537 | | // REQUIREMENT: block all background work by calling PauseBackgroundWork() |
2538 | | // before calling this function |
2539 | 42 | Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { |
2540 | 42 | assert(level < cfd->NumberLevels()); |
2541 | 42 | if (target_level >= cfd->NumberLevels()) { |
2542 | 0 | return STATUS(InvalidArgument, "Target level exceeds number of levels"); |
2543 | 0 | } |
2544 | | |
2545 | 42 | std::unique_ptr<SuperVersion> superversion_to_free; |
2546 | 42 | std::unique_ptr<SuperVersion> new_superversion(new SuperVersion()); |
2547 | | |
2548 | 42 | Status status; |
2549 | | |
2550 | 42 | InstrumentedMutexLock guard_lock(&mutex_); |
2551 | | |
2552 | | // only allow one thread refitting |
2553 | 42 | if (refitting_level_) { |
2554 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2555 | 0 | "[ReFitLevel] another thread is refitting"); |
2556 | 0 | return STATUS(NotSupported, "another thread is refitting"); |
2557 | 0 | } |
2558 | 42 | refitting_level_ = true; |
2559 | | |
2560 | 42 | const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); |
2561 | | // move to a smaller level |
2562 | 42 | int to_level = target_level; |
2563 | 42 | if (target_level < 0) { |
2564 | 0 | to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); |
2565 | 0 | } |
2566 | | |
2567 | 42 | auto* vstorage = cfd->current()->storage_info(); |
2568 | 42 | if (to_level > level) { |
2569 | 22 | if (level == 0) { |
2570 | 0 | return STATUS(NotSupported, |
2571 | 0 | "Cannot change from level 0 to other levels."); |
2572 | 0 | } |
2573 | | // Check levels are empty for a trivial move |
2574 | 84 | for (int l = level + 1; l <= to_level; l++) { |
2575 | 62 | if (vstorage->NumLevelFiles(l) > 0) { |
2576 | 0 | return STATUS(NotSupported, |
2577 | 0 | "Levels between source and target are not empty for a move."); |
2578 | 0 | } |
2579 | 62 | } |
2580 | 22 | } |
2581 | 42 | if (to_level != level) { |
2582 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2583 | 39 | "[%s] Before refitting:\n%s", cfd->GetName().c_str(), |
2584 | 39 | cfd->current()->DebugString().data()); |
2585 | | |
2586 | 39 | VersionEdit edit; |
2587 | 39 | edit.SetColumnFamily(cfd->GetID()); |
2588 | 75 | for (const auto& f : vstorage->LevelFiles(level)) { |
2589 | 75 | edit.DeleteFile(level, f->fd.GetNumber()); |
2590 | 75 | edit.AddCleanedFile(to_level, *f); |
2591 | 75 | } |
2592 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2593 | 39 | "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), |
2594 | 39 | edit.DebugString().data()); |
2595 | | |
2596 | 39 | status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, |
2597 | 39 | directories_.GetDbDir()); |
2598 | 39 | superversion_to_free = InstallSuperVersionAndScheduleWork( |
2599 | 39 | cfd, new_superversion.release(), mutable_cf_options); |
2600 | | |
2601 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2602 | 39 | "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), |
2603 | 39 | status.ToString().data()); |
2604 | | |
2605 | 39 | if (status.ok()) { |
2606 | 39 | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
2607 | 39 | "[%s] After refitting:\n%s", cfd->GetName().c_str(), |
2608 | 39 | cfd->current()->DebugString().data()); |
2609 | 39 | } |
2610 | 39 | } |
2611 | | |
2612 | 42 | refitting_level_ = false; |
2613 | | |
2614 | 42 | return status; |
2615 | 42 | } |
2616 | | |
2617 | 1.34k | int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { |
2618 | 1.34k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2619 | 1.34k | return cfh->cfd()->NumberLevels(); |
2620 | 1.34k | } |
2621 | | |
2622 | 0 | int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { |
2623 | 0 | return 0; |
2624 | 0 | } |
2625 | | |
2626 | 0 | int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { |
2627 | 0 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2628 | 0 | InstrumentedMutexLock l(&mutex_); |
2629 | 0 | return cfh->cfd()->GetSuperVersion()-> |
2630 | 0 | mutable_cf_options.level0_stop_writes_trigger; |
2631 | 0 | } |
2632 | | |
2633 | | Status DBImpl::Flush(const FlushOptions& flush_options, |
2634 | 252k | ColumnFamilyHandle* column_family) { |
2635 | 252k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2636 | 252k | return FlushMemTable(cfh->cfd(), flush_options); |
2637 | 252k | } |
2638 | | |
2639 | 99 | Status DBImpl::WaitForFlush(ColumnFamilyHandle* column_family) { |
2640 | 99 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2641 | | // Wait until the flush completes. |
2642 | 99 | return WaitForFlushMemTable(cfh->cfd()); |
2643 | 99 | } |
2644 | | |
2645 | 7 | Status DBImpl::SyncWAL() { |
2646 | 7 | autovector<log::Writer*, 1> logs_to_sync; |
2647 | 7 | bool need_log_dir_sync; |
2648 | 7 | uint64_t current_log_number; |
2649 | | |
2650 | 7 | { |
2651 | 7 | InstrumentedMutexLock l(&mutex_); |
2652 | 7 | assert(!logs_.empty()); |
2653 | | |
2654 | | // This SyncWAL() call only cares about logs up to this number. |
2655 | 7 | current_log_number = logfile_number_; |
2656 | | |
2657 | 7 | while (logs_.front().number <= current_log_number && |
2658 | 7 | logs_.front().getting_synced) { |
2659 | 0 | log_sync_cv_.Wait(); |
2660 | 0 | } |
2661 | | // First check that logs are safe to sync in background. |
2662 | 7 | for (auto it = logs_.begin(); |
2663 | 15 | it != logs_.end() && it->number <= current_log_number; ++it) { |
2664 | 9 | if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) { |
2665 | 1 | return STATUS(NotSupported, |
2666 | 1 | "SyncWAL() is not supported for this implementation of WAL file", |
2667 | 1 | db_options_.allow_mmap_writes |
2668 | 1 | ? "try setting Options::allow_mmap_writes to false" |
2669 | 1 | : yb::Slice()); |
2670 | 1 | } |
2671 | 9 | } |
2672 | 6 | for (auto it = logs_.begin(); |
2673 | 14 | it != logs_.end() && it->number <= current_log_number; ++it) { |
2674 | 8 | auto& log = *it; |
2675 | 8 | assert(!log.getting_synced); |
2676 | 8 | log.getting_synced = true; |
2677 | 8 | logs_to_sync.push_back(log.writer); |
2678 | 8 | } |
2679 | | |
2680 | 6 | need_log_dir_sync = !log_dir_synced_; |
2681 | 6 | } |
2682 | | |
2683 | 6 | RecordTick(stats_, WAL_FILE_SYNCED); |
2684 | 6 | Status status; |
2685 | 8 | for (log::Writer* log : logs_to_sync) { |
2686 | 8 | status = log->file()->SyncWithoutFlush(db_options_.use_fsync); |
2687 | 8 | if (!status.ok()) { |
2688 | 0 | break; |
2689 | 0 | } |
2690 | 8 | } |
2691 | 6 | if (status.ok() && need_log_dir_sync) { |
2692 | 5 | status = directories_.GetWalDir()->Fsync(); |
2693 | 5 | } |
2694 | | |
2695 | 6 | TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); |
2696 | 6 | TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); |
2697 | | |
2698 | 6 | { |
2699 | 6 | InstrumentedMutexLock l(&mutex_); |
2700 | 6 | MarkLogsSynced(current_log_number, need_log_dir_sync, status); |
2701 | 6 | } |
2702 | | |
2703 | 6 | return status; |
2704 | 7 | } |
2705 | | |
2706 | | void DBImpl::MarkLogsSynced( |
2707 | 113 | uint64_t up_to, bool synced_dir, const Status& status) { |
2708 | 113 | mutex_.AssertHeld(); |
2709 | 113 | if (synced_dir && |
2710 | 75 | logfile_number_ == up_to && |
2711 | 73 | status.ok()) { |
2712 | 43 | log_dir_synced_ = true; |
2713 | 43 | } |
2714 | 231 | for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) { |
2715 | 118 | auto& log = *it; |
2716 | 118 | assert(log.getting_synced); |
2717 | 118 | if (status.ok() && logs_.size() > 1) { |
2718 | 7 | logs_to_free_.push_back(log.ReleaseWriter()); |
2719 | 7 | it = logs_.erase(it); |
2720 | 111 | } else { |
2721 | 111 | log.getting_synced = false; |
2722 | 111 | ++it; |
2723 | 111 | } |
2724 | 118 | } |
2725 | 113 | assert(logs_.empty() || logs_[0].number > up_to || |
2726 | 113 | (logs_.size() == 1 && !logs_[0].getting_synced)); |
2727 | 113 | log_sync_cv_.SignalAll(); |
2728 | 113 | } |
2729 | | |
2730 | 324k | SequenceNumber DBImpl::GetLatestSequenceNumber() const { |
2731 | 324k | return versions_->LastSequence(); |
2732 | 324k | } |
2733 | | |
2734 | 471 | void DBImpl::SubmitCompactionOrFlushTask(std::unique_ptr<ThreadPoolTask> task) { |
2735 | 471 | mutex_.AssertHeld(); |
2736 | 471 | if (task->Type() == BgTaskType::kCompaction) { |
2737 | 440 | compaction_tasks_.insert(down_cast<CompactionTask*>(task.get())); |
2738 | 440 | } |
2739 | 471 | auto status = db_options_.priority_thread_pool_for_compactions_and_flushes->Submit( |
2740 | 471 | task->Priority(), &task); |
2741 | 471 | if (!status.ok()) { |
2742 | 0 | task->AbortedUnlocked(status); |
2743 | 0 | } |
2744 | 471 | } |
2745 | | |
2746 | | Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, |
2747 | | int output_level, uint32_t output_path_id, |
2748 | | const Slice* begin, const Slice* end, |
2749 | 5.68k | bool exclusive, bool disallow_trivial_move) { |
2750 | 5.68k | TEST_SYNC_POINT("DBImpl::RunManualCompaction"); |
2751 | | |
2752 | 5.68k | DCHECK(input_level == ColumnFamilyData::kCompactAllLevels || |
2753 | 5.68k | input_level >= 0); |
2754 | | |
2755 | 5.68k | InternalKey begin_storage, end_storage; |
2756 | 5.68k | CompactionArg* ca; |
2757 | | |
2758 | 5.68k | bool scheduled = false; |
2759 | 5.68k | bool manual_conflict = false; |
2760 | 5.68k | ManualCompaction manual_compaction; |
2761 | 5.68k | manual_compaction.cfd = cfd; |
2762 | 5.68k | manual_compaction.input_level = input_level; |
2763 | 5.68k | manual_compaction.output_level = output_level; |
2764 | 5.68k | manual_compaction.output_path_id = output_path_id; |
2765 | 5.68k | manual_compaction.done = false; |
2766 | 5.68k | manual_compaction.in_progress = false; |
2767 | 5.68k | manual_compaction.incomplete = false; |
2768 | 5.68k | manual_compaction.exclusive = exclusive; |
2769 | 5.68k | manual_compaction.disallow_trivial_move = disallow_trivial_move; |
2770 | | // For universal compaction, we enforce every manual compaction to compact |
2771 | | // all files. |
2772 | 5.68k | if (begin == nullptr || |
2773 | 1.20k | cfd->ioptions()->compaction_style == kCompactionStyleUniversal || |
2774 | 4.54k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { |
2775 | 4.54k | manual_compaction.begin = nullptr; |
2776 | 1.13k | } else { |
2777 | 1.13k | begin_storage = InternalKey::MaxPossibleForUserKey(*begin); |
2778 | 1.13k | manual_compaction.begin = &begin_storage; |
2779 | 1.13k | } |
2780 | 5.68k | if (end == nullptr || |
2781 | 1.25k | cfd->ioptions()->compaction_style == kCompactionStyleUniversal || |
2782 | 4.49k | cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { |
2783 | 4.49k | manual_compaction.end = nullptr; |
2784 | 1.18k | } else { |
2785 | 1.18k | end_storage = InternalKey::MinPossibleForUserKey(*end); |
2786 | 1.18k | manual_compaction.end = &end_storage; |
2787 | 1.18k | } |
2788 | | |
2789 | 5.68k | InstrumentedMutexLock l(&mutex_); |
2790 | | |
2791 | | // When a manual compaction arrives, if it is exclusive, run all scheduled |
2792 | | // and unscheduled compactions (from the queue) and then run the manual |
2793 | | // one. This is to ensure that any key range can be compacted without |
2794 | | // conflict. Otherwise, we let the manual compaction conflict until all |
2795 | | // automatic compactions from the same column family have been scheduled |
2796 | | // and run in the background. |
2797 | | // |
2798 | | // HasPendingManualCompaction() is true when at least one thread is inside |
2799 | | // RunManualCompaction(), i.e. during that time no other compaction will |
2800 | | // get scheduled (see MaybeScheduleFlushOrCompaction). |
2801 | | // |
2802 | | // Note that the following loop doesn't stop more that one thread calling |
2803 | | // RunManualCompaction() from getting to the second while loop below. |
2804 | | // However, only one of them will actually schedule compaction, while |
2805 | | // others will wait on a condition variable until it completes. |
2806 | | |
2807 | 5.68k | AddManualCompaction(&manual_compaction); |
2808 | 5.68k | TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_); |
2809 | 5.68k | if (exclusive) { |
2810 | 5.86k | while (unscheduled_compactions_ + bg_compaction_scheduled_ + compaction_tasks_.size() > 0) { |
2811 | 262 | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::Conflict"); |
2812 | 262 | MaybeScheduleFlushOrCompaction(); |
2813 | 547 | while (bg_compaction_scheduled_ + compaction_tasks_.size() > 0) { |
2814 | 285 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2815 | 285 | "[%s] Manual compaction waiting for all other scheduled background " |
2816 | 285 | "compactions to finish", |
2817 | 285 | cfd->GetName().c_str()); |
2818 | 285 | bg_cv_.Wait(); |
2819 | 285 | if (IsShuttingDown()) { |
2820 | 0 | return STATUS(ShutdownInProgress, ""); |
2821 | 0 | } |
2822 | 285 | } |
2823 | 262 | } |
2824 | 5.59k | } |
2825 | | |
2826 | 5.68k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
2827 | 5.68k | "[%s] Manual compaction starting", |
2828 | 5.68k | cfd->GetName().c_str()); |
2829 | | |
2830 | 5.68k | size_t compaction_task_serial_no = 0; |
2831 | | // We don't check bg_error_ here, because if we get the error in compaction, |
2832 | | // the compaction will set manual_compaction.status to bg_error_ and set manual_compaction.done to |
2833 | | // true. |
2834 | 16.5k | while (!manual_compaction.done) { |
2835 | 10.8k | DCHECK(HasPendingManualCompaction()); |
2836 | 10.8k | manual_conflict = false; |
2837 | 10.8k | if (ShouldntRunManualCompaction(&manual_compaction) || manual_compaction.in_progress || |
2838 | 5.94k | scheduled || |
2839 | 5.87k | ((manual_compaction.manual_end = &manual_compaction.tmp_storage1) && ( |
2840 | 5.87k | (manual_compaction.compaction = manual_compaction.cfd->CompactRange( |
2841 | 5.87k | *manual_compaction.cfd->GetLatestMutableCFOptions(), |
2842 | 5.87k | manual_compaction.input_level, manual_compaction.output_level, |
2843 | 5.87k | manual_compaction.output_path_id, manual_compaction.begin, manual_compaction.end, |
2844 | 5.87k | &manual_compaction.manual_end, &manual_conflict)) == |
2845 | 5.87k | nullptr) && |
2846 | 4.96k | manual_conflict)) { |
2847 | 0 | DCHECK(!exclusive || !manual_conflict) |
2848 | 0 | << "exclusive manual compactions should not see a conflict during CompactRange"; |
2849 | 4.96k | if (manual_conflict) { |
2850 | 2 | TEST_SYNC_POINT("DBImpl::RunManualCompaction()::Conflict"); |
2851 | 2 | } |
2852 | | // Running either this or some other manual compaction |
2853 | 4.96k | bg_cv_.Wait(); |
2854 | 4.96k | if (IsShuttingDown()) { |
2855 | 3 | if (!scheduled) { |
2856 | 0 | return STATUS(ShutdownInProgress, ""); |
2857 | 0 | } |
2858 | | // If manual compaction is already scheduled, we increase its priority and will wait for it |
2859 | | // to be aborted. We can't just exit, because compaction task can access manual_compaction |
2860 | | // by raw pointer. |
2861 | 3 | if (db_options_.priority_thread_pool_for_compactions_and_flushes) { |
2862 | 3 | mutex_.Unlock(); |
2863 | 3 | db_options_.priority_thread_pool_for_compactions_and_flushes->ChangeTaskPriority( |
2864 | 3 | compaction_task_serial_no, kShuttingDownPriority); |
2865 | 3 | mutex_.Lock(); |
2866 | 3 | } |
2867 | 3 | } |
2868 | | |
2869 | 4.96k | if (scheduled && manual_compaction.incomplete == true) { |
2870 | 190 | DCHECK(!manual_compaction.in_progress); |
2871 | 190 | scheduled = false; |
2872 | 190 | manual_compaction.incomplete = false; |
2873 | 190 | } |
2874 | 5.87k | } else if (!scheduled) { |
2875 | 5.87k | if (manual_compaction.compaction == nullptr) { |
2876 | 1.01k | manual_compaction.done = true; |
2877 | 1.01k | bg_cv_.SignalAll(); |
2878 | 1.01k | continue; |
2879 | 1.01k | } |
2880 | 4.86k | manual_compaction.incomplete = false; |
2881 | 4.86k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
2882 | 94 | FLAGS_use_priority_thread_pool_for_compactions) { |
2883 | 94 | auto compaction_task = std::make_unique<CompactionTask>(this, &manual_compaction); |
2884 | 94 | compaction_task_serial_no = compaction_task->SerialNo(); |
2885 | 94 | SubmitCompactionOrFlushTask(std::move(compaction_task)); |
2886 | 4.76k | } else { |
2887 | 4.76k | bg_compaction_scheduled_++; |
2888 | 4.76k | ca = new CompactionArg; |
2889 | 4.76k | ca->db = this; |
2890 | 4.76k | ca->m = &manual_compaction; |
2891 | 4.76k | env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, |
2892 | 4.76k | &DBImpl::UnscheduleCallback); |
2893 | 4.76k | } |
2894 | 4.86k | scheduled = true; |
2895 | 4.86k | } |
2896 | 10.8k | } |
2897 | | |
2898 | 5.68k | DCHECK(!manual_compaction.in_progress); |
2899 | 5.68k | DCHECK(HasPendingManualCompaction()); |
2900 | 5.68k | RemoveManualCompaction(&manual_compaction); |
2901 | 5.68k | bg_cv_.SignalAll(); |
2902 | 5.68k | return manual_compaction.status; |
2903 | 5.68k | } |
2904 | | |
2905 | | InternalIterator* DBImpl::NewInternalIterator( |
2906 | 404 | Arena* arena, ColumnFamilyHandle* column_family) { |
2907 | 404 | ColumnFamilyData* cfd; |
2908 | 404 | if (column_family == nullptr) { |
2909 | 1 | cfd = default_cf_handle_->cfd(); |
2910 | 403 | } else { |
2911 | 403 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
2912 | 403 | cfd = cfh->cfd(); |
2913 | 403 | } |
2914 | | |
2915 | 404 | mutex_.Lock(); |
2916 | 404 | SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); |
2917 | 404 | mutex_.Unlock(); |
2918 | 404 | ReadOptions roptions; |
2919 | 404 | return NewInternalIterator(roptions, cfd, super_version, arena); |
2920 | 404 | } |
2921 | | |
2922 | 2.38k | int DBImpl::GetCfdImmNumNotFlushed() { |
2923 | 2.38k | auto cfd = down_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd(); |
2924 | 2.38k | InstrumentedMutexLock guard_lock(&mutex_); |
2925 | 2.38k | return cfd->imm()->NumNotFlushed(); |
2926 | 2.38k | } |
2927 | | |
2928 | 5.44M | FlushAbility DBImpl::GetFlushAbility() { |
2929 | 5.44M | auto cfd = down_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd(); |
2930 | 5.44M | InstrumentedMutexLock guard_lock(&mutex_); |
2931 | 5.44M | if (cfd->imm()->NumNotFlushed() != 0) { |
2932 | 6.35k | return FlushAbility::kAlreadyFlushing; |
2933 | 6.35k | } |
2934 | 5.44M | return cfd->mem()->IsEmpty() ? FlushAbility::kNoNewData : FlushAbility::kHasNewData; |
2935 | 5.44M | } |
2936 | | |
2937 | | Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, |
2938 | 257k | const FlushOptions& flush_options) { |
2939 | 257k | Status s; |
2940 | 257k | { |
2941 | 257k | WriteContext context; |
2942 | 257k | InstrumentedMutexLock guard_lock(&mutex_); |
2943 | | |
2944 | 257k | if (last_flush_at_tick_ > flush_options.ignore_if_flushed_after_tick) { |
2945 | 0 | return STATUS(AlreadyPresent, "Mem table already flushed"); |
2946 | 0 | } |
2947 | | |
2948 | 257k | if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) { |
2949 | | // Nothing to flush |
2950 | 245k | return Status::OK(); |
2951 | 245k | } |
2952 | | |
2953 | 12.4k | last_flush_at_tick_ = FlushTick(); |
2954 | | |
2955 | 12.4k | WriteThread::Writer w; |
2956 | 12.4k | write_thread_.EnterUnbatched(&w, &mutex_); |
2957 | | |
2958 | | // SwitchMemtable() will release and reacquire mutex |
2959 | | // during execution |
2960 | 12.4k | s = SwitchMemtable(cfd, &context); |
2961 | 12.4k | write_thread_.ExitUnbatched(&w); |
2962 | | |
2963 | 12.4k | cfd->imm()->FlushRequested(); |
2964 | | |
2965 | | // schedule flush |
2966 | 12.4k | SchedulePendingFlush(cfd); |
2967 | 12.4k | MaybeScheduleFlushOrCompaction(); |
2968 | 12.4k | } |
2969 | | |
2970 | 12.4k | if (s.ok() && flush_options.wait) { |
2971 | | // Wait until the compaction completes |
2972 | 12.2k | s = WaitForFlushMemTable(cfd); |
2973 | 12.2k | } |
2974 | 12.4k | return s; |
2975 | 12.4k | } |
2976 | | |
2977 | 20.6k | Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { |
2978 | 20.6k | Status s; |
2979 | | // Wait until the flush completes |
2980 | 20.6k | InstrumentedMutexLock l(&mutex_); |
2981 | 38.3k | while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) { |
2982 | 17.7k | if (IsShuttingDown() && disable_flush_on_shutdown_) { |
2983 | 0 | return STATUS(ShutdownInProgress, ""); |
2984 | 0 | } |
2985 | 17.7k | bg_cv_.Wait(); |
2986 | 17.7k | } |
2987 | 20.6k | if (!bg_error_.ok()) { |
2988 | 10 | s = bg_error_; |
2989 | 10 | } |
2990 | 20.6k | return s; |
2991 | 20.6k | } |
2992 | | |
2993 | | Status DBImpl::EnableAutoCompaction( |
2994 | 328k | const std::vector<ColumnFamilyHandle*>& column_family_handles) { |
2995 | 328k | TEST_SYNC_POINT("DBImpl::EnableAutoCompaction"); |
2996 | 328k | Status s; |
2997 | 328k | for (auto cf_ptr : column_family_handles) { |
2998 | 328k | Status status = |
2999 | 328k | this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}}, false); |
3000 | 328k | if (status.ok()) { |
3001 | 328k | ColumnFamilyData* cfd = down_cast<ColumnFamilyHandleImpl*>(cf_ptr)->cfd(); |
3002 | 328k | InstrumentedMutexLock guard_lock(&mutex_); |
3003 | 328k | InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
3004 | 18.4E | } else { |
3005 | 18.4E | s = status; |
3006 | 18.4E | } |
3007 | 328k | } |
3008 | | |
3009 | 328k | return s; |
3010 | 328k | } |
3011 | | |
3012 | 1.25M | void DBImpl::MaybeScheduleFlushOrCompaction() { |
3013 | 1.25M | mutex_.AssertHeld(); |
3014 | 1.25M | if (!opened_successfully_) { |
3015 | | // Compaction may introduce data race to DB open |
3016 | 344k | return; |
3017 | 344k | } |
3018 | 911k | if (bg_work_paused_ > 0) { |
3019 | | // we paused the background work |
3020 | 41 | return; |
3021 | 911k | } else if (IsShuttingDown() && disable_flush_on_shutdown_) { |
3022 | | // DB is being deleted; no more background compactions and flushes. |
3023 | 3 | return; |
3024 | 3 | } |
3025 | | |
3026 | 943k | while (unscheduled_flushes_ > 0 && |
3027 | 37.3k | bg_flush_scheduled_ < db_options_.max_background_flushes) { |
3028 | 31.7k | unscheduled_flushes_--; |
3029 | 31.7k | bg_flush_scheduled_++; |
3030 | 31.7k | env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); |
3031 | 31.7k | } |
3032 | | |
3033 | 911k | size_t bg_compactions_allowed = BGCompactionsAllowed(); |
3034 | | |
3035 | | // special case -- if max_background_flushes == 0, then schedule flush on a |
3036 | | // compaction thread |
3037 | 911k | if (db_options_.max_background_flushes == 0) { |
3038 | 968 | while (unscheduled_flushes_ > 0 && |
3039 | 130 | bg_flush_scheduled_ + bg_compaction_scheduled_ + compaction_tasks_.size() < |
3040 | 130 | bg_compactions_allowed) { |
3041 | 130 | unscheduled_flushes_--; |
3042 | 130 | bg_flush_scheduled_++; |
3043 | 130 | env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); |
3044 | 130 | } |
3045 | 838 | } |
3046 | | |
3047 | 911k | if (IsShuttingDown()) { |
3048 | 2.82k | return; |
3049 | 2.82k | } |
3050 | | |
3051 | 908k | if (bg_compaction_paused_ > 0) { |
3052 | | // we paused the background compaction |
3053 | 0 | return; |
3054 | 0 | } |
3055 | | |
3056 | 991k | while (bg_compaction_scheduled_ + compaction_tasks_.size() < bg_compactions_allowed && |
3057 | 849k | unscheduled_compactions_ > 0) { |
3058 | 82.9k | bg_compaction_scheduled_++; |
3059 | 82.9k | unscheduled_compactions_--; |
3060 | 82.9k | CompactionArg* ca = new CompactionArg; |
3061 | 82.9k | ca->db = this; |
3062 | 82.9k | ca->m = nullptr; |
3063 | 82.9k | env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this, |
3064 | 82.9k | &DBImpl::UnscheduleCallback); |
3065 | 82.9k | } |
3066 | 908k | } |
3067 | | |
3068 | 978k | int DBImpl::BGCompactionsAllowed() const { |
3069 | 978k | if (write_controller_.NeedSpeedupCompaction()) { |
3070 | 977k | return db_options_.max_background_compactions; |
3071 | 560 | } else { |
3072 | 560 | return db_options_.base_background_compactions; |
3073 | 560 | } |
3074 | 978k | } |
3075 | | |
3076 | 0 | bool DBImpl::IsEmptyCompactionQueue() { |
3077 | 0 | return small_compaction_queue_.empty() && large_compaction_queue_.empty(); |
3078 | 0 | } |
3079 | | |
3080 | 35.4k | bool DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { |
3081 | 35.4k | mutex_.AssertHeld(); |
3082 | | |
3083 | 35.4k | assert(!cfd->pending_compaction()); |
3084 | | |
3085 | 35.4k | const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
3086 | 35.4k | std::unique_ptr<Compaction> c; |
3087 | | |
3088 | 35.4k | if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped() |
3089 | 34.2k | && !(HasExclusiveManualCompaction() || HaveManualCompaction(cfd))) { |
3090 | 33.8k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3091 | 33.8k | c = cfd->PickCompaction(*cfd->GetLatestMutableCFOptions(), &log_buffer); |
3092 | 33.8k | log_buffer.FlushBufferToLog(); |
3093 | 33.8k | if (c) { |
3094 | 18.1k | cfd->Ref(); |
3095 | 18.1k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
3096 | 346 | FLAGS_use_priority_thread_pool_for_compactions) { |
3097 | 346 | SubmitCompactionOrFlushTask(std::make_unique<CompactionTask>(this, std::move(c))); |
3098 | | // True means that we need to schedule one more compaction, since it is already scheduled |
3099 | | // one line above we return false. |
3100 | 346 | return false; |
3101 | 17.7k | } else if (!IsLargeCompaction(*c)) { |
3102 | 17.7k | small_compaction_queue_.push_back(std::move(c)); |
3103 | 6 | } else { |
3104 | 6 | large_compaction_queue_.push_back(std::move(c)); |
3105 | 6 | } |
3106 | 17.7k | cfd->set_pending_compaction(true); |
3107 | 17.7k | return true; |
3108 | 17.2k | } |
3109 | 33.8k | } |
3110 | | |
3111 | 17.2k | return false; |
3112 | 17.2k | } |
3113 | | |
3114 | 17.7k | std::unique_ptr<Compaction> DBImpl::PopFirstFromSmallCompactionQueue() { |
3115 | 17.7k | return PopFirstFromCompactionQueue(&small_compaction_queue_); |
3116 | 17.7k | } |
3117 | | |
3118 | 6 | std::unique_ptr<Compaction> DBImpl::PopFirstFromLargeCompactionQueue() { |
3119 | 6 | return PopFirstFromCompactionQueue(&large_compaction_queue_); |
3120 | 6 | } |
3121 | | |
3122 | 20.9k | bool DBImpl::IsLargeCompaction(const Compaction& compaction) { |
3123 | 20.9k | return compaction.CalculateTotalInputSize() >= db_options_.compaction_size_threshold_bytes; |
3124 | 20.9k | } |
3125 | | |
3126 | 31.8k | void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) { |
3127 | 31.8k | assert(!cfd->pending_flush()); |
3128 | 31.8k | cfd->Ref(); |
3129 | 31.8k | flush_queue_.push_back(cfd); |
3130 | 31.8k | cfd->set_pending_flush(true); |
3131 | 31.8k | } |
3132 | | |
3133 | 31.8k | ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { |
3134 | 31.8k | assert(!flush_queue_.empty()); |
3135 | 31.8k | auto cfd = *flush_queue_.begin(); |
3136 | 31.8k | flush_queue_.pop_front(); |
3137 | 31.8k | assert(cfd->pending_flush()); |
3138 | 31.8k | cfd->set_pending_flush(false); |
3139 | 31.8k | return cfd; |
3140 | 31.8k | } |
3141 | | |
3142 | 774k | void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) { |
3143 | 774k | if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) { |
3144 | 3.43k | for (auto listener : db_options_.listeners) { |
3145 | 3.43k | listener->OnFlushScheduled(this); |
3146 | 3.43k | } |
3147 | 31.8k | if (db_options_.priority_thread_pool_for_compactions_and_flushes && |
3148 | 9.61k | FLAGS_use_priority_thread_pool_for_flushes) { |
3149 | 31 | ++bg_flush_scheduled_; |
3150 | 31 | cfd->Ref(); |
3151 | 31 | cfd->set_pending_flush(true); |
3152 | 31 | SubmitCompactionOrFlushTask(std::make_unique<FlushTask>(this, cfd)); |
3153 | 31.8k | } else { |
3154 | 31.8k | AddToFlushQueue(cfd); |
3155 | 31.8k | ++unscheduled_flushes_; |
3156 | 31.8k | } |
3157 | 31.8k | } |
3158 | 774k | } |
3159 | | |
3160 | 803k | void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { |
3161 | 803k | mutex_.AssertHeld(); |
3162 | | |
3163 | 803k | if (!cfd->pending_compaction() && cfd->NeedsCompaction() && !IsShuttingDown()) { |
3164 | 35.3k | if (AddToCompactionQueue(cfd)) { |
3165 | 17.7k | ++unscheduled_compactions_; |
3166 | 17.7k | } |
3167 | 35.3k | } |
3168 | 803k | TEST_SYNC_POINT("DBImpl::SchedulePendingCompaction:Done"); |
3169 | 803k | } |
3170 | | |
3171 | 63.7k | void DBImpl::RecordFlushIOStats() { |
3172 | 63.7k | RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); |
3173 | 63.7k | IOSTATS_RESET(bytes_written); |
3174 | 63.7k | } |
3175 | | |
3176 | 31.8k | void DBImpl::BGWorkFlush(void* db) { |
3177 | 31.8k | IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH); |
3178 | 31.8k | TEST_SYNC_POINT("DBImpl::BGWorkFlush"); |
3179 | 31.8k | reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush(nullptr /* cfd */); |
3180 | 31.8k | TEST_SYNC_POINT("DBImpl::BGWorkFlush:done"); |
3181 | 31.8k | } |
3182 | | |
3183 | 87.7k | void DBImpl::BGWorkCompaction(void* arg) { |
3184 | 87.7k | CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg)); |
3185 | 87.7k | delete reinterpret_cast<CompactionArg*>(arg); |
3186 | 87.7k | IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW); |
3187 | 87.7k | TEST_SYNC_POINT("DBImpl::BGWorkCompaction"); |
3188 | 87.7k | reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(ca.m); |
3189 | 87.7k | } |
3190 | | |
3191 | 0 | void DBImpl::UnscheduleCallback(void* arg) { |
3192 | 0 | CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg)); |
3193 | 0 | delete reinterpret_cast<CompactionArg*>(arg); |
3194 | 0 | if (ca.m != nullptr) { |
3195 | 0 | ca.m->compaction.reset(); |
3196 | 0 | } |
3197 | 0 | TEST_SYNC_POINT("DBImpl::UnscheduleCallback"); |
3198 | 0 | } |
3199 | | |
3200 | | Result<FileNumbersHolder> DBImpl::BackgroundFlush( |
3201 | 31.8k | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, ColumnFamilyData* cfd) { |
3202 | 31.8k | mutex_.AssertHeld(); |
3203 | | |
3204 | 31.8k | auto scope_exit = yb::ScopeExit([&cfd] { |
3205 | 31.8k | if (cfd && cfd->Unref()) { |
3206 | 0 | delete cfd; |
3207 | 0 | } |
3208 | 31.8k | }); |
3209 | | |
3210 | 31.8k | if (cfd) { |
3211 | | // cfd is not nullptr when we get here from DBImpl::FlushTask and in this case we need to reset |
3212 | | // pending flush flag. |
3213 | | // In other cases (getting here from DBImpl::BGWorkFlush) this is done by |
3214 | | // DBImpl::PopFirstFromFlushQueue called below. |
3215 | 31 | cfd->set_pending_flush(false); |
3216 | 31 | } |
3217 | | |
3218 | 31.8k | Status status = bg_error_; |
3219 | 31.8k | if (status.ok() && IsShuttingDown() && disable_flush_on_shutdown_) { |
3220 | 1 | status = STATUS(ShutdownInProgress, ""); |
3221 | 1 | } |
3222 | | |
3223 | 31.8k | if (!status.ok()) { |
3224 | 1 | return status; |
3225 | 1 | } |
3226 | | |
3227 | 31.8k | if (cfd == nullptr) { |
3228 | 31.8k | while (!flush_queue_.empty()) { |
3229 | | // This cfd is already referenced |
3230 | 31.8k | auto first_cfd = PopFirstFromFlushQueue(); |
3231 | | |
3232 | 31.8k | if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) { |
3233 | | // can't flush this CF, try next one |
3234 | 1 | if (first_cfd->Unref()) { |
3235 | 0 | delete first_cfd; |
3236 | 0 | } |
3237 | 1 | continue; |
3238 | 1 | } |
3239 | | |
3240 | | // found a flush! |
3241 | 31.8k | cfd = first_cfd; |
3242 | 31.8k | break; |
3243 | 31.8k | } |
3244 | 31.8k | } |
3245 | | |
3246 | 31.8k | if (cfd == nullptr) { |
3247 | 1 | return FileNumbersHolder(); |
3248 | 1 | } |
3249 | 31.8k | const MutableCFOptions mutable_cf_options = |
3250 | 31.8k | *cfd->GetLatestMutableCFOptions(); |
3251 | 1.69k | YB_LOG_WITH_PREFIX_EVERY_N_SECS(INFO, 1) |
3252 | 1.69k | << "Calling FlushMemTableToOutputFile with column " |
3253 | 1.69k | << "family [" << cfd->GetName() << "], " |
3254 | 1.69k | << "flush slots scheduled " << bg_flush_scheduled_ << ", " |
3255 | 1.69k | << "total flush slots " << db_options_.max_background_flushes << ", " |
3256 | 1.69k | << "compaction slots scheduled " << bg_compaction_scheduled_ << ", " |
3257 | 1.69k | << "compaction tasks " << yb::ToString(compaction_tasks_) << ", " |
3258 | 1.69k | << "total compaction slots " << BGCompactionsAllowed(); |
3259 | 31.8k | return FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, |
3260 | 31.8k | job_context, log_buffer); |
3261 | 31.8k | } |
3262 | | |
3263 | | void DBImpl::WaitAfterBackgroundError( |
3264 | 120k | const Status& s, const char* job_name, LogBuffer* log_buffer) { |
3265 | 120k | if (!s.ok() && !s.IsShutdownInProgress()) { |
3266 | | // Wait a little bit before retrying background job in |
3267 | | // case this is an environmental problem and we do not want to |
3268 | | // chew up resources for failed jobs for the duration of |
3269 | | // the problem. |
3270 | 74 | uint64_t error_cnt = default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); |
3271 | 74 | bg_cv_.SignalAll(); // In case a waiter can proceed despite the error |
3272 | 74 | mutex_.Unlock(); |
3273 | 74 | log_buffer->FlushBufferToLog(); |
3274 | 74 | RLOG( |
3275 | 74 | InfoLogLevel::ERROR_LEVEL, db_options_.info_log, Format( |
3276 | 74 | "Waiting after background $0 error: $1, Accumulated background error counts: $2", |
3277 | 74 | job_name, s, error_cnt).c_str()); |
3278 | 74 | LogFlush(db_options_.info_log); |
3279 | 74 | env_->SleepForMicroseconds(1000000); |
3280 | 74 | mutex_.Lock(); |
3281 | 74 | } |
3282 | 120k | } |
3283 | | |
3284 | | void DBImpl::BackgroundJobComplete( |
3285 | 120k | const Status& s, JobContext* job_context, LogBuffer* log_buffer) { |
3286 | 120k | mutex_.AssertHeld(); |
3287 | | |
3288 | 120k | TaskPriorityUpdater task_priority_updater(this); |
3289 | 120k | task_priority_updater.Prepare(); |
3290 | | |
3291 | | // If flush or compaction failed, we want to delete all temporary files that we might have |
3292 | | // created. Thus, we force full scan in FindObsoleteFiles() |
3293 | 120k | FindObsoleteFiles(job_context, !s.ok() && !s.IsShutdownInProgress()); |
3294 | | |
3295 | | // delete unnecessary files if any, this is done outside the mutex |
3296 | 120k | if (job_context->HaveSomethingToDelete() || !log_buffer->IsEmpty() || |
3297 | 120k | !task_priority_updater.Empty() || HasFilesChangedListener()) { |
3298 | 120k | mutex_.Unlock(); |
3299 | | // Have to flush the info logs before bg_flush_scheduled_-- |
3300 | | // because if bg_flush_scheduled_ becomes 0 and the lock is |
3301 | | // released, the destructor of DB can kick in and destroy all the |
3302 | | // state of DB so info_log might not be available after that point. |
3303 | | // It also applies to access to other state that DB owns. |
3304 | 120k | log_buffer->FlushBufferToLog(); |
3305 | 120k | if (job_context->HaveSomethingToDelete()) { |
3306 | 120k | PurgeObsoleteFiles(*job_context); |
3307 | 120k | } |
3308 | 120k | job_context->Clean(); |
3309 | | |
3310 | 120k | task_priority_updater.Apply(); |
3311 | | |
3312 | 120k | FilesChanged(); |
3313 | | |
3314 | 120k | mutex_.Lock(); |
3315 | 120k | } |
3316 | 120k | } |
3317 | | |
3318 | 31.8k | void DBImpl::BackgroundCallFlush(ColumnFamilyData* cfd) { |
3319 | 31.8k | bool made_progress = false; |
3320 | 31.8k | JobContext job_context(next_job_id_.fetch_add(1), true); |
3321 | | |
3322 | 31.8k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3323 | | |
3324 | 31.8k | InstrumentedMutexLock l(&mutex_); |
3325 | 31.8k | assert(bg_flush_scheduled_); |
3326 | 31.8k | num_running_flushes_++; |
3327 | | |
3328 | 31.8k | Status s; |
3329 | 31.8k | { |
3330 | 31.8k | auto file_number_holder = BackgroundFlush(&made_progress, &job_context, &log_buffer, cfd); |
3331 | 31.8k | s = yb::ResultToStatus(file_number_holder); |
3332 | 31.8k | WaitAfterBackgroundError(s, "flush", &log_buffer); |
3333 | 31.8k | } |
3334 | | |
3335 | 31.8k | BackgroundJobComplete(s, &job_context, &log_buffer); |
3336 | | |
3337 | 31.8k | assert(num_running_flushes_ > 0); |
3338 | 31.8k | num_running_flushes_--; |
3339 | 31.8k | bg_flush_scheduled_--; |
3340 | | // See if there's more work to be done |
3341 | 31.8k | MaybeScheduleFlushOrCompaction(); |
3342 | 31.8k | RecordFlushIOStats(); |
3343 | 31.8k | bg_cv_.SignalAll(); |
3344 | | // IMPORTANT: there should be no code after calling SignalAll. This call may |
3345 | | // signal the DB destructor that it's OK to proceed with destruction. In |
3346 | | // that case, all DB variables will be dealloacated and referencing them |
3347 | | // will cause trouble. |
3348 | 31.8k | } |
3349 | | |
3350 | | void DBImpl::BackgroundCallCompaction(ManualCompaction* m, std::unique_ptr<Compaction> compaction, |
3351 | 88.1k | CompactionTask* compaction_task) { |
3352 | 88.1k | bool made_progress = false; |
3353 | 88.1k | JobContext job_context(next_job_id_.fetch_add(1), true); |
3354 | 88.1k | LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); |
3355 | 88.1k | if (compaction_task) { |
3356 | 439 | compaction_task->SetJobID(&job_context); |
3357 | 439 | } |
3358 | 88.1k | InstrumentedMutexLock l(&mutex_); |
3359 | 88.1k | num_total_running_compactions_++; |
3360 | | |
3361 | 88.1k | if (compaction_task) { |
3362 | 0 | LOG_IF_WITH_PREFIX(DFATAL, compaction_tasks_.count(compaction_task) != 1) |
3363 | 0 | << "Running compaction for unknown task: " << compaction_task; |
3364 | 87.7k | } else { |
3365 | 0 | LOG_IF_WITH_PREFIX(DFATAL, bg_compaction_scheduled_ == 0) |
3366 | 0 | << "Running compaction while no compactions were scheduled"; |
3367 | 87.7k | } |
3368 | | |
3369 | 88.1k | Status s; |
3370 | 88.1k | { |
3371 | 88.1k | auto file_numbers_holder = BackgroundCompaction( |
3372 | 88.1k | &made_progress, &job_context, &log_buffer, m, std::move(compaction)); |
3373 | | |
3374 | 88.1k | if (compaction_task) { |
3375 | 436 | compaction_task->Complete(); |
3376 | 436 | } |
3377 | | |
3378 | 88.1k | s = yb::ResultToStatus(file_numbers_holder); |
3379 | 88.1k | TEST_SYNC_POINT("BackgroundCallCompaction:1"); |
3380 | 88.1k | WaitAfterBackgroundError(s, "compaction", &log_buffer); |
3381 | 88.1k | } |
3382 | | |
3383 | 88.1k | BackgroundJobComplete(s, &job_context, &log_buffer); |
3384 | | |
3385 | 88.1k | assert(num_total_running_compactions_ > 0); |
3386 | 88.1k | num_total_running_compactions_--; |
3387 | 88.1k | if (compaction_task) { |
3388 | 0 | LOG_IF_WITH_PREFIX(DFATAL, compaction_tasks_.erase(compaction_task) != 1) |
3389 | 0 | << "Finished compaction with unknown task serial no: " << yb::ToString(compaction_task); |
3390 | 87.7k | } else { |
3391 | 87.7k | bg_compaction_scheduled_--; |
3392 | 87.7k | } |
3393 | | |
3394 | 88.1k | versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); |
3395 | | |
3396 | | // See if there's more work to be done |
3397 | 88.1k | MaybeScheduleFlushOrCompaction(); |
3398 | 88.1k | if (made_progress || (bg_compaction_scheduled_ + compaction_tasks_.size()) == 0 || |
3399 | 65.2k | HasPendingManualCompaction()) { |
3400 | | // signal if |
3401 | | // * made_progress -- need to wakeup DelayWrite |
3402 | | // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl |
3403 | | // * HasPendingManualCompaction -- need to wakeup RunManualCompaction |
3404 | | // If none of this is true, there is no need to signal since nobody is |
3405 | | // waiting for it |
3406 | 22.9k | bg_cv_.SignalAll(); |
3407 | 22.9k | } |
3408 | | // IMPORTANT: there should be no code after calling SignalAll. This call may |
3409 | | // signal the DB destructor that it's OK to proceed with destruction. In |
3410 | | // that case, all DB variables will be dealloacated and referencing them |
3411 | | // will cause trouble. |
3412 | 88.1k | } |
3413 | | |
3414 | | Result<FileNumbersHolder> DBImpl::BackgroundCompaction( |
3415 | | bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, |
3416 | 88.1k | ManualCompaction* manual_compaction, std::unique_ptr<Compaction> compaction) { |
3417 | 88.1k | *made_progress = false; |
3418 | 88.1k | mutex_.AssertHeld(); |
3419 | | |
3420 | 88.1k | bool is_manual = (manual_compaction != nullptr); |
3421 | 88.1k | if (is_manual && compaction) { |
3422 | 0 | return STATUS( |
3423 | 0 | InvalidArgument, |
3424 | 0 | "Both is_manual and compaction are specified in BackgroundCompaction, only one of them is " |
3425 | 0 | "allowed"); |
3426 | 0 | } |
3427 | 88.1k | DCHECK(!is_manual || !compaction); |
3428 | 88.1k | bool is_large_compaction = false; |
3429 | | |
3430 | | // (manual_compaction->in_progress == false); |
3431 | 88.1k | bool trivial_move_disallowed = |
3432 | 88.1k | is_manual && manual_compaction->disallow_trivial_move; |
3433 | | |
3434 | 88.1k | CompactionJobStats compaction_job_stats; |
3435 | 88.1k | Status status = bg_error_; |
3436 | 88.1k | if (status.ok() && IsShuttingDown()) { |
3437 | 1 | status = STATUS(ShutdownInProgress, ""); |
3438 | 1 | } |
3439 | | |
3440 | 88.1k | if (!status.ok()) { |
3441 | 4 | if (is_manual) { |
3442 | 1 | manual_compaction->status = status; |
3443 | 1 | manual_compaction->done = true; |
3444 | 1 | manual_compaction->in_progress = false; |
3445 | 1 | manual_compaction->compaction.reset(); |
3446 | 1 | manual_compaction = nullptr; |
3447 | 1 | } |
3448 | 4 | if (compaction && compaction->column_family_data()->Unref()) { |
3449 | 0 | delete compaction->column_family_data(); |
3450 | 0 | } |
3451 | 4 | return status; |
3452 | 4 | } |
3453 | | |
3454 | 88.1k | if (is_manual) { |
3455 | | // another thread cannot pick up the same work |
3456 | 4.85k | manual_compaction->in_progress = true; |
3457 | 4.85k | } |
3458 | | |
3459 | 88.1k | unique_ptr<Compaction> c; |
3460 | | // InternalKey manual_end_storage; |
3461 | | // InternalKey* manual_end = &manual_end_storage; |
3462 | 88.1k | if (is_manual) { |
3463 | 4.85k | ManualCompaction* m = manual_compaction; |
3464 | 4.85k | assert(m->in_progress); |
3465 | 4.85k | c = std::move(m->compaction); |
3466 | 4.85k | if (!c) { |
3467 | 0 | m->done = true; |
3468 | 0 | m->manual_end = nullptr; |
3469 | 0 | LOG_TO_BUFFER(log_buffer, |
3470 | 0 | "[%s] Manual compaction from level-%d from %s .. " |
3471 | 0 | "%s; nothing to do\n", |
3472 | 0 | m->cfd->GetName().c_str(), m->input_level, |
3473 | 0 | (m->begin ? m->begin->DebugString().c_str() : "(begin)"), |
3474 | 0 | (m->end ? m->end->DebugString().c_str() : "(end)")); |
3475 | 4.85k | } else { |
3476 | 4.85k | LOG_TO_BUFFER(log_buffer, |
3477 | 4.85k | "[%s] Manual compaction from level-%d to level-%d from %s .. " |
3478 | 4.85k | "%s; will stop at %s\n", |
3479 | 4.85k | m->cfd->GetName().c_str(), m->input_level, c->output_level(), |
3480 | 4.85k | (m->begin ? m->begin->DebugString().c_str() : "(begin)"), |
3481 | 4.85k | (m->end ? m->end->DebugString().c_str() : "(end)"), |
3482 | 4.85k | ((m->done || m->manual_end == nullptr) |
3483 | 4.85k | ? "(end)" |
3484 | 4.85k | : m->manual_end->DebugString().c_str())); |
3485 | 4.85k | } |
3486 | 83.2k | } else { |
3487 | | // cfd is referenced here |
3488 | 83.2k | if (compaction) { |
3489 | 346 | c = std::move(compaction); |
3490 | 346 | is_large_compaction = IsLargeCompaction(*c); |
3491 | 82.9k | } else if (!large_compaction_queue_.empty() && BGCompactionsAllowed() > |
3492 | 6 | num_running_large_compactions() + db_options_.num_reserved_small_compaction_threads) { |
3493 | 6 | c = PopFirstFromLargeCompactionQueue(); |
3494 | 6 | is_large_compaction = true; |
3495 | 82.9k | } else if (!small_compaction_queue_.empty()) { |
3496 | 17.7k | c = PopFirstFromSmallCompactionQueue(); |
3497 | 17.7k | is_large_compaction = false; |
3498 | 65.2k | } else { |
3499 | 1 | LOG_IF(DFATAL, large_compaction_queue_.empty()) |
3500 | 1 | << "Don't have compactions in BackgroundCompaction"; |
3501 | 65.2k | LOG_TO_BUFFER(log_buffer, "No small compactions in queue. Large compaction threads busy."); |
3502 | 65.2k | unscheduled_compactions_++; |
3503 | 65.2k | return FileNumbersHolder(); |
3504 | 65.2k | } |
3505 | | |
3506 | 18.0k | ColumnFamilyData* cfd = c->column_family_data(); |
3507 | | |
3508 | | // We unreference here because the following code will take a Ref() on |
3509 | | // this cfd if it is going to use it (Compaction class holds a |
3510 | | // reference). |
3511 | | // This will all happen under a mutex so we don't have to be afraid of |
3512 | | // somebody else deleting it. |
3513 | 18.0k | if (cfd->Unref()) { |
3514 | 0 | delete cfd; |
3515 | | // This was the last reference of the column family, so no need to |
3516 | | // compact. |
3517 | 0 | return FileNumbersHolder(); |
3518 | 0 | } |
3519 | | |
3520 | 18.0k | if (is_large_compaction) { |
3521 | 6 | num_running_large_compactions_++; |
3522 | 6 | TEST_SYNC_POINT("DBImpl:BackgroundCompaction:LargeCompaction"); |
3523 | 18.0k | } else { |
3524 | 18.0k | TEST_SYNC_POINT("DBImpl:BackgroundCompaction:SmallCompaction"); |
3525 | 18.0k | } |
3526 | | |
3527 | 18.0k | if (c != nullptr) { |
3528 | | // update statistics |
3529 | 18.0k | MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, |
3530 | 18.0k | c->inputs(0)->size()); |
3531 | | // There are three things that can change compaction score: |
3532 | | // 1) When flush or compaction finish. This case is covered by |
3533 | | // InstallSuperVersionAndScheduleWork |
3534 | | // 2) When MutableCFOptions changes. This case is also covered by |
3535 | | // InstallSuperVersionAndScheduleWork, because this is when the new |
3536 | | // options take effect. |
3537 | | // 3) When we Pick a new compaction, we "remove" those files being |
3538 | | // compacted from the calculation, which then influences compaction |
3539 | | // score. Here we check if we need the new compaction even without the |
3540 | | // files that are currently being compacted. If we need another |
3541 | | // compaction, we might be able to execute it in parallel, so we add it |
3542 | | // to the queue and schedule a new thread. |
3543 | | |
3544 | 18.0k | SchedulePendingCompaction(cfd); |
3545 | 18.0k | MaybeScheduleFlushOrCompaction(); |
3546 | 18.0k | } |
3547 | 18.0k | } |
3548 | | |
3549 | 22.9k | Result<FileNumbersHolder> result = FileNumbersHolder(); |
3550 | 649 | for (auto listener : db_options_.listeners) { |
3551 | 649 | listener->OnCompactionStarted(); |
3552 | 649 | } |
3553 | 22.9k | if (c->deletion_compaction()) { |
3554 | | // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old |
3555 | | // file if there is alive snapshot pointing to it |
3556 | 12 | assert(c->num_input_files(1) == 0); |
3557 | 12 | assert(c->level() == 0); |
3558 | 12 | assert(c->column_family_data()->ioptions()->compaction_style == |
3559 | 12 | kCompactionStyleFIFO); |
3560 | | |
3561 | 12 | compaction_job_stats.num_input_files = c->num_input_files(0); |
3562 | | |
3563 | 16 | for (const auto& f : *c->inputs(0)) { |
3564 | 16 | c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); |
3565 | 16 | } |
3566 | 12 | status = versions_->LogAndApply(c->column_family_data(), |
3567 | 12 | *c->mutable_cf_options(), c->edit(), |
3568 | 12 | &mutex_, directories_.GetDbDir()); |
3569 | 12 | InstallSuperVersionAndScheduleWorkWrapper( |
3570 | 12 | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3571 | 12 | LOG_TO_BUFFER(log_buffer, "[%s] Deleted %d files\n", |
3572 | 12 | c->column_family_data()->GetName().c_str(), |
3573 | 12 | c->num_input_files(0)); |
3574 | 12 | *made_progress = true; |
3575 | 22.9k | } else if (!trivial_move_disallowed && c->IsTrivialMove()) { |
3576 | 12.2k | TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); |
3577 | | |
3578 | 12.2k | compaction_job_stats.num_input_files = c->num_input_files(0); |
3579 | | |
3580 | | // Move files to next level |
3581 | 12.2k | int32_t moved_files = 0; |
3582 | 12.2k | int64_t moved_bytes = 0; |
3583 | 26.6k | for (unsigned int l = 0; l < c->num_input_levels(); l++) { |
3584 | 14.3k | if (c->level(l) == c->output_level()) { |
3585 | 224 | continue; |
3586 | 224 | } |
3587 | 27.8k | for (size_t i = 0; i < c->num_input_files(l); i++) { |
3588 | 13.7k | FileMetaData* f = c->input(l, i); |
3589 | 13.7k | c->edit()->DeleteFile(c->level(l), f->fd.GetNumber()); |
3590 | 13.7k | c->edit()->AddCleanedFile(c->output_level(), *f); |
3591 | | |
3592 | 13.7k | LOG_TO_BUFFER(log_buffer, |
3593 | 13.7k | "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n", |
3594 | 13.7k | c->column_family_data()->GetName().c_str(), |
3595 | 13.7k | f->fd.GetNumber(), c->output_level(), f->fd.GetTotalFileSize()); |
3596 | 13.7k | ++moved_files; |
3597 | 13.7k | moved_bytes += f->fd.GetTotalFileSize(); |
3598 | 13.7k | } |
3599 | 14.1k | } |
3600 | | |
3601 | 12.2k | status = versions_->LogAndApply(c->column_family_data(), |
3602 | 12.2k | *c->mutable_cf_options(), c->edit(), |
3603 | 12.2k | &mutex_, directories_.GetDbDir()); |
3604 | | // Use latest MutableCFOptions |
3605 | 12.2k | InstallSuperVersionAndScheduleWorkWrapper( |
3606 | 12.2k | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3607 | | |
3608 | 12.2k | VersionStorageInfo::LevelSummaryStorage tmp; |
3609 | 12.2k | c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(), |
3610 | 12.2k | moved_bytes); |
3611 | 12.2k | { |
3612 | 12.2k | event_logger_.LogToBuffer(log_buffer) |
3613 | 12.2k | << "job" << job_context->job_id << "event" |
3614 | 12.2k | << "trivial_move" |
3615 | 12.2k | << "destination_level" << c->output_level() << "files" << moved_files |
3616 | 12.2k | << "total_files_size" << moved_bytes; |
3617 | 12.2k | } |
3618 | 12.2k | LOG_TO_BUFFER( |
3619 | 12.2k | log_buffer, |
3620 | 12.2k | "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n", |
3621 | 12.2k | c->column_family_data()->GetName().c_str(), moved_files, |
3622 | 12.2k | c->output_level(), moved_bytes, status.ToString().c_str(), |
3623 | 12.2k | c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); |
3624 | 12.2k | *made_progress = true; |
3625 | 10.6k | } else { |
3626 | 10.6k | int output_level __attribute__((unused)) = c->output_level(); |
3627 | 10.6k | TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial", |
3628 | 10.6k | &output_level); |
3629 | | |
3630 | 10.6k | SequenceNumber earliest_write_conflict_snapshot; |
3631 | 10.6k | std::vector<SequenceNumber> snapshot_seqs = |
3632 | 10.6k | snapshots_.GetAll(&earliest_write_conflict_snapshot); |
3633 | | |
3634 | 10.6k | assert(is_snapshot_supported_ || snapshots_.empty()); |
3635 | 10.6k | CompactionJob compaction_job( |
3636 | 10.6k | job_context->job_id, c.get(), db_options_, env_options_, |
3637 | 10.6k | versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), |
3638 | 10.6k | directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, |
3639 | 10.6k | &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, |
3640 | 10.6k | pending_outputs_.get(), table_cache_, &event_logger_, |
3641 | 10.6k | c->mutable_cf_options()->paranoid_file_checks, |
3642 | 10.6k | c->mutable_cf_options()->compaction_measure_io_stats, dbname_, |
3643 | 10.6k | &compaction_job_stats); |
3644 | 10.6k | compaction_job.Prepare(); |
3645 | | |
3646 | 10.6k | mutex_.Unlock(); |
3647 | 10.6k | result = compaction_job.Run(); |
3648 | 10.6k | TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); |
3649 | 10.6k | mutex_.Lock(); |
3650 | | |
3651 | 10.6k | status = compaction_job.Install(*c->mutable_cf_options()); |
3652 | 10.6k | if (status.ok()) { |
3653 | 10.5k | InstallSuperVersionAndScheduleWorkWrapper( |
3654 | 10.5k | c->column_family_data(), job_context, *c->mutable_cf_options()); |
3655 | 10.5k | } |
3656 | 10.6k | *made_progress = true; |
3657 | 10.6k | } |
3658 | | |
3659 | 22.9k | NotifyOnCompactionCompleted( |
3660 | 22.9k | c->column_family_data(), c.get(), status, |
3661 | 22.9k | compaction_job_stats, job_context->job_id); |
3662 | | |
3663 | 22.9k | c->ReleaseCompactionFiles(status); |
3664 | | |
3665 | | // It is possible that a compaction was needed in the column family but we could not |
3666 | | // add it to the queue when this compaction was popped because of L0 conflicts |
3667 | | // or other picker internals, so we try to schedule again. |
3668 | 22.9k | SchedulePendingCompaction(c->column_family_data()); |
3669 | | |
3670 | 22.9k | *made_progress = true; |
3671 | | // this will unref its input_version and column_family_data |
3672 | 22.9k | c.reset(); |
3673 | | |
3674 | 22.9k | if (status.ok()) { |
3675 | | // Done |
3676 | 103 | } else if (status.IsShutdownInProgress()) { |
3677 | | // Ignore compaction errors found during shutting down |
3678 | 56 | } else { |
3679 | 56 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", |
3680 | 56 | status.ToString().c_str()); |
3681 | 56 | if (db_options_.paranoid_checks && bg_error_.ok()) { |
3682 | 24 | bg_error_ = status; |
3683 | 24 | } |
3684 | 56 | } |
3685 | | |
3686 | 22.9k | if (is_manual) { |
3687 | 4.85k | ManualCompaction* m = manual_compaction; |
3688 | 4.85k | if (!status.ok()) { |
3689 | 32 | m->status = status; |
3690 | 32 | m->done = true; |
3691 | 32 | } |
3692 | | // For universal compaction: |
3693 | | // Because universal compaction always happens at level 0, so one |
3694 | | // compaction will pick up all overlapped files. No files will be |
3695 | | // filtered out due to size limit and left for a successive compaction. |
3696 | | // So we can safely conclude the current compaction. |
3697 | | // |
3698 | | // Also note that, if we don't stop here, then the current compaction |
3699 | | // writes a new file back to level 0, which will be used in successive |
3700 | | // compaction. Hence the manual compaction will never finish. |
3701 | | // |
3702 | | // Stop the compaction if manual_end points to nullptr -- this means |
3703 | | // that we compacted the whole range. manual_end should always point |
3704 | | // to nullptr in case of universal compaction |
3705 | 4.85k | if (m->manual_end == nullptr) { |
3706 | 4.66k | m->done = true; |
3707 | 4.66k | } |
3708 | 4.85k | if (!m->done) { |
3709 | | // We only compacted part of the requested range. Update *m |
3710 | | // to the range that is left to be compacted. |
3711 | | // Universal and FIFO compactions should always compact the whole range |
3712 | 190 | assert(m->cfd->ioptions()->compaction_style != |
3713 | 190 | kCompactionStyleUniversal || |
3714 | 190 | m->cfd->ioptions()->num_levels > 1); |
3715 | 190 | assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); |
3716 | 190 | m->tmp_storage = *m->manual_end; |
3717 | 190 | m->begin = &m->tmp_storage; |
3718 | 190 | m->incomplete = true; |
3719 | 190 | } |
3720 | 4.85k | m->in_progress = false; // not being processed anymore |
3721 | 4.85k | } |
3722 | | |
3723 | 22.9k | if (is_large_compaction) { |
3724 | 6 | num_running_large_compactions_--; |
3725 | 6 | } |
3726 | | |
3727 | 22.9k | RETURN_NOT_OK(status); |
3728 | | |
3729 | 22.8k | return result; |
3730 | 22.9k | } |
3731 | | |
3732 | 81.7k | bool DBImpl::HasPendingManualCompaction() { |
3733 | 81.7k | return (!manual_compaction_dequeue_.empty()); |
3734 | 81.7k | } |
3735 | | |
3736 | 5.68k | void DBImpl::AddManualCompaction(DBImpl::ManualCompaction* m) { |
3737 | 5.68k | manual_compaction_dequeue_.push_back(m); |
3738 | 5.68k | } |
3739 | | |
3740 | 5.68k | void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) { |
3741 | | // Remove from queue |
3742 | 5.68k | std::deque<ManualCompaction*>::iterator it = |
3743 | 5.68k | manual_compaction_dequeue_.begin(); |
3744 | 5.68k | while (it != manual_compaction_dequeue_.end()) { |
3745 | 5.68k | if (m == (*it)) { |
3746 | 5.68k | it = manual_compaction_dequeue_.erase(it); |
3747 | 5.68k | return; |
3748 | 5.68k | } |
3749 | 1 | it++; |
3750 | 1 | } |
3751 | 0 | assert(false); |
3752 | 0 | return; |
3753 | 5.68k | } |
3754 | | |
3755 | 10.8k | bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) { |
3756 | 10.8k | if (m->exclusive) { |
3757 | 10.6k | return (bg_compaction_scheduled_ + compaction_tasks_.size() > 0); |
3758 | 10.6k | } |
3759 | 187 | std::deque<ManualCompaction*>::iterator it = |
3760 | 187 | manual_compaction_dequeue_.begin(); |
3761 | 187 | bool seen = false; |
3762 | 377 | while (it != manual_compaction_dequeue_.end()) { |
3763 | 190 | if (m == (*it)) { |
3764 | 187 | it++; |
3765 | 187 | seen = true; |
3766 | 187 | continue; |
3767 | 3 | } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) { |
3768 | | // Consider the other manual compaction *it, conflicts if: |
3769 | | // overlaps with m |
3770 | | // and (*it) is ahead in the queue and is not yet in progress |
3771 | 0 | return true; |
3772 | 0 | } |
3773 | 3 | it++; |
3774 | 3 | } |
3775 | 187 | return false; |
3776 | 187 | } |
3777 | | |
3778 | 33.8k | bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) { |
3779 | | // Remove from priority queue |
3780 | 33.8k | std::deque<ManualCompaction*>::iterator it = |
3781 | 33.8k | manual_compaction_dequeue_.begin(); |
3782 | 33.8k | while (it != manual_compaction_dequeue_.end()) { |
3783 | 17 | if ((*it)->exclusive) { |
3784 | 0 | return true; |
3785 | 0 | } |
3786 | 17 | if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) { |
3787 | | // Allow automatic compaction if manual compaction is |
3788 | | // is in progress |
3789 | 2 | return true; |
3790 | 2 | } |
3791 | 15 | it++; |
3792 | 15 | } |
3793 | 33.8k | return false; |
3794 | 33.8k | } |
3795 | | |
3796 | 34.2k | bool DBImpl::HasExclusiveManualCompaction() { |
3797 | | // Remove from priority queue |
3798 | 34.2k | std::deque<ManualCompaction*>::iterator it = |
3799 | 34.2k | manual_compaction_dequeue_.begin(); |
3800 | 34.2k | while (it != manual_compaction_dequeue_.end()) { |
3801 | 369 | if ((*it)->exclusive) { |
3802 | 352 | return true; |
3803 | 352 | } |
3804 | 17 | it++; |
3805 | 17 | } |
3806 | 33.8k | return false; |
3807 | 34.2k | } |
3808 | | |
3809 | 3 | bool DBImpl::MCOverlap(ManualCompaction* m, ManualCompaction* m1) { |
3810 | 3 | if ((m->exclusive) || (m1->exclusive)) { |
3811 | 0 | return true; |
3812 | 0 | } |
3813 | 3 | if (m->cfd != m1->cfd) { |
3814 | 3 | return false; |
3815 | 3 | } |
3816 | 0 | return true; |
3817 | 0 | } |
3818 | | |
3819 | | namespace { |
3820 | | struct IterState { |
3821 | | IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version) |
3822 | 15.3M | : db(_db), mu(_mu), super_version(_super_version) {} |
3823 | | |
3824 | | DBImpl* db; |
3825 | | InstrumentedMutex* mu; |
3826 | | SuperVersion* super_version; |
3827 | | }; |
3828 | | |
3829 | 15.3M | static void CleanupIteratorState(void* arg1, void* arg2) { |
3830 | 15.3M | IterState* state = reinterpret_cast<IterState*>(arg1); |
3831 | | |
3832 | 15.3M | if (state->super_version->Unref()) { |
3833 | | // Job id == 0 means that this is not our background process, but rather |
3834 | | // user thread |
3835 | 1.93k | JobContext job_context(0); |
3836 | | |
3837 | 1.93k | state->mu->Lock(); |
3838 | 1.93k | state->super_version->Cleanup(); |
3839 | 1.93k | state->db->FindObsoleteFiles(&job_context, false, true); |
3840 | 1.93k | state->mu->Unlock(); |
3841 | | |
3842 | 1.93k | delete state->super_version; |
3843 | 1.93k | if (job_context.HaveSomethingToDelete()) { |
3844 | 151 | state->db->PurgeObsoleteFiles(job_context); |
3845 | 151 | } |
3846 | 1.93k | job_context.Clean(); |
3847 | 1.93k | } |
3848 | | |
3849 | 15.3M | delete state; |
3850 | 15.3M | } |
3851 | | } // namespace |
3852 | | |
3853 | | InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, |
3854 | | ColumnFamilyData* cfd, |
3855 | | SuperVersion* super_version, |
3856 | 15.3M | Arena* arena) { |
3857 | 15.3M | InternalIterator* internal_iter; |
3858 | 15.3M | assert(arena != nullptr); |
3859 | | // Need to create internal iterator from the arena. |
3860 | 15.3M | MergeIteratorBuilder merge_iter_builder(cfd->internal_comparator().get(), arena); |
3861 | | // Collect iterator for mutable mem |
3862 | 15.3M | merge_iter_builder.AddIterator( |
3863 | 15.3M | super_version->mem->NewIterator(read_options, arena)); |
3864 | | // Collect all needed child iterators for immutable memtables |
3865 | 15.3M | super_version->imm->AddIterators(read_options, &merge_iter_builder); |
3866 | | // Collect iterators for files in L0 - Ln |
3867 | 15.3M | super_version->current->AddIterators(read_options, env_options_, |
3868 | 15.3M | &merge_iter_builder); |
3869 | 15.3M | internal_iter = merge_iter_builder.Finish(); |
3870 | 15.3M | IterState* cleanup = new IterState(this, &mutex_, super_version); |
3871 | 15.3M | internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); |
3872 | | |
3873 | 15.3M | return internal_iter; |
3874 | 15.3M | } |
3875 | | |
3876 | 42.0M | ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { |
3877 | 42.0M | return default_cf_handle_; |
3878 | 42.0M | } |
3879 | | |
3880 | | Status DBImpl::Get(const ReadOptions& read_options, |
3881 | | ColumnFamilyHandle* column_family, const Slice& key, |
3882 | 6.53M | std::string* value) { |
3883 | 6.53M | return GetImpl(read_options, column_family, key, value); |
3884 | 6.53M | } |
3885 | | |
3886 | | // JobContext gets created and destructed outside of the lock -- |
3887 | | // we |
3888 | | // use this convinently to: |
3889 | | // * malloc one SuperVersion() outside of the lock -- new_superversion |
3890 | | // * delete SuperVersion()s outside of the lock -- superversions_to_free |
3891 | | // |
3892 | | // However, if InstallSuperVersionAndScheduleWork() gets called twice with the |
3893 | | // same job_context, we can't reuse the SuperVersion() that got |
3894 | | // malloced because |
3895 | | // first call already used it. In that rare case, we take a hit and create a |
3896 | | // new SuperVersion() inside of the mutex. We do similar thing |
3897 | | // for superversion_to_free |
3898 | | void DBImpl::InstallSuperVersionAndScheduleWorkWrapper( |
3899 | | ColumnFamilyData* cfd, JobContext* job_context, |
3900 | 54.8k | const MutableCFOptions& mutable_cf_options) { |
3901 | 54.8k | mutex_.AssertHeld(); |
3902 | 54.8k | auto old_superversion = InstallSuperVersionAndScheduleWork( |
3903 | 54.8k | cfd, job_context->new_superversion, mutable_cf_options); |
3904 | 54.8k | job_context->new_superversion = nullptr; |
3905 | 54.8k | job_context->superversions_to_free.push_back(old_superversion.release()); |
3906 | 54.8k | } |
3907 | | |
3908 | | std::unique_ptr<SuperVersion> DBImpl::InstallSuperVersionAndScheduleWork( |
3909 | | ColumnFamilyData* cfd, SuperVersion* new_sv, |
3910 | 760k | const MutableCFOptions& mutable_cf_options) { |
3911 | 760k | mutex_.AssertHeld(); |
3912 | | |
3913 | | // Update max_total_in_memory_state_ |
3914 | 760k | size_t old_memtable_size = 0; |
3915 | 760k | auto* old_sv = cfd->GetSuperVersion(); |
3916 | 760k | if (old_sv) { |
3917 | 414k | old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * |
3918 | 414k | old_sv->mutable_cf_options.max_write_buffer_number; |
3919 | 414k | } |
3920 | | |
3921 | 760k | auto old = cfd->InstallSuperVersion( |
3922 | 680k | new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options); |
3923 | | |
3924 | | // Whenever we install new SuperVersion, we might need to issue new flushes or |
3925 | | // compactions. |
3926 | 760k | SchedulePendingFlush(cfd); |
3927 | 760k | SchedulePendingCompaction(cfd); |
3928 | 760k | MaybeScheduleFlushOrCompaction(); |
3929 | | |
3930 | | // Update max_total_in_memory_state_ |
3931 | 760k | max_total_in_memory_state_ = |
3932 | 760k | max_total_in_memory_state_ - old_memtable_size + |
3933 | 760k | mutable_cf_options.write_buffer_size * |
3934 | 760k | mutable_cf_options.max_write_buffer_number; |
3935 | 760k | return old; |
3936 | 760k | } |
3937 | | |
3938 | | Status DBImpl::GetImpl(const ReadOptions& read_options, |
3939 | | ColumnFamilyHandle* column_family, const Slice& key, |
3940 | 6.53M | std::string* value, bool* value_found) { |
3941 | 6.53M | StopWatch sw(env_, stats_, DB_GET); |
3942 | 6.53M | PERF_TIMER_GUARD(get_snapshot_time); |
3943 | | |
3944 | 6.53M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
3945 | 6.53M | auto cfd = cfh->cfd(); |
3946 | | |
3947 | 6.53M | SequenceNumber snapshot; |
3948 | 6.53M | if (read_options.snapshot != nullptr) { |
3949 | 905 | snapshot = reinterpret_cast<const SnapshotImpl*>( |
3950 | 905 | read_options.snapshot)->number_; |
3951 | 6.53M | } else { |
3952 | 6.53M | snapshot = versions_->LastSequence(); |
3953 | 6.53M | } |
3954 | | // Acquire SuperVersion |
3955 | 6.53M | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
3956 | | // Prepare to store a list of merge operations if merge occurs. |
3957 | 6.53M | MergeContext merge_context; |
3958 | | |
3959 | 6.53M | Status s; |
3960 | | // First look in the memtable, then in the immutable memtable (if any). |
3961 | | // s is both in/out. When in, s could either be OK or MergeInProgress. |
3962 | | // merge_operands will contain the sequence of merges in the latter case. |
3963 | 6.53M | LookupKey lkey(key, snapshot); |
3964 | 6.53M | PERF_TIMER_STOP(get_snapshot_time); |
3965 | | |
3966 | 6.53M | bool skip_memtable = |
3967 | 6.53M | (read_options.read_tier == kPersistedTier && has_unpersisted_data_); |
3968 | 6.53M | bool done = false; |
3969 | 6.53M | if (!skip_memtable) { |
3970 | 6.53M | if (sv->mem->Get(lkey, value, &s, &merge_context)) { |
3971 | 28.6k | done = true; |
3972 | 28.6k | RecordTick(stats_, MEMTABLE_HIT); |
3973 | 6.51M | } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { |
3974 | 77 | done = true; |
3975 | 77 | RecordTick(stats_, MEMTABLE_HIT); |
3976 | 77 | } |
3977 | 6.53M | } |
3978 | 6.53M | if (!done) { |
3979 | 6.51M | PERF_TIMER_GUARD(get_from_output_files_time); |
3980 | 6.51M | sv->current->Get(read_options, lkey, value, &s, &merge_context, |
3981 | 6.51M | value_found); |
3982 | 6.51M | RecordTick(stats_, MEMTABLE_MISS); |
3983 | 6.51M | } |
3984 | | |
3985 | 6.53M | { |
3986 | 6.53M | PERF_TIMER_GUARD(get_post_process_time); |
3987 | | |
3988 | 6.53M | ReturnAndCleanupSuperVersion(cfd, sv); |
3989 | | |
3990 | 6.53M | RecordTick(stats_, NUMBER_KEYS_READ); |
3991 | 6.53M | RecordTick(stats_, BYTES_READ, value->size()); |
3992 | 6.53M | MeasureTime(stats_, BYTES_PER_READ, value->size()); |
3993 | 6.53M | } |
3994 | 6.53M | return s; |
3995 | 6.53M | } |
3996 | | |
3997 | | std::vector<Status> DBImpl::MultiGet( |
3998 | | const ReadOptions& read_options, |
3999 | | const std::vector<ColumnFamilyHandle*>& column_family, |
4000 | 1.48M | const std::vector<Slice>& keys, std::vector<std::string>* values) { |
4001 | | |
4002 | 1.48M | StopWatch sw(env_, stats_, DB_MULTIGET); |
4003 | 1.48M | PERF_TIMER_GUARD(get_snapshot_time); |
4004 | | |
4005 | 1.48M | struct MultiGetColumnFamilyData { |
4006 | 1.48M | ColumnFamilyData* cfd; |
4007 | 1.48M | SuperVersion* super_version; |
4008 | 1.48M | }; |
4009 | 1.48M | std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data; |
4010 | | // fill up and allocate outside of mutex |
4011 | 14.7M | for (auto cf : column_family) { |
4012 | 14.7M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(cf); |
4013 | 14.7M | auto cfd = cfh->cfd(); |
4014 | 14.7M | if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) { |
4015 | 14.7M | auto mgcfd = new MultiGetColumnFamilyData(); |
4016 | 14.7M | mgcfd->cfd = cfd; |
4017 | 14.7M | multiget_cf_data.insert({cfd->GetID(), mgcfd}); |
4018 | 14.7M | } |
4019 | 14.7M | } |
4020 | | |
4021 | 1.48M | mutex_.Lock(); |
4022 | 1.48M | SequenceNumber snapshot; |
4023 | 1.48M | if (read_options.snapshot != nullptr) { |
4024 | 0 | snapshot = reinterpret_cast<const SnapshotImpl*>( |
4025 | 0 | read_options.snapshot)->number_; |
4026 | 1.48M | } else { |
4027 | 1.48M | snapshot = versions_->LastSequence(); |
4028 | 1.48M | } |
4029 | 14.8M | for (auto mgd_iter : multiget_cf_data) { |
4030 | 14.8M | mgd_iter.second->super_version = |
4031 | 14.8M | mgd_iter.second->cfd->GetSuperVersion()->Ref(); |
4032 | 14.8M | } |
4033 | 1.48M | mutex_.Unlock(); |
4034 | | |
4035 | | // Contain a list of merge operations if merge occurs. |
4036 | 1.48M | MergeContext merge_context; |
4037 | | |
4038 | | // Note: this always resizes the values array |
4039 | 1.48M | size_t num_keys = keys.size(); |
4040 | 1.48M | std::vector<Status> stat_list(num_keys); |
4041 | 1.48M | values->resize(num_keys); |
4042 | | |
4043 | | // Keep track of bytes that we read for statistics-recording later |
4044 | 1.48M | uint64_t bytes_read = 0; |
4045 | 1.48M | PERF_TIMER_STOP(get_snapshot_time); |
4046 | | |
4047 | | // For each of the given keys, apply the entire "get" process as follows: |
4048 | | // First look in the memtable, then in the immutable memtable (if any). |
4049 | | // s is both in/out. When in, s could either be OK or MergeInProgress. |
4050 | | // merge_operands will contain the sequence of merges in the latter case. |
4051 | 16.2M | for (size_t i = 0; i < num_keys; ++i) { |
4052 | 14.7M | merge_context.Clear(); |
4053 | 14.7M | Status& s = stat_list[i]; |
4054 | 14.7M | std::string* value = &(*values)[i]; |
4055 | | |
4056 | 14.7M | LookupKey lkey(keys[i], snapshot); |
4057 | 14.7M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family[i]); |
4058 | 14.7M | auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); |
4059 | 14.7M | assert(mgd_iter != multiget_cf_data.end()); |
4060 | 14.7M | auto mgd = mgd_iter->second; |
4061 | 14.7M | auto super_version = mgd->super_version; |
4062 | 14.7M | bool skip_memtable = |
4063 | 14.7M | (read_options.read_tier == kPersistedTier && has_unpersisted_data_); |
4064 | 14.7M | bool done = false; |
4065 | 14.8M | if (!skip_memtable) { |
4066 | 14.8M | if (super_version->mem->Get(lkey, value, &s, &merge_context)) { |
4067 | 13.6M | done = true; |
4068 | | // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? |
4069 | 1.12M | } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { |
4070 | 288k | done = true; |
4071 | | // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? |
4072 | 288k | } |
4073 | 14.8M | } |
4074 | 14.7M | if (!done) { |
4075 | 833k | PERF_TIMER_GUARD(get_from_output_files_time); |
4076 | 833k | super_version->current->Get(read_options, lkey, value, &s, |
4077 | 833k | &merge_context); |
4078 | | // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? |
4079 | 833k | } |
4080 | | |
4081 | 14.7M | if (s.ok()) { |
4082 | 14.5M | bytes_read += value->size(); |
4083 | 14.5M | } |
4084 | 14.7M | } |
4085 | | |
4086 | | // Post processing (decrement reference counts and record statistics) |
4087 | 1.48M | PERF_TIMER_GUARD(get_post_process_time); |
4088 | 1.48M | autovector<SuperVersion*> superversions_to_delete; |
4089 | | |
4090 | | // TODO(icanadi) do we need lock here or just around Cleanup()? |
4091 | 1.48M | mutex_.Lock(); |
4092 | 14.8M | for (auto mgd_iter : multiget_cf_data) { |
4093 | 14.8M | auto mgd = mgd_iter.second; |
4094 | 14.8M | if (mgd->super_version->Unref()) { |
4095 | 1.30k | mgd->super_version->Cleanup(); |
4096 | 1.30k | superversions_to_delete.push_back(mgd->super_version); |
4097 | 1.30k | } |
4098 | 14.8M | } |
4099 | 1.48M | mutex_.Unlock(); |
4100 | | |
4101 | 1.30k | for (auto td : superversions_to_delete) { |
4102 | 1.30k | delete td; |
4103 | 1.30k | } |
4104 | 14.8M | for (auto mgd : multiget_cf_data) { |
4105 | 14.8M | delete mgd.second; |
4106 | 14.8M | } |
4107 | | |
4108 | 1.48M | RecordTick(stats_, NUMBER_MULTIGET_CALLS); |
4109 | 1.48M | RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); |
4110 | 1.48M | RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); |
4111 | 1.48M | MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); |
4112 | 1.48M | PERF_TIMER_STOP(get_post_process_time); |
4113 | | |
4114 | 1.48M | return stat_list; |
4115 | 1.48M | } |
4116 | | |
4117 | | #ifndef ROCKSDB_LITE |
4118 | | Status DBImpl::AddFile(ColumnFamilyHandle* column_family, |
4119 | 565 | const std::string& file_path, bool move_file) { |
4120 | 565 | Status status; |
4121 | 565 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4122 | 565 | ColumnFamilyData* cfd = cfh->cfd(); |
4123 | | |
4124 | 565 | ExternalSstFileInfo file_info; |
4125 | 565 | file_info.file_path = file_path; |
4126 | 565 | status = env_->GetFileSize(file_path, &file_info.base_file_size); |
4127 | 565 | if (!status.ok()) { |
4128 | 0 | return status; |
4129 | 0 | } |
4130 | | |
4131 | | // Access the file using TableReader to extract |
4132 | | // version, number of entries, smallest user key, largest user key |
4133 | 565 | std::unique_ptr<RandomAccessFile> base_sst_file; |
4134 | 565 | status = env_->NewRandomAccessFile(file_path, &base_sst_file, env_options_); |
4135 | 565 | if (!status.ok()) { |
4136 | 0 | return status; |
4137 | 0 | } |
4138 | 565 | std::unique_ptr<RandomAccessFileReader> base_sst_file_reader; |
4139 | 565 | base_sst_file_reader.reset(new RandomAccessFileReader(std::move(base_sst_file))); |
4140 | | |
4141 | 565 | std::unique_ptr<TableReader> table_reader; |
4142 | 565 | status = cfd->ioptions()->table_factory->NewTableReader( |
4143 | 565 | TableReaderOptions(*cfd->ioptions(), env_options_, |
4144 | 565 | cfd->internal_comparator()), |
4145 | 565 | std::move(base_sst_file_reader), file_info.base_file_size, |
4146 | 565 | &table_reader); |
4147 | 565 | if (!status.ok()) { |
4148 | 0 | return status; |
4149 | 0 | } |
4150 | | |
4151 | | // Get the external sst file version from table properties |
4152 | 565 | const UserCollectedProperties& user_collected_properties = |
4153 | 565 | table_reader->GetTableProperties()->user_collected_properties; |
4154 | 565 | UserCollectedProperties::const_iterator external_sst_file_version_iter = |
4155 | 565 | user_collected_properties.find(ExternalSstFilePropertyNames::kVersion); |
4156 | 565 | if (external_sst_file_version_iter == user_collected_properties.end()) { |
4157 | 0 | return STATUS(InvalidArgument, "Generated table version not found"); |
4158 | 0 | } |
4159 | | |
4160 | 565 | file_info.is_split_sst = table_reader->IsSplitSst(); |
4161 | 565 | if (file_info.is_split_sst) { |
4162 | 561 | std::unique_ptr<RandomAccessFile> data_sst_file; |
4163 | 561 | status = env_->NewRandomAccessFile(TableBaseToDataFileName(file_path), &data_sst_file, |
4164 | 561 | env_options_); |
4165 | 561 | if (!status.ok()) { |
4166 | 0 | return status; |
4167 | 0 | } |
4168 | 561 | std::unique_ptr<RandomAccessFileReader> data_sst_file_reader; |
4169 | 561 | data_sst_file_reader.reset(new RandomAccessFileReader(std::move(data_sst_file))); |
4170 | 561 | table_reader->SetDataFileReader(std::move(data_sst_file_reader)); |
4171 | 561 | } |
4172 | | |
4173 | 565 | file_info.file_size = file_info.base_file_size + |
4174 | 18.4E | (file_info.is_split_sst ? table_reader->GetTableProperties()->data_size : 0); |
4175 | | |
4176 | 565 | file_info.version = |
4177 | 565 | DecodeFixed32(external_sst_file_version_iter->second.c_str()); |
4178 | 568 | if (file_info.version == 1) { |
4179 | | // version 1 imply that all sequence numbers in table equal 0 |
4180 | 568 | file_info.sequence_number = 0; |
4181 | 18.4E | } else { |
4182 | 18.4E | return STATUS(InvalidArgument, "Generated table version is not supported"); |
4183 | 18.4E | } |
4184 | | |
4185 | | // Get number of entries in table |
4186 | 568 | file_info.num_entries = table_reader->GetTableProperties()->num_entries; |
4187 | | |
4188 | 568 | ParsedInternalKey key; |
4189 | 568 | std::unique_ptr<InternalIterator> iter( |
4190 | 568 | table_reader->NewIterator(ReadOptions())); |
4191 | | |
4192 | | // Get first (smallest) key from file |
4193 | 568 | iter->SeekToFirst(); |
4194 | 568 | if (!ParseInternalKey(iter->key(), &key)) { |
4195 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4196 | 0 | } |
4197 | 568 | if (key.sequence != 0) { |
4198 | 0 | return STATUS(Corruption, "Generated table have non zero sequence number"); |
4199 | 0 | } |
4200 | 568 | file_info.smallest_key = key.user_key.ToString(); |
4201 | | |
4202 | | // Get last (largest) key from file |
4203 | 568 | iter->SeekToLast(); |
4204 | 568 | if (!ParseInternalKey(iter->key(), &key)) { |
4205 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4206 | 0 | } |
4207 | 568 | if (key.sequence != 0) { |
4208 | 0 | return STATUS(Corruption, "Generated table have non zero sequence number"); |
4209 | 0 | } |
4210 | 568 | file_info.largest_key = key.user_key.ToString(); |
4211 | | |
4212 | 568 | return AddFile(column_family, &file_info, move_file); |
4213 | 568 | } |
4214 | | |
4215 | | namespace { |
4216 | | |
4217 | | // Helper function for copying file from src_path to dst_path. If try_hard_link is true it tries |
4218 | | // to make a hard link instead of copyging if possible. |
4219 | | Status AddFile(Env* env, const std::string& src_path, const std::string& dst_path, |
4220 | 19.7k | bool try_hard_link) { |
4221 | 19.7k | Status status; |
4222 | 19.7k | if (try_hard_link) { |
4223 | 326 | status = env->LinkFile(src_path, dst_path); |
4224 | 326 | if (status.IsNotSupported()) { |
4225 | | // Original file is on a different FS, use copy instead of hard linking |
4226 | 0 | status = CopyFile(env, src_path, dst_path, 0); |
4227 | 0 | } |
4228 | 19.4k | } else { |
4229 | 19.4k | status = CopyFile(env, src_path, dst_path, 0); |
4230 | 19.4k | } |
4231 | 19.7k | return status; |
4232 | 19.7k | } |
4233 | | |
4234 | | // Deletes file and logs error message in case of failure. error_format should have format |
4235 | | // specifications exactly for 2 string arguments: path and status. |
4236 | | void DeleteFile(Env* env, const std::string& path, const shared_ptr<Logger>& info_log, |
4237 | 9.19k | const char* error_format) { |
4238 | 9.19k | Status s = env->DeleteFile(path); |
4239 | 9.19k | if (!s.ok()) { |
4240 | 0 | RLOG(InfoLogLevel::WARN_LEVEL, info_log, error_format, path.c_str(), s.ToString().c_str()); |
4241 | 0 | } |
4242 | 9.19k | } |
4243 | | |
4244 | | } // namespace |
4245 | | |
4246 | | Status DBImpl::AddFile(ColumnFamilyHandle* column_family, |
4247 | 9.86k | const ExternalSstFileInfo* file_info, bool move_file) { |
4248 | 9.86k | Status status; |
4249 | 9.86k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4250 | 9.86k | ColumnFamilyData* cfd = cfh->cfd(); |
4251 | | |
4252 | 9.86k | if (file_info->num_entries == 0) { |
4253 | 0 | return STATUS(InvalidArgument, "File contain no entries"); |
4254 | 0 | } |
4255 | 9.86k | if (file_info->version != 1) { |
4256 | 0 | return STATUS(InvalidArgument, "Generated table version is not supported"); |
4257 | 0 | } |
4258 | | // version 1 imply that file have only Put Operations with Sequence Number = 0 |
4259 | | |
4260 | 9.86k | FileMetaData meta; |
4261 | 9.86k | meta.smallest.key = InternalKey(file_info->smallest_key, |
4262 | 9.86k | file_info->sequence_number, |
4263 | 9.86k | ValueType::kTypeValue); |
4264 | 9.86k | meta.largest.key = InternalKey(file_info->largest_key, |
4265 | 9.86k | file_info->sequence_number, |
4266 | 9.86k | ValueType::kTypeValue); |
4267 | 9.86k | if (!meta.smallest.key.Valid() || !meta.largest.key.Valid()) { |
4268 | 0 | return STATUS(Corruption, "Generated table have corrupted keys"); |
4269 | 0 | } |
4270 | 9.86k | meta.smallest.seqno = file_info->sequence_number; |
4271 | 9.86k | meta.largest.seqno = file_info->sequence_number; |
4272 | 9.86k | if (meta.smallest.seqno != 0 || meta.largest.seqno != 0) { |
4273 | 0 | return STATUS(InvalidArgument, |
4274 | 0 | "Non zero sequence numbers are not supported"); |
4275 | 0 | } |
4276 | | |
4277 | 9.86k | std::string db_base_fname; |
4278 | 9.86k | std::string db_data_fname; |
4279 | 9.86k | std::string data_file_path; |
4280 | 9.86k | { |
4281 | | // Generate a location for the new table |
4282 | 9.86k | auto file_number_holder = pending_outputs_->NewFileNumber(); |
4283 | 9.86k | meta.fd = FileDescriptor(file_number_holder.Last(), 0, file_info->file_size, |
4284 | 9.86k | file_info->base_file_size); |
4285 | | |
4286 | 9.86k | db_base_fname = TableFileName( |
4287 | 9.86k | db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); |
4288 | 9.86k | status = ::rocksdb::AddFile(env_, file_info->file_path, db_base_fname, move_file); |
4289 | | |
4290 | 9.86k | if (status.ok() && file_info->is_split_sst) { |
4291 | 9.86k | data_file_path = TableBaseToDataFileName(file_info->file_path); |
4292 | 9.86k | db_data_fname = TableBaseToDataFileName(db_base_fname); |
4293 | 9.86k | status = ::rocksdb::AddFile(env_, data_file_path, db_data_fname, move_file); |
4294 | 9.86k | if (!status.ok()) { |
4295 | 0 | ::rocksdb::DeleteFile(env_, db_base_fname, db_options_.info_log, |
4296 | 0 | "AddFile() clean up for file %s failed : %s"); |
4297 | 0 | } |
4298 | 9.86k | } |
4299 | | |
4300 | 9.86k | TEST_SYNC_POINT("DBImpl::AddFile:FileCopied"); |
4301 | 9.86k | if (!status.ok()) { |
4302 | 0 | return status; |
4303 | 0 | } |
4304 | | |
4305 | 9.86k | { |
4306 | 9.86k | InstrumentedMutexLock l(&mutex_); |
4307 | 9.86k | const MutableCFOptions mutable_cf_options = |
4308 | 9.86k | *cfd->GetLatestMutableCFOptions(); |
4309 | | |
4310 | 9.86k | WriteThread::Writer w; |
4311 | 9.86k | write_thread_.EnterUnbatched(&w, &mutex_); |
4312 | | |
4313 | 9.86k | if (!snapshots_.empty()) { |
4314 | | // Check that no snapshots are being held |
4315 | 23 | status = |
4316 | 23 | STATUS(NotSupported, "Cannot add a file while holding snapshots"); |
4317 | 23 | } |
4318 | | |
4319 | 9.86k | if (status.ok()) { |
4320 | | // Verify that added file key range dont overlap with any keys in DB |
4321 | 9.84k | SuperVersion* sv = cfd->GetSuperVersion()->Ref(); |
4322 | 9.84k | Arena arena; |
4323 | 9.84k | ReadOptions ro; |
4324 | 9.84k | ro.total_order_seek = true; |
4325 | 9.84k | ScopedArenaIterator iter(NewInternalIterator(ro, cfd, sv, &arena)); |
4326 | | |
4327 | 9.84k | InternalKey range_start(file_info->smallest_key, kMaxSequenceNumber, kTypeValue); |
4328 | 9.84k | iter->Seek(range_start.Encode()); |
4329 | 9.84k | status = iter->status(); |
4330 | | |
4331 | 9.84k | if (status.ok() && iter->Valid()) { |
4332 | 9.53k | ParsedInternalKey seek_result; |
4333 | 9.53k | if (ParseInternalKey(iter->key(), &seek_result)) { |
4334 | 9.53k | auto* vstorage = cfd->current()->storage_info(); |
4335 | 9.53k | if (vstorage->InternalComparator()->user_comparator()->Compare( |
4336 | 4.50k | seek_result.user_key, file_info->largest_key) <= 0) { |
4337 | 4.50k | status = STATUS(NotSupported, "Cannot add overlapping range"); |
4338 | 4.50k | } |
4339 | 0 | } else { |
4340 | 0 | status = STATUS(Corruption, "DB have corrupted keys"); |
4341 | 0 | } |
4342 | 9.53k | } |
4343 | 9.84k | } |
4344 | | |
4345 | 9.86k | if (status.ok()) { |
4346 | | // Add file to L0 |
4347 | 5.34k | VersionEdit edit; |
4348 | 5.34k | edit.SetColumnFamily(cfd->GetID()); |
4349 | 5.34k | edit.AddCleanedFile(0, meta); |
4350 | | |
4351 | 5.34k | status = versions_->LogAndApply( |
4352 | 5.34k | cfd, mutable_cf_options, &edit, &mutex_, directories_.GetDbDir()); |
4353 | 5.34k | } |
4354 | 9.86k | write_thread_.ExitUnbatched(&w); |
4355 | | |
4356 | 9.86k | if (status.ok()) { |
4357 | 5.34k | InstallSuperVersionAndScheduleWork(cfd, nullptr, mutable_cf_options); |
4358 | 5.34k | } |
4359 | 9.86k | } |
4360 | 9.86k | } |
4361 | | |
4362 | 9.86k | if (!status.ok()) { |
4363 | | // We failed to add the file to the database |
4364 | 4.53k | const char* error_format = "AddFile() clean up for file %s failed : %s"; |
4365 | 4.53k | ::rocksdb::DeleteFile(env_, db_base_fname, db_options_.info_log, error_format); |
4366 | 4.53k | if (file_info->is_split_sst) { |
4367 | 4.53k | ::rocksdb::DeleteFile(env_, db_data_fname, db_options_.info_log, error_format); |
4368 | 4.53k | } |
4369 | 5.34k | } else if (status.ok()) { |
4370 | 5.34k | if (move_file) { |
4371 | | // The file was moved and added successfully, remove original file link |
4372 | 66 | const char* error_format = |
4373 | 66 | "%s was added to DB successfully but failed to remove original file link : %s"; |
4374 | 66 | ::rocksdb::DeleteFile(env_, file_info->file_path, db_options_.info_log, error_format); |
4375 | 66 | if (file_info->is_split_sst) { |
4376 | 66 | ::rocksdb::DeleteFile(env_, data_file_path, db_options_.info_log, error_format); |
4377 | 66 | } |
4378 | 66 | } |
4379 | 5.34k | FilesChanged(); |
4380 | 5.34k | } |
4381 | 9.86k | return status; |
4382 | 9.86k | } |
4383 | | #endif // ROCKSDB_LITE |
4384 | | |
4385 | 125k | std::function<void()> DBImpl::GetFilesChangedListener() const { |
4386 | 125k | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4387 | 125k | return files_changed_listener_; |
4388 | 125k | } |
4389 | | |
4390 | 0 | bool DBImpl::HasFilesChangedListener() const { |
4391 | 0 | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4392 | 0 | return files_changed_listener_ != nullptr; |
4393 | 0 | } |
4394 | | |
4395 | 430k | void DBImpl::ListenFilesChanged(std::function<void()> files_changed_listener) { |
4396 | 430k | std::lock_guard<std::mutex> lock(files_changed_listener_mutex_); |
4397 | 430k | files_changed_listener_ = std::move(files_changed_listener); |
4398 | 430k | } |
4399 | | |
4400 | 125k | void DBImpl::FilesChanged() { |
4401 | 125k | auto files_changed_listener = GetFilesChangedListener(); |
4402 | 125k | if (files_changed_listener) { |
4403 | 10.1k | files_changed_listener(); |
4404 | 10.1k | } |
4405 | 125k | } |
4406 | | |
4407 | | Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
4408 | | const std::string& column_family_name, |
4409 | 1.71k | ColumnFamilyHandle** handle) { |
4410 | 1.71k | Status s; |
4411 | 1.71k | Status persist_options_status; |
4412 | 1.71k | *handle = nullptr; |
4413 | | |
4414 | 1.71k | s = CheckCompressionSupported(cf_options); |
4415 | 1.71k | if (s.ok() && db_options_.allow_concurrent_memtable_write) { |
4416 | 34 | s = CheckConcurrentWritesSupported(cf_options); |
4417 | 34 | } |
4418 | 1.71k | if (!s.ok()) { |
4419 | 2 | return s; |
4420 | 2 | } |
4421 | | |
4422 | 1.71k | { |
4423 | 1.71k | InstrumentedMutexLock l(&mutex_); |
4424 | | |
4425 | 1.71k | if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != |
4426 | 0 | nullptr) { |
4427 | 0 | return STATUS(InvalidArgument, "Column family already exists"); |
4428 | 0 | } |
4429 | 1.71k | VersionEdit edit; |
4430 | 1.71k | edit.AddColumnFamily(column_family_name); |
4431 | 1.71k | uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); |
4432 | 1.71k | edit.SetColumnFamily(new_id); |
4433 | 1.71k | edit.SetLogNumber(logfile_number_); |
4434 | 1.71k | edit.SetComparatorName(cf_options.comparator->Name()); |
4435 | | |
4436 | | // LogAndApply will both write the creation in MANIFEST and create |
4437 | | // ColumnFamilyData object |
4438 | 1.71k | Options opt(db_options_, cf_options); |
4439 | 1.71k | { // write thread |
4440 | 1.71k | WriteThread::Writer w; |
4441 | 1.71k | write_thread_.EnterUnbatched(&w, &mutex_); |
4442 | | // LogAndApply will both write the creation in MANIFEST and create |
4443 | | // ColumnFamilyData object |
4444 | 1.71k | s = versions_->LogAndApply( |
4445 | 1.71k | nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit, |
4446 | 1.71k | &mutex_, directories_.GetDbDir(), false, &cf_options); |
4447 | | |
4448 | 1.71k | if (s.ok()) { |
4449 | | // If the column family was created successfully, we then persist |
4450 | | // the updated RocksDB options under the same single write thread |
4451 | 1.71k | persist_options_status = WriteOptionsFile(); |
4452 | 1.71k | } |
4453 | 1.71k | write_thread_.ExitUnbatched(&w); |
4454 | 1.71k | } |
4455 | 1.71k | if (s.ok()) { |
4456 | 1.71k | single_column_family_mode_ = false; |
4457 | 1.71k | auto* cfd = |
4458 | 1.71k | versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); |
4459 | 1.71k | assert(cfd != nullptr); |
4460 | 1.71k | InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
4461 | | |
4462 | 1.71k | if (!cfd->mem()->IsSnapshotSupported()) { |
4463 | 30 | is_snapshot_supported_ = false; |
4464 | 30 | } |
4465 | | |
4466 | 1.71k | *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); |
4467 | 1.71k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4468 | 1.71k | "Created column family [%s] (ID %u)", |
4469 | 1.71k | column_family_name.c_str(), (unsigned)cfd->GetID()); |
4470 | 0 | } else { |
4471 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
4472 | 0 | "Creating column family [%s] FAILED -- %s", |
4473 | 0 | column_family_name.c_str(), s.ToString().c_str()); |
4474 | 0 | } |
4475 | 1.71k | } // InstrumentedMutexLock l(&mutex_) |
4476 | | |
4477 | | // this is outside the mutex |
4478 | 1.71k | if (s.ok()) { |
4479 | 1.71k | if (!persist_options_status.ok()) { |
4480 | 0 | if (db_options_.fail_if_options_file_error) { |
4481 | 0 | s = STATUS(IOError, |
4482 | 0 | "ColumnFamily has been created, but unable to persist" |
4483 | 0 | "options in CreateColumnFamily()", |
4484 | 0 | persist_options_status.ToString().c_str()); |
4485 | 0 | } |
4486 | 0 | RWARN(db_options_.info_log, |
4487 | 0 | "Unable to persist options in CreateColumnFamily() -- %s", |
4488 | 0 | persist_options_status.ToString().c_str()); |
4489 | 0 | } |
4490 | 1.71k | } |
4491 | 1.71k | return s; |
4492 | 1.71k | } |
4493 | | |
4494 | 26 | Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { |
4495 | 26 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4496 | 26 | auto cfd = cfh->cfd(); |
4497 | 26 | if (cfd->GetID() == 0) { |
4498 | 0 | return STATUS(InvalidArgument, "Can't drop default column family"); |
4499 | 0 | } |
4500 | | |
4501 | 26 | bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); |
4502 | | |
4503 | 26 | VersionEdit edit; |
4504 | 26 | edit.DropColumnFamily(); |
4505 | 26 | edit.SetColumnFamily(cfd->GetID()); |
4506 | | |
4507 | 26 | Status s; |
4508 | 26 | Status options_persist_status; |
4509 | 26 | { |
4510 | 26 | InstrumentedMutexLock l(&mutex_); |
4511 | 26 | if (cfd->IsDropped()) { |
4512 | 0 | s = STATUS(InvalidArgument, "Column family already dropped!\n"); |
4513 | 0 | } |
4514 | 26 | if (s.ok()) { |
4515 | | // we drop column family from a single write thread |
4516 | 26 | WriteThread::Writer w; |
4517 | 26 | write_thread_.EnterUnbatched(&w, &mutex_); |
4518 | 26 | s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
4519 | 26 | &edit, &mutex_); |
4520 | 26 | if (s.ok()) { |
4521 | | // If the column family was dropped successfully, we then persist |
4522 | | // the updated RocksDB options under the same single write thread |
4523 | 26 | options_persist_status = WriteOptionsFile(); |
4524 | 26 | } |
4525 | 26 | write_thread_.ExitUnbatched(&w); |
4526 | 26 | } |
4527 | | |
4528 | 26 | if (!cf_support_snapshot) { |
4529 | | // Dropped Column Family doesn't support snapshot. Need to recalculate |
4530 | | // is_snapshot_supported_. |
4531 | 0 | bool new_is_snapshot_supported = true; |
4532 | 0 | for (auto c : *versions_->GetColumnFamilySet()) { |
4533 | 0 | if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) { |
4534 | 0 | new_is_snapshot_supported = false; |
4535 | 0 | break; |
4536 | 0 | } |
4537 | 0 | } |
4538 | 0 | is_snapshot_supported_ = new_is_snapshot_supported; |
4539 | 0 | } |
4540 | 26 | } |
4541 | | |
4542 | 26 | if (s.ok()) { |
4543 | | // Note that here we erase the associated cf_info of the to-be-dropped |
4544 | | // cfd before its ref-count goes to zero to avoid having to erase cf_info |
4545 | | // later inside db_mutex. |
4546 | 26 | assert(cfd->IsDropped()); |
4547 | 26 | auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); |
4548 | 26 | max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * |
4549 | 26 | mutable_cf_options->max_write_buffer_number; |
4550 | 26 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4551 | 26 | "Dropped column family with id %u\n", cfd->GetID()); |
4552 | | |
4553 | 26 | if (!options_persist_status.ok()) { |
4554 | 0 | if (db_options_.fail_if_options_file_error) { |
4555 | 0 | s = STATUS(IOError, |
4556 | 0 | "ColumnFamily has been dropped, but unable to persist " |
4557 | 0 | "options in DropColumnFamily()", |
4558 | 0 | options_persist_status.ToString().c_str()); |
4559 | 0 | } |
4560 | 0 | RWARN(db_options_.info_log, |
4561 | 0 | "Unable to persist options in DropColumnFamily() -- %s", |
4562 | 0 | options_persist_status.ToString().c_str()); |
4563 | 0 | } |
4564 | 0 | } else { |
4565 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
4566 | 0 | "Dropping column family with id %u FAILED -- %s\n", |
4567 | 0 | cfd->GetID(), s.ToString().c_str()); |
4568 | 0 | } |
4569 | | |
4570 | 26 | return s; |
4571 | 26 | } |
4572 | | |
4573 | | bool DBImpl::KeyMayExist(const ReadOptions& read_options, |
4574 | | ColumnFamilyHandle* column_family, const Slice& key, |
4575 | 385 | std::string* value, bool* value_found) { |
4576 | 385 | if (value_found != nullptr) { |
4577 | | // falsify later if key-may-exist but can't fetch value |
4578 | 146 | *value_found = true; |
4579 | 146 | } |
4580 | 385 | ReadOptions roptions = read_options; |
4581 | 385 | roptions.read_tier = kBlockCacheTier; // read from block cache only |
4582 | 385 | auto s = GetImpl(roptions, column_family, key, value, value_found); |
4583 | | |
4584 | | // If block_cache is enabled and the index block of the table was |
4585 | | // not present in block_cache, the return value will be Status::Incomplete. |
4586 | | // In this case, key may still exist in the table. |
4587 | 385 | return s.ok() || s.IsIncomplete(); |
4588 | 385 | } |
4589 | | |
4590 | | Iterator* DBImpl::NewIterator(const ReadOptions& read_options, |
4591 | 15.3M | ColumnFamilyHandle* column_family) { |
4592 | 15.3M | if (read_options.read_tier == kPersistedTier) { |
4593 | 1 | return NewErrorIterator(STATUS(NotSupported, |
4594 | 1 | "ReadTier::kPersistedData is not yet supported in iterators.")); |
4595 | 1 | } |
4596 | 15.3M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4597 | 15.3M | auto cfd = cfh->cfd(); |
4598 | | |
4599 | 15.3M | XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, |
4600 | 15.3M | reinterpret_cast<DBImpl*>(this), |
4601 | 15.3M | const_cast<ReadOptions*>(&read_options), is_snapshot_supported_); |
4602 | 15.3M | if (read_options.managed) { |
4603 | | #ifdef ROCKSDB_LITE |
4604 | | // not supported in lite version |
4605 | | return NewErrorIterator(STATUS(InvalidArgument, |
4606 | | "Managed Iterators not supported in RocksDBLite.")); |
4607 | | #else |
4608 | 76 | if ((read_options.tailing) || (read_options.snapshot != nullptr) || |
4609 | 76 | (is_snapshot_supported_)) { |
4610 | 76 | return new ManagedIterator(this, read_options, cfd); |
4611 | 76 | } |
4612 | | // Managed iter not supported |
4613 | 0 | return NewErrorIterator(STATUS(InvalidArgument, |
4614 | 0 | "Managed Iterators not supported without snapshots.")); |
4615 | 0 | #endif |
4616 | 15.3M | } else if (read_options.tailing) { |
4617 | | #ifdef ROCKSDB_LITE |
4618 | | // not supported in lite version |
4619 | | return nullptr; |
4620 | | #else |
4621 | 23 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4622 | 23 | auto iter = new ForwardIterator(this, read_options, cfd, sv); |
4623 | 23 | return NewDBIterator( |
4624 | 23 | env_, *cfd->ioptions(), cfd->user_comparator(), iter, |
4625 | 23 | kMaxSequenceNumber, |
4626 | 23 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4627 | 23 | sv->version_number, read_options.iterate_upper_bound, |
4628 | 23 | read_options.prefix_same_as_start, read_options.pin_data); |
4629 | 23 | #endif |
4630 | 15.3M | } else { |
4631 | 15.3M | SequenceNumber latest_snapshot = versions_->LastSequence(); |
4632 | 15.3M | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4633 | | |
4634 | 15.3M | auto snapshot = |
4635 | 15.3M | read_options.snapshot != nullptr |
4636 | 2.67k | ? reinterpret_cast<const SnapshotImpl*>( |
4637 | 2.67k | read_options.snapshot)->number_ |
4638 | 15.3M | : latest_snapshot; |
4639 | | |
4640 | | // Try to generate a DB iterator tree in continuous memory area to be |
4641 | | // cache friendly. Here is an example of result: |
4642 | | // +-------------------------------+ |
4643 | | // | | |
4644 | | // | ArenaWrappedDBIter | |
4645 | | // | + | |
4646 | | // | +---> Inner Iterator ------------+ |
4647 | | // | | | | |
4648 | | // | | +-- -- -- -- -- -- -- --+ | |
4649 | | // | +--- | Arena | | |
4650 | | // | | | | |
4651 | | // | Allocated Memory: | | |
4652 | | // | | +-------------------+ | |
4653 | | // | | | DBIter | <---+ |
4654 | | // | | + | |
4655 | | // | | | +-> iter_ ------------+ |
4656 | | // | | | | | |
4657 | | // | | +-------------------+ | |
4658 | | // | | | MergingIterator | <---+ |
4659 | | // | | + | |
4660 | | // | | | +->child iter1 ------------+ |
4661 | | // | | | | | | |
4662 | | // | | +->child iter2 ----------+ | |
4663 | | // | | | | | | | |
4664 | | // | | | +->child iter3 --------+ | | |
4665 | | // | | | | | | |
4666 | | // | | +-------------------+ | | | |
4667 | | // | | | Iterator1 | <--------+ |
4668 | | // | | +-------------------+ | | |
4669 | | // | | | Iterator2 | <------+ |
4670 | | // | | +-------------------+ | |
4671 | | // | | | Iterator3 | <----+ |
4672 | | // | | +-------------------+ |
4673 | | // | | | |
4674 | | // +-------+-----------------------+ |
4675 | | // |
4676 | | // ArenaWrappedDBIter inlines an arena area where all the iterators in |
4677 | | // the iterator tree are allocated in the order of being accessed when |
4678 | | // querying. |
4679 | | // Laying out the iterators in the order of being accessed makes it more |
4680 | | // likely that any iterator pointer is close to the iterator it points to so |
4681 | | // that they are likely to be in the same cache line and/or page. |
4682 | 15.3M | ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( |
4683 | 15.3M | env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, |
4684 | 15.3M | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4685 | 15.3M | sv->version_number, read_options.iterate_upper_bound, |
4686 | 15.3M | read_options.prefix_same_as_start, read_options.pin_data); |
4687 | | |
4688 | 15.3M | InternalIterator* internal_iter = |
4689 | 15.3M | NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); |
4690 | 15.3M | db_iter->SetIterUnderDBIter(internal_iter); |
4691 | | |
4692 | 15.3M | if (yb::GetAtomicFlag(&FLAGS_rocksdb_use_logging_iterator)) { |
4693 | 0 | return new TransitionLoggingIteratorWrapper(db_iter, LogPrefix()); |
4694 | 0 | } |
4695 | 15.3M | return db_iter; |
4696 | 15.3M | } |
4697 | | // To stop compiler from complaining |
4698 | 0 | return nullptr; |
4699 | 0 | } |
4700 | | |
4701 | | Status DBImpl::NewIterators( |
4702 | | const ReadOptions& read_options, |
4703 | | const std::vector<ColumnFamilyHandle*>& column_families, |
4704 | 19 | std::vector<Iterator*>* iterators) { |
4705 | 19 | if (read_options.read_tier == kPersistedTier) { |
4706 | 1 | return STATUS(NotSupported, |
4707 | 1 | "ReadTier::kPersistedData is not yet supported in iterators."); |
4708 | 1 | } |
4709 | 18 | iterators->clear(); |
4710 | 18 | iterators->reserve(column_families.size()); |
4711 | 18 | XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, |
4712 | 18 | reinterpret_cast<DBImpl*>(this), |
4713 | 18 | const_cast<ReadOptions*>(&read_options), is_snapshot_supported_); |
4714 | 18 | if (read_options.managed) { |
4715 | | #ifdef ROCKSDB_LITE |
4716 | | return STATUS(InvalidArgument, |
4717 | | "Managed interator not supported in RocksDB lite"); |
4718 | | #else |
4719 | 0 | if ((!read_options.tailing) && (read_options.snapshot == nullptr) && |
4720 | 0 | (!is_snapshot_supported_)) { |
4721 | 0 | return STATUS(InvalidArgument, |
4722 | 0 | "Managed interator not supported without snapshots"); |
4723 | 0 | } |
4724 | 0 | for (auto cfh : column_families) { |
4725 | 0 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(cfh)->cfd(); |
4726 | 0 | auto iter = new ManagedIterator(this, read_options, cfd); |
4727 | 0 | iterators->push_back(iter); |
4728 | 0 | } |
4729 | 0 | #endif |
4730 | 18 | } else if (read_options.tailing) { |
4731 | | #ifdef ROCKSDB_LITE |
4732 | | return STATUS(InvalidArgument, |
4733 | | "Tailing interator not supported in RocksDB lite"); |
4734 | | #else |
4735 | 3 | for (auto cfh : column_families) { |
4736 | 3 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(cfh)->cfd(); |
4737 | 3 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4738 | 3 | auto iter = new ForwardIterator(this, read_options, cfd, sv); |
4739 | 3 | iterators->push_back(NewDBIterator( |
4740 | 3 | env_, *cfd->ioptions(), cfd->user_comparator(), iter, |
4741 | 3 | kMaxSequenceNumber, |
4742 | 3 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4743 | 3 | sv->version_number, nullptr, false, read_options.pin_data)); |
4744 | 3 | } |
4745 | 1 | #endif |
4746 | 17 | } else { |
4747 | 17 | SequenceNumber latest_snapshot = versions_->LastSequence(); |
4748 | | |
4749 | 53 | for (size_t i = 0; i < column_families.size(); ++i) { |
4750 | 36 | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd(); |
4751 | 36 | SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); |
4752 | | |
4753 | 36 | auto snapshot = |
4754 | 36 | read_options.snapshot != nullptr |
4755 | 0 | ? reinterpret_cast<const SnapshotImpl*>( |
4756 | 0 | read_options.snapshot)->number_ |
4757 | 36 | : latest_snapshot; |
4758 | | |
4759 | 36 | ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( |
4760 | 36 | env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, |
4761 | 36 | sv->mutable_cf_options.max_sequential_skip_in_iterations, |
4762 | 36 | sv->version_number, nullptr, false, read_options.pin_data); |
4763 | 36 | InternalIterator* internal_iter = |
4764 | 36 | NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); |
4765 | 36 | db_iter->SetIterUnderDBIter(internal_iter); |
4766 | 36 | iterators->push_back(db_iter); |
4767 | 36 | } |
4768 | 17 | } |
4769 | | |
4770 | 18 | return Status::OK(); |
4771 | 18 | } |
4772 | | |
4773 | 3.20k | const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); } |
4774 | | |
4775 | | #ifndef ROCKSDB_LITE |
4776 | 71 | const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { |
4777 | 71 | return GetSnapshotImpl(true); |
4778 | 71 | } |
4779 | | #endif // ROCKSDB_LITE |
4780 | | |
4781 | 3.27k | const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) { |
4782 | 3.27k | int64_t unix_time = 0; |
4783 | 3.27k | WARN_NOT_OK(env_->GetCurrentTime(&unix_time), "Failed to get current time"); |
4784 | 3.27k | SnapshotImpl* s = new SnapshotImpl; |
4785 | | |
4786 | 3.27k | InstrumentedMutexLock l(&mutex_); |
4787 | | // returns null if the underlying memtable does not support snapshot. |
4788 | 3.27k | if (!is_snapshot_supported_) { |
4789 | 0 | delete s; |
4790 | 0 | return nullptr; |
4791 | 0 | } |
4792 | 3.27k | return snapshots_.New(s, versions_->LastSequence(), unix_time, |
4793 | 3.27k | is_write_conflict_boundary); |
4794 | 3.27k | } |
4795 | | |
4796 | 3.27k | void DBImpl::ReleaseSnapshot(const Snapshot* s) { |
4797 | 3.27k | const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s); |
4798 | 3.27k | { |
4799 | 3.27k | InstrumentedMutexLock l(&mutex_); |
4800 | 3.27k | snapshots_.Delete(casted_s); |
4801 | 3.27k | } |
4802 | 3.27k | delete casted_s; |
4803 | 3.27k | } |
4804 | | |
4805 | | // Convenience methods |
4806 | | Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, |
4807 | 14.8M | const Slice& key, const Slice& val) { |
4808 | 14.8M | return DB::Put(o, column_family, key, val); |
4809 | 14.8M | } |
4810 | | |
4811 | | Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, |
4812 | 89.9k | const Slice& key, const Slice& val) { |
4813 | 89.9k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
4814 | 89.9k | if (!cfh->cfd()->ioptions()->merge_operator) { |
4815 | 1 | return STATUS(NotSupported, "Provide a merge_operator when opening DB"); |
4816 | 89.9k | } else { |
4817 | 89.9k | return DB::Merge(o, column_family, key, val); |
4818 | 89.9k | } |
4819 | 89.9k | } |
4820 | | |
4821 | | Status DBImpl::Delete(const WriteOptions& write_options, |
4822 | 543k | ColumnFamilyHandle* column_family, const Slice& key) { |
4823 | 543k | return DB::Delete(write_options, column_family, key); |
4824 | 543k | } |
4825 | | |
4826 | | Status DBImpl::SingleDelete(const WriteOptions& write_options, |
4827 | | ColumnFamilyHandle* column_family, |
4828 | 185 | const Slice& key) { |
4829 | 185 | return DB::SingleDelete(write_options, column_family, key); |
4830 | 185 | } |
4831 | | |
4832 | 24.0M | Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { |
4833 | 24.0M | return WriteImpl(write_options, my_batch, nullptr); |
4834 | 24.0M | } |
4835 | | |
4836 | | #ifndef ROCKSDB_LITE |
4837 | | Status DBImpl::WriteWithCallback(const WriteOptions& write_options, |
4838 | | WriteBatch* my_batch, |
4839 | 353 | WriteCallback* callback) { |
4840 | 353 | return WriteImpl(write_options, my_batch, callback); |
4841 | 353 | } |
4842 | | #endif // ROCKSDB_LITE |
4843 | | |
4844 | | Status DBImpl::WriteImpl(const WriteOptions& write_options, |
4845 | 24.0M | WriteBatch* my_batch, WriteCallback* callback) { |
4846 | | |
4847 | 24.0M | if (my_batch == nullptr) { |
4848 | 0 | return STATUS(Corruption, "Batch is nullptr!"); |
4849 | 0 | } |
4850 | 24.0M | if (write_options.timeout_hint_us != 0) { |
4851 | 1 | return STATUS(InvalidArgument, "timeout_hint_us is deprecated"); |
4852 | 1 | } |
4853 | | |
4854 | 24.0M | Status status; |
4855 | | |
4856 | 24.0M | bool xfunc_attempted_write = false; |
4857 | 24.0M | XFUNC_TEST("transaction", "transaction_xftest_write_impl", |
4858 | 24.0M | xf_transaction_write1, xf_transaction_write, write_options, |
4859 | 24.0M | db_options_, my_batch, callback, this, &status, |
4860 | 24.0M | &xfunc_attempted_write); |
4861 | 24.0M | if (xfunc_attempted_write) { |
4862 | | // Test already did the write |
4863 | 0 | return status; |
4864 | 0 | } |
4865 | | |
4866 | 24.0M | PERF_TIMER_GUARD(write_pre_and_post_process_time); |
4867 | 24.0M | WriteThread::Writer w; |
4868 | 24.0M | w.batch = my_batch; |
4869 | 24.0M | w.sync = write_options.sync; |
4870 | 24.0M | w.disableWAL = write_options.disableWAL; |
4871 | 24.0M | w.in_batch_group = false; |
4872 | 24.0M | w.callback = callback; |
4873 | | |
4874 | 24.0M | if (!write_options.disableWAL) { |
4875 | 17.4M | RecordTick(stats_, WRITE_WITH_WAL); |
4876 | 17.4M | } |
4877 | | |
4878 | 24.0M | StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE); |
4879 | | |
4880 | 24.0M | #ifndef NDEBUG |
4881 | 24.0M | auto num_write_waiters = write_waiters_.fetch_add(1, std::memory_order_acq_rel); |
4882 | 24.0M | #endif |
4883 | | |
4884 | 24.0M | write_thread_.JoinBatchGroup(&w); |
4885 | | |
4886 | 24.0M | #ifndef NDEBUG |
4887 | 24.0M | write_waiters_.fetch_sub(1, std::memory_order_acq_rel); |
4888 | 24.0M | DCHECK_LE(num_write_waiters, FLAGS_TEST_max_write_waiters); |
4889 | 24.0M | #endif |
4890 | | |
4891 | 24.0M | if (w.state == WriteThread::STATE_PARALLEL_FOLLOWER) { |
4892 | | // we are a non-leader in a parallel group |
4893 | 40.7k | PERF_TIMER_GUARD(write_memtable_time); |
4894 | | |
4895 | 41.0k | if (!w.CallbackFailed()) { |
4896 | 41.0k | ColumnFamilyMemTablesImpl column_family_memtables( |
4897 | 41.0k | versions_->GetColumnFamilySet()); |
4898 | 41.0k | WriteBatchInternal::SetSequence(w.batch, w.sequence); |
4899 | 41.0k | InsertFlags insert_flags{InsertFlag::kConcurrentMemtableWrites}; |
4900 | 41.0k | w.status = WriteBatchInternal::InsertInto( |
4901 | 41.0k | w.batch, &column_family_memtables, &flush_scheduler_, |
4902 | 41.0k | write_options.ignore_missing_column_families, 0 /*log_number*/, this, insert_flags); |
4903 | 41.0k | } |
4904 | | |
4905 | 40.7k | if (write_thread_.CompleteParallelWorker(&w)) { |
4906 | | // we're responsible for early exit |
4907 | 11.8k | auto last_sequence = w.parallel_group->last_sequence; |
4908 | 11.8k | SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); |
4909 | 11.8k | versions_->SetLastSequence(last_sequence); |
4910 | 11.8k | write_thread_.EarlyExitParallelGroup(&w); |
4911 | 11.8k | } |
4912 | 40.7k | assert(w.state == WriteThread::STATE_COMPLETED); |
4913 | | // STATE_COMPLETED conditional below handles exit |
4914 | | |
4915 | 40.7k | status = w.FinalStatus(); |
4916 | 40.7k | } |
4917 | 24.0M | if (w.state == WriteThread::STATE_COMPLETED) { |
4918 | | // write is complete and leader has updated sequence |
4919 | 1.18M | RecordTick(stats_, WRITE_DONE_BY_OTHER); |
4920 | 1.18M | return w.FinalStatus(); |
4921 | 1.18M | } |
4922 | | // else we are the leader of the write batch group |
4923 | 22.8M | assert(w.state == WriteThread::STATE_GROUP_LEADER); |
4924 | | |
4925 | 22.8M | WriteContext context; |
4926 | 22.8M | mutex_.Lock(); |
4927 | | |
4928 | 22.8M | if (!write_options.disableWAL) { |
4929 | 16.1M | default_cf_internal_stats_->AddDBStats(InternalDBStatsType::WRITE_WITH_WAL, 1); |
4930 | 16.1M | } |
4931 | | |
4932 | 22.8M | RecordTick(stats_, WRITE_DONE_BY_SELF); |
4933 | 22.8M | default_cf_internal_stats_->AddDBStats(InternalDBStatsType::WRITE_DONE_BY_SELF, 1); |
4934 | | |
4935 | | // Once reaches this point, the current writer "w" will try to do its write |
4936 | | // job. It may also pick up some of the remaining writers in the "writers_" |
4937 | | // when it finds suitable, and finish them in the same write batch. |
4938 | | // This is how a write job could be done by the other writer. |
4939 | 22.8M | assert(!single_column_family_mode_ || |
4940 | 22.8M | versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); |
4941 | | |
4942 | 22.8M | uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) |
4943 | 22.7M | ? 4 * max_total_in_memory_state_ |
4944 | 106k | : db_options_.max_total_wal_size; |
4945 | 22.8M | if (UNLIKELY(!single_column_family_mode_ && |
4946 | 22.8M | alive_log_files_.begin()->getting_flushed == false && |
4947 | 17 | total_log_size() > max_total_wal_size)) { |
4948 | 17 | uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; |
4949 | 17 | alive_log_files_.begin()->getting_flushed = true; |
4950 | 17 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4951 | 17 | "Flushing all column families with data in WAL number %" PRIu64 |
4952 | 17 | ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, |
4953 | 17 | flush_column_family_if_log_file, total_log_size(), max_total_wal_size); |
4954 | | // no need to refcount because drop is happening in write thread, so can't |
4955 | | // happen while we're in the write thread |
4956 | 37 | for (auto cfd : *versions_->GetColumnFamilySet()) { |
4957 | 37 | if (cfd->IsDropped()) { |
4958 | 1 | continue; |
4959 | 1 | } |
4960 | 36 | if (cfd->GetLogNumber() <= flush_column_family_if_log_file) { |
4961 | 18 | status = SwitchMemtable(cfd, &context); |
4962 | 18 | if (!status.ok()) { |
4963 | 0 | break; |
4964 | 0 | } |
4965 | 18 | cfd->imm()->FlushRequested(); |
4966 | 18 | SchedulePendingFlush(cfd); |
4967 | 18 | } |
4968 | 36 | } |
4969 | 17 | MaybeScheduleFlushOrCompaction(); |
4970 | 22.8M | } else if (UNLIKELY(write_buffer_.ShouldFlush())) { |
4971 | 1.39k | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
4972 | 1.39k | "Flushing column family with largest mem table size. Write buffer is " |
4973 | 1.39k | "using %" PRIu64 " bytes out of a total of %" PRIu64 ".", |
4974 | 1.39k | write_buffer_.memory_usage(), write_buffer_.buffer_size()); |
4975 | | // no need to refcount because drop is happening in write thread, so can't |
4976 | | // happen while we're in the write thread |
4977 | 1.39k | ColumnFamilyData* largest_cfd = nullptr; |
4978 | 1.39k | size_t largest_cfd_size = 0; |
4979 | | |
4980 | 1.40k | for (auto cfd : *versions_->GetColumnFamilySet()) { |
4981 | 1.40k | if (cfd->IsDropped()) { |
4982 | 0 | continue; |
4983 | 0 | } |
4984 | 1.40k | if (!cfd->mem()->IsEmpty()) { |
4985 | | // We only consider active mem table, hoping immutable memtable is |
4986 | | // already in the process of flushing. |
4987 | 1.40k | size_t cfd_size = cfd->mem()->ApproximateMemoryUsage(); |
4988 | 1.40k | if (largest_cfd == nullptr || cfd_size > largest_cfd_size) { |
4989 | 1.40k | largest_cfd = cfd; |
4990 | 1.40k | largest_cfd_size = cfd_size; |
4991 | 1.40k | } |
4992 | 1.40k | } |
4993 | 1.40k | } |
4994 | 1.39k | if (largest_cfd != nullptr) { |
4995 | 1.39k | status = SwitchMemtable(largest_cfd, &context); |
4996 | 1.39k | if (status.ok()) { |
4997 | 1.39k | largest_cfd->imm()->FlushRequested(); |
4998 | 1.39k | SchedulePendingFlush(largest_cfd); |
4999 | 1.39k | MaybeScheduleFlushOrCompaction(); |
5000 | 1.39k | } |
5001 | 1.39k | } |
5002 | 1.39k | } |
5003 | | |
5004 | 22.8M | if (UNLIKELY(status.ok() && !bg_error_.ok())) { |
5005 | 828k | status = bg_error_; |
5006 | 828k | } |
5007 | | |
5008 | 22.8M | if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { |
5009 | 10.9k | status = ScheduleFlushes(&context); |
5010 | 10.9k | } |
5011 | | |
5012 | 22.8M | if (UNLIKELY(status.ok() && (write_controller_.IsStopped() || |
5013 | 15.9k | write_controller_.NeedsDelay()))) { |
5014 | 15.9k | PERF_TIMER_STOP(write_pre_and_post_process_time); |
5015 | 15.9k | PERF_TIMER_GUARD(write_delay_time); |
5016 | | // We don't know size of curent batch so that we always use the size |
5017 | | // for previous one. It might create a fairness issue that expiration |
5018 | | // might happen for smaller writes but larger writes can go through. |
5019 | | // Can optimize it if it is an issue. |
5020 | 15.9k | status = DelayWrite(last_batch_group_size_); |
5021 | 15.9k | PERF_TIMER_START(write_pre_and_post_process_time); |
5022 | 15.9k | } |
5023 | | |
5024 | 22.8M | uint64_t last_sequence = versions_->LastSequence(); |
5025 | 22.8M | WriteThread::Writer* last_writer = &w; |
5026 | 22.8M | autovector<WriteThread::Writer*> write_group; |
5027 | 22.8M | bool need_log_sync = !write_options.disableWAL && write_options.sync; |
5028 | 22.8M | bool need_log_dir_sync = need_log_sync && !log_dir_synced_; |
5029 | | |
5030 | 22.8M | if (status.ok()) { |
5031 | 21.9M | if (need_log_sync) { |
5032 | 107 | while (logs_.front().getting_synced) { |
5033 | 0 | log_sync_cv_.Wait(); |
5034 | 0 | } |
5035 | 110 | for (auto& log : logs_) { |
5036 | 110 | assert(!log.getting_synced); |
5037 | 110 | log.getting_synced = true; |
5038 | 110 | } |
5039 | 107 | } |
5040 | | |
5041 | | // Add to log and apply to memtable. We can release the lock |
5042 | | // during this phase since &w is currently responsible for logging |
5043 | | // and protects against concurrent loggers and concurrent writes |
5044 | | // into memtables |
5045 | 21.9M | } |
5046 | | |
5047 | 22.8M | mutex_.Unlock(); |
5048 | | |
5049 | | // At this point the mutex is unlocked |
5050 | | |
5051 | 22.8M | bool exit_completed_early = false; |
5052 | 22.8M | last_batch_group_size_ = |
5053 | 22.8M | write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); |
5054 | | |
5055 | 22.8M | if (status.ok()) { |
5056 | | // Rules for when we can update the memtable concurrently |
5057 | | // 1. supported by memtable |
5058 | | // 2. Puts are not okay if inplace_update_support |
5059 | | // 3. Deletes or SingleDeletes are not okay if filtering deletes |
5060 | | // (controlled by both batch and memtable setting) |
5061 | | // 4. Merges are not okay |
5062 | | // 5. YugaByte-specific user-specified sequence numbers are currently not compatible with |
5063 | | // parallel memtable writes. |
5064 | | // |
5065 | | // Rules 1..3 are enforced by checking the options |
5066 | | // during startup (CheckConcurrentWritesSupported), so if |
5067 | | // options.allow_concurrent_memtable_write is true then they can be |
5068 | | // assumed to be true. Rule 4 is checked for each batch. We could |
5069 | | // relax rules 2 and 3 if we could prevent write batches from referring |
5070 | | // more than once to a particular key. |
5071 | 21.9M | bool parallel = |
5072 | 21.9M | db_options_.allow_concurrent_memtable_write && write_group.size() > 1; |
5073 | 21.9M | size_t total_count = 0; |
5074 | 21.9M | uint64_t total_byte_size = 0; |
5075 | 23.2M | for (auto writer : write_group) { |
5076 | 23.2M | if (writer->CheckCallback(this)) { |
5077 | 23.2M | total_count += WriteBatchInternal::Count(writer->batch); |
5078 | 23.2M | total_byte_size = WriteBatchInternal::AppendedByteSize( |
5079 | 23.2M | total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); |
5080 | 23.2M | parallel = parallel && !writer->batch->HasMerge(); |
5081 | 23.2M | } |
5082 | 23.2M | } |
5083 | | |
5084 | 21.9M | const SequenceNumber current_sequence = last_sequence + 1; |
5085 | | |
5086 | 21.9M | #ifndef NDEBUG |
5087 | 21.9M | if (current_sequence <= last_sequence) { |
5088 | 0 | RLOG(InfoLogLevel::FATAL_LEVEL, db_options_.info_log, |
5089 | 0 | "Current sequence number %" PRIu64 " is <= last sequence number %" PRIu64, |
5090 | 0 | current_sequence, last_sequence); |
5091 | 0 | } |
5092 | 21.9M | #endif |
5093 | | |
5094 | | // Reserve sequence numbers for all individual updates in this batch group. |
5095 | 21.9M | last_sequence += total_count; |
5096 | | |
5097 | | // Record statistics |
5098 | 21.9M | RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); |
5099 | 21.9M | RecordTick(stats_, BYTES_WRITTEN, total_byte_size); |
5100 | 21.9M | MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); |
5101 | 21.9M | PERF_TIMER_STOP(write_pre_and_post_process_time); |
5102 | | |
5103 | 21.9M | if (write_options.disableWAL) { |
5104 | 6.62M | has_unpersisted_data_ = true; |
5105 | 6.62M | } |
5106 | | |
5107 | 21.9M | uint64_t log_size = 0; |
5108 | 21.9M | if (!write_options.disableWAL) { |
5109 | 15.2M | PERF_TIMER_GUARD(write_wal_time); |
5110 | | |
5111 | 15.2M | WriteBatch* merged_batch = nullptr; |
5112 | 15.2M | if (write_group.size() == 1 && !write_group[0]->CallbackFailed()) { |
5113 | 14.9M | merged_batch = write_group[0]->batch; |
5114 | 340k | } else { |
5115 | | // WAL needs all of the batches flattened into a single batch. |
5116 | | // We could avoid copying here with an iov-like AddRecord |
5117 | | // interface |
5118 | 340k | merged_batch = &tmp_batch_; |
5119 | 1.62M | for (auto writer : write_group) { |
5120 | 1.62M | if (!writer->CallbackFailed()) { |
5121 | 1.62M | WriteBatchInternal::Append(merged_batch, writer->batch); |
5122 | 1.62M | } |
5123 | 1.62M | } |
5124 | 340k | } |
5125 | 15.2M | WriteBatchInternal::SetSequence(merged_batch, current_sequence); |
5126 | | |
5127 | 15.2M | CHECK_EQ(WriteBatchInternal::Count(merged_batch), total_count); |
5128 | | |
5129 | 15.2M | Slice log_entry = WriteBatchInternal::Contents(merged_batch); |
5130 | 15.2M | log::Writer* log_writer; |
5131 | 15.2M | LogFileNumberSize* last_alive_log_file; |
5132 | 15.2M | { |
5133 | 15.2M | InstrumentedMutexLock l(&mutex_); |
5134 | 15.2M | log_writer = logs_.back().writer; |
5135 | 15.2M | last_alive_log_file = &alive_log_files_.back(); |
5136 | 15.2M | } |
5137 | 15.2M | status = log_writer->AddRecord(log_entry); |
5138 | 15.2M | total_log_size_.fetch_add(static_cast<int64_t>(log_entry.size())); |
5139 | 15.2M | last_alive_log_file->AddSize(log_entry.size()); |
5140 | 15.2M | log_empty_ = false; |
5141 | 15.2M | log_size = log_entry.size(); |
5142 | 15.2M | RecordTick(stats_, WAL_FILE_BYTES, log_size); |
5143 | 15.2M | if (status.ok() && need_log_sync) { |
5144 | 107 | RecordTick(stats_, WAL_FILE_SYNCED); |
5145 | 107 | StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); |
5146 | | // It's safe to access logs_ with unlocked mutex_ here because: |
5147 | | // - we've set getting_synced=true for all logs, |
5148 | | // so other threads won't pop from logs_ while we're here, |
5149 | | // - only writer thread can push to logs_, and we're in |
5150 | | // writer thread, so no one will push to logs_, |
5151 | | // - as long as other threads don't modify it, it's safe to read |
5152 | | // from std::deque from multiple threads concurrently. |
5153 | 110 | for (auto& log : logs_) { |
5154 | 110 | status = log.writer->file()->Sync(db_options_.use_fsync); |
5155 | 110 | if (!status.ok()) { |
5156 | 0 | break; |
5157 | 0 | } |
5158 | 110 | } |
5159 | 107 | if (status.ok() && need_log_dir_sync) { |
5160 | | // We only sync WAL directory the first time WAL syncing is |
5161 | | // requested, so that in case users never turn on WAL sync, |
5162 | | // we can avoid the disk I/O in the write code path. |
5163 | 70 | status = directories_.GetWalDir()->Fsync(); |
5164 | 70 | } |
5165 | 107 | } |
5166 | | |
5167 | 15.2M | if (merged_batch == &tmp_batch_) { |
5168 | 340k | tmp_batch_.Clear(); |
5169 | 340k | } |
5170 | 15.2M | } |
5171 | 21.9M | if (status.ok()) { |
5172 | 21.9M | PERF_TIMER_GUARD(write_memtable_time); |
5173 | | |
5174 | 21.9M | { |
5175 | | // Update stats while we are an exclusive group leader, so we know |
5176 | | // that nobody else can be writing to these particular stats. |
5177 | | // We're optimistic, updating the stats before we successfully |
5178 | | // commit. That lets us release our leader status early in |
5179 | | // some cases. |
5180 | 21.9M | auto stats = default_cf_internal_stats_; |
5181 | 21.9M | stats->AddDBStats(InternalDBStatsType::BYTES_WRITTEN, total_byte_size); |
5182 | 21.9M | stats->AddDBStats(InternalDBStatsType::NUMBER_KEYS_WRITTEN, total_count); |
5183 | 21.9M | if (!write_options.disableWAL) { |
5184 | 15.2M | if (write_options.sync) { |
5185 | 107 | stats->AddDBStats(InternalDBStatsType::WAL_FILE_SYNCED, 1); |
5186 | 107 | } |
5187 | 15.2M | stats->AddDBStats(InternalDBStatsType::WAL_FILE_BYTES, log_size); |
5188 | 15.2M | } |
5189 | 21.9M | uint64_t for_other = write_group.size() - 1; |
5190 | 21.9M | if (for_other > 0) { |
5191 | 340k | stats->AddDBStats(InternalDBStatsType::WRITE_DONE_BY_OTHER, for_other); |
5192 | 340k | if (!write_options.disableWAL) { |
5193 | 340k | stats->AddDBStats(InternalDBStatsType::WRITE_WITH_WAL, for_other); |
5194 | 340k | } |
5195 | 340k | } |
5196 | 21.9M | } |
5197 | | |
5198 | 21.9M | if (!parallel) { |
5199 | 21.9M | InsertFlags insert_flags{InsertFlag::kFilterDeletes}; |
5200 | 21.9M | status = WriteBatchInternal::InsertInto( |
5201 | 21.9M | write_group, current_sequence, column_family_memtables_.get(), |
5202 | 21.9M | &flush_scheduler_, write_options.ignore_missing_column_families, |
5203 | 21.9M | 0 /*log_number*/, this, insert_flags); |
5204 | | |
5205 | 21.9M | if (status.ok()) { |
5206 | | // There were no write failures. Set leader's status |
5207 | | // in case the write callback returned a non-ok status. |
5208 | 21.9M | status = w.FinalStatus(); |
5209 | 21.9M | } |
5210 | 23.1M | for (const auto& writer : write_group) { |
5211 | 23.1M | last_sequence += writer->batch->DirectEntries(); |
5212 | 23.1M | } |
5213 | | |
5214 | 9.52k | } else { |
5215 | 9.52k | WriteThread::ParallelGroup pg; |
5216 | 9.52k | pg.leader = &w; |
5217 | 9.52k | pg.last_writer = last_writer; |
5218 | 9.52k | pg.last_sequence = last_sequence; |
5219 | 9.52k | pg.early_exit_allowed = !need_log_sync; |
5220 | 9.52k | pg.running.store(static_cast<uint32_t>(write_group.size()), |
5221 | 9.52k | std::memory_order_relaxed); |
5222 | 9.52k | write_thread_.LaunchParallelFollowers(&pg, current_sequence); |
5223 | | |
5224 | 12.4k | if (!w.CallbackFailed()) { |
5225 | | // do leader write |
5226 | 12.4k | ColumnFamilyMemTablesImpl column_family_memtables( |
5227 | 12.4k | versions_->GetColumnFamilySet()); |
5228 | 12.4k | assert(w.sequence == current_sequence); |
5229 | 12.4k | WriteBatchInternal::SetSequence(w.batch, w.sequence); |
5230 | 12.4k | InsertFlags insert_flags{InsertFlag::kConcurrentMemtableWrites}; |
5231 | 12.4k | w.status = WriteBatchInternal::InsertInto( |
5232 | 12.4k | w.batch, &column_family_memtables, &flush_scheduler_, |
5233 | 12.4k | write_options.ignore_missing_column_families, 0 /*log_number*/, |
5234 | 12.4k | this, insert_flags); |
5235 | 12.4k | } |
5236 | | |
5237 | | // CompleteParallelWorker returns true if this thread should |
5238 | | // handle exit, false means somebody else did |
5239 | 9.52k | exit_completed_early = !write_thread_.CompleteParallelWorker(&w); |
5240 | 9.52k | status = w.FinalStatus(); |
5241 | 9.52k | } |
5242 | | |
5243 | 21.9M | if (!exit_completed_early && w.status.ok()) { |
5244 | 21.9M | SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); |
5245 | 21.9M | versions_->SetLastSequence(last_sequence); |
5246 | 21.9M | if (!need_log_sync) { |
5247 | 21.9M | write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); |
5248 | 21.9M | exit_completed_early = true; |
5249 | 21.9M | } |
5250 | 21.9M | } |
5251 | | |
5252 | | // A non-OK status here indicates that the state implied by the |
5253 | | // WAL has diverged from the in-memory state. This could be |
5254 | | // because of a corrupt write_batch (very bad), or because the |
5255 | | // client specified an invalid column family and didn't specify |
5256 | | // ignore_missing_column_families. |
5257 | | // |
5258 | | // Is setting bg_error_ enough here? This will at least stop |
5259 | | // compaction and fail any further writes. |
5260 | 21.9M | if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) { |
5261 | 1 | bg_error_ = status; |
5262 | 1 | } |
5263 | 21.9M | } |
5264 | 21.9M | } |
5265 | 22.8M | PERF_TIMER_START(write_pre_and_post_process_time); |
5266 | | |
5267 | 22.8M | if (db_options_.paranoid_checks && !status.ok() && !w.CallbackFailed() && !status.IsBusy()) { |
5268 | 828k | mutex_.Lock(); |
5269 | 828k | if (bg_error_.ok()) { |
5270 | 8 | bg_error_ = status; // stop compaction & fail any further writes |
5271 | 8 | } |
5272 | 828k | mutex_.Unlock(); |
5273 | 828k | } |
5274 | | |
5275 | 22.8M | if (need_log_sync) { |
5276 | 107 | mutex_.Lock(); |
5277 | 107 | MarkLogsSynced(logfile_number_, need_log_dir_sync, status); |
5278 | 107 | mutex_.Unlock(); |
5279 | 107 | } |
5280 | | |
5281 | 22.8M | if (!exit_completed_early) { |
5282 | 828k | write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); |
5283 | 828k | } |
5284 | | |
5285 | 22.8M | return status; |
5286 | 22.8M | } |
5287 | | |
5288 | | // REQUIRES: mutex_ is held |
5289 | | // REQUIRES: this thread is currently at the front of the writer queue |
5290 | 15.9k | Status DBImpl::DelayWrite(uint64_t num_bytes) { |
5291 | 15.9k | uint64_t time_delayed = 0; |
5292 | 15.9k | bool delayed = false; |
5293 | 15.9k | { |
5294 | 15.9k | auto delay = write_controller_.GetDelay(env_, num_bytes); |
5295 | 15.9k | if (delay > 0) { |
5296 | 2.33k | mutex_.Unlock(); |
5297 | 2.33k | delayed = true; |
5298 | 2.33k | TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); |
5299 | | // hopefully we don't have to sleep more than 2 billion microseconds |
5300 | 2.33k | env_->SleepForMicroseconds(static_cast<int>(delay)); |
5301 | 2.33k | mutex_.Lock(); |
5302 | 2.33k | } |
5303 | | |
5304 | | // If we are shutting down, background job that make WriteController stopped could be aborted |
5305 | | // and never release WriteControllerToken, so we need to check IsShuttingDown to not stuck here |
5306 | | // in this case. |
5307 | 18.1k | while (bg_error_.ok() && write_controller_.IsStopped() && !IsShuttingDown()) { |
5308 | 2.20k | delayed = true; |
5309 | 2.20k | TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); |
5310 | 2.20k | bg_cv_.Wait(); |
5311 | 2.20k | } |
5312 | 15.9k | } |
5313 | 15.9k | if (delayed) { |
5314 | 4.14k | RecordTick(stats_, STALL_MICROS, time_delayed); |
5315 | 4.14k | } |
5316 | | |
5317 | 15.9k | return bg_error_; |
5318 | 15.9k | } |
5319 | | |
5320 | 10.9k | Status DBImpl::ScheduleFlushes(WriteContext* context) { |
5321 | 10.9k | ColumnFamilyData* cfd; |
5322 | 22.5k | while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) { |
5323 | 11.6k | auto status = SwitchMemtable(cfd, context); |
5324 | 11.6k | if (cfd->Unref()) { |
5325 | 0 | delete cfd; |
5326 | 0 | } |
5327 | 11.6k | if (!status.ok()) { |
5328 | 6 | return status; |
5329 | 6 | } |
5330 | 11.6k | } |
5331 | 10.9k | return Status::OK(); |
5332 | 10.9k | } |
5333 | | |
5334 | | // REQUIRES: mutex_ is held |
5335 | | // REQUIRES: this thread is currently at the front of the writer queue |
5336 | 25.4k | Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { |
5337 | 25.4k | mutex_.AssertHeld(); |
5338 | 25.4k | unique_ptr<WritableFile> lfile; |
5339 | 25.4k | log::Writer* new_log = nullptr; |
5340 | 25.4k | MemTable* new_mem = nullptr; |
5341 | | |
5342 | | // Attempt to switch to a new memtable and trigger flush of old. |
5343 | | // Do this without holding the dbmutex lock. |
5344 | 25.4k | assert(versions_->prev_log_number() == 0); |
5345 | 25.4k | bool creating_new_log = !log_empty_; |
5346 | 25.4k | uint64_t recycle_log_number = 0; |
5347 | 25.4k | if (creating_new_log && db_options_.recycle_log_file_num && |
5348 | 83 | !log_recycle_files.empty()) { |
5349 | 57 | recycle_log_number = log_recycle_files.front(); |
5350 | 57 | log_recycle_files.pop_front(); |
5351 | 57 | } |
5352 | 25.4k | uint64_t new_log_number = |
5353 | 20.3k | creating_new_log ? versions_->NewFileNumber() : logfile_number_; |
5354 | 25.4k | SuperVersion* new_superversion = nullptr; |
5355 | 25.4k | const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); |
5356 | 25.4k | mutex_.Unlock(); |
5357 | 25.4k | Status s; |
5358 | 25.4k | { |
5359 | 25.4k | if (creating_new_log) { |
5360 | 20.3k | EnvOptions opt_env_opt = |
5361 | 20.3k | env_->OptimizeForLogWrite(env_options_, db_options_); |
5362 | 20.3k | if (recycle_log_number) { |
5363 | 57 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
5364 | 57 | "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); |
5365 | 57 | s = env_->ReuseWritableFile( |
5366 | 57 | LogFileName(db_options_.wal_dir, new_log_number), |
5367 | 57 | LogFileName(db_options_.wal_dir, recycle_log_number), &lfile, |
5368 | 57 | opt_env_opt); |
5369 | 20.2k | } else { |
5370 | 20.2k | s = NewWritableFile(env_, |
5371 | 20.2k | LogFileName(db_options_.wal_dir, new_log_number), |
5372 | 20.2k | &lfile, opt_env_opt); |
5373 | 20.2k | } |
5374 | 20.3k | if (s.ok()) { |
5375 | | // Our final size should be less than write_buffer_size |
5376 | | // (compression, etc) but err on the side of caution. |
5377 | 20.3k | lfile->SetPreallocationBlockSize( |
5378 | 20.3k | mutable_cf_options.write_buffer_size / 10 + |
5379 | 20.3k | mutable_cf_options.write_buffer_size); |
5380 | 20.3k | unique_ptr<WritableFileWriter> file_writer( |
5381 | 20.3k | new WritableFileWriter(std::move(lfile), opt_env_opt)); |
5382 | 20.3k | new_log = new log::Writer(std::move(file_writer), new_log_number, |
5383 | 20.3k | db_options_.recycle_log_file_num > 0); |
5384 | 20.3k | } |
5385 | 20.3k | } |
5386 | | |
5387 | 25.4k | if (s.ok()) { |
5388 | 25.4k | SequenceNumber seq = versions_->LastSequence(); |
5389 | 25.4k | new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); |
5390 | 25.4k | new_superversion = new SuperVersion(); |
5391 | 25.4k | } |
5392 | 25.4k | } |
5393 | 25.4k | RLOG(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, |
5394 | 25.4k | "[%s] New memtable created with log file: #%" PRIu64 "\n", |
5395 | 25.4k | cfd->GetName().c_str(), new_log_number); |
5396 | 25.4k | mutex_.Lock(); |
5397 | 25.4k | if (!s.ok()) { |
5398 | | // how do we fail if we're not creating new log? |
5399 | 7 | assert(creating_new_log); |
5400 | 7 | assert(!new_mem); |
5401 | 7 | assert(!new_log); |
5402 | 7 | return s; |
5403 | 7 | } |
5404 | 25.4k | if (creating_new_log) { |
5405 | 20.3k | logfile_number_ = new_log_number; |
5406 | 20.3k | assert(new_log != nullptr); |
5407 | 20.3k | log_empty_ = true; |
5408 | 20.3k | log_dir_synced_ = false; |
5409 | 20.3k | logs_.emplace_back(logfile_number_, new_log); |
5410 | 20.3k | alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); |
5411 | 29.4k | for (auto loop_cfd : *versions_->GetColumnFamilySet()) { |
5412 | | // all this is just optimization to delete logs that |
5413 | | // are no longer needed -- if CF is empty, that means it |
5414 | | // doesn't need that particular log to stay alive, so we just |
5415 | | // advance the log number. no need to persist this in the manifest |
5416 | 29.4k | if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && |
5417 | 7.54k | loop_cfd->imm()->NumNotFlushed() == 0) { |
5418 | 7.54k | loop_cfd->SetLogNumber(logfile_number_); |
5419 | 7.54k | } |
5420 | 29.4k | } |
5421 | 20.3k | } |
5422 | 25.4k | cfd->mem()->SetFlushStartTime(std::chrono::steady_clock::now()); |
5423 | 25.4k | cfd->mem()->SetNextLogNumber(logfile_number_); |
5424 | 25.4k | cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_); |
5425 | 25.4k | new_mem->Ref(); |
5426 | 25.4k | cfd->SetMemtable(new_mem); |
5427 | 25.4k | context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork( |
5428 | 25.4k | cfd, new_superversion, mutable_cf_options)); |
5429 | | |
5430 | 25.4k | return s; |
5431 | 25.4k | } |
5432 | | |
5433 | | #ifndef ROCKSDB_LITE |
5434 | | Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, |
5435 | 54 | TablePropertiesCollection* props) { |
5436 | 54 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5437 | 54 | auto cfd = cfh->cfd(); |
5438 | | |
5439 | | // Increment the ref count |
5440 | 54 | mutex_.Lock(); |
5441 | 54 | auto version = cfd->current(); |
5442 | 54 | version->Ref(); |
5443 | 54 | mutex_.Unlock(); |
5444 | | |
5445 | 54 | auto s = version->GetPropertiesOfAllTables(props); |
5446 | | |
5447 | | // Decrement the ref count |
5448 | 54 | mutex_.Lock(); |
5449 | 54 | version->Unref(); |
5450 | 54 | mutex_.Unlock(); |
5451 | | |
5452 | 54 | return s; |
5453 | 54 | } |
5454 | | |
5455 | | Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, |
5456 | | const Range* range, std::size_t n, |
5457 | 0 | TablePropertiesCollection* props) { |
5458 | 0 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5459 | 0 | auto cfd = cfh->cfd(); |
5460 | | |
5461 | | // Increment the ref count |
5462 | 0 | mutex_.Lock(); |
5463 | 0 | auto version = cfd->current(); |
5464 | 0 | version->Ref(); |
5465 | 0 | mutex_.Unlock(); |
5466 | |
|
5467 | 0 | auto s = version->GetPropertiesOfTablesInRange(range, n, props); |
5468 | | |
5469 | | // Decrement the ref count |
5470 | 0 | mutex_.Lock(); |
5471 | 0 | version->Unref(); |
5472 | 0 | mutex_.Unlock(); |
5473 | |
|
5474 | 0 | return s; |
5475 | 0 | } |
5476 | | |
5477 | | void DBImpl::GetColumnFamiliesOptions( |
5478 | | std::vector<std::string>* column_family_names, |
5479 | 9 | std::vector<ColumnFamilyOptions>* column_family_options) { |
5480 | 9 | DCHECK(column_family_names); |
5481 | 9 | DCHECK(column_family_options); |
5482 | 9 | InstrumentedMutexLock lock(&mutex_); |
5483 | 9 | GetColumnFamiliesOptionsUnlocked(column_family_names, column_family_options); |
5484 | 9 | } |
5485 | | |
5486 | | void DBImpl::GetColumnFamiliesOptionsUnlocked( |
5487 | | std::vector<std::string>* column_family_names, |
5488 | 999k | std::vector<ColumnFamilyOptions>* column_family_options) { |
5489 | 1.00M | for (auto cfd : *versions_->GetColumnFamilySet()) { |
5490 | 1.00M | if (cfd->IsDropped()) { |
5491 | 29 | continue; |
5492 | 29 | } |
5493 | 1.00M | column_family_names->push_back(cfd->GetName()); |
5494 | 1.00M | column_family_options->push_back( |
5495 | 1.00M | BuildColumnFamilyOptions(*cfd->options(), *cfd->GetLatestMutableCFOptions())); |
5496 | 1.00M | } |
5497 | 999k | } |
5498 | | #endif // ROCKSDB_LITE |
5499 | | |
5500 | 5.52M | const std::string& DBImpl::GetName() const { |
5501 | 5.52M | return dbname_; |
5502 | 5.52M | } |
5503 | | |
5504 | 4.01M | Env* DBImpl::GetEnv() const { |
5505 | 4.01M | return env_; |
5506 | 4.01M | } |
5507 | | |
5508 | 13.4k | Env* DBImpl::GetCheckpointEnv() const { |
5509 | 13.4k | return checkpoint_env_; |
5510 | 13.4k | } |
5511 | | |
5512 | 9.39M | const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { |
5513 | 9.39M | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5514 | 9.39M | return *cfh->cfd()->options(); |
5515 | 9.39M | } |
5516 | | |
5517 | 1.00M | const DBOptions& DBImpl::GetDBOptions() const { return db_options_; } |
5518 | | |
5519 | | bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, |
5520 | 10.4k | const Slice& property, std::string* value) { |
5521 | 10.4k | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5522 | 10.4k | value->clear(); |
5523 | 10.4k | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
5524 | 10.4k | if (property_info == nullptr) { |
5525 | 0 | return false; |
5526 | 10.4k | } else if (property_info->handle_int) { |
5527 | 426 | uint64_t int_value; |
5528 | 426 | bool ret_value = |
5529 | 426 | GetIntPropertyInternal(cfd, *property_info, false, &int_value); |
5530 | 426 | if (ret_value) { |
5531 | 426 | *value = ToString(int_value); |
5532 | 426 | } |
5533 | 426 | return ret_value; |
5534 | 10.0k | } else if (property_info->handle_string) { |
5535 | 10.0k | InstrumentedMutexLock l(&mutex_); |
5536 | 10.0k | return cfd->internal_stats()->GetStringProperty(*property_info, property, |
5537 | 10.0k | value); |
5538 | 10.0k | } |
5539 | | // Shouldn't reach here since exactly one of handle_string and handle_int |
5540 | | // should be non-nullptr. |
5541 | 0 | assert(false); |
5542 | 0 | return false; |
5543 | 0 | } |
5544 | | |
5545 | | bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, |
5546 | 583 | const Slice& property, uint64_t* value) { |
5547 | 583 | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5548 | 583 | if (property_info == nullptr || property_info->handle_int == nullptr) { |
5549 | 0 | return false; |
5550 | 0 | } |
5551 | 583 | auto cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
5552 | 583 | return GetIntPropertyInternal(cfd, *property_info, false, value); |
5553 | 583 | } |
5554 | | |
5555 | | bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, |
5556 | | const DBPropertyInfo& property_info, |
5557 | 1.06k | bool is_locked, uint64_t* value) { |
5558 | 1.06k | assert(property_info.handle_int != nullptr); |
5559 | 1.06k | if (!property_info.need_out_of_mutex) { |
5560 | 911 | if (is_locked) { |
5561 | 5 | mutex_.AssertHeld(); |
5562 | 5 | return cfd->internal_stats()->GetIntProperty(property_info, value, this); |
5563 | 906 | } else { |
5564 | 906 | InstrumentedMutexLock l(&mutex_); |
5565 | 906 | return cfd->internal_stats()->GetIntProperty(property_info, value, this); |
5566 | 906 | } |
5567 | 153 | } else { |
5568 | 153 | SuperVersion* sv = nullptr; |
5569 | 153 | if (!is_locked) { |
5570 | 103 | sv = GetAndRefSuperVersion(cfd); |
5571 | 50 | } else { |
5572 | 50 | sv = cfd->GetSuperVersion(); |
5573 | 50 | } |
5574 | | |
5575 | 153 | bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( |
5576 | 153 | property_info, sv->current, value); |
5577 | | |
5578 | 153 | if (!is_locked) { |
5579 | 103 | ReturnAndCleanupSuperVersion(cfd, sv); |
5580 | 103 | } |
5581 | | |
5582 | 153 | return ret; |
5583 | 153 | } |
5584 | 1.06k | } |
5585 | | |
5586 | | bool DBImpl::GetAggregatedIntProperty(const Slice& property, |
5587 | 12 | uint64_t* aggregated_value) { |
5588 | 12 | const DBPropertyInfo* property_info = GetPropertyInfo(property); |
5589 | 12 | if (property_info == nullptr || property_info->handle_int == nullptr) { |
5590 | 1 | return false; |
5591 | 1 | } |
5592 | | |
5593 | 11 | uint64_t sum = 0; |
5594 | 11 | { |
5595 | | // Needs mutex to protect the list of column families. |
5596 | 11 | InstrumentedMutexLock l(&mutex_); |
5597 | 11 | uint64_t value; |
5598 | 55 | for (auto* cfd : *versions_->GetColumnFamilySet()) { |
5599 | 55 | if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { |
5600 | 55 | sum += value; |
5601 | 0 | } else { |
5602 | 0 | return false; |
5603 | 0 | } |
5604 | 55 | } |
5605 | 11 | } |
5606 | 11 | *aggregated_value = sum; |
5607 | 11 | return true; |
5608 | 11 | } |
5609 | | |
5610 | 6.55M | SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { |
5611 | | // TODO(ljin): consider using GetReferencedSuperVersion() directly |
5612 | 6.55M | return cfd->GetThreadLocalSuperVersion(&mutex_); |
5613 | 6.55M | } |
5614 | | |
5615 | | // REQUIRED: this function should only be called on the write thread or if the |
5616 | | // mutex is held. |
5617 | 59 | SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { |
5618 | 59 | auto column_family_set = versions_->GetColumnFamilySet(); |
5619 | 59 | auto cfd = column_family_set->GetColumnFamily(column_family_id); |
5620 | 59 | if (!cfd) { |
5621 | 0 | return nullptr; |
5622 | 0 | } |
5623 | | |
5624 | 59 | return GetAndRefSuperVersion(cfd); |
5625 | 59 | } |
5626 | | |
5627 | | // REQUIRED: mutex is NOT held |
5628 | 0 | SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) { |
5629 | 0 | ColumnFamilyData* cfd; |
5630 | 0 | { |
5631 | 0 | InstrumentedMutexLock l(&mutex_); |
5632 | 0 | auto column_family_set = versions_->GetColumnFamilySet(); |
5633 | 0 | cfd = column_family_set->GetColumnFamily(column_family_id); |
5634 | 0 | } |
5635 | |
|
5636 | 0 | if (!cfd) { |
5637 | 0 | return nullptr; |
5638 | 0 | } |
5639 | | |
5640 | 0 | return GetAndRefSuperVersion(cfd); |
5641 | 0 | } |
5642 | | |
5643 | | void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, |
5644 | 6.55M | SuperVersion* sv) { |
5645 | 6.55M | bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); |
5646 | | |
5647 | 6.55M | if (unref_sv) { |
5648 | | // Release SuperVersion |
5649 | 149 | if (sv->Unref()) { |
5650 | 122 | { |
5651 | 122 | InstrumentedMutexLock l(&mutex_); |
5652 | 122 | sv->Cleanup(); |
5653 | 122 | } |
5654 | 122 | delete sv; |
5655 | 122 | RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS); |
5656 | 122 | } |
5657 | 149 | RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES); |
5658 | 149 | } |
5659 | 6.55M | } |
5660 | | |
5661 | | // REQUIRED: this function should only be called on the write thread. |
5662 | | void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id, |
5663 | 59 | SuperVersion* sv) { |
5664 | 59 | auto column_family_set = versions_->GetColumnFamilySet(); |
5665 | 59 | auto cfd = column_family_set->GetColumnFamily(column_family_id); |
5666 | | |
5667 | | // If SuperVersion is held, and we successfully fetched a cfd using |
5668 | | // GetAndRefSuperVersion(), it must still exist. |
5669 | 59 | assert(cfd != nullptr); |
5670 | 59 | ReturnAndCleanupSuperVersion(cfd, sv); |
5671 | 59 | } |
5672 | | |
5673 | | // REQUIRED: Mutex should NOT be held. |
5674 | | void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id, |
5675 | 0 | SuperVersion* sv) { |
5676 | 0 | ColumnFamilyData* cfd; |
5677 | 0 | { |
5678 | 0 | InstrumentedMutexLock l(&mutex_); |
5679 | 0 | auto column_family_set = versions_->GetColumnFamilySet(); |
5680 | 0 | cfd = column_family_set->GetColumnFamily(column_family_id); |
5681 | 0 | } |
5682 | | |
5683 | | // If SuperVersion is held, and we successfully fetched a cfd using |
5684 | | // GetAndRefSuperVersion(), it must still exist. |
5685 | 0 | assert(cfd != nullptr); |
5686 | 0 | ReturnAndCleanupSuperVersion(cfd, sv); |
5687 | 0 | } |
5688 | | |
5689 | | // REQUIRED: this function should only be called on the write thread or if the |
5690 | | // mutex is held. |
5691 | 0 | ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { |
5692 | 0 | ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); |
5693 | |
|
5694 | 0 | if (!cf_memtables->Seek(column_family_id)) { |
5695 | 0 | return nullptr; |
5696 | 0 | } |
5697 | | |
5698 | 0 | return cf_memtables->GetColumnFamilyHandle(); |
5699 | 0 | } |
5700 | | |
5701 | | // REQUIRED: mutex is NOT held. |
5702 | | ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked( |
5703 | 0 | uint32_t column_family_id) { |
5704 | 0 | ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); |
5705 | |
|
5706 | 0 | InstrumentedMutexLock l(&mutex_); |
5707 | |
|
5708 | 0 | if (!cf_memtables->Seek(column_family_id)) { |
5709 | 0 | return nullptr; |
5710 | 0 | } |
5711 | | |
5712 | 0 | return cf_memtables->GetColumnFamilyHandle(); |
5713 | 0 | } |
5714 | | |
5715 | 0 | Status DBImpl::Import(const std::string& source_dir) { |
5716 | 0 | const auto seqno = versions_->LastSequence(); |
5717 | 0 | FlushOptions options; |
5718 | 0 | RETURN_NOT_OK(Flush(options)); |
5719 | 0 | VersionEdit edit; |
5720 | 0 | auto status = versions_->Import(source_dir, seqno, &edit); |
5721 | 0 | if (!status.ok()) { |
5722 | 0 | return status; |
5723 | 0 | } |
5724 | 0 | return ApplyVersionEdit(&edit); |
5725 | 0 | } |
5726 | | |
5727 | 0 | bool DBImpl::AreWritesStopped() { |
5728 | 0 | return write_controller_.IsStopped(); |
5729 | 0 | } |
5730 | | |
5731 | 2.93M | bool DBImpl::NeedsDelay() { |
5732 | 2.93M | return write_controller_.NeedsDelay(); |
5733 | 2.93M | } |
5734 | | |
5735 | 47 | Result<std::string> DBImpl::GetMiddleKey() { |
5736 | 47 | InstrumentedMutexLock lock(&mutex_); |
5737 | 47 | return default_cf_handle_->cfd()->current()->GetMiddleKey(); |
5738 | 47 | } |
5739 | | |
5740 | 0 | void DBImpl::TEST_SwitchMemtable() { |
5741 | 0 | std::lock_guard<InstrumentedMutex> lock(mutex_); |
5742 | 0 | WriteContext context; |
5743 | 0 | CHECK_OK(SwitchMemtable(default_cf_handle_->cfd(), &context)); |
5744 | 0 | } |
5745 | | |
5746 | | void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, |
5747 | | const Range* range, int n, uint64_t* sizes, |
5748 | 14.0k | bool include_memtable) { |
5749 | 14.0k | Version* v; |
5750 | 14.0k | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5751 | 14.0k | auto cfd = cfh->cfd(); |
5752 | 14.0k | SuperVersion* sv = GetAndRefSuperVersion(cfd); |
5753 | 14.0k | v = sv->current; |
5754 | | |
5755 | 28.1k | for (int i = 0; i < n; i++) { |
5756 | | // Convert user_key into a corresponding internal key. |
5757 | 14.0k | InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); |
5758 | 14.0k | InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); |
5759 | 14.0k | sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode()); |
5760 | 14.0k | if (include_memtable) { |
5761 | 9 | sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode()); |
5762 | 9 | sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode()); |
5763 | 9 | } |
5764 | 14.0k | } |
5765 | | |
5766 | 14.0k | ReturnAndCleanupSuperVersion(cfd, sv); |
5767 | 14.0k | } |
5768 | | |
5769 | | #ifndef ROCKSDB_LITE |
5770 | | Status DBImpl::GetUpdatesSince( |
5771 | | SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter, |
5772 | 48 | const TransactionLogIterator::ReadOptions& read_options) { |
5773 | | |
5774 | 48 | RecordTick(stats_, GET_UPDATES_SINCE_CALLS); |
5775 | 48 | if (seq > versions_->LastSequence()) { |
5776 | 0 | return STATUS(NotFound, "Requested sequence not yet written in the db"); |
5777 | 0 | } |
5778 | 48 | return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); |
5779 | 48 | } |
5780 | | |
5781 | 157 | Status DBImpl::DeleteFile(std::string name) { |
5782 | 157 | uint64_t number; |
5783 | 157 | FileType type; |
5784 | 157 | WalFileType log_type; |
5785 | 157 | if (!ParseFileName(name, &number, &type, &log_type) || |
5786 | 157 | (type != kTableFile && type != kLogFile)) { |
5787 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5788 | 0 | "DeleteFile %s failed.\n", name.c_str()); |
5789 | 0 | return STATUS(InvalidArgument, "Invalid file name"); |
5790 | 0 | } |
5791 | | |
5792 | 157 | Status status; |
5793 | 157 | if (type == kLogFile) { |
5794 | | // Only allow deleting archived log files |
5795 | 2 | if (log_type != kArchivedLogFile) { |
5796 | 1 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5797 | 1 | "DeleteFile %s failed - not archived log.\n", |
5798 | 1 | name.c_str()); |
5799 | 1 | return STATUS(NotSupported, "Delete only supported for archived logs"); |
5800 | 1 | } |
5801 | 1 | status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); |
5802 | 1 | if (!status.ok()) { |
5803 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
5804 | 0 | "DeleteFile %s failed -- %s.\n", |
5805 | 0 | name.c_str(), status.ToString().c_str()); |
5806 | 0 | } |
5807 | 1 | return status; |
5808 | 1 | } |
5809 | | |
5810 | 155 | int level; |
5811 | 155 | FileMetaData* metadata; |
5812 | 155 | ColumnFamilyData* cfd; |
5813 | 155 | VersionEdit edit; |
5814 | 155 | JobContext job_context(next_job_id_.fetch_add(1), true); |
5815 | 155 | { |
5816 | 155 | InstrumentedMutexLock l(&mutex_); |
5817 | | // Delete file is infrequent operation, so could just busy wait here. |
5818 | 155 | while (versions_->has_manifest_writers()) { |
5819 | 0 | mutex_.unlock(); |
5820 | 0 | std::this_thread::sleep_for(10ms); |
5821 | 0 | mutex_.lock(); |
5822 | 0 | } |
5823 | | |
5824 | 155 | status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); |
5825 | 155 | if (!status.ok()) { |
5826 | 1 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5827 | 1 | "DeleteFile %s failed. File not found\n", name.c_str()); |
5828 | 1 | job_context.Clean(); |
5829 | 1 | return STATUS(InvalidArgument, "File not found"); |
5830 | 1 | } |
5831 | 154 | assert(level < cfd->NumberLevels()); |
5832 | | |
5833 | | // If the file is being compacted no need to delete. |
5834 | 154 | if (metadata->being_compacted) { |
5835 | 0 | RLOG(InfoLogLevel::INFO_LEVEL, db_options_.info_log, |
5836 | 0 | "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); |
5837 | 0 | job_context.Clean(); |
5838 | 0 | return Status::OK(); |
5839 | 0 | } |
5840 | | |
5841 | | // Only the files in the last level can be deleted externally. |
5842 | | // This is to make sure that any deletion tombstones are not |
5843 | | // lost. Check that the level passed is the last level. |
5844 | 154 | auto* vstoreage = cfd->current()->storage_info(); |
5845 | 246 | for (int i = level + 1; i < cfd->NumberLevels(); i++) { |
5846 | 93 | if (vstoreage->NumLevelFiles(i) != 0) { |
5847 | 1 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5848 | 1 | "DeleteFile %s FAILED. File not in last level\n", name.c_str()); |
5849 | 1 | job_context.Clean(); |
5850 | 1 | return STATUS(InvalidArgument, "File not in last level"); |
5851 | 1 | } |
5852 | 93 | } |
5853 | | // if level == 0, it has to be the oldest file |
5854 | 153 | if (level == 0 && |
5855 | 101 | vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { |
5856 | 7 | RLOG(InfoLogLevel::WARN_LEVEL, db_options_.info_log, |
5857 | 7 | "DeleteFile %s failed ---" |
5858 | 7 | " target file in level 0 must be the oldest. Expected: %" PRIu64, name.c_str(), number); |
5859 | 7 | job_context.Clean(); |
5860 | 7 | return STATUS(InvalidArgument, "File in level 0, but not oldest"); |
5861 | 7 | } |
5862 | | |
5863 | 146 | TEST_SYNC_POINT("DBImpl::DeleteFile:DecidedToDelete"); |
5864 | | |
5865 | 146 | metadata->being_deleted = true; |
5866 | | |
5867 | 146 | edit.SetColumnFamily(cfd->GetID()); |
5868 | 146 | edit.DeleteFile(level, number); |
5869 | 146 | status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
5870 | 146 | &edit, &mutex_, directories_.GetDbDir()); |
5871 | 146 | if (status.ok()) { |
5872 | 146 | InstallSuperVersionAndScheduleWorkWrapper( |
5873 | 146 | cfd, &job_context, *cfd->GetLatestMutableCFOptions()); |
5874 | 146 | } |
5875 | 146 | FindObsoleteFiles(&job_context, false); |
5876 | 146 | } // lock released here |
5877 | | |
5878 | 146 | LogFlush(db_options_.info_log); |
5879 | | // remove files outside the db-lock |
5880 | 146 | if (job_context.HaveSomethingToDelete()) { |
5881 | | // Call PurgeObsoleteFiles() without holding mutex. |
5882 | 146 | PurgeObsoleteFiles(job_context); |
5883 | 146 | } |
5884 | 146 | job_context.Clean(); |
5885 | | |
5886 | 146 | FilesChanged(); |
5887 | | |
5888 | 146 | return status; |
5889 | 146 | } |
5890 | | |
5891 | | Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family, |
5892 | 3 | const Slice* begin, const Slice* end) { |
5893 | 3 | Status status; |
5894 | 3 | auto cfh = down_cast<ColumnFamilyHandleImpl*>(column_family); |
5895 | 3 | ColumnFamilyData* cfd = cfh->cfd(); |
5896 | 3 | VersionEdit edit; |
5897 | 3 | std::vector<FileMetaData*> deleted_files; |
5898 | 3 | JobContext job_context(next_job_id_.fetch_add(1), true); |
5899 | 3 | { |
5900 | 3 | InstrumentedMutexLock l(&mutex_); |
5901 | 3 | Version* input_version = cfd->current(); |
5902 | | |
5903 | 3 | auto* vstorage = input_version->storage_info(); |
5904 | 12 | for (int i = 1; i < cfd->NumberLevels(); i++) { |
5905 | 9 | if (vstorage->LevelFiles(i).empty() || |
5906 | 9 | !vstorage->OverlapInLevel(i, begin, end)) { |
5907 | 3 | continue; |
5908 | 3 | } |
5909 | 6 | std::vector<FileMetaData*> level_files; |
5910 | 6 | InternalKey begin_storage, end_storage, *begin_key, *end_key; |
5911 | 6 | if (begin == nullptr) { |
5912 | 3 | begin_key = nullptr; |
5913 | 3 | } else { |
5914 | 3 | begin_storage = InternalKey::MaxPossibleForUserKey(*begin); |
5915 | 3 | begin_key = &begin_storage; |
5916 | 3 | } |
5917 | 6 | if (end == nullptr) { |
5918 | 3 | end_key = nullptr; |
5919 | 3 | } else { |
5920 | 3 | end_storage = InternalKey::MinPossibleForUserKey(*end); |
5921 | 3 | end_key = &end_storage; |
5922 | 3 | } |
5923 | | |
5924 | 6 | vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1, |
5925 | 6 | nullptr, false); |
5926 | 6 | FileMetaData* level_file; |
5927 | 38 | for (uint32_t j = 0; j < level_files.size(); j++) { |
5928 | 32 | level_file = level_files[j]; |
5929 | 32 | if (((begin == nullptr) || |
5930 | 7 | (cfd->internal_comparator()->user_comparator()->Compare( |
5931 | 7 | level_file->smallest.key.user_key(), *begin) >= 0)) && |
5932 | 31 | ((end == nullptr) || |
5933 | 6 | (cfd->internal_comparator()->user_comparator()->Compare( |
5934 | 29 | level_file->largest.key.user_key(), *end) <= 0))) { |
5935 | 29 | if (level_file->being_compacted) { |
5936 | 0 | continue; |
5937 | 0 | } |
5938 | 29 | edit.SetColumnFamily(cfd->GetID()); |
5939 | 29 | edit.DeleteFile(i, level_file->fd.GetNumber()); |
5940 | 29 | deleted_files.push_back(level_file); |
5941 | 29 | level_file->being_compacted = true; |
5942 | 29 | } |
5943 | 32 | } |
5944 | 6 | } |
5945 | 3 | if (edit.GetDeletedFiles().empty()) { |
5946 | 1 | job_context.Clean(); |
5947 | 1 | return Status::OK(); |
5948 | 1 | } |
5949 | 2 | input_version->Ref(); |
5950 | 2 | status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), |
5951 | 2 | &edit, &mutex_, directories_.GetDbDir()); |
5952 | 2 | if (status.ok()) { |
5953 | 2 | InstallSuperVersionAndScheduleWorkWrapper( |
5954 | 2 | cfd, &job_context, *cfd->GetLatestMutableCFOptions()); |
5955 | 2 | } |
5956 | 29 | for (auto* deleted_file : deleted_files) { |
5957 | 29 | deleted_file->being_compacted = false; |
5958 | 29 | } |
5959 | 2 | input_version->Unref(); |
5960 | 2 | FindObsoleteFiles(&job_context, false); |
5961 | 2 | } // lock released here |
5962 | | |
5963 | 2 | LogFlush(db_options_.info_log); |
5964 | | // remove files outside the db-lock |
5965 | 2 | if (job_context.HaveSomethingToDelete()) { |
5966 | | // Call PurgeObsoleteFiles() without holding mutex. |
5967 | 2 | PurgeObsoleteFiles(job_context); |
5968 | 2 | } |
5969 | 2 | job_context.Clean(); |
5970 | 2 | return status; |
5971 | 2 | } |
5972 | | |
5973 | 954k | void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) { |
5974 | 954k | InstrumentedMutexLock l(&mutex_); |
5975 | 954k | versions_->GetLiveFilesMetaData(metadata); |
5976 | 954k | } |
5977 | | |
5978 | 2.38M | UserFrontierPtr DBImpl::GetFlushedFrontier() { |
5979 | 2.38M | InstrumentedMutexLock l(&mutex_); |
5980 | 2.38M | auto result = versions_->FlushedFrontier(); |
5981 | 2.38M | if (result) { |
5982 | 578k | return result->Clone(); |
5983 | 578k | } |
5984 | 1.80M | std::vector<LiveFileMetaData> files; |
5985 | 1.80M | versions_->GetLiveFilesMetaData(&files); |
5986 | 1.80M | UserFrontierPtr accumulated; |
5987 | 0 | for (const auto& file : files) { |
5988 | 0 | if (!file.imported) { |
5989 | 0 | UserFrontier::Update( |
5990 | 0 | file.largest.user_frontier.get(), UpdateUserValueType::kLargest, &accumulated); |
5991 | 0 | } |
5992 | 0 | } |
5993 | 1.80M | return accumulated; |
5994 | 1.80M | } |
5995 | | |
5996 | 1.72M | UserFrontierPtr DBImpl::GetMutableMemTableFrontier(UpdateUserValueType type) { |
5997 | 1.72M | InstrumentedMutexLock l(&mutex_); |
5998 | 1.72M | UserFrontierPtr accumulated; |
5999 | 1.72M | for (auto cfd : *versions_->GetColumnFamilySet()) { |
6000 | 1.72M | if (cfd) { |
6001 | 1.72M | const auto* mem = cfd->mem(); |
6002 | 1.72M | if (mem) { |
6003 | 1.72M | if (!cfd->IsDropped() && cfd->imm()->NumNotFlushed() == 0 && !mem->IsEmpty()) { |
6004 | 427k | auto frontier = mem->GetFrontier(type); |
6005 | 427k | if (frontier) { |
6006 | 427k | UserFrontier::Update(frontier.get(), type, &accumulated); |
6007 | 0 | } else { |
6008 | 0 | YB_LOG_EVERY_N_SECS(DFATAL, 5) |
6009 | 0 | << db_options_.log_prefix << "[" << cfd->GetName() |
6010 | 0 | << "] " << ToString(type) << " frontier is not initialized for non-empty MemTable"; |
6011 | 0 | } |
6012 | 427k | } |
6013 | 0 | } else { |
6014 | 0 | YB_LOG_EVERY_N_SECS(WARNING, 5) << db_options_.log_prefix |
6015 | 0 | << "[" << cfd->GetName() |
6016 | 0 | << "] mem is expected to be non-nullptr here"; |
6017 | 0 | } |
6018 | 0 | } else { |
6019 | 0 | YB_LOG_EVERY_N_SECS(WARNING, 5) << db_options_.log_prefix |
6020 | 0 | << "cfd is expected to be non-nullptr here"; |
6021 | 0 | } |
6022 | 1.72M | } |
6023 | 1.72M | return accumulated; |
6024 | 1.72M | } |
6025 | | |
6026 | 242k | Status DBImpl::ApplyVersionEdit(VersionEdit* edit) { |
6027 | 242k | auto cfd = versions_->GetColumnFamilySet()->GetDefault(); |
6028 | 242k | std::unique_ptr<SuperVersion> superversion_to_free_after_unlock_because_of_install; |
6029 | 242k | std::unique_ptr<SuperVersion> superversion_to_free_after_unlock_because_of_unref; |
6030 | 242k | InstrumentedMutexLock lock(&mutex_); |
6031 | 242k | auto current_sv = cfd->GetSuperVersion()->Ref(); |
6032 | 242k | auto se = yb::ScopeExit([&superversion_to_free_after_unlock_because_of_unref, current_sv]() { |
6033 | 242k | if (current_sv->Unref()) { |
6034 | 242k | current_sv->Cleanup(); |
6035 | 242k | superversion_to_free_after_unlock_because_of_unref.reset(current_sv); |
6036 | 242k | } |
6037 | 242k | }); |
6038 | 242k | auto status = versions_->LogAndApply(cfd, current_sv->mutable_cf_options, edit, &mutex_); |
6039 | 242k | if (!status.ok()) { |
6040 | 0 | return status; |
6041 | 0 | } |
6042 | 242k | superversion_to_free_after_unlock_because_of_install = cfd->InstallSuperVersion( |
6043 | 242k | new SuperVersion(), &mutex_); |
6044 | | |
6045 | 242k | return Status::OK(); |
6046 | 242k | } |
6047 | | |
6048 | 242k | Status DBImpl::ModifyFlushedFrontier(UserFrontierPtr frontier, FrontierModificationMode mode) { |
6049 | 242k | VersionEdit edit; |
6050 | 242k | edit.ModifyFlushedFrontier(std::move(frontier), mode); |
6051 | 242k | return ApplyVersionEdit(&edit); |
6052 | 242k | } |
6053 | | |
6054 | | void DBImpl::GetColumnFamilyMetaData( |
6055 | | ColumnFamilyHandle* column_family, |
6056 | 514 | ColumnFamilyMetaData* cf_meta) { |
6057 | 514 | assert(column_family); |
6058 | 514 | auto* cfd = down_cast<ColumnFamilyHandleImpl*>(column_family)->cfd(); |
6059 | 514 | auto* sv = GetAndRefSuperVersion(cfd); |
6060 | 514 | sv->current->GetColumnFamilyMetaData(cf_meta); |
6061 | 514 | ReturnAndCleanupSuperVersion(cfd, sv); |
6062 | 514 | } |
6063 | | |
6064 | | #endif // ROCKSDB_LITE |
6065 | | |
6066 | 341k | Status DBImpl::CheckConsistency() { |
6067 | 341k | mutex_.AssertHeld(); |
6068 | 341k | std::vector<LiveFileMetaData> metadata; |
6069 | 341k | versions_->GetLiveFilesMetaData(&metadata); |
6070 | | |
6071 | 341k | std::string corruption_messages; |
6072 | 18.9k | for (const auto& md : metadata) { |
6073 | | // md.name has a leading "/". |
6074 | 18.9k | std::string base_file_path = md.db_path + md.name; |
6075 | 18.9k | uint64_t base_fsize = 0; |
6076 | 18.9k | Status s = env_->GetFileSize(base_file_path, &base_fsize); |
6077 | 18.9k | if (!s.ok() && |
6078 | 7 | env_->GetFileSize(Rocks2LevelTableFileName(base_file_path), &base_fsize).ok()) { |
6079 | 5 | s = Status::OK(); |
6080 | 5 | } |
6081 | 18.9k | if (!s.ok()) { |
6082 | 2 | corruption_messages += |
6083 | 2 | "Can't access " + md.name + ": " + s.ToString() + "\n"; |
6084 | 18.9k | } else if (base_fsize != md.base_size) { |
6085 | 2 | corruption_messages += "Sst base file size mismatch: " + base_file_path + |
6086 | 2 | ". Size recorded in manifest " + |
6087 | 2 | ToString(md.base_size) + ", actual size " + |
6088 | 2 | ToString(base_fsize) + "\n"; |
6089 | 2 | } |
6090 | 18.9k | if (md.total_size > md.base_size) { |
6091 | 17.8k | const std::string data_file_path = TableBaseToDataFileName(base_file_path); |
6092 | 17.8k | uint64_t data_fsize = 0; |
6093 | 17.8k | s = env_->GetFileSize(data_file_path, &data_fsize); |
6094 | 17.8k | const uint64_t md_data_size = md.total_size - md.base_size; |
6095 | 17.8k | if (!s.ok()) { |
6096 | 0 | corruption_messages += |
6097 | 0 | "Can't access " + TableBaseToDataFileName(md.name) + ": " + s.ToString() + "\n"; |
6098 | 17.8k | } else if (data_fsize != md_data_size) { |
6099 | 0 | corruption_messages += "Sst data file size mismatch: " + data_file_path + |
6100 | 0 | ". Data size based on total and base size recorded in manifest " + |
6101 | 0 | ToString(md_data_size) + ", actual data size " + |
6102 | 0 | ToString(data_fsize) + "\n"; |
6103 | 0 | } |
6104 | 17.8k | } |
6105 | 18.9k | } |
6106 | 341k | if (corruption_messages.size() == 0) { |
6107 | 341k | return Status::OK(); |
6108 | 56 | } else { |
6109 | 56 | return STATUS(Corruption, corruption_messages); |
6110 | 56 | } |
6111 | 341k | } |
6112 | | |
6113 | 15 | Status DBImpl::GetDbIdentity(std::string* identity) const { |
6114 | 15 | std::string idfilename = IdentityFileName(dbname_); |
6115 | 15 | const EnvOptions soptions; |
6116 | 15 | unique_ptr<SequentialFileReader> id_file_reader; |
6117 | 15 | Status s; |
6118 | 15 | { |
6119 | 15 | unique_ptr<SequentialFile> idfile; |
6120 | 15 | s = env_->NewSequentialFile(idfilename, &idfile, soptions); |
6121 | 15 | if (!s.ok()) { |
6122 | 0 | return s; |
6123 | 0 | } |
6124 | 15 | id_file_reader.reset(new SequentialFileReader(std::move(idfile))); |
6125 | 15 | } |
6126 | | |
6127 | 15 | uint64_t file_size; |
6128 | 15 | s = env_->GetFileSize(idfilename, &file_size); |
6129 | 15 | if (!s.ok()) { |
6130 | 0 | return s; |
6131 | 0 | } |
6132 | 15 | uint8_t* buffer = reinterpret_cast<uint8_t*>(alloca(file_size)); |
6133 | 15 | Slice id; |
6134 | 15 | s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer); |
6135 | 15 | if (!s.ok()) { |
6136 | 0 | return s; |
6137 | 0 | } |
6138 | 15 | identity->assign(id.cdata(), id.size()); |
6139 | | // If last character is '\n' remove it from identity |
6140 | 15 | if (!identity->empty() && identity->back() == '\n') { |
6141 | 0 | identity->pop_back(); |
6142 | 0 | } |
6143 | 15 | return s; |
6144 | 15 | } |
6145 | | |
6146 | | // Default implementations of convenience methods that subclasses of DB |
6147 | | // can call if they wish |
6148 | | Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6149 | 14.8M | const Slice& key, const Slice& value) { |
6150 | | // Pre-allocate size of write batch conservatively. |
6151 | | // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, |
6152 | | // and we allocate 11 extra bytes for key length, as well as value length. |
6153 | 14.8M | WriteBatch batch(key.size() + value.size() + 24); |
6154 | 14.8M | batch.Put(column_family, key, value); |
6155 | 14.8M | return Write(opt, &batch); |
6156 | 14.8M | } |
6157 | | |
6158 | | Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6159 | 543k | const Slice& key) { |
6160 | 543k | WriteBatch batch; |
6161 | 543k | batch.Delete(column_family, key); |
6162 | 543k | return Write(opt, &batch); |
6163 | 543k | } |
6164 | | |
6165 | | Status DB::SingleDelete(const WriteOptions& opt, |
6166 | 185 | ColumnFamilyHandle* column_family, const Slice& key) { |
6167 | 185 | WriteBatch batch; |
6168 | 185 | batch.SingleDelete(column_family, key); |
6169 | 185 | return Write(opt, &batch); |
6170 | 185 | } |
6171 | | |
6172 | | Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, |
6173 | 89.9k | const Slice& key, const Slice& value) { |
6174 | 89.9k | WriteBatch batch; |
6175 | 89.9k | batch.Merge(column_family, key, value); |
6176 | 89.9k | return Write(opt, &batch); |
6177 | 89.9k | } |
6178 | | |
6179 | | // Default implementation -- returns not supported status |
6180 | | Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, |
6181 | | const std::string& column_family_name, |
6182 | 0 | ColumnFamilyHandle** handle) { |
6183 | 0 | return STATUS(NotSupported, ""); |
6184 | 0 | } |
6185 | 0 | Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) { |
6186 | 0 | return STATUS(NotSupported, ""); |
6187 | 0 | } |
6188 | | |
6189 | 323k | DB::~DB() { } |
6190 | | |
6191 | 339k | Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { |
6192 | 339k | DBOptions db_options(options); |
6193 | 339k | ColumnFamilyOptions cf_options(options); |
6194 | 339k | std::vector<ColumnFamilyDescriptor> column_families; |
6195 | 339k | column_families.push_back( |
6196 | 339k | ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); |
6197 | 339k | std::vector<ColumnFamilyHandle*> handles; |
6198 | 339k | Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); |
6199 | 339k | if (s.ok()) { |
6200 | 339k | assert(handles.size() == 1); |
6201 | | // i can delete the handle since DBImpl is always holding a reference to |
6202 | | // default column family |
6203 | 339k | delete handles[0]; |
6204 | 339k | } |
6205 | 339k | return s; |
6206 | 339k | } |
6207 | | |
6208 | | Status DB::Open(const DBOptions& db_options, const std::string& dbname, |
6209 | | const std::vector<ColumnFamilyDescriptor>& column_families, |
6210 | 341k | std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) { |
6211 | 341k | Status s = SanitizeOptionsByTable(db_options, column_families); |
6212 | 341k | if (!s.ok()) { |
6213 | 1 | return s; |
6214 | 1 | } |
6215 | | |
6216 | 344k | for (auto& cfd : column_families) { |
6217 | 344k | s = CheckCompressionSupported(cfd.options); |
6218 | 344k | if (s.ok() && db_options.allow_concurrent_memtable_write) { |
6219 | 352 | s = CheckConcurrentWritesSupported(cfd.options); |
6220 | 352 | } |
6221 | 344k | if (!s.ok()) { |
6222 | 2 | return s; |
6223 | 2 | } |
6224 | 344k | if (db_options.db_paths.size() > 1) { |
6225 | 90 | if ((cfd.options.compaction_style != kCompactionStyleUniversal) && |
6226 | 38 | (cfd.options.compaction_style != kCompactionStyleLevel)) { |
6227 | 0 | return STATUS(NotSupported, |
6228 | 0 | "More than one DB paths are only supported in " |
6229 | 0 | "universal and level compaction styles. "); |
6230 | 0 | } |
6231 | 90 | } |
6232 | 344k | } |
6233 | | |
6234 | 341k | if (db_options.db_paths.size() > 4) { |
6235 | 1 | return STATUS(NotSupported, |
6236 | 1 | "More than four DB paths are not supported yet. "); |
6237 | 1 | } |
6238 | | |
6239 | 341k | *dbptr = nullptr; |
6240 | 341k | handles->clear(); |
6241 | | |
6242 | 341k | size_t max_write_buffer_size = 0; |
6243 | 344k | for (auto cf : column_families) { |
6244 | 344k | max_write_buffer_size = |
6245 | 344k | std::max(max_write_buffer_size, cf.options.write_buffer_size); |
6246 | 344k | } |
6247 | | |
6248 | 341k | DBImpl* impl = new DBImpl(db_options, dbname); |
6249 | 341k | for (auto db_path : impl->db_options_.db_paths) { |
6250 | 341k | s = impl->env_->CreateDirIfMissing(db_path.path); |
6251 | 341k | if (!s.ok()) { |
6252 | 0 | break; |
6253 | 0 | } |
6254 | 341k | } |
6255 | | // WAL dir could be inside other paths, so we create it after. |
6256 | 341k | if (s.ok()) { |
6257 | 341k | s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); |
6258 | 341k | } |
6259 | | |
6260 | 341k | if (!s.ok()) { |
6261 | 0 | delete impl; |
6262 | 0 | return s; |
6263 | 0 | } |
6264 | | |
6265 | 341k | s = impl->CreateArchivalDirectory(); |
6266 | 341k | if (!s.ok()) { |
6267 | 0 | delete impl; |
6268 | 0 | return s; |
6269 | 0 | } |
6270 | 341k | impl->mutex_.Lock(); |
6271 | | // Handles create_if_missing, error_if_exists |
6272 | 341k | s = impl->Recover(column_families); |
6273 | 341k | if (s.ok()) { |
6274 | 341k | uint64_t new_log_number = impl->versions_->NewFileNumber(); |
6275 | 341k | unique_ptr<WritableFile> lfile; |
6276 | 341k | EnvOptions soptions(db_options); |
6277 | 341k | EnvOptions opt_env_options = |
6278 | 341k | impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_); |
6279 | 341k | s = NewWritableFile(impl->db_options_.env, |
6280 | 341k | LogFileName(impl->db_options_.wal_dir, new_log_number), |
6281 | 341k | &lfile, opt_env_options); |
6282 | 341k | if (s.ok()) { |
6283 | 341k | lfile->SetPreallocationBlockSize((max_write_buffer_size / 10) + max_write_buffer_size); |
6284 | 341k | impl->logfile_number_ = new_log_number; |
6285 | 341k | unique_ptr<WritableFileWriter> file_writer( |
6286 | 341k | new WritableFileWriter(std::move(lfile), opt_env_options)); |
6287 | 341k | impl->logs_.emplace_back( |
6288 | 341k | new_log_number, |
6289 | 341k | new log::Writer(std::move(file_writer), new_log_number, |
6290 | 341k | impl->db_options_.recycle_log_file_num > 0)); |
6291 | | |
6292 | | // set column family handles |
6293 | 344k | for (auto cf : column_families) { |
6294 | 344k | auto cfd = |
6295 | 344k | impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name); |
6296 | 344k | if (cfd != nullptr) { |
6297 | 344k | handles->push_back( |
6298 | 344k | new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); |
6299 | 18.4E | } else { |
6300 | 18.4E | if (db_options.create_missing_column_families) { |
6301 | | // missing column family, create it |
6302 | 17 | ColumnFamilyHandle* handle; |
6303 | 17 | impl->mutex_.Unlock(); |
6304 | 17 | s = impl->CreateColumnFamily(cf.options, cf.name, &handle); |
6305 | 17 | impl->mutex_.Lock(); |
6306 | 17 | if (s.ok()) { |
6307 | 17 | handles->push_back(handle); |
6308 | 0 | } else { |
6309 | 0 | break; |
6310 | 0 | } |
6311 | 18.4E | } else { |
6312 | 18.4E | s = STATUS(InvalidArgument, "Column family not found: ", cf.name); |
6313 | 18.4E | break; |
6314 | 18.4E | } |
6315 | 18.4E | } |
6316 | 344k | } |
6317 | 341k | } |
6318 | 341k | if (s.ok()) { |
6319 | 344k | for (auto cfd : *impl->versions_->GetColumnFamilySet()) { |
6320 | 344k | impl->InstallSuperVersionAndScheduleWork(cfd, nullptr, *cfd->GetLatestMutableCFOptions()); |
6321 | 344k | } |
6322 | 341k | impl->alive_log_files_.push_back( |
6323 | 341k | DBImpl::LogFileNumberSize(impl->logfile_number_)); |
6324 | 341k | impl->DeleteObsoleteFiles(); |
6325 | 341k | s = impl->directories_.GetDbDir()->Fsync(); |
6326 | 341k | } |
6327 | 341k | } |
6328 | | |
6329 | 341k | if (s.ok()) { |
6330 | 344k | for (auto cfd : *impl->versions_->GetColumnFamilySet()) { |
6331 | 344k | if (cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { |
6332 | 257 | auto* vstorage = cfd->current()->storage_info(); |
6333 | 257 | for (int i = 1; i < vstorage->num_levels(); ++i) { |
6334 | 0 | int num_files = vstorage->NumLevelFiles(i); |
6335 | 0 | if (num_files > 0) { |
6336 | 0 | s = STATUS(InvalidArgument, |
6337 | 0 | "Not all files are at level 0. Cannot " |
6338 | 0 | "open with FIFO compaction style."); |
6339 | 0 | break; |
6340 | 0 | } |
6341 | 0 | } |
6342 | 257 | } |
6343 | 344k | if (!cfd->mem()->IsSnapshotSupported()) { |
6344 | 60 | impl->is_snapshot_supported_ = false; |
6345 | 60 | } |
6346 | 344k | if (cfd->ioptions()->merge_operator != nullptr && |
6347 | 530 | !cfd->mem()->IsMergeOperatorSupported()) { |
6348 | 0 | s = STATUS(InvalidArgument, |
6349 | 0 | "The memtable of column family %s does not support merge operator " |
6350 | 0 | "its options.merge_operator is non-null", cfd->GetName().c_str()); |
6351 | 0 | } |
6352 | 344k | if (!s.ok()) { |
6353 | 0 | break; |
6354 | 0 | } |
6355 | 344k | } |
6356 | 341k | } |
6357 | 341k | TEST_SYNC_POINT("DBImpl::Open:Opened"); |
6358 | 341k | Status persist_options_status; |
6359 | 341k | if (s.ok()) { |
6360 | | // Persist RocksDB Options before scheduling the compaction. |
6361 | | // The WriteOptionsFile() will release and lock the mutex internally. |
6362 | 341k | persist_options_status = impl->WriteOptionsFile(); |
6363 | | |
6364 | 341k | *dbptr = impl; |
6365 | 341k | impl->opened_successfully_ = true; |
6366 | 341k | impl->MaybeScheduleFlushOrCompaction(); |
6367 | 341k | } |
6368 | 341k | impl->mutex_.Unlock(); |
6369 | | |
6370 | 341k | auto sfm = static_cast<SstFileManagerImpl*>( |
6371 | 341k | impl->db_options_.sst_file_manager.get()); |
6372 | 341k | if (s.ok() && sfm) { |
6373 | | // Notify SstFileManager about all sst files that already exist in |
6374 | | // db_paths[0] when the DB is opened. |
6375 | 11 | auto& db_path = impl->db_options_.db_paths[0]; |
6376 | 11 | std::vector<std::string> existing_files; |
6377 | 11 | RETURN_NOT_OK(impl->db_options_.env->GetChildren(db_path.path, &existing_files)); |
6378 | 112 | for (auto& file_name : existing_files) { |
6379 | 112 | uint64_t file_number; |
6380 | 112 | FileType file_type; |
6381 | 112 | std::string file_path = db_path.path + "/" + file_name; |
6382 | 112 | if (ParseFileName(file_name, &file_number, &file_type) && |
6383 | 76 | (file_type == kTableFile || file_type == kTableSBlockFile)) { |
6384 | 8 | RETURN_NOT_OK(sfm->OnAddFile(file_path)); |
6385 | 8 | } |
6386 | 112 | } |
6387 | 11 | } |
6388 | | |
6389 | 341k | if (s.ok()) { |
6390 | 341k | LogFlush(impl->db_options_.info_log); |
6391 | 341k | if (!persist_options_status.ok()) { |
6392 | 56 | if (db_options.fail_if_options_file_error) { |
6393 | 0 | s = STATUS(IOError, |
6394 | 0 | "DB::Open() failed --- Unable to persist Options file", |
6395 | 0 | persist_options_status.ToString()); |
6396 | 0 | } |
6397 | 56 | RWARN(impl->db_options_.info_log, |
6398 | 56 | "Unable to persist options in DB::Open() -- %s", |
6399 | 56 | persist_options_status.ToString().c_str()); |
6400 | 56 | } |
6401 | 341k | } |
6402 | 341k | if (!s.ok()) { |
6403 | 0 | for (auto* h : *handles) { |
6404 | 0 | delete h; |
6405 | 0 | } |
6406 | 170 | handles->clear(); |
6407 | 170 | delete impl; |
6408 | 170 | *dbptr = nullptr; |
6409 | 341k | } else if (impl) { |
6410 | 341k | impl->SetSSTFileTickers(); |
6411 | 341k | } |
6412 | | |
6413 | 341k | return s; |
6414 | 341k | } |
6415 | | |
6416 | 144 | yb::Result<std::unique_ptr<DB>> DB::Open(const Options& options, const std::string& name) { |
6417 | 144 | DB* db = nullptr; |
6418 | 144 | Status status = Open(options, name, &db); |
6419 | 144 | if (!status.ok()) { |
6420 | 0 | delete db; |
6421 | 0 | return status; |
6422 | 0 | } |
6423 | 144 | return std::unique_ptr<DB>(db); |
6424 | 144 | } |
6425 | | |
6426 | | Status DB::ListColumnFamilies(const DBOptions& db_options, |
6427 | | const std::string& name, |
6428 | 8 | std::vector<std::string>* column_families) { |
6429 | 8 | return VersionSet::ListColumnFamilies(column_families, |
6430 | 8 | name, |
6431 | 8 | db_options.boundary_extractor.get(), |
6432 | 8 | db_options.env); |
6433 | 8 | } |
6434 | | |
6435 | 329k | Snapshot::~Snapshot() { |
6436 | 329k | } |
6437 | | |
6438 | 345k | Status DestroyDB(const std::string& dbname, const Options& options) { |
6439 | 345k | const InternalKeyComparator comparator(options.comparator); |
6440 | 345k | const Options& soptions(SanitizeOptions(dbname, &comparator, options)); |
6441 | 345k | Env* env = soptions.env; |
6442 | 345k | std::vector<std::string> filenames; |
6443 | | |
6444 | | // Ignore error in case directory does not exist |
6445 | 345k | env->GetChildrenWarnNotOk(dbname, &filenames); |
6446 | | |
6447 | 345k | FileLock* lock; |
6448 | 345k | const std::string lockname = LockFileName(dbname); |
6449 | 345k | Status result = env->LockFile(lockname, &lock); |
6450 | 345k | if (result.ok()) { |
6451 | 345k | uint64_t number; |
6452 | 345k | FileType type; |
6453 | 345k | InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname); |
6454 | 3.28M | for (size_t i = 0; i < filenames.size(); i++) { |
6455 | 2.94M | if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) && |
6456 | 2.24M | type != kDBLockFile) { // Lock file will be deleted at end |
6457 | 1.93M | Status del; |
6458 | 1.93M | std::string path_to_delete = dbname + "/" + filenames[i]; |
6459 | 1.93M | if (type == kMetaDatabase) { |
6460 | 2 | del = DestroyDB(path_to_delete, options); |
6461 | 1.93M | } else if (type == kTableFile || type == kTableSBlockFile) { |
6462 | 19.0k | del = DeleteSSTFile(&options, path_to_delete, 0); |
6463 | 1.91M | } else { |
6464 | 1.91M | del = env->DeleteFile(path_to_delete); |
6465 | 1.91M | } |
6466 | 1.93M | if (result.ok() && !del.ok()) { |
6467 | 0 | result = del; |
6468 | 0 | } |
6469 | 1.93M | } |
6470 | 2.94M | } |
6471 | | |
6472 | 348k | for (size_t path_id = 0; path_id < options.db_paths.size(); path_id++) { |
6473 | 2.26k | const auto& db_path = options.db_paths[path_id]; |
6474 | 2.26k | env->GetChildrenWarnNotOk(db_path.path, &filenames); |
6475 | 4.39k | for (size_t i = 0; i < filenames.size(); i++) { |
6476 | 2.13k | if (ParseFileName(filenames[i], &number, &type) && |
6477 | | // Lock file will be deleted at end |
6478 | 742 | (type == kTableFile || type == kTableSBlockFile)) { |
6479 | 170 | std::string table_path = db_path.path + "/" + filenames[i]; |
6480 | 170 | Status del = DeleteSSTFile(&options, table_path, |
6481 | 170 | static_cast<uint32_t>(path_id)); |
6482 | 170 | if (result.ok() && !del.ok()) { |
6483 | 0 | result = del; |
6484 | 0 | } |
6485 | 170 | } |
6486 | 2.13k | } |
6487 | 2.26k | } |
6488 | | |
6489 | 345k | std::vector<std::string> walDirFiles; |
6490 | 345k | std::string archivedir = ArchivalDirectory(dbname); |
6491 | 345k | if (dbname != soptions.wal_dir) { |
6492 | 710 | env->GetChildrenWarnNotOk(soptions.wal_dir, &walDirFiles); |
6493 | 710 | archivedir = ArchivalDirectory(soptions.wal_dir); |
6494 | 710 | } |
6495 | | |
6496 | | // Delete log files in the WAL dir |
6497 | 424 | for (const auto& file : walDirFiles) { |
6498 | 424 | if (ParseFileName(file, &number, &type) && type == kLogFile) { |
6499 | 139 | Status del = env->DeleteFile(soptions.wal_dir + "/" + file); |
6500 | 139 | if (result.ok() && !del.ok()) { |
6501 | 0 | result = del; |
6502 | 0 | } |
6503 | 139 | } |
6504 | 424 | } |
6505 | | |
6506 | 345k | std::vector<std::string> archiveFiles; |
6507 | 345k | env->GetChildrenWarnNotOk(archivedir, &archiveFiles); |
6508 | | // Delete archival files. |
6509 | 345k | for (size_t i = 0; i < archiveFiles.size(); ++i) { |
6510 | 183 | if (ParseFileName(archiveFiles[i], &number, &type) && |
6511 | 93 | type == kLogFile) { |
6512 | 93 | Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]); |
6513 | 93 | if (result.ok() && !del.ok()) { |
6514 | 0 | result = del; |
6515 | 0 | } |
6516 | 93 | } |
6517 | 183 | } |
6518 | | |
6519 | | // ignore case where no archival directory is present. |
6520 | 345k | if (env->FileExists(archivedir).ok()) { |
6521 | 45 | WARN_NOT_OK(env->DeleteDir(archivedir), "Failed to cleanup dir " + archivedir); |
6522 | 45 | } |
6523 | 345k | WARN_NOT_OK(env->UnlockFile(lock), "Unlock file failed"); |
6524 | 345k | env->CleanupFile(lockname); |
6525 | 345k | if (env->FileExists(dbname).ok()) { |
6526 | 345k | WARN_NOT_OK(env->DeleteDir(dbname), "Failed to cleanup dir " + dbname); |
6527 | 345k | } |
6528 | 345k | if (env->FileExists(soptions.wal_dir).ok()) { |
6529 | 1.22k | WARN_NOT_OK(env->DeleteDir(soptions.wal_dir), |
6530 | 1.22k | "Failed to cleanup wal dir " + soptions.wal_dir); |
6531 | 1.22k | } |
6532 | 345k | } |
6533 | 345k | return result; |
6534 | 345k | } |
6535 | | |
6536 | 1.00M | Status DBImpl::WriteOptionsFile() { |
6537 | 1.00M | #ifndef ROCKSDB_LITE |
6538 | 1.00M | mutex_.AssertHeld(); |
6539 | | |
6540 | 1.00M | std::vector<std::string> cf_names; |
6541 | 1.00M | std::vector<ColumnFamilyOptions> cf_opts; |
6542 | | |
6543 | | // This part requires mutex to protect the column family options |
6544 | 1.00M | GetColumnFamiliesOptionsUnlocked(&cf_names, &cf_opts); |
6545 | | |
6546 | | // Unlock during expensive operations. New writes cannot get here |
6547 | | // because the single write thread ensures all new writes get queued. |
6548 | 1.00M | mutex_.Unlock(); |
6549 | | |
6550 | 1.00M | std::string file_name = |
6551 | 1.00M | TempOptionsFileName(GetName(), versions_->NewFileNumber()); |
6552 | 1.00M | Status s = PersistRocksDBOptions(GetDBOptions(), cf_names, cf_opts, file_name, |
6553 | 1.00M | GetEnv()); |
6554 | | |
6555 | 1.00M | if (s.ok()) { |
6556 | 999k | s = RenameTempFileToOptionsFile(file_name); |
6557 | 999k | } |
6558 | 1.00M | mutex_.Lock(); |
6559 | 1.00M | return s; |
6560 | | #else |
6561 | | return Status::OK(); |
6562 | | #endif // !ROCKSDB_LITE |
6563 | 1.00M | } |
6564 | | |
6565 | | #ifndef ROCKSDB_LITE |
6566 | | namespace { |
6567 | | void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames, |
6568 | | const size_t num_files_to_keep, |
6569 | | const std::shared_ptr<Logger>& info_log, |
6570 | 1.00M | Env* env) { |
6571 | 1.00M | if (filenames.size() <= num_files_to_keep) { |
6572 | 665k | return; |
6573 | 665k | } |
6574 | 335k | for (auto iter = std::next(filenames.begin(), num_files_to_keep); |
6575 | 670k | iter != filenames.end(); ++iter) { |
6576 | 335k | if (!env->DeleteFile(iter->second).ok()) { |
6577 | 0 | RWARN(info_log, "Unable to delete options file %s", iter->second.c_str()); |
6578 | 0 | } |
6579 | 335k | } |
6580 | 335k | } |
6581 | | } // namespace |
6582 | | #endif // !ROCKSDB_LITE |
6583 | | |
6584 | 1.00M | Status DBImpl::DeleteObsoleteOptionsFiles() { |
6585 | 1.00M | #ifndef ROCKSDB_LITE |
6586 | 1.00M | std::vector<std::string> filenames; |
6587 | | // use ordered map to store keep the filenames sorted from the newest |
6588 | | // to the oldest. |
6589 | 1.00M | std::map<uint64_t, std::string> options_filenames; |
6590 | 1.00M | Status s; |
6591 | 1.00M | s = GetEnv()->GetChildren(GetName(), &filenames); |
6592 | 1.00M | if (!s.ok()) { |
6593 | 0 | return s; |
6594 | 0 | } |
6595 | 9.72M | for (auto& filename : filenames) { |
6596 | 9.72M | uint64_t file_number; |
6597 | 9.72M | FileType type; |
6598 | 9.72M | if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) { |
6599 | 2.00M | options_filenames.insert( |
6600 | 2.00M | {std::numeric_limits<uint64_t>::max() - file_number, |
6601 | 2.00M | GetName() + "/" + filename}); |
6602 | 2.00M | } |
6603 | 9.72M | } |
6604 | | |
6605 | | // Keeps the latest 2 Options file |
6606 | 1.00M | const size_t kNumOptionsFilesKept = 2; |
6607 | 1.00M | DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, |
6608 | 1.00M | db_options_.info_log, GetEnv()); |
6609 | 1.00M | return Status::OK(); |
6610 | | #else |
6611 | | return Status::OK(); |
6612 | | #endif // !ROCKSDB_LITE |
6613 | 1.00M | } |
6614 | | |
6615 | 999k | Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { |
6616 | 999k | #ifndef ROCKSDB_LITE |
6617 | 999k | Status s; |
6618 | 999k | std::string options_file_name = |
6619 | 999k | OptionsFileName(GetName(), versions_->NewFileNumber()); |
6620 | | // Retry if the file name happen to conflict with an existing one. |
6621 | 999k | s = GetEnv()->RenameFile(file_name, options_file_name); |
6622 | | |
6623 | 999k | WARN_NOT_OK(DeleteObsoleteOptionsFiles(), "Failed to cleanup obsolete options file"); |
6624 | 999k | return s; |
6625 | | #else |
6626 | | return Status::OK(); |
6627 | | #endif // !ROCKSDB_LITE |
6628 | 999k | } |
6629 | | |
6630 | | #ifndef ROCKSDB_LITE |
6631 | | SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, |
6632 | 135 | bool include_history) { |
6633 | | // Find the earliest sequence number that we know we can rely on reading |
6634 | | // from the memtable without needing to check sst files. |
6635 | 135 | SequenceNumber earliest_seq = |
6636 | 135 | sv->imm->GetEarliestSequenceNumber(include_history); |
6637 | 135 | if (earliest_seq == kMaxSequenceNumber) { |
6638 | 112 | earliest_seq = sv->mem->GetEarliestSequenceNumber(); |
6639 | 112 | } |
6640 | 135 | assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq); |
6641 | | |
6642 | 135 | return earliest_seq; |
6643 | 135 | } |
6644 | | #endif // ROCKSDB_LITE |
6645 | | |
6646 | | #ifndef ROCKSDB_LITE |
6647 | | Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key, |
6648 | | bool cache_only, SequenceNumber* seq, |
6649 | 138 | bool* found_record_for_key) { |
6650 | 138 | Status s; |
6651 | 138 | MergeContext merge_context; |
6652 | | |
6653 | 138 | SequenceNumber current_seq = versions_->LastSequence(); |
6654 | 138 | LookupKey lkey(key, current_seq); |
6655 | | |
6656 | 138 | *seq = kMaxSequenceNumber; |
6657 | 138 | *found_record_for_key = false; |
6658 | | |
6659 | | // Check if there is a record for this key in the latest memtable |
6660 | 138 | sv->mem->Get(lkey, nullptr, &s, &merge_context, seq); |
6661 | | |
6662 | 138 | if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { |
6663 | | // unexpected error reading memtable. |
6664 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6665 | 0 | "Unexpected status returned from MemTable::Get: %s\n", |
6666 | 0 | s.ToString().c_str()); |
6667 | |
|
6668 | 0 | return s; |
6669 | 0 | } |
6670 | | |
6671 | 138 | if (*seq != kMaxSequenceNumber) { |
6672 | | // Found a sequence number, no need to check immutable memtables |
6673 | 81 | *found_record_for_key = true; |
6674 | 81 | return Status::OK(); |
6675 | 81 | } |
6676 | | |
6677 | | // Check if there is a record for this key in the immutable memtables |
6678 | 57 | sv->imm->Get(lkey, nullptr, &s, &merge_context, seq); |
6679 | | |
6680 | 57 | if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { |
6681 | | // unexpected error reading memtable. |
6682 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6683 | 0 | "Unexpected status returned from MemTableList::Get: %s\n", |
6684 | 0 | s.ToString().c_str()); |
6685 | |
|
6686 | 0 | return s; |
6687 | 0 | } |
6688 | | |
6689 | 57 | if (*seq != kMaxSequenceNumber) { |
6690 | | // Found a sequence number, no need to check memtable history |
6691 | 0 | *found_record_for_key = true; |
6692 | 0 | return Status::OK(); |
6693 | 0 | } |
6694 | | |
6695 | | // Check if there is a record for this key in the immutable memtables |
6696 | 57 | sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, seq); |
6697 | | |
6698 | 57 | if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { |
6699 | | // unexpected error reading memtable. |
6700 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6701 | 0 | "Unexpected status returned from MemTableList::GetFromHistory: %s\n", |
6702 | 0 | s.ToString().c_str()); |
6703 | |
|
6704 | 0 | return s; |
6705 | 0 | } |
6706 | | |
6707 | 57 | if (*seq != kMaxSequenceNumber) { |
6708 | | // Found a sequence number, no need to check SST files |
6709 | 1 | *found_record_for_key = true; |
6710 | 1 | return Status::OK(); |
6711 | 1 | } |
6712 | | |
6713 | | // TODO(agiardullo): possible optimization: consider checking cached |
6714 | | // SST files if cache_only=true? |
6715 | 56 | if (!cache_only) { |
6716 | | // Check tables |
6717 | 21 | ReadOptions read_options; |
6718 | | |
6719 | 21 | sv->current->Get(read_options, lkey, nullptr, &s, &merge_context, |
6720 | 21 | nullptr /* value_found */, found_record_for_key, seq); |
6721 | | |
6722 | 21 | if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) { |
6723 | | // unexpected error reading SST files |
6724 | 0 | RLOG(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, |
6725 | 0 | "Unexpected status returned from Version::Get: %s\n", |
6726 | 0 | s.ToString().c_str()); |
6727 | |
|
6728 | 0 | return s; |
6729 | 0 | } |
6730 | 56 | } |
6731 | | |
6732 | 56 | return Status::OK(); |
6733 | 56 | } |
6734 | | #endif // ROCKSDB_LITE |
6735 | | |
6736 | 366k | const std::string& DBImpl::LogPrefix() const { |
6737 | 366k | static const std::string kEmptyString; |
6738 | 366k | return db_options_.info_log ? db_options_.info_log->Prefix() : kEmptyString; |
6739 | 366k | } |
6740 | | |
6741 | | } // namespace rocksdb |