/Users/deen/code/yugabyte-db/src/yb/tserver/tablet_memory_manager.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include "yb/tserver/tablet_memory_manager.h" |
15 | | |
16 | | #include "yb/consensus/log_cache.h" |
17 | | #include "yb/consensus/raft_consensus.h" |
18 | | |
19 | | #include "yb/gutil/casts.h" |
20 | | #include "yb/gutil/strings/human_readable.h" |
21 | | |
22 | | #include "yb/rocksdb/cache.h" |
23 | | #include "yb/rocksdb/memory_monitor.h" |
24 | | |
25 | | #include "yb/tablet/tablet.h" |
26 | | #include "yb/tablet/tablet_options.h" |
27 | | #include "yb/tablet/tablet_peer.h" |
28 | | |
29 | | #include "yb/util/background_task.h" |
30 | | #include "yb/util/flag_tags.h" |
31 | | #include "yb/util/logging.h" |
32 | | #include "yb/util/mem_tracker.h" |
33 | | #include "yb/util/status_log.h" |
34 | | |
35 | | using namespace std::literals; |
36 | | using namespace std::placeholders; |
37 | | |
38 | | DEFINE_bool(enable_log_cache_gc, true, |
39 | | "Set to true to enable log cache garbage collector."); |
40 | | |
41 | | DEFINE_bool(log_cache_gc_evict_only_over_allocated, true, |
42 | | "If set to true, log cache garbage collection would evict only memory that was " |
43 | | "allocated over limit for log cache. Otherwise it will try to evict requested number " |
44 | | "of bytes."); |
45 | | DEFINE_int64(global_memstore_size_percentage, 10, |
46 | | "Percentage of total available memory to use for the global memstore. " |
47 | | "Default is 10. See also memstore_size_mb and " |
48 | | "global_memstore_size_mb_max."); |
49 | | DEFINE_int64(global_memstore_size_mb_max, 2048, |
50 | | "Global memstore size is determined as a percentage of the available " |
51 | | "memory. However, this flag limits it in absolute size. Value of 0 " |
52 | | "means no limit on the value obtained by the percentage. Default is 2048."); |
53 | | |
54 | | namespace { |
55 | | constexpr int kDbCacheSizeUsePercentage = -1; |
56 | | constexpr int kDbCacheSizeCacheDisabled = -2; |
57 | | constexpr int kDbCacheSizeUseDefault = -3; |
58 | | } |
59 | | |
60 | | DEFINE_bool(enable_block_based_table_cache_gc, false, |
61 | | "Set to true to enable block based table garbage collector."); |
62 | | |
63 | | DEFINE_int64(db_block_cache_size_bytes, kDbCacheSizeUsePercentage, |
64 | | "Size of RocksDB block cache (in bytes). " |
65 | | "This defaults to -1 for system auto-generated default, which would use " |
66 | | "FLAGS_db_block_cache_size_percentage to select a percentage of the total " |
67 | | "memory as the default size for the shared block cache. Value of -2 disables " |
68 | | "block cache."); |
69 | | |
70 | | DEFINE_int32(db_block_cache_size_percentage, kDbCacheSizeUseDefault, |
71 | | "Default percentage of total available memory to use as block cache size, if not " |
72 | | "asking for a raw number, through FLAGS_db_block_cache_size_bytes. " |
73 | | "Defaults to -3 (use default percentage as defined by master or tserver)."); |
74 | | |
75 | | DEFINE_int32(db_block_cache_num_shard_bits, 4, |
76 | | "Number of bits to use for sharding the block cache (defaults to 4 bits)"); |
77 | | TAG_FLAG(db_block_cache_num_shard_bits, advanced); |
78 | | |
79 | | DEFINE_test_flag(bool, pretend_memory_exceeded_enforce_flush, false, |
80 | | "Always pretend memory has been exceeded to enforce background flush."); |
81 | | |
82 | | namespace yb { |
83 | | namespace tserver { |
84 | | |
85 | | using strings::Substitute; |
86 | | |
87 | | namespace { |
88 | | |
89 | | class FunctorGC : public GarbageCollector { |
90 | | public: |
91 | 17.2k | explicit FunctorGC(std::function<void(size_t)> impl) : impl_(std::move(impl)) {} |
92 | | |
93 | 0 | void CollectGarbage(size_t required) { |
94 | 0 | impl_(required); |
95 | 0 | } |
96 | | |
97 | 245 | virtual ~FunctorGC() = default; |
98 | | |
99 | | private: |
100 | | std::function<void(size_t)> impl_; |
101 | | }; |
102 | | |
103 | | class LRUCacheGC : public GarbageCollector { |
104 | | public: |
105 | 17.2k | explicit LRUCacheGC(std::shared_ptr<rocksdb::Cache> cache) : cache_(std::move(cache)) {} |
106 | | |
107 | 0 | void CollectGarbage(size_t required) { |
108 | 0 | if (!FLAGS_enable_block_based_table_cache_gc) { |
109 | 0 | return; |
110 | 0 | } |
111 | | |
112 | 0 | auto evicted = cache_->Evict(required); |
113 | 0 | LOG(INFO) << "Evicted from table cache: " << HumanReadableNumBytes::ToString(evicted) |
114 | 0 | << ", new usage: " << HumanReadableNumBytes::ToString(cache_->GetUsage()) |
115 | 0 | << ", required: " << HumanReadableNumBytes::ToString(required); |
116 | 0 | } |
117 | | |
118 | 245 | virtual ~LRUCacheGC() = default; |
119 | | |
120 | | private: |
121 | | std::shared_ptr<rocksdb::Cache> cache_; |
122 | | }; |
123 | | |
124 | | // Evaluates the target block cache size based on the db_block_cache_size_percentage and |
125 | | // db_block_cache_size_bytes flags, as well as the passed default_block_cache_size_percentage. |
126 | 17.2k | int64_t GetTargetBlockCacheSize(const int32_t default_block_cache_size_percentage) { |
127 | 17.2k | int32_t target_block_cache_size_percentage = |
128 | 17.2k | (FLAGS_db_block_cache_size_percentage == kDbCacheSizeUseDefault) ? |
129 | 17.2k | default_block_cache_size_percentage : FLAGS_db_block_cache_size_percentage0 ; |
130 | | |
131 | | // If we aren't assigning block cache sized based on percentage, then the size is determined by |
132 | | // db_block_cache_size_bytes. |
133 | 17.2k | int64_t target_block_cache_size_bytes = FLAGS_db_block_cache_size_bytes; |
134 | | // Auto-compute size of block cache based on percentage of memory available if asked to. |
135 | 17.2k | if (target_block_cache_size_bytes == kDbCacheSizeUsePercentage) { |
136 | | // Check some bounds. |
137 | 17.2k | CHECK(target_block_cache_size_percentage > 0 && target_block_cache_size_percentage <= 100) |
138 | 0 | << Substitute( |
139 | 0 | "Flag tablet_block_cache_size_percentage must be between 0 and 100. Current value: " |
140 | 0 | "$0", |
141 | 0 | target_block_cache_size_percentage); |
142 | | |
143 | 17.2k | const int64_t total_ram_avail = MemTracker::GetRootTracker()->limit(); |
144 | 17.2k | target_block_cache_size_bytes = total_ram_avail * target_block_cache_size_percentage / 100; |
145 | 17.2k | } |
146 | 17.2k | return target_block_cache_size_bytes; |
147 | 17.2k | } |
148 | | |
149 | 0 | size_t GetLogCacheSize(tablet::TabletPeer* peer) { |
150 | 0 | return down_cast<consensus::RaftConsensus*>(peer->consensus())->LogCacheSize(); |
151 | 0 | } |
152 | | |
153 | | } // namespace |
154 | | |
155 | | TabletMemoryManager::TabletMemoryManager( |
156 | | tablet::TabletOptions* options, |
157 | | const std::shared_ptr<MemTracker>& mem_tracker, |
158 | | const int32_t default_block_cache_size_percentage, |
159 | | const scoped_refptr<MetricEntity>& metrics, |
160 | 17.2k | const std::function<std::vector<tablet::TabletPeerPtr>()>& peers_fn) { |
161 | 17.2k | server_mem_tracker_ = mem_tracker; |
162 | 17.2k | peers_fn_ = peers_fn; |
163 | | |
164 | 17.2k | InitBlockCache(metrics, default_block_cache_size_percentage, options); |
165 | 17.2k | InitLogCacheGC(); |
166 | | // Assign background_task_ if necessary. |
167 | 17.2k | ConfigureBackgroundTask(options); |
168 | 17.2k | } |
169 | | |
170 | 16.6k | CHECKED_STATUS TabletMemoryManager::Init() { |
171 | 16.6k | if (background_task_) { |
172 | 16.6k | RETURN_NOT_OK(background_task_->Init()); |
173 | 16.6k | } |
174 | 16.6k | return Status::OK(); |
175 | 16.6k | } |
176 | | |
177 | 317 | void TabletMemoryManager::Shutdown() { |
178 | 317 | if (background_task_) { |
179 | 317 | background_task_->Shutdown(); |
180 | 317 | } |
181 | 317 | } |
182 | | |
183 | 150k | std::shared_ptr<MemTracker> TabletMemoryManager::block_based_table_mem_tracker() { |
184 | 150k | return block_based_table_mem_tracker_; |
185 | 150k | } |
186 | | |
187 | | void TabletMemoryManager::InitBlockCache( |
188 | | const scoped_refptr<MetricEntity>& metrics, |
189 | | const int32_t default_block_cache_size_percentage, |
190 | 17.2k | tablet::TabletOptions* options) { |
191 | 17.2k | int64_t block_cache_size_bytes = GetTargetBlockCacheSize(default_block_cache_size_percentage); |
192 | | |
193 | 17.2k | block_based_table_mem_tracker_ = MemTracker::FindOrCreateTracker( |
194 | 17.2k | block_cache_size_bytes, |
195 | 17.2k | "BlockBasedTable", |
196 | 17.2k | server_mem_tracker_); |
197 | | |
198 | 17.2k | if (block_cache_size_bytes != kDbCacheSizeCacheDisabled) { |
199 | 17.2k | options->block_cache = rocksdb::NewLRUCache(block_cache_size_bytes, |
200 | 17.2k | FLAGS_db_block_cache_num_shard_bits); |
201 | 17.2k | options->block_cache->SetMetrics(metrics); |
202 | 17.2k | block_based_table_gc_ = std::make_shared<LRUCacheGC>(options->block_cache); |
203 | 17.2k | block_based_table_mem_tracker_->AddGarbageCollector(block_based_table_gc_); |
204 | 17.2k | } |
205 | 17.2k | } |
206 | | |
207 | 17.2k | void TabletMemoryManager::InitLogCacheGC() { |
208 | 17.2k | auto log_cache_mem_tracker = consensus::LogCache::GetServerMemTracker(server_mem_tracker_); |
209 | 17.2k | log_cache_gc_ = std::make_shared<FunctorGC>( |
210 | 17.2k | std::bind(&TabletMemoryManager::LogCacheGC, this, log_cache_mem_tracker.get(), _1)); |
211 | 17.2k | log_cache_mem_tracker->AddGarbageCollector(log_cache_gc_); |
212 | 17.2k | } |
213 | | |
214 | 17.2k | void TabletMemoryManager::ConfigureBackgroundTask(tablet::TabletOptions* options) { |
215 | | // Calculate memstore_size_bytes based on total RAM available and global percentage. |
216 | 17.2k | CHECK(FLAGS_global_memstore_size_percentage > 0 && FLAGS_global_memstore_size_percentage <= 100) |
217 | 0 | << Substitute( |
218 | 0 | "Flag tablet_block_cache_size_percentage must be between 0 and 100. Current value: " |
219 | 0 | "$0", |
220 | 0 | FLAGS_global_memstore_size_percentage); |
221 | 17.2k | int64_t total_ram_avail = MemTracker::GetRootTracker()->limit(); |
222 | 17.2k | size_t memstore_size_bytes = total_ram_avail * FLAGS_global_memstore_size_percentage / 100; |
223 | | |
224 | 17.2k | if (FLAGS_global_memstore_size_mb_max != 0) { |
225 | 17.2k | memstore_size_bytes = std::min(memstore_size_bytes, |
226 | 17.2k | static_cast<size_t>(FLAGS_global_memstore_size_mb_max << 20)); |
227 | 17.2k | } |
228 | | |
229 | | // Add memory monitor and background thread for flushing. |
230 | | // TODO(zhaoalex): replace task with Poller |
231 | 17.2k | background_task_.reset(new BackgroundTask( |
232 | 17.2k | std::function<void()>([this]() { FlushTabletIfLimitExceeded(); }688 ), |
233 | 17.2k | "tablet manager", |
234 | 17.2k | "flush scheduler bgtask")); |
235 | 17.2k | options->memory_monitor = std::make_shared<rocksdb::MemoryMonitor>( |
236 | 17.2k | memstore_size_bytes, |
237 | 228k | std::function<void()>([this](){ |
238 | 228k | YB_WARN_NOT_OK(background_task_->Wake(), "Wakeup error"); })); |
239 | | |
240 | | // Must assign memory_monitor_ after configuring the background task. |
241 | 17.2k | memory_monitor_ = options->memory_monitor; |
242 | 17.2k | } |
243 | | |
244 | 0 | void TabletMemoryManager::LogCacheGC(MemTracker* log_cache_mem_tracker, size_t bytes_to_evict) { |
245 | 0 | if (!FLAGS_enable_log_cache_gc) { |
246 | 0 | return; |
247 | 0 | } |
248 | | |
249 | 0 | if (FLAGS_log_cache_gc_evict_only_over_allocated) { |
250 | 0 | if (!log_cache_mem_tracker->has_limit()) { |
251 | 0 | return; |
252 | 0 | } |
253 | 0 | auto limit = log_cache_mem_tracker->limit(); |
254 | 0 | auto consumption = log_cache_mem_tracker->consumption(); |
255 | 0 | if (consumption <= limit) { |
256 | 0 | return; |
257 | 0 | } |
258 | 0 | bytes_to_evict = std::min<size_t>(bytes_to_evict, consumption - limit); |
259 | 0 | } |
260 | | |
261 | 0 | auto peers = peers_fn_(); |
262 | | // Sort by inverse log size. |
263 | 0 | std::sort(peers.begin(), peers.end(), [](const auto& lhs, const auto& rhs) { |
264 | 0 | return GetLogCacheSize(lhs.get()) > GetLogCacheSize(rhs.get()); |
265 | 0 | }); |
266 | |
|
267 | 0 | size_t total_evicted = 0; |
268 | 0 | for (const auto& peer : peers) { |
269 | 0 | if (GetLogCacheSize(peer.get()) <= 0) { |
270 | 0 | continue; |
271 | 0 | } |
272 | 0 | size_t evicted = down_cast<consensus::RaftConsensus*>( |
273 | 0 | peer->consensus())->EvictLogCache(bytes_to_evict - total_evicted); |
274 | 0 | total_evicted += evicted; |
275 | 0 | if (total_evicted >= bytes_to_evict) { |
276 | 0 | break; |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | |
|
281 | 0 | LOG(INFO) << "Evicted from log cache: " << HumanReadableNumBytes::ToString(total_evicted) |
282 | 0 | << ", required: " << HumanReadableNumBytes::ToString(bytes_to_evict); |
283 | 0 | } |
284 | | |
285 | 691 | void TabletMemoryManager::FlushTabletIfLimitExceeded() { |
286 | 691 | int iteration = 0; |
287 | 1.11k | while (memory_monitor_->Exceeded() || |
288 | 1.11k | (694 iteration++ == 0694 && FLAGS_TEST_pretend_memory_exceeded_enforce_flush691 )) { |
289 | 419 | YB_LOG_EVERY_N_SECS(INFO, 5) << Format("Memstore global limit of $0 bytes reached, looking for " |
290 | 216 | "tablet to flush", memory_monitor_->limit()); |
291 | 419 | auto flush_tick = rocksdb::FlushTick(); |
292 | 419 | tablet::TabletPeerPtr peer_to_flush = TabletToFlush(); |
293 | 419 | if (peer_to_flush) { |
294 | 416 | auto tablet_to_flush = peer_to_flush->shared_tablet(); |
295 | | // TODO(bojanserafimov): If peer_to_flush flushes now because of other reasons, |
296 | | // we will schedule a second flush, which will unnecessarily stall writes for a short time. |
297 | | // This will not happen often, but should be fixed. |
298 | 416 | if (tablet_to_flush) { |
299 | 416 | LOG(INFO) |
300 | 416 | << LogPrefix(peer_to_flush) |
301 | 416 | << "Flushing tablet with oldest memstore write at " |
302 | 416 | << tablet_to_flush->OldestMutableMemtableWriteHybridTime(); |
303 | 416 | WARN_NOT_OK( |
304 | 416 | tablet_to_flush->Flush( |
305 | 416 | tablet::FlushMode::kAsync, tablet::FlushFlags::kAllDbs, flush_tick), |
306 | 416 | Substitute("Flush failed on $0", peer_to_flush->tablet_id())); |
307 | 416 | for (auto listener : TEST_listeners) { |
308 | 0 | listener->StartedFlush(peer_to_flush->tablet_id()); |
309 | 0 | } |
310 | 416 | } |
311 | 416 | } |
312 | 419 | } |
313 | 691 | } |
314 | | |
315 | | // Return the tablet with the oldest write in memstore, or nullptr if all tablet memstores are |
316 | | // empty or about to flush. |
317 | 419 | tablet::TabletPeerPtr TabletMemoryManager::TabletToFlush() { |
318 | 419 | HybridTime oldest_write_in_memstores = HybridTime::kMax; |
319 | 419 | tablet::TabletPeerPtr tablet_to_flush; |
320 | 17.3k | for (const tablet::TabletPeerPtr& peer : peers_fn_()) { |
321 | 17.3k | const auto tablet = peer->shared_tablet(); |
322 | 17.3k | if (tablet) { |
323 | 17.3k | const auto ht = tablet->OldestMutableMemtableWriteHybridTime(); |
324 | 17.3k | if (ht.ok()) { |
325 | 17.3k | if (*ht < oldest_write_in_memstores) { |
326 | 1.25k | oldest_write_in_memstores = *ht; |
327 | 1.25k | tablet_to_flush = peer; |
328 | 1.25k | } |
329 | 17.3k | } else { |
330 | 0 | YB_LOG_EVERY_N_SECS(WARNING, 5) << Format( |
331 | 0 | "Failed to get oldest mutable memtable write ht for tablet $0: $1", |
332 | 0 | tablet->tablet_id(), ht.status()); |
333 | 0 | } |
334 | 17.3k | } |
335 | 17.3k | } |
336 | 419 | return tablet_to_flush; |
337 | 419 | } |
338 | | |
339 | 416 | std::string TabletMemoryManager::LogPrefix(const tablet::TabletPeerPtr& peer) const { |
340 | 416 | return Substitute("T $0 P $1 : ", |
341 | 416 | peer->tablet_id(), |
342 | 416 | peer->permanent_uuid()); |
343 | 416 | } |
344 | | |
345 | | } // namespace tserver |
346 | | } // namespace yb |