YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/tserver/tablet_memory_manager.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/tserver/tablet_memory_manager.h"
15
16
#include "yb/consensus/log_cache.h"
17
#include "yb/consensus/raft_consensus.h"
18
19
#include "yb/gutil/casts.h"
20
#include "yb/gutil/strings/human_readable.h"
21
22
#include "yb/rocksdb/cache.h"
23
#include "yb/rocksdb/memory_monitor.h"
24
25
#include "yb/tablet/tablet.h"
26
#include "yb/tablet/tablet_options.h"
27
#include "yb/tablet/tablet_peer.h"
28
29
#include "yb/util/background_task.h"
30
#include "yb/util/flag_tags.h"
31
#include "yb/util/logging.h"
32
#include "yb/util/mem_tracker.h"
33
#include "yb/util/status_log.h"
34
35
using namespace std::literals;
36
using namespace std::placeholders;
37
38
DEFINE_bool(enable_log_cache_gc, true,
39
            "Set to true to enable log cache garbage collector.");
40
41
DEFINE_bool(log_cache_gc_evict_only_over_allocated, true,
42
            "If set to true, log cache garbage collection would evict only memory that was "
43
            "allocated over limit for log cache. Otherwise it will try to evict requested number "
44
            "of bytes.");
45
DEFINE_int64(global_memstore_size_percentage, 10,
46
             "Percentage of total available memory to use for the global memstore. "
47
             "Default is 10. See also memstore_size_mb and "
48
             "global_memstore_size_mb_max.");
49
DEFINE_int64(global_memstore_size_mb_max, 2048,
50
             "Global memstore size is determined as a percentage of the available "
51
             "memory. However, this flag limits it in absolute size. Value of 0 "
52
             "means no limit on the value obtained by the percentage. Default is 2048.");
53
54
namespace {
55
  constexpr int kDbCacheSizeUsePercentage = -1;
56
  constexpr int kDbCacheSizeCacheDisabled = -2;
57
  constexpr int kDbCacheSizeUseDefault = -3;
58
}
59
60
DEFINE_bool(enable_block_based_table_cache_gc, false,
61
            "Set to true to enable block based table garbage collector.");
62
63
DEFINE_int64(db_block_cache_size_bytes, kDbCacheSizeUsePercentage,
64
             "Size of RocksDB block cache (in bytes). "
65
             "This defaults to -1 for system auto-generated default, which would use "
66
             "FLAGS_db_block_cache_size_percentage to select a percentage of the total "
67
             "memory as the default size for the shared block cache. Value of -2 disables "
68
             "block cache.");
69
70
DEFINE_int32(db_block_cache_size_percentage, kDbCacheSizeUseDefault,
71
             "Default percentage of total available memory to use as block cache size, if not "
72
             "asking for a raw number, through FLAGS_db_block_cache_size_bytes. "
73
             "Defaults to -3 (use default percentage as defined by master or tserver).");
74
75
DEFINE_int32(db_block_cache_num_shard_bits, 4,
76
             "Number of bits to use for sharding the block cache (defaults to 4 bits)");
77
TAG_FLAG(db_block_cache_num_shard_bits, advanced);
78
79
DEFINE_test_flag(bool, pretend_memory_exceeded_enforce_flush, false,
80
                  "Always pretend memory has been exceeded to enforce background flush.");
81
82
namespace yb {
83
namespace tserver {
84
85
using strings::Substitute;
86
87
namespace {
88
89
class FunctorGC : public GarbageCollector {
90
 public:
91
17.2k
  explicit FunctorGC(std::function<void(size_t)> impl) : impl_(std::move(impl)) {}
92
93
0
  void CollectGarbage(size_t required) {
94
0
    impl_(required);
95
0
  }
96
97
245
  virtual ~FunctorGC() = default;
98
99
 private:
100
  std::function<void(size_t)> impl_;
101
};
102
103
class LRUCacheGC : public GarbageCollector {
104
 public:
105
17.2k
  explicit LRUCacheGC(std::shared_ptr<rocksdb::Cache> cache) : cache_(std::move(cache)) {}
106
107
0
  void CollectGarbage(size_t required) {
108
0
    if (!FLAGS_enable_block_based_table_cache_gc) {
109
0
      return;
110
0
    }
111
112
0
    auto evicted = cache_->Evict(required);
113
0
    LOG(INFO) << "Evicted from table cache: " << HumanReadableNumBytes::ToString(evicted)
114
0
              << ", new usage: " << HumanReadableNumBytes::ToString(cache_->GetUsage())
115
0
              << ", required: " << HumanReadableNumBytes::ToString(required);
116
0
  }
117
118
245
  virtual ~LRUCacheGC() = default;
119
120
 private:
121
  std::shared_ptr<rocksdb::Cache> cache_;
122
};
123
124
// Evaluates the target block cache size based on the db_block_cache_size_percentage and
125
// db_block_cache_size_bytes flags, as well as the passed default_block_cache_size_percentage.
126
17.2k
int64_t GetTargetBlockCacheSize(const int32_t default_block_cache_size_percentage) {
127
17.2k
  int32_t target_block_cache_size_percentage =
128
17.2k
      (FLAGS_db_block_cache_size_percentage == kDbCacheSizeUseDefault) ?
129
17.2k
      default_block_cache_size_percentage : 
FLAGS_db_block_cache_size_percentage0
;
130
131
  // If we aren't assigning block cache sized based on percentage, then the size is determined by
132
  // db_block_cache_size_bytes.
133
17.2k
  int64_t target_block_cache_size_bytes = FLAGS_db_block_cache_size_bytes;
134
  // Auto-compute size of block cache based on percentage of memory available if asked to.
135
17.2k
  if (target_block_cache_size_bytes == kDbCacheSizeUsePercentage) {
136
    // Check some bounds.
137
17.2k
    CHECK(target_block_cache_size_percentage > 0 && target_block_cache_size_percentage <= 100)
138
0
        << Substitute(
139
0
               "Flag tablet_block_cache_size_percentage must be between 0 and 100. Current value: "
140
0
               "$0",
141
0
               target_block_cache_size_percentage);
142
143
17.2k
    const int64_t total_ram_avail = MemTracker::GetRootTracker()->limit();
144
17.2k
    target_block_cache_size_bytes = total_ram_avail * target_block_cache_size_percentage / 100;
145
17.2k
  }
146
17.2k
  return target_block_cache_size_bytes;
147
17.2k
}
148
149
0
size_t GetLogCacheSize(tablet::TabletPeer* peer) {
150
0
  return down_cast<consensus::RaftConsensus*>(peer->consensus())->LogCacheSize();
151
0
}
152
153
}  // namespace
154
155
TabletMemoryManager::TabletMemoryManager(
156
    tablet::TabletOptions* options,
157
    const std::shared_ptr<MemTracker>& mem_tracker,
158
    const int32_t default_block_cache_size_percentage,
159
    const scoped_refptr<MetricEntity>& metrics,
160
17.2k
    const std::function<std::vector<tablet::TabletPeerPtr>()>& peers_fn) {
161
17.2k
  server_mem_tracker_ = mem_tracker;
162
17.2k
  peers_fn_ = peers_fn;
163
164
17.2k
  InitBlockCache(metrics, default_block_cache_size_percentage, options);
165
17.2k
  InitLogCacheGC();
166
  // Assign background_task_ if necessary.
167
17.2k
  ConfigureBackgroundTask(options);
168
17.2k
}
169
170
16.6k
CHECKED_STATUS TabletMemoryManager::Init() {
171
16.6k
  if (background_task_) {
172
16.6k
    RETURN_NOT_OK(background_task_->Init());
173
16.6k
  }
174
16.6k
  return Status::OK();
175
16.6k
}
176
177
317
void TabletMemoryManager::Shutdown() {
178
317
  if (background_task_) {
179
317
    background_task_->Shutdown();
180
317
  }
181
317
}
182
183
150k
std::shared_ptr<MemTracker> TabletMemoryManager::block_based_table_mem_tracker() {
184
150k
  return block_based_table_mem_tracker_;
185
150k
}
186
187
void TabletMemoryManager::InitBlockCache(
188
    const scoped_refptr<MetricEntity>& metrics,
189
    const int32_t default_block_cache_size_percentage,
190
17.2k
    tablet::TabletOptions* options) {
191
17.2k
  int64_t block_cache_size_bytes = GetTargetBlockCacheSize(default_block_cache_size_percentage);
192
193
17.2k
  block_based_table_mem_tracker_ = MemTracker::FindOrCreateTracker(
194
17.2k
      block_cache_size_bytes,
195
17.2k
      "BlockBasedTable",
196
17.2k
      server_mem_tracker_);
197
198
17.2k
  if (block_cache_size_bytes != kDbCacheSizeCacheDisabled) {
199
17.2k
    options->block_cache = rocksdb::NewLRUCache(block_cache_size_bytes,
200
17.2k
                                                FLAGS_db_block_cache_num_shard_bits);
201
17.2k
    options->block_cache->SetMetrics(metrics);
202
17.2k
    block_based_table_gc_ = std::make_shared<LRUCacheGC>(options->block_cache);
203
17.2k
    block_based_table_mem_tracker_->AddGarbageCollector(block_based_table_gc_);
204
17.2k
  }
205
17.2k
}
206
207
17.2k
void TabletMemoryManager::InitLogCacheGC() {
208
17.2k
  auto log_cache_mem_tracker = consensus::LogCache::GetServerMemTracker(server_mem_tracker_);
209
17.2k
  log_cache_gc_ = std::make_shared<FunctorGC>(
210
17.2k
      std::bind(&TabletMemoryManager::LogCacheGC, this, log_cache_mem_tracker.get(), _1));
211
17.2k
  log_cache_mem_tracker->AddGarbageCollector(log_cache_gc_);
212
17.2k
}
213
214
17.2k
void TabletMemoryManager::ConfigureBackgroundTask(tablet::TabletOptions* options) {
215
  // Calculate memstore_size_bytes based on total RAM available and global percentage.
216
17.2k
  CHECK(FLAGS_global_memstore_size_percentage > 0 && FLAGS_global_memstore_size_percentage <= 100)
217
0
    << Substitute(
218
0
        "Flag tablet_block_cache_size_percentage must be between 0 and 100. Current value: "
219
0
        "$0",
220
0
        FLAGS_global_memstore_size_percentage);
221
17.2k
  int64_t total_ram_avail = MemTracker::GetRootTracker()->limit();
222
17.2k
  size_t memstore_size_bytes = total_ram_avail * FLAGS_global_memstore_size_percentage / 100;
223
224
17.2k
  if (FLAGS_global_memstore_size_mb_max != 0) {
225
17.2k
    memstore_size_bytes = std::min(memstore_size_bytes,
226
17.2k
                                   static_cast<size_t>(FLAGS_global_memstore_size_mb_max << 20));
227
17.2k
  }
228
229
  // Add memory monitor and background thread for flushing.
230
  // TODO(zhaoalex): replace task with Poller
231
17.2k
  background_task_.reset(new BackgroundTask(
232
17.2k
    std::function<void()>([this]() 
{ FlushTabletIfLimitExceeded(); }688
),
233
17.2k
    "tablet manager",
234
17.2k
    "flush scheduler bgtask"));
235
17.2k
  options->memory_monitor = std::make_shared<rocksdb::MemoryMonitor>(
236
17.2k
      memstore_size_bytes,
237
228k
      std::function<void()>([this](){
238
228k
                              YB_WARN_NOT_OK(background_task_->Wake(), "Wakeup error"); }));
239
240
  // Must assign memory_monitor_ after configuring the background task.
241
17.2k
  memory_monitor_ = options->memory_monitor;
242
17.2k
}
243
244
0
void TabletMemoryManager::LogCacheGC(MemTracker* log_cache_mem_tracker, size_t bytes_to_evict) {
245
0
  if (!FLAGS_enable_log_cache_gc) {
246
0
    return;
247
0
  }
248
249
0
  if (FLAGS_log_cache_gc_evict_only_over_allocated) {
250
0
    if (!log_cache_mem_tracker->has_limit()) {
251
0
      return;
252
0
    }
253
0
    auto limit = log_cache_mem_tracker->limit();
254
0
    auto consumption = log_cache_mem_tracker->consumption();
255
0
    if (consumption <= limit) {
256
0
      return;
257
0
    }
258
0
    bytes_to_evict = std::min<size_t>(bytes_to_evict, consumption - limit);
259
0
  }
260
261
0
  auto peers = peers_fn_();
262
  // Sort by inverse log size.
263
0
  std::sort(peers.begin(), peers.end(), [](const auto& lhs, const auto& rhs) {
264
0
    return GetLogCacheSize(lhs.get()) > GetLogCacheSize(rhs.get());
265
0
  });
266
267
0
  size_t total_evicted = 0;
268
0
  for (const auto& peer : peers) {
269
0
    if (GetLogCacheSize(peer.get()) <= 0) {
270
0
      continue;
271
0
    }
272
0
    size_t evicted = down_cast<consensus::RaftConsensus*>(
273
0
        peer->consensus())->EvictLogCache(bytes_to_evict - total_evicted);
274
0
    total_evicted += evicted;
275
0
    if (total_evicted >= bytes_to_evict) {
276
0
      break;
277
0
    }
278
0
  }
279
280
281
0
  LOG(INFO) << "Evicted from log cache: " << HumanReadableNumBytes::ToString(total_evicted)
282
0
            << ", required: " << HumanReadableNumBytes::ToString(bytes_to_evict);
283
0
}
284
285
691
void TabletMemoryManager::FlushTabletIfLimitExceeded() {
286
691
  int iteration = 0;
287
1.11k
  while (memory_monitor_->Exceeded() ||
288
1.11k
         
(694
iteration++ == 0694
&&
FLAGS_TEST_pretend_memory_exceeded_enforce_flush691
)) {
289
419
    YB_LOG_EVERY_N_SECS(INFO, 5) << Format("Memstore global limit of $0 bytes reached, looking for "
290
216
                                           "tablet to flush", memory_monitor_->limit());
291
419
    auto flush_tick = rocksdb::FlushTick();
292
419
    tablet::TabletPeerPtr peer_to_flush = TabletToFlush();
293
419
    if (peer_to_flush) {
294
416
      auto tablet_to_flush = peer_to_flush->shared_tablet();
295
      // TODO(bojanserafimov): If peer_to_flush flushes now because of other reasons,
296
      // we will schedule a second flush, which will unnecessarily stall writes for a short time.
297
      // This will not happen often, but should be fixed.
298
416
      if (tablet_to_flush) {
299
416
        LOG(INFO)
300
416
            << LogPrefix(peer_to_flush)
301
416
            << "Flushing tablet with oldest memstore write at "
302
416
            << tablet_to_flush->OldestMutableMemtableWriteHybridTime();
303
416
        WARN_NOT_OK(
304
416
            tablet_to_flush->Flush(
305
416
                tablet::FlushMode::kAsync, tablet::FlushFlags::kAllDbs, flush_tick),
306
416
            Substitute("Flush failed on $0", peer_to_flush->tablet_id()));
307
416
        for (auto listener : TEST_listeners) {
308
0
          listener->StartedFlush(peer_to_flush->tablet_id());
309
0
        }
310
416
      }
311
416
    }
312
419
  }
313
691
}
314
315
// Return the tablet with the oldest write in memstore, or nullptr if all tablet memstores are
316
// empty or about to flush.
317
419
tablet::TabletPeerPtr TabletMemoryManager::TabletToFlush() {
318
419
  HybridTime oldest_write_in_memstores = HybridTime::kMax;
319
419
  tablet::TabletPeerPtr tablet_to_flush;
320
17.3k
  for (const tablet::TabletPeerPtr& peer : peers_fn_()) {
321
17.3k
    const auto tablet = peer->shared_tablet();
322
17.3k
    if (tablet) {
323
17.3k
      const auto ht = tablet->OldestMutableMemtableWriteHybridTime();
324
17.3k
      if (ht.ok()) {
325
17.3k
        if (*ht < oldest_write_in_memstores) {
326
1.25k
          oldest_write_in_memstores = *ht;
327
1.25k
          tablet_to_flush = peer;
328
1.25k
        }
329
17.3k
      } else {
330
0
        YB_LOG_EVERY_N_SECS(WARNING, 5) << Format(
331
0
            "Failed to get oldest mutable memtable write ht for tablet $0: $1",
332
0
            tablet->tablet_id(), ht.status());
333
0
      }
334
17.3k
    }
335
17.3k
  }
336
419
  return tablet_to_flush;
337
419
}
338
339
416
std::string TabletMemoryManager::LogPrefix(const tablet::TabletPeerPtr& peer) const {
340
416
  return Substitute("T $0 P $1 : ",
341
416
      peer->tablet_id(),
342
416
      peer->permanent_uuid());
343
416
}
344
345
}  // namespace tserver
346
}  // namespace yb