YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/tserver/service_util.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/tserver/service_util.h"
15
16
#include "yb/common/wire_protocol.h"
17
18
#include "yb/consensus/consensus.h"
19
#include "yb/consensus/consensus_error.h"
20
#include "yb/consensus/raft_consensus.h"
21
22
#include "yb/tablet/tablet.h"
23
#include "yb/tablet/tablet_metadata.h"
24
#include "yb/tablet/tablet_metrics.h"
25
26
#include "yb/tserver/tserver_error.h"
27
28
#include "yb/util/flag_tags.h"
29
#include "yb/util/mem_tracker.h"
30
#include "yb/util/metrics.h"
31
32
DEFINE_test_flag(bool, assert_reads_from_follower_rejected_because_of_staleness, false,
33
                 "If set, we verify that the consistency level is CONSISTENT_PREFIX, and that "
34
                 "a follower receives the request, but that it gets rejected because it's a stale "
35
                 "follower");
36
37
DEFINE_uint64(
38
    max_stale_read_bound_time_ms, 60000,
39
    "If we are allowed to read from followers, specify the maximum time a follower can be behind "
40
    "by using the last message received from the leader. If set to zero, a read can be served by a "
41
    "follower regardless of when was the last time it received a message from the leader or how "
42
    "far behind this follower is.");
43
TAG_FLAG(max_stale_read_bound_time_ms, evolving);
44
TAG_FLAG(max_stale_read_bound_time_ms, runtime);
45
46
DEFINE_uint64(sst_files_soft_limit, 24,
47
              "When majority SST files number is greater that this limit, we will start rejecting "
48
              "part of write requests. The higher the number of SST files, the higher probability "
49
              "of rejection.");
50
TAG_FLAG(sst_files_soft_limit, runtime);
51
52
DEFINE_uint64(sst_files_hard_limit, 48,
53
              "When majority SST files number is greater that this limit, we will reject all write "
54
              "requests.");
55
TAG_FLAG(sst_files_hard_limit, runtime);
56
57
DEFINE_test_flag(int32, write_rejection_percentage, 0,
58
                 "Reject specified percentage of writes.");
59
60
DEFINE_uint64(min_rejection_delay_ms, 100,
61
              "Minimal delay for rejected write to be retried in milliseconds.");
62
TAG_FLAG(min_rejection_delay_ms, runtime);
63
64
DEFINE_uint64(max_rejection_delay_ms, 5000, ""
65
              "Maximal delay for rejected write to be retried in milliseconds.");
66
TAG_FLAG(max_rejection_delay_ms, runtime);
67
68
DECLARE_int32(memory_limit_warn_threshold_percentage);
69
70
namespace yb {
71
namespace tserver {
72
73
void SetupErrorAndRespond(TabletServerErrorPB* error,
74
                          const Status& s,
75
                          TabletServerErrorPB::Code code,
76
311k
                          rpc::RpcContext* context) {
77
  // Generic "service unavailable" errors will cause the client to retry later.
78
311k
  if (code == TabletServerErrorPB::UNKNOWN_ERROR) {
79
177k
    if (s.IsServiceUnavailable()) {
80
122
      TabletServerDelay delay(s);
81
122
      if (!delay.value().Initialized()) {
82
116
        context->RespondRpcFailure(rpc::ErrorStatusPB::ERROR_SERVER_TOO_BUSY, s);
83
116
        return;
84
116
      }
85
177k
    }
86
177k
    consensus::ConsensusError consensus_error(s);
87
177k
    if (consensus_error.value() == consensus::ConsensusErrorPB::TABLET_SPLIT) {
88
13
      code = TabletServerErrorPB::TABLET_SPLIT;
89
13
    }
90
177k
  }
91
92
311k
  StatusToPB(s, error->mutable_status());
93
311k
  error->set_code(code);
94
311k
  context->RespondSuccess();
95
311k
}
96
97
void SetupErrorAndRespond(TabletServerErrorPB* error,
98
                          const Status& s,
99
305k
                          rpc::RpcContext* context) {
100
305k
  auto ts_error = TabletServerError::FromStatus(s);
101
305k
  SetupErrorAndRespond(
102
177k
      error, s, ts_error ? ts_error->value() : TabletServerErrorPB::UNKNOWN_ERROR, context);
103
305k
}
104
105
0
void SetupError(TabletServerErrorPB* error, const Status& s) {
106
0
  auto ts_error = TabletServerError::FromStatus(s);
107
0
  auto code = ts_error ? ts_error->value() : TabletServerErrorPB::UNKNOWN_ERROR;
108
0
  if (code == TabletServerErrorPB::UNKNOWN_ERROR) {
109
0
    consensus::ConsensusError consensus_error(s);
110
0
    if (consensus_error.value() == consensus::ConsensusErrorPB::TABLET_SPLIT) {
111
0
      code = TabletServerErrorPB::TABLET_SPLIT;
112
0
    }
113
0
  }
114
0
  StatusToPB(s, error->mutable_status());
115
0
  error->set_code(code);
116
0
}
117
118
6.80M
Result<int64_t> LeaderTerm(const tablet::TabletPeer& tablet_peer) {
119
6.80M
  std::shared_ptr<consensus::Consensus> consensus = tablet_peer.shared_consensus();
120
6.80M
  if (!consensus) {
121
0
    auto state = tablet_peer.state();
122
0
    if (state != tablet::RaftGroupStatePB::SHUTDOWN) {
123
      // Should not happen.
124
0
      return STATUS(IllegalState, "Tablet peer does not have consensus, but in $0 state",
125
0
                    tablet::RaftGroupStatePB_Name(state));
126
0
    }
127
0
    return STATUS(Aborted, "Tablet peer was closed");
128
0
  }
129
6.80M
  auto leader_state = consensus->GetLeaderState();
130
131
12.2k
  VLOG(1) << Format(
132
12.2k
      "Check for tablet $0 peer $1. Peer role is $2. Leader status is $3.",
133
12.2k
      tablet_peer.tablet_id(), tablet_peer.permanent_uuid(),
134
12.2k
      consensus->role(), to_underlying(leader_state.status));
135
136
6.80M
  if (!leader_state.ok()) {
137
83.6k
    typedef consensus::LeaderStatus LeaderStatus;
138
83.6k
    auto status = leader_state.CreateStatus();
139
83.6k
    switch (leader_state.status) {
140
76.9k
      case LeaderStatus::NOT_LEADER: FALLTHROUGH_INTENDED;
141
78.8k
      case LeaderStatus::LEADER_BUT_NO_MAJORITY_REPLICATED_LEASE:
142
        // We are returning a NotTheLeader as opposed to LeaderNotReady, because there is a chance
143
        // that we're a partitioned-away leader, and the client needs to do another leader lookup.
144
78.8k
        return status.CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::NOT_THE_LEADER));
145
2.11k
      case LeaderStatus::LEADER_BUT_NO_OP_NOT_COMMITTED: FALLTHROUGH_INTENDED;
146
4.72k
      case LeaderStatus::LEADER_BUT_OLD_LEADER_MAY_HAVE_LEASE:
147
4.72k
        return status.CloneAndAddErrorCode(TabletServerError(
148
4.72k
            TabletServerErrorPB::LEADER_NOT_READY_TO_SERVE));
149
0
      case LeaderStatus::LEADER_AND_READY:
150
0
        LOG(FATAL) << "Unexpected status: " << to_underlying(leader_state.status);
151
83.6k
    }
152
0
    FATAL_INVALID_ENUM_VALUE(LeaderStatus, leader_state.status);
153
0
  }
154
155
6.72M
  return leader_state.term;
156
6.80M
}
157
158
3.43M
void LeaderTabletPeer::FillTabletPeer(TabletPeerTablet source) {
159
3.43M
  peer = std::move(source.tablet_peer);
160
3.43M
  tablet = std::move(source.tablet);
161
3.43M
}
162
163
2.88M
CHECKED_STATUS LeaderTabletPeer::FillTerm() {
164
2.88M
  auto leader_term_result = LeaderTerm(*peer);
165
2.88M
  if (!leader_term_result.ok()) {
166
61.5k
    auto tablet = peer->shared_tablet();
167
61.6k
    if (tablet) {
168
      // It could happen that tablet becomes nullptr due to shutdown.
169
61.6k
      tablet->metrics()->not_leader_rejections->Increment();
170
61.6k
    }
171
61.5k
    return leader_term_result.status();
172
61.5k
  }
173
2.82M
  leader_term = *leader_term_result;
174
175
2.82M
  return Status::OK();
176
2.82M
}
177
178
Result<LeaderTabletPeer> LookupLeaderTablet(
179
    TabletPeerLookupIf* tablet_manager,
180
    const std::string& tablet_id,
181
2.88M
    TabletPeerTablet peer) {
182
2.88M
  if (peer.tablet_peer) {
183
0
    LOG_IF(DFATAL, peer.tablet_peer->tablet_id() != tablet_id)
184
0
        << "Mismatching table ids: peer " << peer.tablet_peer->tablet_id()
185
0
        << " vs " << tablet_id;
186
0
    LOG_IF(DFATAL, !peer.tablet) << "Empty tablet pointer for tablet id : " << tablet_id;
187
2.88M
  } else {
188
2.88M
    peer = VERIFY_RESULT(LookupTabletPeer(tablet_manager, tablet_id));
189
2.88M
  }
190
2.88M
  LeaderTabletPeer result;
191
2.88M
  result.FillTabletPeer(std::move(peer));
192
193
2.88M
  RETURN_NOT_OK(result.FillTerm());
194
2.82M
  return result;
195
2.88M
}
196
197
Status CheckPeerIsReady(
198
3.92M
    const tablet::TabletPeer& tablet_peer, AllowSplitTablet allow_split_tablet) {
199
3.92M
  auto consensus = tablet_peer.shared_consensus();
200
3.92M
  if (!consensus) {
201
0
    return STATUS(
202
0
        IllegalState, Format("Consensus not available for tablet $0.", tablet_peer.tablet_id()),
203
0
        Slice(), TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
204
0
  }
205
206
3.92M
  Status s = tablet_peer.CheckRunning();
207
3.92M
  if (!s.ok()) {
208
0
    return s.CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
209
0
  }
210
211
3.92M
  auto* tablet = tablet_peer.tablet();
212
3.92M
  SCHECK(tablet != nullptr, IllegalState, "Expected tablet peer to have a tablet");
213
3.92M
  const auto tablet_data_state = tablet->metadata()->tablet_data_state();
214
3.92M
  if (!allow_split_tablet &&
215
3.92M
      tablet_data_state == tablet::TabletDataState::TABLET_DATA_SPLIT_COMPLETED) {
216
6
    auto split_child_tablet_ids = tablet->metadata()->split_child_tablet_ids();
217
6
    return STATUS(
218
6
               IllegalState,
219
6
               Format("The tablet $0 is in $1 state",
220
6
                      tablet->tablet_id(),
221
6
                      TabletDataState_Name(tablet_data_state)),
222
6
               TabletServerError(TabletServerErrorPB::TABLET_SPLIT))
223
6
        .CloneAndAddErrorCode(SplitChildTabletIdsData(
224
6
            std::vector<TabletId>(split_child_tablet_ids.begin(), split_child_tablet_ids.end())));
225
    // TODO(tsplit): If we get FS corruption on 1 node, we can just delete that tablet copy and
226
    // bootstrap from a good leader. If there's a way that all peers replicated the SPLIT and
227
    // modified their data state, but all had some failures (code bug?).
228
    // Perhaps we should consider a tool for editing the data state?
229
6
  }
230
3.92M
  return Status::OK();
231
3.92M
}
232
233
234
3.92M
CHECKED_STATUS CheckPeerIsLeader(const tablet::TabletPeer& tablet_peer) {
235
3.92M
  return ResultToStatus(LeaderTerm(tablet_peer));
236
3.92M
}
237
238
Result<TabletPeerTablet> LookupTabletPeer(
239
    TabletPeerLookupIf* tablet_manager,
240
17.9M
    const TabletId& tablet_id) {
241
17.9M
  TabletPeerTablet result;
242
17.9M
  auto status = tablet_manager->GetTabletPeer(tablet_id, &result.tablet_peer);
243
17.9M
  if (PREDICT_FALSE(!status.ok())) {
244
100
    auto code = status.IsServiceUnavailable() ? TabletServerErrorPB::UNKNOWN_ERROR
245
6.34k
                                              : TabletServerErrorPB::TABLET_NOT_FOUND;
246
6.44k
    return status.CloneAndAddErrorCode(TabletServerError(code));
247
6.44k
  }
248
249
  // Check RUNNING state.
250
17.9M
  tablet::RaftGroupStatePB state = result.tablet_peer->state();
251
17.9M
  if (PREDICT_FALSE(state != tablet::RUNNING)) {
252
49.2k
    Status s = STATUS(IllegalState, "Tablet not RUNNING", tablet::RaftGroupStateError(state))
253
49.2k
        .CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
254
49.2k
    return s;
255
49.2k
  }
256
257
17.9M
  result.tablet = result.tablet_peer->shared_tablet();
258
17.9M
  if (!result.tablet) {
259
5
    Status s = STATUS(IllegalState,
260
5
                      "Tablet not running",
261
5
                      TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
262
5
    return s;
263
5
  }
264
17.9M
  return result;
265
17.9M
}
266
267
Result<std::shared_ptr<tablet::AbstractTablet>> GetTablet(
268
    TabletPeerLookupIf* tablet_manager, const TabletId& tablet_id,
269
    tablet::TabletPeerPtr tablet_peer, YBConsistencyLevel consistency_level,
270
3.92M
    AllowSplitTablet allow_split_tablet) {
271
3.92M
  tablet::TabletPtr tablet_ptr = nullptr;
272
3.92M
  if (tablet_peer) {
273
1.58k
    DCHECK_EQ(tablet_peer->tablet_id(), tablet_id);
274
1.58k
    tablet_ptr = tablet_peer->shared_tablet();
275
0
    LOG_IF(DFATAL, tablet_ptr == nullptr)
276
0
        << "Empty tablet pointer for tablet id: " << tablet_id;
277
3.91M
  } else {
278
3.91M
    auto tablet_peer_result = VERIFY_RESULT(LookupTabletPeer(tablet_manager, tablet_id));
279
280
3.91M
    tablet_peer = std::move(tablet_peer_result.tablet_peer);
281
3.91M
    tablet_ptr = std::move(tablet_peer_result.tablet);
282
3.91M
  }
283
284
3.92M
  RETURN_NOT_OK(CheckPeerIsReady(*tablet_peer, allow_split_tablet));
285
286
  // Check for leader only in strong consistency level.
287
3.92M
  if (consistency_level == YBConsistencyLevel::STRONG) {
288
3.90M
    if (PREDICT_FALSE(FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
289
0
      LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true but "
290
0
                    "consistency level is invalid: YBConsistencyLevel::STRONG";
291
0
    }
292
293
3.90M
    RETURN_NOT_OK(CheckPeerIsLeader(*tablet_peer));
294
14.6k
  } else {
295
14.6k
    auto s = CheckPeerIsLeader(*tablet_peer.get());
296
297
    // Peer is not the leader, so check that the time since it last heard from the leader is less
298
    // than FLAGS_max_stale_read_bound_time_ms.
299
14.6k
    if (PREDICT_FALSE(!s.ok())) {
300
11.3k
      if (FLAGS_max_stale_read_bound_time_ms > 0) {
301
11.3k
        auto consensus = tablet_peer->shared_consensus();
302
        // TODO(hector): This safe time could be reused by the read operation.
303
11.3k
        auto safe_time_micros = tablet_peer->tablet()->mvcc_manager()->SafeTimeForFollower(
304
11.3k
            HybridTime::kMin, CoarseTimePoint::min()).GetPhysicalValueMicros();
305
11.3k
        auto now_micros = tablet_peer->clock_ptr()->Now().GetPhysicalValueMicros();
306
11.3k
        auto follower_staleness_us = now_micros - safe_time_micros;
307
11.3k
        if (follower_staleness_us > FLAGS_max_stale_read_bound_time_ms * 1000) {
308
0
          VLOG(1) << "Rejecting stale read with staleness "
309
0
                     << follower_staleness_us << "us";
310
2.00k
          return STATUS(
311
2.00k
              IllegalState, "Stale follower",
312
2.00k
              TabletServerError(TabletServerErrorPB::STALE_FOLLOWER));
313
9.34k
        } else if (PREDICT_FALSE(
314
0
            FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
315
0
          LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true,"
316
0
                     << " but peer " << tablet_peer->permanent_uuid()
317
0
                     << " for tablet: " << tablet_id
318
0
                     << " is not stale. Time since last update from leader: "
319
0
                     << follower_staleness_us << "us";
320
9.34k
        } else {
321
1
          VLOG(3) << "Reading from follower with staleness: " << follower_staleness_us << "us";
322
9.34k
        }
323
11.3k
      }
324
3.34k
    } else {
325
      // We are here because we are the leader.
326
3.34k
      if (PREDICT_FALSE(FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
327
0
        LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true but "
328
0
                   << " peer " << tablet_peer->permanent_uuid()
329
0
                   << " is the leader for tablet " << tablet_id;
330
0
      }
331
3.34k
    }
332
14.6k
  }
333
334
3.91M
  auto tablet = tablet_peer->shared_tablet();
335
3.91M
  if (PREDICT_FALSE(!tablet)) {
336
0
    return STATUS_EC_FORMAT(
337
0
        IllegalState, TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING),
338
0
        "Tablet $0 is not running", tablet_id);
339
0
  }
340
341
3.91M
  return tablet;
342
3.91M
}
343
344
// overlimit - we have 2 bounds, value and random score.
345
// overlimit is calculated as:
346
// score + (value - lower_bound) / (upper_bound - lower_bound).
347
// And it will be >= 1.0 when this function is invoked.
348
CHECKED_STATUS RejectWrite(
349
6
    tablet::TabletPeer* tablet_peer, const std::string& message, double overlimit) {
350
6
  int64_t delay_ms = fit_bounds<int64_t>((overlimit - 1.0) * FLAGS_max_rejection_delay_ms,
351
6
                                         FLAGS_min_rejection_delay_ms,
352
6
                                         FLAGS_max_rejection_delay_ms);
353
6
  auto status = STATUS(
354
6
      ServiceUnavailable, message, TabletServerDelay(std::chrono::milliseconds(delay_ms)));
355
6
  YB_LOG_EVERY_N_SECS(WARNING, 1)
356
2
      << "T " << tablet_peer->tablet_id() << " P " << tablet_peer->permanent_uuid()
357
2
      << ": Rejecting Write request, " << status << THROTTLE_MSG;
358
6
  return status;
359
6
}
360
361
1.48M
CHECKED_STATUS CheckWriteThrottling(double score, tablet::TabletPeer* tablet_peer) {
362
  // Check for memory pressure; don't bother doing any additional work if we've
363
  // exceeded the limit.
364
1.48M
  auto tablet = tablet_peer->tablet();
365
1.48M
  auto soft_limit_exceeded_result = tablet->mem_tracker()->AnySoftLimitExceeded(score);
366
1.48M
  if (soft_limit_exceeded_result.exceeded) {
367
16
    tablet->metrics()->leader_memory_pressure_rejections->Increment();
368
16
    string msg = StringPrintf(
369
16
        "Soft memory limit exceeded (at %.2f%% of capacity), score: %.2f",
370
16
        soft_limit_exceeded_result.current_capacity_pct, score);
371
16
    if (soft_limit_exceeded_result.current_capacity_pct >=
372
0
            FLAGS_memory_limit_warn_threshold_percentage) {
373
0
      YB_LOG_EVERY_N_SECS(WARNING, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG;
374
16
    } else {
375
16
      YB_LOG_EVERY_N_SECS(INFO, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG;
376
16
    }
377
16
    return STATUS(ServiceUnavailable, msg);
378
16
  }
379
380
1.48M
  const uint64_t num_sst_files = tablet_peer->raft_consensus()->MajorityNumSSTFiles();
381
1.48M
  const auto sst_files_soft_limit = FLAGS_sst_files_soft_limit;
382
1.48M
  const int64_t sst_files_used_delta = num_sst_files - sst_files_soft_limit;
383
1.48M
  if (sst_files_used_delta >= 0) {
384
43
    const auto sst_files_hard_limit = FLAGS_sst_files_hard_limit;
385
43
    const auto sst_files_full_delta = sst_files_hard_limit - sst_files_soft_limit;
386
43
    if (sst_files_used_delta >= sst_files_full_delta * (1 - score)) {
387
6
      tablet->metrics()->majority_sst_files_rejections->Increment();
388
6
      auto message = Format("SST files limit exceeded $0 against ($1, $2), score: $3",
389
6
                            num_sst_files, sst_files_soft_limit, sst_files_hard_limit, score);
390
6
      auto overlimit = sst_files_full_delta > 0
391
6
          ? score + static_cast<double>(sst_files_used_delta) / sst_files_full_delta
392
0
          : 2.0;
393
6
      return RejectWrite(tablet_peer, message, overlimit);
394
6
    }
395
1.48M
  }
396
397
1.48M
  if (FLAGS_TEST_write_rejection_percentage != 0 &&
398
0
      score >= 1.0 - FLAGS_TEST_write_rejection_percentage * 0.01) {
399
0
    auto status = Format("TEST: Write request rejected, desired percentage: $0, score: $1",
400
0
                         FLAGS_TEST_write_rejection_percentage, score);
401
0
    return RejectWrite(tablet_peer, status, score + FLAGS_TEST_write_rejection_percentage * 0.01);
402
0
  }
403
404
1.48M
  return Status::OK();
405
1.48M
}
406
407
} // namespace tserver
408
} // namespace yb