YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/yb/tserver/service_util.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include "yb/tserver/service_util.h"
15
16
#include "yb/common/wire_protocol.h"
17
18
#include "yb/consensus/consensus.h"
19
#include "yb/consensus/consensus_error.h"
20
#include "yb/consensus/raft_consensus.h"
21
22
#include "yb/tablet/tablet.h"
23
#include "yb/tablet/tablet_metadata.h"
24
#include "yb/tablet/tablet_metrics.h"
25
26
#include "yb/tserver/tserver_error.h"
27
28
#include "yb/util/flag_tags.h"
29
#include "yb/util/mem_tracker.h"
30
#include "yb/util/metrics.h"
31
32
DEFINE_test_flag(bool, assert_reads_from_follower_rejected_because_of_staleness, false,
33
                 "If set, we verify that the consistency level is CONSISTENT_PREFIX, and that "
34
                 "a follower receives the request, but that it gets rejected because it's a stale "
35
                 "follower");
36
37
DEFINE_uint64(
38
    max_stale_read_bound_time_ms, 60000,
39
    "If we are allowed to read from followers, specify the maximum time a follower can be behind "
40
    "by using the last message received from the leader. If set to zero, a read can be served by a "
41
    "follower regardless of when was the last time it received a message from the leader or how "
42
    "far behind this follower is.");
43
TAG_FLAG(max_stale_read_bound_time_ms, evolving);
44
TAG_FLAG(max_stale_read_bound_time_ms, runtime);
45
46
DEFINE_uint64(sst_files_soft_limit, 24,
47
              "When majority SST files number is greater that this limit, we will start rejecting "
48
              "part of write requests. The higher the number of SST files, the higher probability "
49
              "of rejection.");
50
TAG_FLAG(sst_files_soft_limit, runtime);
51
52
DEFINE_uint64(sst_files_hard_limit, 48,
53
              "When majority SST files number is greater that this limit, we will reject all write "
54
              "requests.");
55
TAG_FLAG(sst_files_hard_limit, runtime);
56
57
DEFINE_test_flag(int32, write_rejection_percentage, 0,
58
                 "Reject specified percentage of writes.");
59
60
DEFINE_uint64(min_rejection_delay_ms, 100,
61
              "Minimal delay for rejected write to be retried in milliseconds.");
62
TAG_FLAG(min_rejection_delay_ms, runtime);
63
64
DEFINE_uint64(max_rejection_delay_ms, 5000, ""
65
              "Maximal delay for rejected write to be retried in milliseconds.");
66
TAG_FLAG(max_rejection_delay_ms, runtime);
67
68
DECLARE_int32(memory_limit_warn_threshold_percentage);
69
70
namespace yb {
71
namespace tserver {
72
73
void SetupErrorAndRespond(TabletServerErrorPB* error,
74
                          const Status& s,
75
                          TabletServerErrorPB::Code code,
76
435k
                          rpc::RpcContext* context) {
77
  // Generic "service unavailable" errors will cause the client to retry later.
78
435k
  if (code == TabletServerErrorPB::UNKNOWN_ERROR) {
79
250k
    if (s.IsServiceUnavailable()) {
80
127
      TabletServerDelay delay(s);
81
127
      if (!delay.value().Initialized()) {
82
127
        context->RespondRpcFailure(rpc::ErrorStatusPB::ERROR_SERVER_TOO_BUSY, s);
83
127
        return;
84
127
      }
85
127
    }
86
250k
    consensus::ConsensusError consensus_error(s);
87
250k
    if (consensus_error.value() == consensus::ConsensusErrorPB::TABLET_SPLIT) {
88
23
      code = TabletServerErrorPB::TABLET_SPLIT;
89
23
    }
90
250k
  }
91
92
435k
  StatusToPB(s, error->mutable_status());
93
435k
  error->set_code(code);
94
435k
  context->RespondSuccess();
95
435k
}
96
97
void SetupErrorAndRespond(TabletServerErrorPB* error,
98
                          const Status& s,
99
422k
                          rpc::RpcContext* context) {
100
422k
  auto ts_error = TabletServerError::FromStatus(s);
101
422k
  SetupErrorAndRespond(
102
422k
      error, s, ts_error ? 
ts_error->value()172k
:
TabletServerErrorPB::UNKNOWN_ERROR250k
, context);
103
422k
}
104
105
0
void SetupError(TabletServerErrorPB* error, const Status& s) {
106
0
  auto ts_error = TabletServerError::FromStatus(s);
107
0
  auto code = ts_error ? ts_error->value() : TabletServerErrorPB::UNKNOWN_ERROR;
108
0
  if (code == TabletServerErrorPB::UNKNOWN_ERROR) {
109
0
    consensus::ConsensusError consensus_error(s);
110
0
    if (consensus_error.value() == consensus::ConsensusErrorPB::TABLET_SPLIT) {
111
0
      code = TabletServerErrorPB::TABLET_SPLIT;
112
0
    }
113
0
  }
114
0
  StatusToPB(s, error->mutable_status());
115
0
  error->set_code(code);
116
0
}
117
118
12.8M
Result<int64_t> LeaderTerm(const tablet::TabletPeer& tablet_peer) {
119
12.8M
  std::shared_ptr<consensus::Consensus> consensus = tablet_peer.shared_consensus();
120
12.8M
  if (!consensus) {
121
0
    auto state = tablet_peer.state();
122
0
    if (state != tablet::RaftGroupStatePB::SHUTDOWN) {
123
      // Should not happen.
124
0
      return STATUS(IllegalState, "Tablet peer does not have consensus, but in $0 state",
125
0
                    tablet::RaftGroupStatePB_Name(state));
126
0
    }
127
0
    return STATUS(Aborted, "Tablet peer was closed");
128
0
  }
129
12.8M
  auto leader_state = consensus->GetLeaderState();
130
131
12.8M
  VLOG(1) << Format(
132
14.6k
      "Check for tablet $0 peer $1. Peer role is $2. Leader status is $3.",
133
14.6k
      tablet_peer.tablet_id(), tablet_peer.permanent_uuid(),
134
14.6k
      consensus->role(), to_underlying(leader_state.status));
135
136
12.8M
  if (!leader_state.ok()) {
137
102k
    typedef consensus::LeaderStatus LeaderStatus;
138
102k
    auto status = leader_state.CreateStatus();
139
102k
    switch (leader_state.status) {
140
81.2k
      case LeaderStatus::NOT_LEADER: FALLTHROUGH_INTENDED;
141
95.1k
      case LeaderStatus::LEADER_BUT_NO_MAJORITY_REPLICATED_LEASE:
142
        // We are returning a NotTheLeader as opposed to LeaderNotReady, because there is a chance
143
        // that we're a partitioned-away leader, and the client needs to do another leader lookup.
144
95.1k
        return status.CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::NOT_THE_LEADER));
145
4.32k
      case LeaderStatus::LEADER_BUT_NO_OP_NOT_COMMITTED: FALLTHROUGH_INTENDED;
146
7.46k
      case LeaderStatus::LEADER_BUT_OLD_LEADER_MAY_HAVE_LEASE:
147
7.46k
        return status.CloneAndAddErrorCode(TabletServerError(
148
7.46k
            TabletServerErrorPB::LEADER_NOT_READY_TO_SERVE));
149
0
      case LeaderStatus::LEADER_AND_READY:
150
0
        LOG(FATAL) << "Unexpected status: " << to_underlying(leader_state.status);
151
102k
    }
152
0
    FATAL_INVALID_ENUM_VALUE(LeaderStatus, leader_state.status);
153
0
  }
154
155
12.7M
  return leader_state.term;
156
12.8M
}
157
158
6.05M
void LeaderTabletPeer::FillTabletPeer(TabletPeerTablet source) {
159
6.05M
  peer = std::move(source.tablet_peer);
160
6.05M
  tablet = std::move(source.tablet);
161
6.05M
}
162
163
5.12M
CHECKED_STATUS LeaderTabletPeer::FillTerm() {
164
5.12M
  auto leader_term_result = LeaderTerm(*peer);
165
5.12M
  if (!leader_term_result.ok()) {
166
61.6k
    auto tablet = peer->shared_tablet();
167
61.6k
    if (
tablet61.6k
) {
168
      // It could happen that tablet becomes nullptr due to shutdown.
169
61.6k
      tablet->metrics()->not_leader_rejections->Increment();
170
61.6k
    }
171
61.6k
    return leader_term_result.status();
172
61.6k
  }
173
5.06M
  leader_term = *leader_term_result;
174
175
5.06M
  return Status::OK();
176
5.12M
}
177
178
Result<LeaderTabletPeer> LookupLeaderTablet(
179
    TabletPeerLookupIf* tablet_manager,
180
    const std::string& tablet_id,
181
5.12M
    TabletPeerTablet peer) {
182
5.12M
  if (peer.tablet_peer) {
183
8.26k
    LOG_IF(DFATAL, peer.tablet_peer->tablet_id() != tablet_id)
184
0
        << "Mismatching table ids: peer " << peer.tablet_peer->tablet_id()
185
0
        << " vs " << tablet_id;
186
8.26k
    LOG_IF
(DFATAL, !peer.tablet) << "Empty tablet pointer for tablet id : " << tablet_id0
;
187
5.11M
  } else {
188
5.11M
    peer = 
VERIFY_RESULT5.11M
(LookupTabletPeer(tablet_manager, tablet_id));
189
5.11M
  }
190
5.12M
  LeaderTabletPeer result;
191
5.12M
  result.FillTabletPeer(std::move(peer));
192
193
5.12M
  RETURN_NOT_OK(result.FillTerm());
194
5.06M
  return result;
195
5.12M
}
196
197
Status CheckPeerIsReady(
198
7.73M
    const tablet::TabletPeer& tablet_peer, AllowSplitTablet allow_split_tablet) {
199
7.73M
  auto consensus = tablet_peer.shared_consensus();
200
7.73M
  if (!consensus) {
201
0
    return STATUS(
202
0
        IllegalState, Format("Consensus not available for tablet $0.", tablet_peer.tablet_id()),
203
0
        Slice(), TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
204
0
  }
205
206
7.73M
  Status s = tablet_peer.CheckRunning();
207
7.73M
  if (!s.ok()) {
208
0
    return s.CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
209
0
  }
210
211
7.73M
  auto* tablet = tablet_peer.tablet();
212
7.73M
  SCHECK(tablet != nullptr, IllegalState, "Expected tablet peer to have a tablet");
213
7.73M
  const auto tablet_data_state = tablet->metadata()->tablet_data_state();
214
7.73M
  if (!allow_split_tablet &&
215
7.73M
      
tablet_data_state == tablet::TabletDataState::TABLET_DATA_SPLIT_COMPLETED7.73M
) {
216
12
    auto split_child_tablet_ids = tablet->metadata()->split_child_tablet_ids();
217
12
    return STATUS(
218
12
               IllegalState,
219
12
               Format("The tablet $0 is in $1 state",
220
12
                      tablet->tablet_id(),
221
12
                      TabletDataState_Name(tablet_data_state)),
222
12
               TabletServerError(TabletServerErrorPB::TABLET_SPLIT))
223
12
        .CloneAndAddErrorCode(SplitChildTabletIdsData(
224
12
            std::vector<TabletId>(split_child_tablet_ids.begin(), split_child_tablet_ids.end())));
225
    // TODO(tsplit): If we get FS corruption on 1 node, we can just delete that tablet copy and
226
    // bootstrap from a good leader. If there's a way that all peers replicated the SPLIT and
227
    // modified their data state, but all had some failures (code bug?).
228
    // Perhaps we should consider a tool for editing the data state?
229
12
  }
230
7.73M
  return Status::OK();
231
7.73M
}
232
233
234
7.73M
CHECKED_STATUS CheckPeerIsLeader(const tablet::TabletPeer& tablet_peer) {
235
7.73M
  return ResultToStatus(LeaderTerm(tablet_peer));
236
7.73M
}
237
238
Result<TabletPeerTablet> LookupTabletPeer(
239
    TabletPeerLookupIf* tablet_manager,
240
41.2M
    const TabletId& tablet_id) {
241
41.2M
  TabletPeerTablet result;
242
41.2M
  auto status = tablet_manager->GetTabletPeer(tablet_id, &result.tablet_peer);
243
41.2M
  if (PREDICT_FALSE(!status.ok())) {
244
12.6k
    auto code = status.IsServiceUnavailable() ? 
TabletServerErrorPB::UNKNOWN_ERROR112
245
12.6k
                                              : 
TabletServerErrorPB::TABLET_NOT_FOUND12.5k
;
246
12.6k
    return status.CloneAndAddErrorCode(TabletServerError(code));
247
12.6k
  }
248
249
  // Check RUNNING state.
250
41.2M
  tablet::RaftGroupStatePB state = result.tablet_peer->state();
251
41.2M
  if (PREDICT_FALSE(state != tablet::RUNNING)) {
252
74.5k
    Status s = STATUS(IllegalState, "Tablet not RUNNING", tablet::RaftGroupStateError(state))
253
74.5k
        .CloneAndAddErrorCode(TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
254
74.5k
    return s;
255
74.5k
  }
256
257
41.1M
  result.tablet = result.tablet_peer->shared_tablet();
258
41.1M
  if (!result.tablet) {
259
6
    Status s = STATUS(IllegalState,
260
6
                      "Tablet not running",
261
6
                      TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING));
262
6
    return s;
263
6
  }
264
41.1M
  return result;
265
41.1M
}
266
267
Result<std::shared_ptr<tablet::AbstractTablet>> GetTablet(
268
    TabletPeerLookupIf* tablet_manager, const TabletId& tablet_id,
269
    tablet::TabletPeerPtr tablet_peer, YBConsistencyLevel consistency_level,
270
7.73M
    AllowSplitTablet allow_split_tablet) {
271
7.73M
  tablet::TabletPtr tablet_ptr = nullptr;
272
7.73M
  if (tablet_peer) {
273
2.66k
    DCHECK_EQ(tablet_peer->tablet_id(), tablet_id);
274
2.66k
    tablet_ptr = tablet_peer->shared_tablet();
275
2.66k
    LOG_IF(DFATAL, tablet_ptr == nullptr)
276
0
        << "Empty tablet pointer for tablet id: " << tablet_id;
277
7.72M
  } else {
278
7.72M
    auto tablet_peer_result = 
VERIFY_RESULT7.72M
(LookupTabletPeer(tablet_manager, tablet_id));7.72M
279
280
0
    tablet_peer = std::move(tablet_peer_result.tablet_peer);
281
7.72M
    tablet_ptr = std::move(tablet_peer_result.tablet);
282
7.72M
  }
283
284
7.73M
  RETURN_NOT_OK(CheckPeerIsReady(*tablet_peer, allow_split_tablet));
285
286
  // Check for leader only in strong consistency level.
287
7.73M
  if (consistency_level == YBConsistencyLevel::STRONG) {
288
7.70M
    if (PREDICT_FALSE(FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
289
0
      LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true but "
290
0
                    "consistency level is invalid: YBConsistencyLevel::STRONG";
291
0
    }
292
293
7.70M
    RETURN_NOT_OK(CheckPeerIsLeader(*tablet_peer));
294
7.70M
  } else {
295
23.3k
    auto s = CheckPeerIsLeader(*tablet_peer.get());
296
297
    // Peer is not the leader, so check that the time since it last heard from the leader is less
298
    // than FLAGS_max_stale_read_bound_time_ms.
299
23.3k
    if (PREDICT_FALSE(!s.ok())) {
300
19.4k
      if (FLAGS_max_stale_read_bound_time_ms > 0) {
301
19.4k
        auto consensus = tablet_peer->shared_consensus();
302
        // TODO(hector): This safe time could be reused by the read operation.
303
19.4k
        auto safe_time_micros = tablet_peer->tablet()->mvcc_manager()->SafeTimeForFollower(
304
19.4k
            HybridTime::kMin, CoarseTimePoint::min()).GetPhysicalValueMicros();
305
19.4k
        auto now_micros = tablet_peer->clock_ptr()->Now().GetPhysicalValueMicros();
306
19.4k
        auto follower_staleness_us = now_micros - safe_time_micros;
307
19.4k
        if (follower_staleness_us > FLAGS_max_stale_read_bound_time_ms * 1000) {
308
4.00k
          VLOG(1) << "Rejecting stale read with staleness "
309
0
                     << follower_staleness_us << "us";
310
4.00k
          return STATUS(
311
4.00k
              IllegalState, "Stale follower",
312
4.00k
              TabletServerError(TabletServerErrorPB::STALE_FOLLOWER));
313
15.4k
        } else if (PREDICT_FALSE(
314
15.4k
            FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
315
0
          LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true,"
316
0
                     << " but peer " << tablet_peer->permanent_uuid()
317
0
                     << " for tablet: " << tablet_id
318
0
                     << " is not stale. Time since last update from leader: "
319
0
                     << follower_staleness_us << "us";
320
15.4k
        } else {
321
18.4E
          VLOG(3) << "Reading from follower with staleness: " << follower_staleness_us << "us";
322
15.4k
        }
323
19.4k
      }
324
19.4k
    } else {
325
      // We are here because we are the leader.
326
3.98k
      if (PREDICT_FALSE(FLAGS_TEST_assert_reads_from_follower_rejected_because_of_staleness)) {
327
0
        LOG(FATAL) << "--TEST_assert_reads_from_follower_rejected_because_of_staleness is true but "
328
0
                   << " peer " << tablet_peer->permanent_uuid()
329
0
                   << " is the leader for tablet " << tablet_id;
330
0
      }
331
3.98k
    }
332
23.3k
  }
333
334
7.70M
  auto tablet = tablet_peer->shared_tablet();
335
7.70M
  if (PREDICT_FALSE(!tablet)) {
336
0
    return STATUS_EC_FORMAT(
337
0
        IllegalState, TabletServerError(TabletServerErrorPB::TABLET_NOT_RUNNING),
338
0
        "Tablet $0 is not running", tablet_id);
339
0
  }
340
341
7.70M
  return tablet;
342
7.70M
}
343
344
// overlimit - we have 2 bounds, value and random score.
345
// overlimit is calculated as:
346
// score + (value - lower_bound) / (upper_bound - lower_bound).
347
// And it will be >= 1.0 when this function is invoked.
348
CHECKED_STATUS RejectWrite(
349
0
    tablet::TabletPeer* tablet_peer, const std::string& message, double overlimit) {
350
0
  int64_t delay_ms = fit_bounds<int64_t>((overlimit - 1.0) * FLAGS_max_rejection_delay_ms,
351
0
                                         FLAGS_min_rejection_delay_ms,
352
0
                                         FLAGS_max_rejection_delay_ms);
353
0
  auto status = STATUS(
354
0
      ServiceUnavailable, message, TabletServerDelay(std::chrono::milliseconds(delay_ms)));
355
0
  YB_LOG_EVERY_N_SECS(WARNING, 1)
356
0
      << "T " << tablet_peer->tablet_id() << " P " << tablet_peer->permanent_uuid()
357
0
      << ": Rejecting Write request, " << status << THROTTLE_MSG;
358
0
  return status;
359
0
}
360
361
2.87M
CHECKED_STATUS CheckWriteThrottling(double score, tablet::TabletPeer* tablet_peer) {
362
  // Check for memory pressure; don't bother doing any additional work if we've
363
  // exceeded the limit.
364
2.87M
  auto tablet = tablet_peer->tablet();
365
2.87M
  auto soft_limit_exceeded_result = tablet->mem_tracker()->AnySoftLimitExceeded(score);
366
2.87M
  if (soft_limit_exceeded_result.exceeded) {
367
15
    tablet->metrics()->leader_memory_pressure_rejections->Increment();
368
15
    string msg = StringPrintf(
369
15
        "Soft memory limit exceeded (at %.2f%% of capacity), score: %.2f",
370
15
        soft_limit_exceeded_result.current_capacity_pct, score);
371
15
    if (soft_limit_exceeded_result.current_capacity_pct >=
372
15
            FLAGS_memory_limit_warn_threshold_percentage) {
373
0
      YB_LOG_EVERY_N_SECS(WARNING, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG;
374
15
    } else {
375
15
      YB_LOG_EVERY_N_SECS
(INFO, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG2
;
376
15
    }
377
15
    return STATUS(ServiceUnavailable, msg);
378
15
  }
379
380
2.87M
  const uint64_t num_sst_files = tablet_peer->raft_consensus()->MajorityNumSSTFiles();
381
2.87M
  const auto sst_files_soft_limit = FLAGS_sst_files_soft_limit;
382
2.87M
  const int64_t sst_files_used_delta = num_sst_files - sst_files_soft_limit;
383
2.87M
  if (sst_files_used_delta >= 0) {
384
18
    const auto sst_files_hard_limit = FLAGS_sst_files_hard_limit;
385
18
    const auto sst_files_full_delta = sst_files_hard_limit - sst_files_soft_limit;
386
18
    if (sst_files_used_delta >= sst_files_full_delta * (1 - score)) {
387
0
      tablet->metrics()->majority_sst_files_rejections->Increment();
388
0
      auto message = Format("SST files limit exceeded $0 against ($1, $2), score: $3",
389
0
                            num_sst_files, sst_files_soft_limit, sst_files_hard_limit, score);
390
0
      auto overlimit = sst_files_full_delta > 0
391
0
          ? score + static_cast<double>(sst_files_used_delta) / sst_files_full_delta
392
0
          : 2.0;
393
0
      return RejectWrite(tablet_peer, message, overlimit);
394
0
    }
395
18
  }
396
397
2.87M
  if (FLAGS_TEST_write_rejection_percentage != 0 &&
398
2.87M
      
score >= 1.0 - FLAGS_TEST_write_rejection_percentage * 0.010
) {
399
0
    auto status = Format("TEST: Write request rejected, desired percentage: $0, score: $1",
400
0
                         FLAGS_TEST_write_rejection_percentage, score);
401
0
    return RejectWrite(tablet_peer, status, score + FLAGS_TEST_write_rejection_percentage * 0.01);
402
0
  }
403
404
2.87M
  return Status::OK();
405
2.87M
}
406
407
} // namespace tserver
408
} // namespace yb