YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/yb/tools/run-with-timeout.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) YugaByte, Inc.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4
// in compliance with the License.  You may obtain a copy of the License at
5
//
6
// http://www.apache.org/licenses/LICENSE-2.0
7
//
8
// Unless required by applicable law or agreed to in writing, software distributed under the License
9
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10
// or implied.  See the License for the specific language governing permissions and limitations
11
// under the License.
12
//
13
14
#include <signal.h>
15
16
#include <chrono>
17
#include <condition_variable>
18
#include <iostream>
19
#include <mutex>
20
#include <thread>
21
#include <vector>
22
23
#include "yb/gutil/stringprintf.h"
24
#include "yb/util/subprocess.h"
25
#include "yb/util/logging.h"
26
#include "yb/util/status.h"
27
#include "yb/util/status_log.h"
28
#include "yb/util/result.h"
29
#include "yb/util/net/net_util.h"
30
31
#ifdef __linux__
32
#include <sys/resource.h>
33
#endif
34
35
constexpr int kWaitSecAfterSigQuit = 30;
36
constexpr int kWaitSecAfterSigSegv = 30;
37
38
using std::cerr;
39
using std::condition_variable;
40
using std::cout;
41
using std::endl;
42
using std::invalid_argument;
43
using std::mutex;
44
using std::string;
45
using std::thread;
46
using std::unique_lock;
47
using std::vector;
48
using std::chrono::steady_clock;
49
using std::chrono::duration_cast;
50
using std::chrono::milliseconds;
51
52
using yb::Subprocess;
53
using yb::Status;
54
using yb::Result;
55
using yb::GetHostname;
56
57
0
bool SendSignalAndWait(Subprocess* subprocess, int signal, const string& signal_str, int wait_sec) {
58
0
  LOG(ERROR) << "Timeout reached, trying to stop the child process with " << signal_str;
59
0
  const Status signal_status = subprocess->KillNoCheckIfRunning(signal);
60
0
  if (!signal_status.ok()) {
61
0
    LOG(ERROR) << "Failed to send " << signal_str << " to child process: " << signal_status;
62
0
  }
63
64
0
  if (wait_sec > 0) {
65
0
    int sleep_ms = 5;
66
0
    auto second_wait_until = steady_clock::now() + std::chrono::seconds(wait_sec);
67
0
    while (subprocess->IsRunning() && steady_clock::now() < second_wait_until) {
68
0
      std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
69
      // Sleep for 5 ms, 10 ms, 15 ms, etc. capped at 100 ms.
70
0
      sleep_ms = std::min(sleep_ms + 5, 100);
71
0
    }
72
0
  }
73
74
0
  bool is_running = subprocess->IsRunning();
75
0
  if (!is_running) {
76
0
    LOG(INFO) << "Child process terminated after we sent " << signal_str;
77
0
  }
78
0
  return !is_running;
79
0
}
80
81
2.89k
int main(int argc, char** argv) {
82
2.89k
  yb::InitGoogleLoggingSafeBasic(argv[0]);
83
84
2.89k
  vector<string> args;
85
2.89k
  args.assign(argv + 1, argv + argc);
86
2.89k
  if (args.size() < 2) {
87
0
    cerr << "Usage: run-with-timeout <timeout_sec> <program> [<arguments>]" << endl;
88
0
    cerr << endl;
89
0
    cerr << "Runs the given program with the given timeout. If the program is still" << endl;
90
0
    cerr << "running after the given amount of time, it gets sent the SIGQUIT signal" << endl;
91
0
    cerr << "to make it create a core dump and terminate. If it is still running after" << endl;
92
0
    cerr << "that, we send SIGSEGV, and finally a SIGKILL." << endl;
93
0
    cerr << endl;
94
0
    cerr << "A timeout value of 0 means no timeout." << endl;
95
0
    return EXIT_FAILURE;
96
0
  }
97
98
2.89k
  const string& timeout_arg = args[0];
99
2.89k
  int timeout_sec;
100
2.89k
  try {
101
2.89k
    timeout_sec = std::stoi(timeout_arg);
102
0
  } catch (const invalid_argument& ex) {
103
0
    LOG(ERROR) << "Invalid timeout: " << timeout_arg;
104
0
    return EXIT_FAILURE;
105
0
  }
106
107
2.89k
  if (timeout_sec < 0) {
108
0
    LOG(ERROR) << "Invalid timeout: " << timeout_sec;
109
0
    return EXIT_FAILURE;
110
0
  }
111
2.89k
  const string program = args[1];
112
113
#ifdef __linux__
114
  struct rlimit64 core_limits;
115
  if (getrlimit64(RLIMIT_CORE, &core_limits) == 0) {
116
    if (core_limits.rlim_cur != RLIM_INFINITY || core_limits.rlim_max != RLIM_INFINITY) {
117
      // Only print the debug message if core file limits are not infinite already.
118
      LOG(INFO) << "Core file size limits set by parent process: "
119
                << "current=" << core_limits.rlim_cur
120
                << ", max=" << core_limits.rlim_max << ". Trying to set both to infinity.";
121
    }
122
  } else {
123
    perror("getrlimit64 failed");
124
  }
125
  core_limits.rlim_cur = RLIM_INFINITY;
126
  core_limits.rlim_max = RLIM_INFINITY;
127
  if (setrlimit64(RLIMIT_CORE, &core_limits) != 0) {
128
    perror("setrlimit64 failed");
129
  }
130
#endif
131
132
2.89k
  {
133
2.89k
    auto host_name = GetHostname();
134
2.89k
    if (host_name.ok()) {
135
2.89k
      LOG(INFO) << "Running on host " << *host_name;
136
0
    } else {
137
0
      LOG(WARNING) << "Failed to get current host name: " << host_name.status();
138
0
    }
139
2.89k
  }
140
141
2.89k
  const char* path_env_var = getenv("PATH");
142
2.89k
  LOG(INFO) << "PATH environment variable: " << (path_env_var != NULL ? path_env_var : "N/A");
143
144
  // Arguments include the program name, so we only erase the first argument from our own args: the
145
  // timeout value.
146
2.89k
  args.erase(args.begin());
147
2.89k
  Subprocess subprocess(program, args);
148
2.89k
  auto start_time = steady_clock::now();
149
2.89k
  if (timeout_sec > 0) {
150
2.89k
    LOG(INFO) << "Starting child process with a timeout of " << timeout_sec << " seconds";
151
0
  } else {
152
0
    LOG(INFO) << "Starting child process with no timeout";
153
0
  }
154
2.89k
  const Status start_status = subprocess.Start();
155
2.89k
  if (!start_status.ok()) {
156
0
    LOG(ERROR) << "Failed to start child process: " << start_status.ToString();
157
0
    return EXIT_FAILURE;
158
0
  }
159
2.89k
  LOG(INFO) << "Child process pid: " << subprocess.pid();
160
2.89k
  mutex lock;
161
2.89k
  condition_variable finished_cond;
162
2.89k
  bool finished = false;
163
2.89k
  bool timed_out = false;
164
165
2.89k
  thread reaper_thread([timeout_sec, &subprocess, &lock, &finished_cond, &finished, &timed_out] {
166
2.89k
    if (timeout_sec == 0) {
167
0
      return; // no timeout
168
0
    }
169
170
    // Wait until the main thread notifies us that the process has finished running, or the
171
    // timeout is reached.
172
2.89k
    auto wait_until_when = steady_clock::now() + std::chrono::seconds(timeout_sec);
173
174
2.89k
    bool finished_local = false;
175
2.89k
    {
176
2.89k
      unique_lock<mutex> l(lock);
177
5.79k
      finished_cond.wait_until(l, wait_until_when, [&finished]{ return finished; });
178
2.89k
      finished_local = finished;
179
2.89k
    }
180
181
2.89k
    const bool subprocess_is_running = subprocess.IsRunning();
182
2.89k
    LOG(INFO) << "Reaper thread: finished=" << finished_local
183
2.89k
              << ", subprocess_is_running=" << subprocess_is_running
184
2.89k
              << ", timeout_sec=" << timeout_sec;
185
186
2.89k
    if (!finished_local && subprocess_is_running) {
187
0
      timed_out = true;
188
0
      if (!SendSignalAndWait(&subprocess, SIGQUIT, "SIGQUIT", kWaitSecAfterSigQuit) &&
189
0
          !SendSignalAndWait(&subprocess, SIGSEGV, "SIGSEGV", kWaitSecAfterSigSegv) &&
190
0
          !SendSignalAndWait(&subprocess, SIGKILL, "SIGKILL", /* wait_sec */ 0)) {
191
0
        LOG(ERROR) << "Failed to kill the process with SIGKILL (should not happen).";
192
0
      }
193
0
    }
194
2.89k
  });
195
196
2.89k
  int waitpid_ret_val = 0;
197
2.89k
  CHECK_OK(subprocess.Wait(&waitpid_ret_val));
198
2.89k
  LOG(INFO) << "subprocess.Wait finished, waitpid() returned " << waitpid_ret_val;
199
2.89k
  {
200
2.89k
    unique_lock<mutex> l(lock);
201
2.89k
    finished = true;
202
2.89k
    finished_cond.notify_one();
203
2.89k
  }
204
205
2.89k
  LOG(INFO) << "Waiting for reaper thread to join";
206
2.89k
  reaper_thread.join();
207
448
  int exit_code = WIFEXITED(waitpid_ret_val) ? WEXITSTATUS(waitpid_ret_val) : 0;
208
2.45k
  int term_sig = WIFSIGNALED(waitpid_ret_val) ? WTERMSIG(waitpid_ret_val) : 0;
209
2.89k
  LOG(INFO) << "Child process returned exit code " << exit_code;
210
2.89k
  if (timed_out) {
211
0
    if (exit_code == 0) {
212
0
      LOG(ERROR) << "Child process timed out and we had to kill it";
213
0
    }
214
0
    exit_code = EXIT_FAILURE;
215
0
  }
216
217
2.89k
  if (term_sig != 0) {
218
448
    if (exit_code == 0) {
219
448
      exit_code = EXIT_FAILURE;
220
448
    }
221
448
    LOG(ERROR) << "Child process terminated due to signal " << term_sig;
222
448
  }
223
224
2.89k
  LOG(INFO) << "Returning exit code " << exit_code;
225
2.89k
  auto duration_ms = duration_cast<milliseconds>(steady_clock::now() - start_time).count();
226
2.89k
  LOG(INFO) << "Total time taken: " << StringPrintf("%.3f", duration_ms / 1000.0) << " sec";
227
2.89k
  return exit_code;
228
2.89k
}