/Users/deen/code/yugabyte-db/src/yb/tools/run-with-timeout.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) YugaByte, Inc. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
4 | | // in compliance with the License. You may obtain a copy of the License at |
5 | | // |
6 | | // http://www.apache.org/licenses/LICENSE-2.0 |
7 | | // |
8 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
9 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
10 | | // or implied. See the License for the specific language governing permissions and limitations |
11 | | // under the License. |
12 | | // |
13 | | |
14 | | #include <signal.h> |
15 | | |
16 | | #include <chrono> |
17 | | #include <condition_variable> |
18 | | #include <iostream> |
19 | | #include <mutex> |
20 | | #include <thread> |
21 | | #include <vector> |
22 | | |
23 | | #include "yb/gutil/stringprintf.h" |
24 | | #include "yb/util/subprocess.h" |
25 | | #include "yb/util/logging.h" |
26 | | #include "yb/util/status.h" |
27 | | #include "yb/util/status_log.h" |
28 | | #include "yb/util/result.h" |
29 | | #include "yb/util/net/net_util.h" |
30 | | |
31 | | #ifdef __linux__ |
32 | | #include <sys/resource.h> |
33 | | #endif |
34 | | |
35 | | constexpr int kWaitSecAfterSigQuit = 30; |
36 | | constexpr int kWaitSecAfterSigSegv = 30; |
37 | | |
38 | | using std::cerr; |
39 | | using std::condition_variable; |
40 | | using std::cout; |
41 | | using std::endl; |
42 | | using std::invalid_argument; |
43 | | using std::mutex; |
44 | | using std::string; |
45 | | using std::thread; |
46 | | using std::unique_lock; |
47 | | using std::vector; |
48 | | using std::chrono::steady_clock; |
49 | | using std::chrono::duration_cast; |
50 | | using std::chrono::milliseconds; |
51 | | |
52 | | using yb::Subprocess; |
53 | | using yb::Status; |
54 | | using yb::Result; |
55 | | using yb::GetHostname; |
56 | | |
57 | 0 | bool SendSignalAndWait(Subprocess* subprocess, int signal, const string& signal_str, int wait_sec) { |
58 | 0 | LOG(ERROR) << "Timeout reached, trying to stop the child process with " << signal_str; |
59 | 0 | const Status signal_status = subprocess->KillNoCheckIfRunning(signal); |
60 | 0 | if (!signal_status.ok()) { |
61 | 0 | LOG(ERROR) << "Failed to send " << signal_str << " to child process: " << signal_status; |
62 | 0 | } |
63 | |
|
64 | 0 | if (wait_sec > 0) { |
65 | 0 | int sleep_ms = 5; |
66 | 0 | auto second_wait_until = steady_clock::now() + std::chrono::seconds(wait_sec); |
67 | 0 | while (subprocess->IsRunning() && steady_clock::now() < second_wait_until) { |
68 | 0 | std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); |
69 | | // Sleep for 5 ms, 10 ms, 15 ms, etc. capped at 100 ms. |
70 | 0 | sleep_ms = std::min(sleep_ms + 5, 100); |
71 | 0 | } |
72 | 0 | } |
73 | |
|
74 | 0 | bool is_running = subprocess->IsRunning(); |
75 | 0 | if (!is_running) { |
76 | 0 | LOG(INFO) << "Child process terminated after we sent " << signal_str; |
77 | 0 | } |
78 | 0 | return !is_running; |
79 | 0 | } |
80 | | |
81 | 3.94k | int main(int argc, char** argv) { |
82 | 3.94k | yb::InitGoogleLoggingSafeBasic(argv[0]); |
83 | | |
84 | 3.94k | vector<string> args; |
85 | 3.94k | args.assign(argv + 1, argv + argc); |
86 | 3.94k | if (args.size() < 2) { |
87 | 0 | cerr << "Usage: run-with-timeout <timeout_sec> <program> [<arguments>]" << endl; |
88 | 0 | cerr << endl; |
89 | 0 | cerr << "Runs the given program with the given timeout. If the program is still" << endl; |
90 | 0 | cerr << "running after the given amount of time, it gets sent the SIGQUIT signal" << endl; |
91 | 0 | cerr << "to make it create a core dump and terminate. If it is still running after" << endl; |
92 | 0 | cerr << "that, we send SIGSEGV, and finally a SIGKILL." << endl; |
93 | 0 | cerr << endl; |
94 | 0 | cerr << "A timeout value of 0 means no timeout." << endl; |
95 | 0 | return EXIT_FAILURE; |
96 | 0 | } |
97 | | |
98 | 3.94k | const string& timeout_arg = args[0]; |
99 | 3.94k | int timeout_sec; |
100 | 3.94k | try { |
101 | 3.94k | timeout_sec = std::stoi(timeout_arg); |
102 | 3.94k | } catch (const invalid_argument& ex) { |
103 | 0 | LOG(ERROR) << "Invalid timeout: " << timeout_arg; |
104 | 0 | return EXIT_FAILURE; |
105 | 0 | } |
106 | | |
107 | 3.94k | if (timeout_sec < 0) { |
108 | 0 | LOG(ERROR) << "Invalid timeout: " << timeout_sec; |
109 | 0 | return EXIT_FAILURE; |
110 | 0 | } |
111 | 3.94k | const string program = args[1]; |
112 | | |
113 | | #ifdef __linux__ |
114 | | struct rlimit64 core_limits; |
115 | | if (getrlimit64(RLIMIT_CORE, &core_limits) == 0) { |
116 | | if (core_limits.rlim_cur != RLIM_INFINITY || core_limits.rlim_max != RLIM_INFINITY) { |
117 | | // Only print the debug message if core file limits are not infinite already. |
118 | | LOG(INFO) << "Core file size limits set by parent process: " |
119 | | << "current=" << core_limits.rlim_cur |
120 | | << ", max=" << core_limits.rlim_max << ". Trying to set both to infinity."; |
121 | | } |
122 | | } else { |
123 | | perror("getrlimit64 failed"); |
124 | | } |
125 | | core_limits.rlim_cur = RLIM_INFINITY; |
126 | | core_limits.rlim_max = RLIM_INFINITY; |
127 | | if (setrlimit64(RLIMIT_CORE, &core_limits) != 0) { |
128 | | perror("setrlimit64 failed"); |
129 | | } |
130 | | #endif |
131 | | |
132 | 3.94k | { |
133 | 3.94k | auto host_name = GetHostname(); |
134 | 3.94k | if (host_name.ok()) { |
135 | 3.94k | LOG(INFO) << "Running on host " << *host_name; |
136 | 3.94k | } else { |
137 | 0 | LOG(WARNING) << "Failed to get current host name: " << host_name.status(); |
138 | 0 | } |
139 | 3.94k | } |
140 | | |
141 | 3.94k | const char* path_env_var = getenv("PATH"); |
142 | 3.94k | LOG(INFO) << "PATH environment variable: " << (path_env_var != NULL ? path_env_var : "N/A"0 ); |
143 | | |
144 | | // Arguments include the program name, so we only erase the first argument from our own args: the |
145 | | // timeout value. |
146 | 3.94k | args.erase(args.begin()); |
147 | 3.94k | Subprocess subprocess(program, args); |
148 | 3.94k | auto start_time = steady_clock::now(); |
149 | 3.94k | if (timeout_sec > 0) { |
150 | 3.94k | LOG(INFO) << "Starting child process with a timeout of " << timeout_sec << " seconds"; |
151 | 3.94k | } else { |
152 | 0 | LOG(INFO) << "Starting child process with no timeout"; |
153 | 0 | } |
154 | 3.94k | const Status start_status = subprocess.Start(); |
155 | 3.94k | if (!start_status.ok()) { |
156 | 0 | LOG(ERROR) << "Failed to start child process: " << start_status.ToString(); |
157 | 0 | return EXIT_FAILURE; |
158 | 0 | } |
159 | 3.94k | LOG(INFO) << "Child process pid: " << subprocess.pid(); |
160 | 3.94k | mutex lock; |
161 | 3.94k | condition_variable finished_cond; |
162 | 3.94k | bool finished = false; |
163 | 3.94k | bool timed_out = false; |
164 | | |
165 | 3.94k | thread reaper_thread([timeout_sec, &subprocess, &lock, &finished_cond, &finished, &timed_out] { |
166 | 3.94k | if (timeout_sec == 0) { |
167 | 0 | return; // no timeout |
168 | 0 | } |
169 | | |
170 | | // Wait until the main thread notifies us that the process has finished running, or the |
171 | | // timeout is reached. |
172 | 3.94k | auto wait_until_when = steady_clock::now() + std::chrono::seconds(timeout_sec); |
173 | | |
174 | 3.94k | bool finished_local = false; |
175 | 3.94k | { |
176 | 3.94k | unique_lock<mutex> l(lock); |
177 | 7.89k | finished_cond.wait_until(l, wait_until_when, [&finished]{ return finished; }); |
178 | 3.94k | finished_local = finished; |
179 | 3.94k | } |
180 | | |
181 | 3.94k | const bool subprocess_is_running = subprocess.IsRunning(); |
182 | 3.94k | LOG(INFO) << "Reaper thread: finished=" << finished_local |
183 | 3.94k | << ", subprocess_is_running=" << subprocess_is_running |
184 | 3.94k | << ", timeout_sec=" << timeout_sec; |
185 | | |
186 | 3.94k | if (!finished_local && subprocess_is_running0 ) { |
187 | 0 | timed_out = true; |
188 | 0 | if (!SendSignalAndWait(&subprocess, SIGQUIT, "SIGQUIT", kWaitSecAfterSigQuit) && |
189 | 0 | !SendSignalAndWait(&subprocess, SIGSEGV, "SIGSEGV", kWaitSecAfterSigSegv) && |
190 | 0 | !SendSignalAndWait(&subprocess, SIGKILL, "SIGKILL", /* wait_sec */ 0)) { |
191 | 0 | LOG(ERROR) << "Failed to kill the process with SIGKILL (should not happen)."; |
192 | 0 | } |
193 | 0 | } |
194 | 3.94k | }); |
195 | | |
196 | 3.94k | int waitpid_ret_val = 0; |
197 | 3.94k | CHECK_OK(subprocess.Wait(&waitpid_ret_val)); |
198 | 3.94k | LOG(INFO) << "subprocess.Wait finished, waitpid() returned " << waitpid_ret_val; |
199 | 3.94k | { |
200 | 3.94k | unique_lock<mutex> l(lock); |
201 | 3.94k | finished = true; |
202 | 3.94k | finished_cond.notify_one(); |
203 | 3.94k | } |
204 | | |
205 | 3.94k | LOG(INFO) << "Waiting for reaper thread to join"; |
206 | 3.94k | reaper_thread.join(); |
207 | 3.94k | int exit_code = WIFEXITED(waitpid_ret_val) ? WEXITSTATUS(waitpid_ret_val) : 01.07k ; |
208 | 3.94k | int term_sig = WIFSIGNALED(waitpid_ret_val) ? WTERMSIG(waitpid_ret_val) : 02.87k ; |
209 | 3.94k | LOG(INFO) << "Child process returned exit code " << exit_code; |
210 | 3.94k | if (timed_out) { |
211 | 0 | if (exit_code == 0) { |
212 | 0 | LOG(ERROR) << "Child process timed out and we had to kill it"; |
213 | 0 | } |
214 | 0 | exit_code = EXIT_FAILURE; |
215 | 0 | } |
216 | | |
217 | 3.94k | if (term_sig != 0) { |
218 | 1.07k | if (exit_code == 0) { |
219 | 1.07k | exit_code = EXIT_FAILURE; |
220 | 1.07k | } |
221 | 1.07k | LOG(ERROR) << "Child process terminated due to signal " << term_sig; |
222 | 1.07k | } |
223 | | |
224 | 3.94k | LOG(INFO) << "Returning exit code " << exit_code; |
225 | 3.94k | auto duration_ms = duration_cast<milliseconds>(steady_clock::now() - start_time).count(); |
226 | 3.94k | LOG(INFO) << "Total time taken: " << StringPrintf("%.3f", duration_ms / 1000.0) << " sec"; |
227 | 3.94k | return exit_code; |
228 | 3.94k | } |