blob: bac54ddaa75236693bab9170cb629da2cd726530 [file] [log] [blame]
Sergio Slobodriancab0a392017-07-13 08:42:10 -04001#!/usr/bin/env python
2
3import os
4import signal
5import sys
6import time
7
8restart_epoch = 0
9pid_list = []
10
11def force_kill_all_children():
12 """ Iterate through all known child processes and force kill them. In the future we might consider
13 possibly giving the child processes time to exit but this is fine for now. If someone force kills
14 us and does not clean the process tree this will leave child processes around unless they choose
15 to end themselves if their parent process dies. """
16
17 # First uninstall the SIGCHLD handler so that we don't get called again.
18 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
19
20 global pid_list
21 for pid in pid_list:
22 print "force killing PID={}".format(pid)
23 try:
24 os.kill(pid, signal.SIGKILL)
25 except:
26 print "error force killing PID={} continuing".format(pid)
27
28 pid_list = []
29
30
31def sigterm_handler(signum, frame):
32 """ Handler for SIGTERM. See force_kill_all_children() for further discussion. """
33
34 print "got SIGTERM"
35 force_kill_all_children()
36 sys.exit(0)
37
38
39def sighup_handler(signum, frame):
40 """ Handler for SIGUP. This signal is used to cause the restarter to fork and exec a new
41 child. """
42
43 print "got SIGHUP"
44 fork_and_exec()
45
46def sigusr1_handler(signum, frame):
47 """ Handler for SIGUSR1. Propagate SIGUSR1 to all of the child processes """
48
49 global pid_list
50 for pid in pid_list:
51 print "sending SIGUSR1 to PID={}".format(pid)
52 try:
53 os.kill(pid, signal.SIGUSR1)
54 except:
55 print "error in SIGUSR1 to PID={} continuing".format(pid)
56
57
58def sigchld_handler(signum, frame):
59 """ Handler for SIGCHLD. Iterates through all of our known child processes and figures out whether
60 the signal/exit was expected or not. Python doesn't have any of the native signal handlers
61 ability to get the child process info directly from the signal handler so we need to iterate
62 through all child processes and see what happened."""
63
64 print "got SIGCHLD"
65
66 kill_all_and_exit = False
67 global pid_list
68 pid_list_copy = list(pid_list)
69 for pid in pid_list_copy:
70 ret_pid, exit_status = os.waitpid(pid, os.WNOHANG)
71 if ret_pid == 0 and exit_status == 0:
72 # This child is still running.
73 continue
74
75 pid_list.remove(pid)
76
77 # Now we see how the child exited.
78 if os.WIFEXITED(exit_status):
79 exit_code = os.WEXITSTATUS(exit_status)
80 print "PID={} exited with code={}".format(ret_pid, exit_code)
81 if exit_code == 0:
82 # Normal exit. We assume this was on purpose.
83 pass
84 else:
85 # Something bad happened. We need to tear everything down so that whoever started the
86 # restarter can know about this situation and restart the whole thing.
87 kill_all_and_exit = True
88 elif os.WIFSIGNALED(exit_status):
89 print "PID={} was killed with signal={}".format(ret_pid, os.WTERMSIG(exit_status))
90 kill_all_and_exit = True
91 else:
92 kill_all_and_exit = True
93
94 if kill_all_and_exit:
95 print "Due to abnormal exit, force killing all child processes and exiting"
96 force_kill_all_children()
97
98 # Our last child died, so we have no purpose. Exit.
99 if not pid_list:
100 print "exiting due to lack of child processes"
101 sys.exit(1 if kill_all_and_exit else 0)
102
103
104def fork_and_exec():
105 """ This routine forks and execs a new child process and keeps track of its PID. Before we fork,
106 set the current restart epoch in an env variable that processes can read if they care. """
107
108 global restart_epoch
109 os.environ['RESTART_EPOCH'] = str(restart_epoch)
110 print "forking and execing new child process at epoch {}".format(restart_epoch)
111 restart_epoch += 1
112
113 child_pid = os.fork()
114 if child_pid == 0:
115 # Child process
116 os.execl(sys.argv[1], sys.argv[1])
117 else:
118 # Parent process
119 print "forked new child process with PID={}".format(child_pid)
120 pid_list.append(child_pid)
121
122
123def main():
124 """ Script main. This script is designed so that a process watcher like runit or monit can watch
125 this process and take corrective action if it ever goes away. """
126
127 print "starting hot-restarter with target: {}".format(sys.argv[1])
128
129 signal.signal(signal.SIGTERM, sigterm_handler)
130 signal.signal(signal.SIGHUP, sighup_handler)
131 signal.signal(signal.SIGCHLD, sigchld_handler)
132 signal.signal(signal.SIGUSR1, sigusr1_handler)
133
134 # Start the first child process and then go into an endless loop since everything else happens via
135 # signals.
136 fork_and_exec()
137 while True:
138 time.sleep(60)
139
140if __name__ == '__main__':
141 main()