blob: bac54ddaa75236693bab9170cb629da2cd726530 [file] [log] [blame]
#!/usr/bin/env python
import os
import signal
import sys
import time
restart_epoch = 0
pid_list = []
def force_kill_all_children():
""" Iterate through all known child processes and force kill them. In the future we might consider
possibly giving the child processes time to exit but this is fine for now. If someone force kills
us and does not clean the process tree this will leave child processes around unless they choose
to end themselves if their parent process dies. """
# First uninstall the SIGCHLD handler so that we don't get called again.
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
global pid_list
for pid in pid_list:
print "force killing PID={}".format(pid)
try:
os.kill(pid, signal.SIGKILL)
except:
print "error force killing PID={} continuing".format(pid)
pid_list = []
def sigterm_handler(signum, frame):
""" Handler for SIGTERM. See force_kill_all_children() for further discussion. """
print "got SIGTERM"
force_kill_all_children()
sys.exit(0)
def sighup_handler(signum, frame):
""" Handler for SIGUP. This signal is used to cause the restarter to fork and exec a new
child. """
print "got SIGHUP"
fork_and_exec()
def sigusr1_handler(signum, frame):
""" Handler for SIGUSR1. Propagate SIGUSR1 to all of the child processes """
global pid_list
for pid in pid_list:
print "sending SIGUSR1 to PID={}".format(pid)
try:
os.kill(pid, signal.SIGUSR1)
except:
print "error in SIGUSR1 to PID={} continuing".format(pid)
def sigchld_handler(signum, frame):
""" Handler for SIGCHLD. Iterates through all of our known child processes and figures out whether
the signal/exit was expected or not. Python doesn't have any of the native signal handlers
ability to get the child process info directly from the signal handler so we need to iterate
through all child processes and see what happened."""
print "got SIGCHLD"
kill_all_and_exit = False
global pid_list
pid_list_copy = list(pid_list)
for pid in pid_list_copy:
ret_pid, exit_status = os.waitpid(pid, os.WNOHANG)
if ret_pid == 0 and exit_status == 0:
# This child is still running.
continue
pid_list.remove(pid)
# Now we see how the child exited.
if os.WIFEXITED(exit_status):
exit_code = os.WEXITSTATUS(exit_status)
print "PID={} exited with code={}".format(ret_pid, exit_code)
if exit_code == 0:
# Normal exit. We assume this was on purpose.
pass
else:
# Something bad happened. We need to tear everything down so that whoever started the
# restarter can know about this situation and restart the whole thing.
kill_all_and_exit = True
elif os.WIFSIGNALED(exit_status):
print "PID={} was killed with signal={}".format(ret_pid, os.WTERMSIG(exit_status))
kill_all_and_exit = True
else:
kill_all_and_exit = True
if kill_all_and_exit:
print "Due to abnormal exit, force killing all child processes and exiting"
force_kill_all_children()
# Our last child died, so we have no purpose. Exit.
if not pid_list:
print "exiting due to lack of child processes"
sys.exit(1 if kill_all_and_exit else 0)
def fork_and_exec():
""" This routine forks and execs a new child process and keeps track of its PID. Before we fork,
set the current restart epoch in an env variable that processes can read if they care. """
global restart_epoch
os.environ['RESTART_EPOCH'] = str(restart_epoch)
print "forking and execing new child process at epoch {}".format(restart_epoch)
restart_epoch += 1
child_pid = os.fork()
if child_pid == 0:
# Child process
os.execl(sys.argv[1], sys.argv[1])
else:
# Parent process
print "forked new child process with PID={}".format(child_pid)
pid_list.append(child_pid)
def main():
""" Script main. This script is designed so that a process watcher like runit or monit can watch
this process and take corrective action if it ever goes away. """
print "starting hot-restarter with target: {}".format(sys.argv[1])
signal.signal(signal.SIGTERM, sigterm_handler)
signal.signal(signal.SIGHUP, sighup_handler)
signal.signal(signal.SIGCHLD, sigchld_handler)
signal.signal(signal.SIGUSR1, sigusr1_handler)
# Start the first child process and then go into an endless loop since everything else happens via
# signals.
fork_and_exec()
while True:
time.sleep(60)
if __name__ == '__main__':
main()