Sergio Slobodrian | cab0a39 | 2017-07-13 08:42:10 -0400 | [diff] [blame] | 1 | #!/usr/bin/env python |
Zack Williams | 41513bf | 2018-07-07 20:08:35 -0700 | [diff] [blame] | 2 | # Copyright 2017-present Open Networking Foundation |
| 3 | # |
| 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | # you may not use this file except in compliance with the License. |
| 6 | # You may obtain a copy of the License at |
| 7 | # |
| 8 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | # |
| 10 | # Unless required by applicable law or agreed to in writing, software |
| 11 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | # See the License for the specific language governing permissions and |
| 14 | # limitations under the License. |
Sergio Slobodrian | cab0a39 | 2017-07-13 08:42:10 -0400 | [diff] [blame] | 15 | |
| 16 | import os |
| 17 | import signal |
| 18 | import sys |
| 19 | import time |
| 20 | |
| 21 | restart_epoch = 0 |
| 22 | pid_list = [] |
| 23 | |
| 24 | def force_kill_all_children(): |
| 25 | """ Iterate through all known child processes and force kill them. In the future we might consider |
| 26 | possibly giving the child processes time to exit but this is fine for now. If someone force kills |
| 27 | us and does not clean the process tree this will leave child processes around unless they choose |
| 28 | to end themselves if their parent process dies. """ |
| 29 | |
| 30 | # First uninstall the SIGCHLD handler so that we don't get called again. |
| 31 | signal.signal(signal.SIGCHLD, signal.SIG_DFL) |
| 32 | |
| 33 | global pid_list |
| 34 | for pid in pid_list: |
| 35 | print "force killing PID={}".format(pid) |
| 36 | try: |
| 37 | os.kill(pid, signal.SIGKILL) |
| 38 | except: |
| 39 | print "error force killing PID={} continuing".format(pid) |
| 40 | |
| 41 | pid_list = [] |
| 42 | |
| 43 | |
| 44 | def sigterm_handler(signum, frame): |
| 45 | """ Handler for SIGTERM. See force_kill_all_children() for further discussion. """ |
| 46 | |
| 47 | print "got SIGTERM" |
| 48 | force_kill_all_children() |
| 49 | sys.exit(0) |
| 50 | |
| 51 | |
| 52 | def sighup_handler(signum, frame): |
| 53 | """ Handler for SIGUP. This signal is used to cause the restarter to fork and exec a new |
| 54 | child. """ |
| 55 | |
| 56 | print "got SIGHUP" |
| 57 | fork_and_exec() |
| 58 | |
| 59 | def sigusr1_handler(signum, frame): |
| 60 | """ Handler for SIGUSR1. Propagate SIGUSR1 to all of the child processes """ |
| 61 | |
| 62 | global pid_list |
| 63 | for pid in pid_list: |
| 64 | print "sending SIGUSR1 to PID={}".format(pid) |
| 65 | try: |
| 66 | os.kill(pid, signal.SIGUSR1) |
| 67 | except: |
| 68 | print "error in SIGUSR1 to PID={} continuing".format(pid) |
| 69 | |
| 70 | |
| 71 | def sigchld_handler(signum, frame): |
| 72 | """ Handler for SIGCHLD. Iterates through all of our known child processes and figures out whether |
| 73 | the signal/exit was expected or not. Python doesn't have any of the native signal handlers |
| 74 | ability to get the child process info directly from the signal handler so we need to iterate |
| 75 | through all child processes and see what happened.""" |
| 76 | |
| 77 | print "got SIGCHLD" |
| 78 | |
| 79 | kill_all_and_exit = False |
| 80 | global pid_list |
| 81 | pid_list_copy = list(pid_list) |
| 82 | for pid in pid_list_copy: |
| 83 | ret_pid, exit_status = os.waitpid(pid, os.WNOHANG) |
| 84 | if ret_pid == 0 and exit_status == 0: |
| 85 | # This child is still running. |
| 86 | continue |
| 87 | |
| 88 | pid_list.remove(pid) |
| 89 | |
| 90 | # Now we see how the child exited. |
| 91 | if os.WIFEXITED(exit_status): |
| 92 | exit_code = os.WEXITSTATUS(exit_status) |
| 93 | print "PID={} exited with code={}".format(ret_pid, exit_code) |
| 94 | if exit_code == 0: |
| 95 | # Normal exit. We assume this was on purpose. |
| 96 | pass |
| 97 | else: |
| 98 | # Something bad happened. We need to tear everything down so that whoever started the |
| 99 | # restarter can know about this situation and restart the whole thing. |
| 100 | kill_all_and_exit = True |
| 101 | elif os.WIFSIGNALED(exit_status): |
| 102 | print "PID={} was killed with signal={}".format(ret_pid, os.WTERMSIG(exit_status)) |
| 103 | kill_all_and_exit = True |
| 104 | else: |
| 105 | kill_all_and_exit = True |
| 106 | |
| 107 | if kill_all_and_exit: |
| 108 | print "Due to abnormal exit, force killing all child processes and exiting" |
| 109 | force_kill_all_children() |
| 110 | |
| 111 | # Our last child died, so we have no purpose. Exit. |
| 112 | if not pid_list: |
| 113 | print "exiting due to lack of child processes" |
| 114 | sys.exit(1 if kill_all_and_exit else 0) |
| 115 | |
| 116 | |
| 117 | def fork_and_exec(): |
| 118 | """ This routine forks and execs a new child process and keeps track of its PID. Before we fork, |
| 119 | set the current restart epoch in an env variable that processes can read if they care. """ |
| 120 | |
| 121 | global restart_epoch |
| 122 | os.environ['RESTART_EPOCH'] = str(restart_epoch) |
| 123 | print "forking and execing new child process at epoch {}".format(restart_epoch) |
| 124 | restart_epoch += 1 |
| 125 | |
| 126 | child_pid = os.fork() |
| 127 | if child_pid == 0: |
| 128 | # Child process |
| 129 | os.execl(sys.argv[1], sys.argv[1]) |
| 130 | else: |
| 131 | # Parent process |
| 132 | print "forked new child process with PID={}".format(child_pid) |
| 133 | pid_list.append(child_pid) |
| 134 | |
| 135 | |
| 136 | def main(): |
| 137 | """ Script main. This script is designed so that a process watcher like runit or monit can watch |
| 138 | this process and take corrective action if it ever goes away. """ |
| 139 | |
| 140 | print "starting hot-restarter with target: {}".format(sys.argv[1]) |
| 141 | |
| 142 | signal.signal(signal.SIGTERM, sigterm_handler) |
| 143 | signal.signal(signal.SIGHUP, sighup_handler) |
| 144 | signal.signal(signal.SIGCHLD, sigchld_handler) |
| 145 | signal.signal(signal.SIGUSR1, sigusr1_handler) |
| 146 | |
| 147 | # Start the first child process and then go into an endless loop since everything else happens via |
| 148 | # signals. |
| 149 | fork_and_exec() |
| 150 | while True: |
| 151 | time.sleep(60) |
| 152 | |
| 153 | if __name__ == '__main__': |
| 154 | main() |