blob: c34816c2103f08fdee0691ffef3c883d3a0c7d8a [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs8b886ca2004-12-22 02:56:38 +00002 Monitor status of quagga daemons and restart if necessary.
3
4 Copyright (C) 2004 Andrew J. Schorr
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
ajsa3655342004-12-29 17:39:10 +000021#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000022#include <thread.h>
23#include <log.h>
ajs52e66292005-02-16 20:40:25 +000024#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000025#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000026#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000027#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000028#include <sys/un.h>
29#include <sys/wait.h>
Balaji.G837d16c2012-09-26 14:09:10 +053030#include <memory.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
218 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
219Watchdog program to monitor status of quagga daemons and try to restart\n\
220them if they are down or unresponsive. It determines whether a daemon is\n\
221up based on whether it can connect to the daemon's vty unix stream socket.\n\
222It then repeatedly sends echo commands over that socket to determine whether\n\
223the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
224on the socket connection and know immediately that the daemon is down.\n\n\
225The daemons to be monitored should be listed on the command line.\n\n\
226This program can run in one of 5 modes:\n\n\
2270. Mode: %s.\n\
228 Just monitor and report on status changes. Example:\n\
229 %s -d zebra ospfd bgpd\n\n\
2301. Mode: %s.\n\
231 Whenever any daemon hangs or crashes, use the given command to restart\n\
232 them all. Example:\n\
233 %s -dz \\\n\
234 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
235 zebra ospfd\n\n\
2362. Mode: %s.\n\
237 When any single daemon hangs or crashes, restart only the daemon that's\n\
238 in trouble using the supplied restart command. Example:\n\
239 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2403. Mode: %s.\n\
241 The same as the previous mode, except that there is special treatment when\n\
242 the zebra daemon is in trouble. In that case, a phased restart approach\n\
243 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
244 daemons. Example:\n\
245 %s -adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2484. Mode: %s.\n\
249 This is the same as the previous mode, except that the phased restart\n\
250 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
251 %s -Adz -r '/sbin/service %%s restart' \\\n\
252 -s '/sbin/service %%s start' \\\n\
253 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
254As of this writing, it is believed that mode 2 [%s]\n\
255is not safe, and mode 3 [%s] may not be safe with some of the\n\
256routing daemons.\n\n\
257In order to avoid attempting to restart the daemons in a fast loop,\n\
258the -m and -M options allow you to control the minimum delay between\n\
259restart commands. The minimum restart delay is recalculated each time\n\
260a restart is attempted: if the time since the last restart attempt exceeds\n\
261twice the -M value, then the restart delay is set to the -m value.\n\
262Otherwise, the interval is doubled (but capped at the -M value).\n\n\
263Options:\n\
264-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
265 to syslog instead of stdout.\n\
266-S, --statedir Set the vty socket directory (default is %s)\n\
267-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
268 option is necessary if the daemons do not support the\n\
269 echo command)\n\
270-l, --loglevel Set the logging level (default is %d).\n\
271 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
272 but it can be set higher than %d if extra-verbose debugging\n\
273 messages are desired.\n\
274-m, --min-restart-interval\n\
275 Set the minimum seconds to wait between invocations of daemon\n\
276 restart commands (default is %d).\n\
277-M, --max-restart-interval\n\
278 Set the maximum seconds to wait between invocations of daemon\n\
279 restart commands (default is %d).\n\
280-i, --interval Set the status polling interval in seconds (default is %d)\n\
281-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
282-T, --restart-timeout\n\
283 Set the restart (kill) timeout in seconds (default is %d).\n\
284 If any background jobs are still running after this much\n\
285 time has elapsed, they will be killed.\n\
286-r, --restart Supply a Bourne shell command to use to restart a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 Note that -r and -R are incompatible.\n\
290-s, --start-command\n\
291 Supply a Bourne shell to command to use to start a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294-k, --kill-command\n\
295 Supply a Bourne shell to command to use to stop a single\n\
296 daemon. The command string should include '%%s' where the\n\
297 name of the daemon should be substituted.\n\
298-R, --restart-all\n\
299 When one or more daemons is down, try to restart everything\n\
300 using the Bourne shell command supplied as the argument.\n\
301 Note that -r and -R are incompatible.\n\
302-z, --unresponsive-restart\n\
303 When a daemon is unresponsive, treat it as being down for\n\
304 restart purposes.\n\
305-a, --all-restart\n\
306 When zebra hangs or crashes, restart all daemons using\n\
307 this phased approach: 1. stop all other daemons; 2. restart\n\
308 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
309-A, --always-all-restart\n\
310 When any daemon (not just zebra) hangs or crashes, use the\n\
311 same phased restart mechanism described above for -a.\n\
312 Requires -r, -s, and -k.\n\
313-p, --pid-file Set process identifier file name\n\
314 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000315-b, --blank-string\n\
316 When the supplied argument string is found in any of the\n\
317 various shell command arguments (-r, -s, -k, or -R), replace\n\
318 it with a space. This is an ugly hack to circumvent problems\n\
319 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000320-v, --version Print program version\n\
321-h, --help Display this help and exit\n\
322", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
323progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000324VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000325DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
326DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
327
328 return status;
329}
330
331static pid_t
332run_background(const char *shell_cmd)
333{
334 pid_t child;
335
336 switch (child = fork())
337 {
338 case -1:
339 zlog_err("fork failed, cannot run command [%s]: %s",
340 shell_cmd,safe_strerror(errno));
341 return -1;
342 case 0:
343 /* Child process. */
344 /* Use separate process group so child processes can be killed easily. */
345 if (setpgid(0,0) < 0)
346 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
347 {
348 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
349 execv("/bin/sh",(char *const *)argv);
350 zlog_err("execv(/bin/sh -c '%s') failed: %s",
351 shell_cmd,safe_strerror(errno));
352 _exit(127);
353 }
354 default:
355 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000356 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000357 return child;
358 }
359}
360
361static struct timeval *
362time_elapsed(struct timeval *result, const struct timeval *start_time)
363{
364 gettimeofday(result,NULL);
365 result->tv_sec -= start_time->tv_sec;
366 result->tv_usec -= start_time->tv_usec;
367 while (result->tv_usec < 0)
368 {
369 result->tv_usec += 1000000L;
370 result->tv_sec--;
371 }
372 return result;
373}
374
375static int
376restart_kill(struct thread *t_kill)
377{
378 struct restart_info *restart = THREAD_ARG(t_kill);
379 struct timeval delay;
380
381 time_elapsed(&delay,&restart->time);
382 zlog_warn("Warning: %s %s child process %d still running after "
383 "%ld seconds, sending signal %d",
David Lamparteref008d22015-03-03 08:48:11 +0100384 restart->what,restart->name,(int)restart->pid, (long)delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000385 (restart->kills ? SIGKILL : SIGTERM));
386 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
387 restart->kills++;
388 restart->t_kill = thread_add_timer(master,restart_kill,restart,
389 gs.restart_timeout);
390 return 0;
391}
392
393static struct restart_info *
394find_child(pid_t child)
395{
396 if (gs.mode == MODE_GLOBAL_RESTART)
397 {
398 if (gs.restart.pid == child)
399 return &gs.restart;
400 }
401 else
402 {
403 struct daemon *dmn;
404 for (dmn = gs.daemons; dmn; dmn = dmn->next)
405 {
406 if (dmn->restart.pid == child)
407 return &dmn->restart;
408 }
409 }
410 return NULL;
411}
412
413static void
414sigchild(void)
415{
416 pid_t child;
417 int status;
418 const char *name;
419 const char *what;
420 struct restart_info *restart;
421
422 switch (child = waitpid(-1,&status,WNOHANG))
423 {
424 case -1:
425 zlog_err("waitpid failed: %s",safe_strerror(errno));
426 return;
427 case 0:
428 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
429 return;
430 }
431
432 if ((restart = find_child(child)) != NULL)
433 {
434 name = restart->name;
435 what = restart->what;
436 restart->pid = 0;
437 gs.numpids--;
438 thread_cancel(restart->t_kill);
439 restart->t_kill = NULL;
440 /* Update restart time to reflect the time the command completed. */
441 gettimeofday(&restart->time,NULL);
442 }
443 else
444 {
445 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000446 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000447 name = "(unknown)";
448 what = "background";
449 }
450 if (WIFSTOPPED(status))
451 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000452 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000453 else if (WIFSIGNALED(status))
454 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000455 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000456 else if (WIFEXITED(status))
457 {
458 if (WEXITSTATUS(status) != 0)
459 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000460 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000461 else
ajsf2d82572004-12-29 17:45:08 +0000462 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000463 }
464 else
465 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000466 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000467 phase_check();
468}
469
470static int
471run_job(struct restart_info *restart, const char *cmdtype, const char *command,
472 int force, int update_interval)
473{
474 struct timeval delay;
475
476 if (gs.loglevel > LOG_DEBUG+1)
477 zlog_debug("attempting to %s %s",cmdtype,restart->name);
478
479 if (restart->pid)
480 {
481 if (gs.loglevel > LOG_DEBUG+1)
482 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000483 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000484 return -1;
485 }
486
ajsa8a8ddc2005-01-12 16:24:51 +0000487 /* Note: time_elapsed test must come before the force test, since we need
488 to make sure that delay is initialized for use below in updating the
489 restart interval. */
490 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
491 !force)
ajs8b886ca2004-12-22 02:56:38 +0000492 {
493 if (gs.loglevel > LOG_DEBUG+1)
494 zlog_debug("postponing %s %s: "
495 "elapsed time %ld < retry interval %ld",
496 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
497 return -1;
498 }
499
500 gettimeofday(&restart->time,NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command)+strlen(restart->name)+1];
504 snprintf(cmd,sizeof(cmd),command,restart->name);
505 if ((restart->pid = run_background(cmd)) > 0)
506 {
507 restart->t_kill = thread_add_timer(master,restart_kill,restart,
508 gs.restart_timeout);
509 restart->what = cmdtype;
510 gs.numpids++;
511 }
512 else
513 restart->pid = 0;
514 }
515
516 /* Calculate the new restart interval. */
517 if (update_interval)
518 {
519 if (delay.tv_sec > 2*gs.max_restart_interval)
520 restart->interval = gs.min_restart_interval;
521 else if ((restart->interval *= 2) > gs.max_restart_interval)
522 restart->interval = gs.max_restart_interval;
523 if (gs.loglevel > LOG_DEBUG+1)
524 zlog_debug("restart %s interval is now %ld",
525 restart->name,restart->interval);
526 }
527 return restart->pid;
528}
529
530#define SET_READ_HANDLER(DMN) \
531 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
532
533#define SET_WAKEUP_DOWN(DMN) \
534 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
535 FUZZY(gs.period))
536
537#define SET_WAKEUP_UNRESPONSIVE(DMN) \
538 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
539 FUZZY(gs.period))
540
541#define SET_WAKEUP_ECHO(DMN) \
542 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
543 FUZZY(gs.period))
544
545static int
546wakeup_down(struct thread *t_wakeup)
547{
548 struct daemon *dmn = THREAD_ARG(t_wakeup);
549
550 dmn->t_wakeup = NULL;
551 if (try_connect(dmn) < 0)
552 SET_WAKEUP_DOWN(dmn);
553 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
554 try_restart(dmn);
555 return 0;
556}
557
558static int
559wakeup_init(struct thread *t_wakeup)
560{
561 struct daemon *dmn = THREAD_ARG(t_wakeup);
562
563 dmn->t_wakeup = NULL;
564 if (try_connect(dmn) < 0)
565 {
566 SET_WAKEUP_DOWN(dmn);
567 zlog_err("%s state -> down : initial connection attempt failed",
568 dmn->name);
569 dmn->state = DAEMON_DOWN;
570 }
571 return 0;
572}
573
574static void
575daemon_down(struct daemon *dmn, const char *why)
576{
577 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
578 zlog_err("%s state -> down : %s",dmn->name,why);
579 else if (gs.loglevel > LOG_DEBUG)
580 zlog_debug("%s still down : %s",dmn->name,why);
581 if (IS_UP(dmn))
582 gs.numdown++;
583 dmn->state = DAEMON_DOWN;
584 if (dmn->fd >= 0)
585 {
586 close(dmn->fd);
587 dmn->fd = -1;
588 }
589 THREAD_OFF(dmn->t_read);
590 THREAD_OFF(dmn->t_write);
591 THREAD_OFF(dmn->t_wakeup);
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594 phase_check();
595}
596
597static int
598handle_read(struct thread *t_read)
599{
600 struct daemon *dmn = THREAD_ARG(t_read);
601 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
602 char buf[sizeof(resp)+100];
603 ssize_t rc;
604 struct timeval delay;
605
606 dmn->t_read = NULL;
607 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
608 {
609 char why[100];
610
ajs518cde82005-02-17 20:11:58 +0000611 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000612 {
613 /* Pretend it never happened. */
614 SET_READ_HANDLER(dmn);
615 return 0;
616 }
617 snprintf(why,sizeof(why),"unexpected read error: %s",
618 safe_strerror(errno));
619 daemon_down(dmn,why);
620 return 0;
621 }
622 if (rc == 0)
623 {
624 daemon_down(dmn,"read returned EOF");
625 return 0;
626 }
627 if (!dmn->echo_sent.tv_sec)
628 {
629 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000630 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
631 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000632 daemon_down(dmn,why);
633 return 0;
634 }
635
636 /* We are expecting an echo response: is there any chance that the
637 response would not be returned entirely in the first read? That
638 seems inconceivable... */
639 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
640 {
641 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000642 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
643 "(expecting %u): %.*s",
644 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000645 daemon_down(dmn,why);
646 return 0;
647 }
648
649 time_elapsed(&delay,&dmn->echo_sent);
650 dmn->echo_sent.tv_sec = 0;
651 if (dmn->state == DAEMON_UNRESPONSIVE)
652 {
653 if (delay.tv_sec < gs.timeout)
654 {
655 dmn->state = DAEMON_UP;
656 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100657 "seconds", dmn->name,
658 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000659 }
660 else
661 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100662 "seconds", dmn->name,
663 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000664 }
665 else if (gs.loglevel > LOG_DEBUG+1)
666 zlog_debug("%s: echo response received after %ld.%06ld seconds",
David Lamparteref008d22015-03-03 08:48:11 +0100667 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000668
669 SET_READ_HANDLER(dmn);
670 if (dmn->t_wakeup)
671 thread_cancel(dmn->t_wakeup);
672 SET_WAKEUP_ECHO(dmn);
673
674 return 0;
675}
676
677static void
678daemon_up(struct daemon *dmn, const char *why)
679{
680 dmn->state = DAEMON_UP;
681 gs.numdown--;
682 dmn->connect_tries = 0;
683 zlog_notice("%s state -> up : %s",dmn->name,why);
684 if (gs.do_ping)
685 SET_WAKEUP_ECHO(dmn);
686 phase_check();
687}
688
689static int
690check_connect(struct thread *t_write)
691{
692 struct daemon *dmn = THREAD_ARG(t_write);
693 int sockerr;
694 socklen_t reslen = sizeof(sockerr);
695
696 dmn->t_write = NULL;
697 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
698 {
699 zlog_warn("%s: check_connect: getsockopt failed: %s",
700 dmn->name,safe_strerror(errno));
701 daemon_down(dmn,"getsockopt failed checking connection success");
702 return 0;
703 }
704 if ((reslen == sizeof(sockerr)) && sockerr)
705 {
706 char why[100];
707 snprintf(why,sizeof(why),
708 "getsockopt reports that connection attempt failed: %s",
709 safe_strerror(sockerr));
710 daemon_down(dmn,why);
711 return 0;
712 }
713
714 daemon_up(dmn,"delayed connect succeeded");
715 return 0;
716}
717
718static int
719wakeup_connect_hanging(struct thread *t_wakeup)
720{
721 struct daemon *dmn = THREAD_ARG(t_wakeup);
722 char why[100];
723
724 dmn->t_wakeup = NULL;
725 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
726 gs.timeout);
727 daemon_down(dmn,why);
728 return 0;
729}
730
731/* Making connection to protocol daemon. */
732static int
733try_connect(struct daemon *dmn)
734{
735 int sock;
736 struct sockaddr_un addr;
737 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000738
739 if (gs.loglevel > LOG_DEBUG+1)
740 zlog_debug("%s: attempting to connect",dmn->name);
741 dmn->connect_tries++;
742
743 memset (&addr, 0, sizeof (struct sockaddr_un));
744 addr.sun_family = AF_UNIX;
745 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
746 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000747#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000748 len = addr.sun_len = SUN_LEN(&addr);
749#else
750 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000751#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000752
753 /* Quick check to see if we might succeed before we go to the trouble
754 of creating a socket. */
755 if (access(addr.sun_path, W_OK) < 0)
756 {
757 if (errno != ENOENT)
758 zlog_err("%s: access to socket %s denied: %s",
759 dmn->name,addr.sun_path,safe_strerror(errno));
760 return -1;
761 }
762
763 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
764 {
765 zlog_err("%s(%s): cannot make socket: %s",
766 __func__,addr.sun_path, safe_strerror(errno));
767 return -1;
768 }
769
ajs52e66292005-02-16 20:40:25 +0000770 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000771 {
ajs52e66292005-02-16 20:40:25 +0000772 zlog_err("%s(%s): set_nonblocking(%d) failed",
773 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000774 close(sock);
775 return -1;
776 }
777
778 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
779 {
780 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
781 {
782 if (gs.loglevel > LOG_DEBUG)
783 zlog_debug("%s(%s): connect failed: %s",
784 __func__,addr.sun_path, safe_strerror(errno));
785 close (sock);
786 return -1;
787 }
788 if (gs.loglevel > LOG_DEBUG)
789 zlog_debug("%s: connection in progress",dmn->name);
790 dmn->state = DAEMON_CONNECTING;
791 dmn->fd = sock;
792 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
793 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
794 gs.timeout);
795 SET_READ_HANDLER(dmn);
796 return 0;
797 }
798
799 dmn->fd = sock;
800 SET_READ_HANDLER(dmn);
801 daemon_up(dmn,"connect succeeded");
802 return 1;
803}
804
805static int
806phase_hanging(struct thread *t_hanging)
807{
808 gs.t_phase_hanging = NULL;
809 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
810 phase_str[gs.phase],PHASE_TIMEOUT);
811 gs.phase = PHASE_NONE;
812 return 0;
813}
814
815static void
816set_phase(restart_phase_t new_phase)
817{
818 gs.phase = new_phase;
819 if (gs.t_phase_hanging)
820 thread_cancel(gs.t_phase_hanging);
821 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
822 PHASE_TIMEOUT);
823}
824
825static void
826phase_check(void)
827{
828 switch (gs.phase)
829 {
830 case PHASE_NONE:
831 break;
832 case PHASE_STOPS_PENDING:
833 if (gs.numpids)
834 break;
835 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
836 set_phase(PHASE_WAITING_DOWN);
837 /*FALLTHRU*/
838 case PHASE_WAITING_DOWN:
839 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
840 break;
841 zlog_info("Phased restart: all routing daemons now down.");
842 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
843 set_phase(PHASE_ZEBRA_RESTART_PENDING);
844 /*FALLTHRU*/
845 case PHASE_ZEBRA_RESTART_PENDING:
846 if (gs.special->restart.pid)
847 break;
848 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
849 set_phase(PHASE_WAITING_ZEBRA_UP);
850 /*FALLTHRU*/
851 case PHASE_WAITING_ZEBRA_UP:
852 if (!IS_UP(gs.special))
853 break;
854 zlog_info("Phased restart: %s is now up.",gs.special->name);
855 {
856 struct daemon *dmn;
857 for (dmn = gs.daemons; dmn; dmn = dmn->next)
858 {
859 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000860 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000861 }
862 }
863 gs.phase = PHASE_NONE;
864 THREAD_OFF(gs.t_phase_hanging);
865 zlog_notice("Phased global restart has completed.");
866 break;
867 }
868}
869
870static void
871try_restart(struct daemon *dmn)
872{
873 switch (gs.mode)
874 {
875 case MODE_MONITOR:
876 return;
877 case MODE_GLOBAL_RESTART:
878 run_job(&gs.restart,"restart",gs.restart_command,0,1);
879 break;
880 case MODE_SEPARATE_RESTART:
881 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
882 break;
883 case MODE_PHASED_ZEBRA_RESTART:
884 if (dmn != gs.special)
885 {
886 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
887 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
888 else
889 zlog_debug("%s: postponing restart attempt because master %s daemon "
890 "not up [%s], or phased restart in progress",
891 dmn->name,gs.special->name,state_str[gs.special->state]);
892 break;
893 }
894 /*FALLTHRU*/
895 case MODE_PHASED_ALL_RESTART:
896 if ((gs.phase != PHASE_NONE) || gs.numpids)
897 {
898 if (gs.loglevel > LOG_DEBUG+1)
899 zlog_debug("postponing phased global restart: restart already in "
900 "progress [%s], or outstanding child processes [%d]",
901 phase_str[gs.phase],gs.numpids);
902 break;
903 }
904 /* Is it too soon for a restart? */
905 {
906 struct timeval delay;
907 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
908 gs.special->restart.interval)
909 {
910 if (gs.loglevel > LOG_DEBUG+1)
911 zlog_debug("postponing phased global restart: "
912 "elapsed time %ld < retry interval %ld",
913 (long)delay.tv_sec,gs.special->restart.interval);
914 break;
915 }
916 }
917 zlog_info("Phased restart: stopping all routing daemons.");
918 /* First step: stop all other daemons. */
919 for (dmn = gs.daemons; dmn; dmn = dmn->next)
920 {
921 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000922 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000923 }
924 set_phase(PHASE_STOPS_PENDING);
925 break;
926 default:
927 zlog_err("error: unknown restart mode %d",gs.mode);
928 break;
929 }
930}
931
932static int
933wakeup_unresponsive(struct thread *t_wakeup)
934{
935 struct daemon *dmn = THREAD_ARG(t_wakeup);
936
937 dmn->t_wakeup = NULL;
938 if (dmn->state != DAEMON_UNRESPONSIVE)
939 zlog_err("%s: no longer unresponsive (now %s), "
940 "wakeup should have been cancelled!",
941 dmn->name,state_str[dmn->state]);
942 else
943 {
944 SET_WAKEUP_UNRESPONSIVE(dmn);
945 try_restart(dmn);
946 }
947 return 0;
948}
949
950static int
951wakeup_no_answer(struct thread *t_wakeup)
952{
953 struct daemon *dmn = THREAD_ARG(t_wakeup);
954
955 dmn->t_wakeup = NULL;
956 dmn->state = DAEMON_UNRESPONSIVE;
957 zlog_err("%s state -> unresponsive : no response yet to ping "
958 "sent %ld seconds ago",dmn->name,gs.timeout);
959 if (gs.unresponsive_restart)
960 {
961 SET_WAKEUP_UNRESPONSIVE(dmn);
962 try_restart(dmn);
963 }
964 return 0;
965}
966
967static int
968wakeup_send_echo(struct thread *t_wakeup)
969{
970 static const char echocmd[] = "echo " PING_TOKEN;
971 ssize_t rc;
972 struct daemon *dmn = THREAD_ARG(t_wakeup);
973
974 dmn->t_wakeup = NULL;
975 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
976 ((size_t)rc != sizeof(echocmd)))
977 {
978 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000979 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
980 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000981 daemon_down(dmn,why);
982 }
983 else
984 {
985 gettimeofday(&dmn->echo_sent,NULL);
986 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
987 }
988 return 0;
989}
990
991static void
992sigint(void)
993{
994 zlog_notice("Terminating on signal");
995 exit(0);
996}
997
998static int
999valid_command(const char *cmd)
1000{
1001 char *p;
1002
1003 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1004}
1005
ajsc8b40f82004-12-22 16:17:16 +00001006/* This is an ugly hack to circumvent problems with passing command-line
1007 arguments that contain spaces. The fix is to use a configuration file. */
1008static char *
1009translate_blanks(const char *cmd, const char *blankstr)
1010{
1011 char *res;
1012 char *p;
1013 size_t bslen = strlen(blankstr);
1014
1015 if (!(res = strdup(cmd)))
1016 {
1017 perror("strdup");
1018 exit(1);
1019 }
1020 while ((p = strstr(res,blankstr)) != NULL)
1021 {
1022 *p = ' ';
1023 if (bslen != 1)
1024 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1025 }
1026 return res;
1027}
1028
ajs8b886ca2004-12-22 02:56:38 +00001029int
1030main(int argc, char **argv)
1031{
1032 const char *progname;
1033 int opt;
1034 int daemon_mode = 0;
1035 const char *pidfile = DEFAULT_PIDFILE;
1036 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001037 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001038 static struct quagga_signal_t my_signals[] =
1039 {
1040 {
1041 .signal = SIGINT,
1042 .handler = sigint,
1043 },
1044 {
1045 .signal = SIGTERM,
1046 .handler = sigint,
1047 },
1048 {
1049 .signal = SIGCHLD,
1050 .handler = sigchild,
1051 },
1052 };
1053
1054 if ((progname = strrchr (argv[0], '/')) != NULL)
1055 progname++;
1056 else
1057 progname = argv[0];
1058
ajs098e2402004-12-22 17:00:46 +00001059 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001060 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001061 longopts, 0)) != EOF)
1062 {
1063 switch (opt)
1064 {
1065 case 0:
1066 break;
1067 case 'a':
1068 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1069 {
1070 fputs("Ambiguous operating mode selected.\n",stderr);
1071 return usage(progname,1);
1072 }
1073 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1074 break;
1075 case 'A':
1076 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1077 {
1078 fputs("Ambiguous operating mode selected.\n",stderr);
1079 return usage(progname,1);
1080 }
1081 gs.mode = MODE_PHASED_ALL_RESTART;
1082 break;
ajsc8b40f82004-12-22 16:17:16 +00001083 case 'b':
1084 blankstr = optarg;
1085 break;
ajs8b886ca2004-12-22 02:56:38 +00001086 case 'd':
1087 daemon_mode = 1;
1088 break;
1089 case 'e':
1090 gs.do_ping = 0;
1091 break;
1092 case 'k':
1093 if (!valid_command(optarg))
1094 {
1095 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1096 optarg);
1097 return usage(progname,1);
1098 }
1099 gs.stop_command = optarg;
1100 break;
1101 case 'l':
1102 {
1103 char garbage[3];
1104 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1105 (gs.loglevel < LOG_EMERG))
1106 {
1107 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1108 return usage(progname,1);
1109 }
1110 }
1111 break;
1112 case 'm':
1113 {
1114 char garbage[3];
1115 if ((sscanf(optarg,"%ld%1s",
1116 &gs.min_restart_interval,garbage) != 1) ||
1117 (gs.min_restart_interval < 0))
1118 {
1119 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1120 optarg);
1121 return usage(progname,1);
1122 }
1123 }
1124 break;
1125 case 'M':
1126 {
1127 char garbage[3];
1128 if ((sscanf(optarg,"%ld%1s",
1129 &gs.max_restart_interval,garbage) != 1) ||
1130 (gs.max_restart_interval < 0))
1131 {
1132 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1133 optarg);
1134 return usage(progname,1);
1135 }
1136 }
1137 break;
1138 case 'i':
1139 {
1140 char garbage[3];
1141 int period;
1142 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1143 (gs.period < 1))
1144 {
1145 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1146 return usage(progname,1);
1147 }
1148 gs.period = 1000*period;
1149 }
1150 break;
1151 case 'p':
1152 pidfile = optarg;
1153 break;
1154 case 'r':
1155 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1156 (gs.mode == MODE_SEPARATE_RESTART))
1157 {
1158 fputs("Ambiguous operating mode selected.\n",stderr);
1159 return usage(progname,1);
1160 }
1161 if (!valid_command(optarg))
1162 {
1163 fprintf(stderr,
1164 "Invalid restart command, must contain '%%s': %s\n",
1165 optarg);
1166 return usage(progname,1);
1167 }
1168 gs.restart_command = optarg;
1169 if (gs.mode == MODE_MONITOR)
1170 gs.mode = MODE_SEPARATE_RESTART;
1171 break;
1172 case 'R':
1173 if (gs.mode != MODE_MONITOR)
1174 {
1175 fputs("Ambiguous operating mode selected.\n",stderr);
1176 return usage(progname,1);
1177 }
1178 if (strchr(optarg,'%'))
1179 {
1180 fprintf(stderr,
1181 "Invalid restart-all arg, must not contain '%%s': %s\n",
1182 optarg);
1183 return usage(progname,1);
1184 }
1185 gs.restart_command = optarg;
1186 gs.mode = MODE_GLOBAL_RESTART;
1187 break;
1188 case 's':
1189 if (!valid_command(optarg))
1190 {
1191 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1192 optarg);
1193 return usage(progname,1);
1194 }
1195 gs.start_command = optarg;
1196 break;
1197 case 'S':
1198 gs.vtydir = optarg;
1199 break;
1200 case 't':
1201 {
1202 char garbage[3];
1203 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1204 (gs.timeout < 1))
1205 {
1206 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1207 return usage(progname,1);
1208 }
1209 }
1210 break;
1211 case 'T':
1212 {
1213 char garbage[3];
1214 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1215 (gs.restart_timeout < 1))
1216 {
1217 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1218 return usage(progname,1);
1219 }
1220 }
1221 break;
1222 case 'z':
1223 gs.unresponsive_restart = 1;
1224 break;
1225 case 'v':
1226 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1227 puts("Copyright 2004 Andrew J. Schorr");
1228 return 0;
1229 case 'h':
1230 return usage(progname,0);
1231 default:
1232 fputs("Invalid option.\n",stderr);
1233 return usage(progname,1);
1234 }
1235 }
1236
1237 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1238 {
1239 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1240 return usage(progname,1);
1241 }
1242 switch (gs.mode)
1243 {
1244 case MODE_MONITOR:
1245 if (gs.restart_command || gs.start_command || gs.stop_command)
1246 {
1247 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1248 mode_str[gs.mode]);
1249 return usage(progname,1);
1250 }
1251 break;
1252 case MODE_GLOBAL_RESTART:
1253 case MODE_SEPARATE_RESTART:
1254 if (!gs.restart_command || gs.start_command || gs.stop_command)
1255 {
1256 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1257 mode_str[gs.mode]);
1258 return usage(progname,1);
1259 }
1260 break;
1261 case MODE_PHASED_ZEBRA_RESTART:
1262 case MODE_PHASED_ALL_RESTART:
1263 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1264 {
1265 fprintf(stderr,
1266 "Need start, kill, and restart commands in [%s] mode.\n",
1267 mode_str[gs.mode]);
1268 return usage(progname,1);
1269 }
1270 break;
1271 }
1272
ajsc8b40f82004-12-22 16:17:16 +00001273 if (blankstr)
1274 {
1275 if (gs.restart_command)
1276 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1277 if (gs.start_command)
1278 gs.start_command = translate_blanks(gs.start_command,blankstr);
1279 if (gs.stop_command)
1280 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1281 }
1282
ajs8b886ca2004-12-22 02:56:38 +00001283 gs.restart.interval = gs.min_restart_interval;
1284 master = thread_master_create();
Balaji.G837d16c2012-09-26 14:09:10 +05301285 signal_init (master, array_size(my_signals), my_signals);
ajs8b886ca2004-12-22 02:56:38 +00001286 srandom(time(NULL));
1287
1288 {
1289 int i;
1290 struct daemon *tail = NULL;
1291
1292 for (i = optind; i < argc; i++)
1293 {
1294 struct daemon *dmn;
1295
1296 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1297 {
ajs098e2402004-12-22 17:00:46 +00001298 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1299 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001300 return 1;
1301 }
1302 dmn->name = dmn->restart.name = argv[i];
1303 dmn->state = DAEMON_INIT;
1304 gs.numdaemons++;
1305 gs.numdown++;
1306 dmn->fd = -1;
1307 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1308 100+(random() % 900));
1309 dmn->restart.interval = gs.min_restart_interval;
1310 if (tail)
1311 tail->next = dmn;
1312 else
1313 gs.daemons = dmn;
1314 tail = dmn;
1315
1316 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1317 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1318 !strcmp(dmn->name,special))
1319 gs.special = dmn;
1320 }
1321 }
1322 if (!gs.daemons)
1323 {
1324 fputs("Must specify one or more daemons to monitor.\n",stderr);
1325 return usage(progname,1);
1326 }
1327 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1328 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1329 {
1330 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1331 mode_str[gs.mode],special);
1332 return usage(progname,1);
1333 }
1334 if (gs.special && (gs.numdaemons < 2))
1335 {
1336 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1337 "to watch.\n",mode_str[gs.mode]);
1338 return usage(progname,1);
1339 }
1340
1341 zlog_default = openzlog(progname, ZLOG_NONE,
1342 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1343 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1344 if (daemon_mode)
1345 {
1346 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001347 if (daemon (0, 0) < 0)
1348 {
1349 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1350 exit (1);
1351 }
ajs8b886ca2004-12-22 02:56:38 +00001352 }
1353 else
1354 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1355
1356 /* Make sure we're not already running. */
1357 pid_output (pidfile);
1358
1359 /* Announce which daemons are being monitored. */
1360 {
1361 struct daemon *dmn;
1362 size_t len = 0;
1363
1364 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1365 len += strlen(dmn->name)+1;
1366
1367 {
1368 char buf[len+1];
1369 char *p = buf;
1370
1371 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1372 {
1373 if (p != buf)
1374 *p++ = ' ';
1375 strcpy(p,dmn->name);
1376 p += strlen(p);
1377 }
1378 zlog_notice("%s %s watching [%s], mode [%s]",
1379 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1380 }
1381 }
1382
1383 {
1384 struct thread thread;
1385
1386 while (thread_fetch (master, &thread))
1387 thread_call (&thread);
1388 }
1389
1390 /* Not reached. */
1391 return 0;
1392}