blob: 772b4294cd65fa04a50b0d03bac44dd8825b983f [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajsa8a8ddc2005-01-12 16:24:51 +00002 $Id: watchquagga.c,v 1.9 2005/01/12 16:24:51 ajs Exp $
ajs8b886ca2004-12-22 02:56:38 +00003
4 Monitor status of quagga daemons and restart if necessary.
5
6 Copyright (C) 2004 Andrew J. Schorr
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
ajsa3655342004-12-29 17:39:10 +000023#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000024#include <thread.h>
25#include <log.h>
26#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000027#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000028#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000029#include <sys/un.h>
30#include <sys/wait.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
218 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
219Watchdog program to monitor status of quagga daemons and try to restart\n\
220them if they are down or unresponsive. It determines whether a daemon is\n\
221up based on whether it can connect to the daemon's vty unix stream socket.\n\
222It then repeatedly sends echo commands over that socket to determine whether\n\
223the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
224on the socket connection and know immediately that the daemon is down.\n\n\
225The daemons to be monitored should be listed on the command line.\n\n\
226This program can run in one of 5 modes:\n\n\
2270. Mode: %s.\n\
228 Just monitor and report on status changes. Example:\n\
229 %s -d zebra ospfd bgpd\n\n\
2301. Mode: %s.\n\
231 Whenever any daemon hangs or crashes, use the given command to restart\n\
232 them all. Example:\n\
233 %s -dz \\\n\
234 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
235 zebra ospfd\n\n\
2362. Mode: %s.\n\
237 When any single daemon hangs or crashes, restart only the daemon that's\n\
238 in trouble using the supplied restart command. Example:\n\
239 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2403. Mode: %s.\n\
241 The same as the previous mode, except that there is special treatment when\n\
242 the zebra daemon is in trouble. In that case, a phased restart approach\n\
243 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
244 daemons. Example:\n\
245 %s -adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2484. Mode: %s.\n\
249 This is the same as the previous mode, except that the phased restart\n\
250 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
251 %s -Adz -r '/sbin/service %%s restart' \\\n\
252 -s '/sbin/service %%s start' \\\n\
253 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
254As of this writing, it is believed that mode 2 [%s]\n\
255is not safe, and mode 3 [%s] may not be safe with some of the\n\
256routing daemons.\n\n\
257In order to avoid attempting to restart the daemons in a fast loop,\n\
258the -m and -M options allow you to control the minimum delay between\n\
259restart commands. The minimum restart delay is recalculated each time\n\
260a restart is attempted: if the time since the last restart attempt exceeds\n\
261twice the -M value, then the restart delay is set to the -m value.\n\
262Otherwise, the interval is doubled (but capped at the -M value).\n\n\
263Options:\n\
264-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
265 to syslog instead of stdout.\n\
266-S, --statedir Set the vty socket directory (default is %s)\n\
267-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
268 option is necessary if the daemons do not support the\n\
269 echo command)\n\
270-l, --loglevel Set the logging level (default is %d).\n\
271 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
272 but it can be set higher than %d if extra-verbose debugging\n\
273 messages are desired.\n\
274-m, --min-restart-interval\n\
275 Set the minimum seconds to wait between invocations of daemon\n\
276 restart commands (default is %d).\n\
277-M, --max-restart-interval\n\
278 Set the maximum seconds to wait between invocations of daemon\n\
279 restart commands (default is %d).\n\
280-i, --interval Set the status polling interval in seconds (default is %d)\n\
281-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
282-T, --restart-timeout\n\
283 Set the restart (kill) timeout in seconds (default is %d).\n\
284 If any background jobs are still running after this much\n\
285 time has elapsed, they will be killed.\n\
286-r, --restart Supply a Bourne shell command to use to restart a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 Note that -r and -R are incompatible.\n\
290-s, --start-command\n\
291 Supply a Bourne shell to command to use to start a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294-k, --kill-command\n\
295 Supply a Bourne shell to command to use to stop a single\n\
296 daemon. The command string should include '%%s' where the\n\
297 name of the daemon should be substituted.\n\
298-R, --restart-all\n\
299 When one or more daemons is down, try to restart everything\n\
300 using the Bourne shell command supplied as the argument.\n\
301 Note that -r and -R are incompatible.\n\
302-z, --unresponsive-restart\n\
303 When a daemon is unresponsive, treat it as being down for\n\
304 restart purposes.\n\
305-a, --all-restart\n\
306 When zebra hangs or crashes, restart all daemons using\n\
307 this phased approach: 1. stop all other daemons; 2. restart\n\
308 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
309-A, --always-all-restart\n\
310 When any daemon (not just zebra) hangs or crashes, use the\n\
311 same phased restart mechanism described above for -a.\n\
312 Requires -r, -s, and -k.\n\
313-p, --pid-file Set process identifier file name\n\
314 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000315-b, --blank-string\n\
316 When the supplied argument string is found in any of the\n\
317 various shell command arguments (-r, -s, -k, or -R), replace\n\
318 it with a space. This is an ugly hack to circumvent problems\n\
319 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000320-v, --version Print program version\n\
321-h, --help Display this help and exit\n\
322", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
323progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000324VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000325DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
326DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
327
328 return status;
329}
330
331static pid_t
332run_background(const char *shell_cmd)
333{
334 pid_t child;
335
336 switch (child = fork())
337 {
338 case -1:
339 zlog_err("fork failed, cannot run command [%s]: %s",
340 shell_cmd,safe_strerror(errno));
341 return -1;
342 case 0:
343 /* Child process. */
344 /* Use separate process group so child processes can be killed easily. */
345 if (setpgid(0,0) < 0)
346 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
347 {
348 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
349 execv("/bin/sh",(char *const *)argv);
350 zlog_err("execv(/bin/sh -c '%s') failed: %s",
351 shell_cmd,safe_strerror(errno));
352 _exit(127);
353 }
354 default:
355 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000356 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000357 return child;
358 }
359}
360
361static struct timeval *
362time_elapsed(struct timeval *result, const struct timeval *start_time)
363{
364 gettimeofday(result,NULL);
365 result->tv_sec -= start_time->tv_sec;
366 result->tv_usec -= start_time->tv_usec;
367 while (result->tv_usec < 0)
368 {
369 result->tv_usec += 1000000L;
370 result->tv_sec--;
371 }
372 return result;
373}
374
375static int
376restart_kill(struct thread *t_kill)
377{
378 struct restart_info *restart = THREAD_ARG(t_kill);
379 struct timeval delay;
380
381 time_elapsed(&delay,&restart->time);
382 zlog_warn("Warning: %s %s child process %d still running after "
383 "%ld seconds, sending signal %d",
ajsf2d82572004-12-29 17:45:08 +0000384 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000385 (restart->kills ? SIGKILL : SIGTERM));
386 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
387 restart->kills++;
388 restart->t_kill = thread_add_timer(master,restart_kill,restart,
389 gs.restart_timeout);
390 return 0;
391}
392
393static struct restart_info *
394find_child(pid_t child)
395{
396 if (gs.mode == MODE_GLOBAL_RESTART)
397 {
398 if (gs.restart.pid == child)
399 return &gs.restart;
400 }
401 else
402 {
403 struct daemon *dmn;
404 for (dmn = gs.daemons; dmn; dmn = dmn->next)
405 {
406 if (dmn->restart.pid == child)
407 return &dmn->restart;
408 }
409 }
410 return NULL;
411}
412
413static void
414sigchild(void)
415{
416 pid_t child;
417 int status;
418 const char *name;
419 const char *what;
420 struct restart_info *restart;
421
422 switch (child = waitpid(-1,&status,WNOHANG))
423 {
424 case -1:
425 zlog_err("waitpid failed: %s",safe_strerror(errno));
426 return;
427 case 0:
428 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
429 return;
430 }
431
432 if ((restart = find_child(child)) != NULL)
433 {
434 name = restart->name;
435 what = restart->what;
436 restart->pid = 0;
437 gs.numpids--;
438 thread_cancel(restart->t_kill);
439 restart->t_kill = NULL;
440 /* Update restart time to reflect the time the command completed. */
441 gettimeofday(&restart->time,NULL);
442 }
443 else
444 {
445 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000446 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000447 name = "(unknown)";
448 what = "background";
449 }
450 if (WIFSTOPPED(status))
451 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000452 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000453 else if (WIFSIGNALED(status))
454 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000455 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000456 else if (WIFEXITED(status))
457 {
458 if (WEXITSTATUS(status) != 0)
459 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000460 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000461 else
ajsf2d82572004-12-29 17:45:08 +0000462 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000463 }
464 else
465 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000466 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000467 phase_check();
468}
469
470static int
471run_job(struct restart_info *restart, const char *cmdtype, const char *command,
472 int force, int update_interval)
473{
474 struct timeval delay;
475
476 if (gs.loglevel > LOG_DEBUG+1)
477 zlog_debug("attempting to %s %s",cmdtype,restart->name);
478
479 if (restart->pid)
480 {
481 if (gs.loglevel > LOG_DEBUG+1)
482 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000483 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000484 return -1;
485 }
486
ajsa8a8ddc2005-01-12 16:24:51 +0000487 /* Note: time_elapsed test must come before the force test, since we need
488 to make sure that delay is initialized for use below in updating the
489 restart interval. */
490 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
491 !force)
ajs8b886ca2004-12-22 02:56:38 +0000492 {
493 if (gs.loglevel > LOG_DEBUG+1)
494 zlog_debug("postponing %s %s: "
495 "elapsed time %ld < retry interval %ld",
496 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
497 return -1;
498 }
499
500 gettimeofday(&restart->time,NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command)+strlen(restart->name)+1];
504 snprintf(cmd,sizeof(cmd),command,restart->name);
505 if ((restart->pid = run_background(cmd)) > 0)
506 {
507 restart->t_kill = thread_add_timer(master,restart_kill,restart,
508 gs.restart_timeout);
509 restart->what = cmdtype;
510 gs.numpids++;
511 }
512 else
513 restart->pid = 0;
514 }
515
516 /* Calculate the new restart interval. */
517 if (update_interval)
518 {
519 if (delay.tv_sec > 2*gs.max_restart_interval)
520 restart->interval = gs.min_restart_interval;
521 else if ((restart->interval *= 2) > gs.max_restart_interval)
522 restart->interval = gs.max_restart_interval;
523 if (gs.loglevel > LOG_DEBUG+1)
524 zlog_debug("restart %s interval is now %ld",
525 restart->name,restart->interval);
526 }
527 return restart->pid;
528}
529
530#define SET_READ_HANDLER(DMN) \
531 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
532
533#define SET_WAKEUP_DOWN(DMN) \
534 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
535 FUZZY(gs.period))
536
537#define SET_WAKEUP_UNRESPONSIVE(DMN) \
538 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
539 FUZZY(gs.period))
540
541#define SET_WAKEUP_ECHO(DMN) \
542 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
543 FUZZY(gs.period))
544
545static int
546wakeup_down(struct thread *t_wakeup)
547{
548 struct daemon *dmn = THREAD_ARG(t_wakeup);
549
550 dmn->t_wakeup = NULL;
551 if (try_connect(dmn) < 0)
552 SET_WAKEUP_DOWN(dmn);
553 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
554 try_restart(dmn);
555 return 0;
556}
557
558static int
559wakeup_init(struct thread *t_wakeup)
560{
561 struct daemon *dmn = THREAD_ARG(t_wakeup);
562
563 dmn->t_wakeup = NULL;
564 if (try_connect(dmn) < 0)
565 {
566 SET_WAKEUP_DOWN(dmn);
567 zlog_err("%s state -> down : initial connection attempt failed",
568 dmn->name);
569 dmn->state = DAEMON_DOWN;
570 }
571 return 0;
572}
573
574static void
575daemon_down(struct daemon *dmn, const char *why)
576{
577 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
578 zlog_err("%s state -> down : %s",dmn->name,why);
579 else if (gs.loglevel > LOG_DEBUG)
580 zlog_debug("%s still down : %s",dmn->name,why);
581 if (IS_UP(dmn))
582 gs.numdown++;
583 dmn->state = DAEMON_DOWN;
584 if (dmn->fd >= 0)
585 {
586 close(dmn->fd);
587 dmn->fd = -1;
588 }
589 THREAD_OFF(dmn->t_read);
590 THREAD_OFF(dmn->t_write);
591 THREAD_OFF(dmn->t_wakeup);
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594 phase_check();
595}
596
597static int
598handle_read(struct thread *t_read)
599{
600 struct daemon *dmn = THREAD_ARG(t_read);
601 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
602 char buf[sizeof(resp)+100];
603 ssize_t rc;
604 struct timeval delay;
605
606 dmn->t_read = NULL;
607 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
608 {
609 char why[100];
610
611 if ((errno == EINTR) || (errno == EAGAIN))
612 {
613 /* Pretend it never happened. */
614 SET_READ_HANDLER(dmn);
615 return 0;
616 }
617 snprintf(why,sizeof(why),"unexpected read error: %s",
618 safe_strerror(errno));
619 daemon_down(dmn,why);
620 return 0;
621 }
622 if (rc == 0)
623 {
624 daemon_down(dmn,"read returned EOF");
625 return 0;
626 }
627 if (!dmn->echo_sent.tv_sec)
628 {
629 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000630 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
631 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000632 daemon_down(dmn,why);
633 return 0;
634 }
635
636 /* We are expecting an echo response: is there any chance that the
637 response would not be returned entirely in the first read? That
638 seems inconceivable... */
639 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
640 {
641 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000642 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
643 "(expecting %u): %.*s",
644 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000645 daemon_down(dmn,why);
646 return 0;
647 }
648
649 time_elapsed(&delay,&dmn->echo_sent);
650 dmn->echo_sent.tv_sec = 0;
651 if (dmn->state == DAEMON_UNRESPONSIVE)
652 {
653 if (delay.tv_sec < gs.timeout)
654 {
655 dmn->state = DAEMON_UP;
656 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
657 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
658 }
659 else
660 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
661 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
662 }
663 else if (gs.loglevel > LOG_DEBUG+1)
664 zlog_debug("%s: echo response received after %ld.%06ld seconds",
665 dmn->name,delay.tv_sec,delay.tv_usec);
666
667 SET_READ_HANDLER(dmn);
668 if (dmn->t_wakeup)
669 thread_cancel(dmn->t_wakeup);
670 SET_WAKEUP_ECHO(dmn);
671
672 return 0;
673}
674
675static void
676daemon_up(struct daemon *dmn, const char *why)
677{
678 dmn->state = DAEMON_UP;
679 gs.numdown--;
680 dmn->connect_tries = 0;
681 zlog_notice("%s state -> up : %s",dmn->name,why);
682 if (gs.do_ping)
683 SET_WAKEUP_ECHO(dmn);
684 phase_check();
685}
686
687static int
688check_connect(struct thread *t_write)
689{
690 struct daemon *dmn = THREAD_ARG(t_write);
691 int sockerr;
692 socklen_t reslen = sizeof(sockerr);
693
694 dmn->t_write = NULL;
695 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
696 {
697 zlog_warn("%s: check_connect: getsockopt failed: %s",
698 dmn->name,safe_strerror(errno));
699 daemon_down(dmn,"getsockopt failed checking connection success");
700 return 0;
701 }
702 if ((reslen == sizeof(sockerr)) && sockerr)
703 {
704 char why[100];
705 snprintf(why,sizeof(why),
706 "getsockopt reports that connection attempt failed: %s",
707 safe_strerror(sockerr));
708 daemon_down(dmn,why);
709 return 0;
710 }
711
712 daemon_up(dmn,"delayed connect succeeded");
713 return 0;
714}
715
716static int
717wakeup_connect_hanging(struct thread *t_wakeup)
718{
719 struct daemon *dmn = THREAD_ARG(t_wakeup);
720 char why[100];
721
722 dmn->t_wakeup = NULL;
723 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
724 gs.timeout);
725 daemon_down(dmn,why);
726 return 0;
727}
728
729/* Making connection to protocol daemon. */
730static int
731try_connect(struct daemon *dmn)
732{
733 int sock;
734 struct sockaddr_un addr;
735 socklen_t len;
736 int flags;
737
738 if (gs.loglevel > LOG_DEBUG+1)
739 zlog_debug("%s: attempting to connect",dmn->name);
740 dmn->connect_tries++;
741
742 memset (&addr, 0, sizeof (struct sockaddr_un));
743 addr.sun_family = AF_UNIX;
744 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
745 gs.vtydir,dmn->name);
746#ifdef HAVE_SUN_LEN
747 len = addr.sun_len = SUN_LEN(&addr);
748#else
749 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
750#endif /* HAVE_SUN_LEN */
751
752 /* Quick check to see if we might succeed before we go to the trouble
753 of creating a socket. */
754 if (access(addr.sun_path, W_OK) < 0)
755 {
756 if (errno != ENOENT)
757 zlog_err("%s: access to socket %s denied: %s",
758 dmn->name,addr.sun_path,safe_strerror(errno));
759 return -1;
760 }
761
762 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
763 {
764 zlog_err("%s(%s): cannot make socket: %s",
765 __func__,addr.sun_path, safe_strerror(errno));
766 return -1;
767 }
768
769 /* Set non-blocking. */
770 if ((flags = fcntl(sock, F_GETFL, 0)) < 0)
771 {
772 zlog_err("%s(%s): fcntl(F_GETFL) failed: %s",
773 __func__,addr.sun_path, safe_strerror(errno));
774 close(sock);
775 return -1;
776 }
777 if (fcntl(sock, F_SETFL, (flags|O_NONBLOCK)) < 0)
778 {
779 zlog_err("%s(%s): fcntl(F_SETFL,O_NONBLOCK) failed: %s",
780 __func__,addr.sun_path, safe_strerror(errno));
781 close(sock);
782 return -1;
783 }
784
785 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
786 {
787 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
788 {
789 if (gs.loglevel > LOG_DEBUG)
790 zlog_debug("%s(%s): connect failed: %s",
791 __func__,addr.sun_path, safe_strerror(errno));
792 close (sock);
793 return -1;
794 }
795 if (gs.loglevel > LOG_DEBUG)
796 zlog_debug("%s: connection in progress",dmn->name);
797 dmn->state = DAEMON_CONNECTING;
798 dmn->fd = sock;
799 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
800 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
801 gs.timeout);
802 SET_READ_HANDLER(dmn);
803 return 0;
804 }
805
806 dmn->fd = sock;
807 SET_READ_HANDLER(dmn);
808 daemon_up(dmn,"connect succeeded");
809 return 1;
810}
811
812static int
813phase_hanging(struct thread *t_hanging)
814{
815 gs.t_phase_hanging = NULL;
816 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
817 phase_str[gs.phase],PHASE_TIMEOUT);
818 gs.phase = PHASE_NONE;
819 return 0;
820}
821
822static void
823set_phase(restart_phase_t new_phase)
824{
825 gs.phase = new_phase;
826 if (gs.t_phase_hanging)
827 thread_cancel(gs.t_phase_hanging);
828 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
829 PHASE_TIMEOUT);
830}
831
832static void
833phase_check(void)
834{
835 switch (gs.phase)
836 {
837 case PHASE_NONE:
838 break;
839 case PHASE_STOPS_PENDING:
840 if (gs.numpids)
841 break;
842 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
843 set_phase(PHASE_WAITING_DOWN);
844 /*FALLTHRU*/
845 case PHASE_WAITING_DOWN:
846 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
847 break;
848 zlog_info("Phased restart: all routing daemons now down.");
849 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
850 set_phase(PHASE_ZEBRA_RESTART_PENDING);
851 /*FALLTHRU*/
852 case PHASE_ZEBRA_RESTART_PENDING:
853 if (gs.special->restart.pid)
854 break;
855 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
856 set_phase(PHASE_WAITING_ZEBRA_UP);
857 /*FALLTHRU*/
858 case PHASE_WAITING_ZEBRA_UP:
859 if (!IS_UP(gs.special))
860 break;
861 zlog_info("Phased restart: %s is now up.",gs.special->name);
862 {
863 struct daemon *dmn;
864 for (dmn = gs.daemons; dmn; dmn = dmn->next)
865 {
866 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000867 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000868 }
869 }
870 gs.phase = PHASE_NONE;
871 THREAD_OFF(gs.t_phase_hanging);
872 zlog_notice("Phased global restart has completed.");
873 break;
874 }
875}
876
877static void
878try_restart(struct daemon *dmn)
879{
880 switch (gs.mode)
881 {
882 case MODE_MONITOR:
883 return;
884 case MODE_GLOBAL_RESTART:
885 run_job(&gs.restart,"restart",gs.restart_command,0,1);
886 break;
887 case MODE_SEPARATE_RESTART:
888 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
889 break;
890 case MODE_PHASED_ZEBRA_RESTART:
891 if (dmn != gs.special)
892 {
893 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
894 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
895 else
896 zlog_debug("%s: postponing restart attempt because master %s daemon "
897 "not up [%s], or phased restart in progress",
898 dmn->name,gs.special->name,state_str[gs.special->state]);
899 break;
900 }
901 /*FALLTHRU*/
902 case MODE_PHASED_ALL_RESTART:
903 if ((gs.phase != PHASE_NONE) || gs.numpids)
904 {
905 if (gs.loglevel > LOG_DEBUG+1)
906 zlog_debug("postponing phased global restart: restart already in "
907 "progress [%s], or outstanding child processes [%d]",
908 phase_str[gs.phase],gs.numpids);
909 break;
910 }
911 /* Is it too soon for a restart? */
912 {
913 struct timeval delay;
914 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
915 gs.special->restart.interval)
916 {
917 if (gs.loglevel > LOG_DEBUG+1)
918 zlog_debug("postponing phased global restart: "
919 "elapsed time %ld < retry interval %ld",
920 (long)delay.tv_sec,gs.special->restart.interval);
921 break;
922 }
923 }
924 zlog_info("Phased restart: stopping all routing daemons.");
925 /* First step: stop all other daemons. */
926 for (dmn = gs.daemons; dmn; dmn = dmn->next)
927 {
928 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000929 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000930 }
931 set_phase(PHASE_STOPS_PENDING);
932 break;
933 default:
934 zlog_err("error: unknown restart mode %d",gs.mode);
935 break;
936 }
937}
938
939static int
940wakeup_unresponsive(struct thread *t_wakeup)
941{
942 struct daemon *dmn = THREAD_ARG(t_wakeup);
943
944 dmn->t_wakeup = NULL;
945 if (dmn->state != DAEMON_UNRESPONSIVE)
946 zlog_err("%s: no longer unresponsive (now %s), "
947 "wakeup should have been cancelled!",
948 dmn->name,state_str[dmn->state]);
949 else
950 {
951 SET_WAKEUP_UNRESPONSIVE(dmn);
952 try_restart(dmn);
953 }
954 return 0;
955}
956
957static int
958wakeup_no_answer(struct thread *t_wakeup)
959{
960 struct daemon *dmn = THREAD_ARG(t_wakeup);
961
962 dmn->t_wakeup = NULL;
963 dmn->state = DAEMON_UNRESPONSIVE;
964 zlog_err("%s state -> unresponsive : no response yet to ping "
965 "sent %ld seconds ago",dmn->name,gs.timeout);
966 if (gs.unresponsive_restart)
967 {
968 SET_WAKEUP_UNRESPONSIVE(dmn);
969 try_restart(dmn);
970 }
971 return 0;
972}
973
974static int
975wakeup_send_echo(struct thread *t_wakeup)
976{
977 static const char echocmd[] = "echo " PING_TOKEN;
978 ssize_t rc;
979 struct daemon *dmn = THREAD_ARG(t_wakeup);
980
981 dmn->t_wakeup = NULL;
982 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
983 ((size_t)rc != sizeof(echocmd)))
984 {
985 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000986 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
987 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000988 daemon_down(dmn,why);
989 }
990 else
991 {
992 gettimeofday(&dmn->echo_sent,NULL);
993 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
994 }
995 return 0;
996}
997
998static void
999sigint(void)
1000{
1001 zlog_notice("Terminating on signal");
1002 exit(0);
1003}
1004
1005static int
1006valid_command(const char *cmd)
1007{
1008 char *p;
1009
1010 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1011}
1012
ajsc8b40f82004-12-22 16:17:16 +00001013/* This is an ugly hack to circumvent problems with passing command-line
1014 arguments that contain spaces. The fix is to use a configuration file. */
1015static char *
1016translate_blanks(const char *cmd, const char *blankstr)
1017{
1018 char *res;
1019 char *p;
1020 size_t bslen = strlen(blankstr);
1021
1022 if (!(res = strdup(cmd)))
1023 {
1024 perror("strdup");
1025 exit(1);
1026 }
1027 while ((p = strstr(res,blankstr)) != NULL)
1028 {
1029 *p = ' ';
1030 if (bslen != 1)
1031 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1032 }
1033 return res;
1034}
1035
ajs8b886ca2004-12-22 02:56:38 +00001036int
1037main(int argc, char **argv)
1038{
1039 const char *progname;
1040 int opt;
1041 int daemon_mode = 0;
1042 const char *pidfile = DEFAULT_PIDFILE;
1043 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001044 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001045 static struct quagga_signal_t my_signals[] =
1046 {
1047 {
1048 .signal = SIGINT,
1049 .handler = sigint,
1050 },
1051 {
1052 .signal = SIGTERM,
1053 .handler = sigint,
1054 },
1055 {
1056 .signal = SIGCHLD,
1057 .handler = sigchild,
1058 },
1059 };
1060
1061 if ((progname = strrchr (argv[0], '/')) != NULL)
1062 progname++;
1063 else
1064 progname = argv[0];
1065
ajs098e2402004-12-22 17:00:46 +00001066 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001067 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001068 longopts, 0)) != EOF)
1069 {
1070 switch (opt)
1071 {
1072 case 0:
1073 break;
1074 case 'a':
1075 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1076 {
1077 fputs("Ambiguous operating mode selected.\n",stderr);
1078 return usage(progname,1);
1079 }
1080 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1081 break;
1082 case 'A':
1083 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1084 {
1085 fputs("Ambiguous operating mode selected.\n",stderr);
1086 return usage(progname,1);
1087 }
1088 gs.mode = MODE_PHASED_ALL_RESTART;
1089 break;
ajsc8b40f82004-12-22 16:17:16 +00001090 case 'b':
1091 blankstr = optarg;
1092 break;
ajs8b886ca2004-12-22 02:56:38 +00001093 case 'd':
1094 daemon_mode = 1;
1095 break;
1096 case 'e':
1097 gs.do_ping = 0;
1098 break;
1099 case 'k':
1100 if (!valid_command(optarg))
1101 {
1102 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1103 optarg);
1104 return usage(progname,1);
1105 }
1106 gs.stop_command = optarg;
1107 break;
1108 case 'l':
1109 {
1110 char garbage[3];
1111 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1112 (gs.loglevel < LOG_EMERG))
1113 {
1114 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1115 return usage(progname,1);
1116 }
1117 }
1118 break;
1119 case 'm':
1120 {
1121 char garbage[3];
1122 if ((sscanf(optarg,"%ld%1s",
1123 &gs.min_restart_interval,garbage) != 1) ||
1124 (gs.min_restart_interval < 0))
1125 {
1126 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1127 optarg);
1128 return usage(progname,1);
1129 }
1130 }
1131 break;
1132 case 'M':
1133 {
1134 char garbage[3];
1135 if ((sscanf(optarg,"%ld%1s",
1136 &gs.max_restart_interval,garbage) != 1) ||
1137 (gs.max_restart_interval < 0))
1138 {
1139 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1140 optarg);
1141 return usage(progname,1);
1142 }
1143 }
1144 break;
1145 case 'i':
1146 {
1147 char garbage[3];
1148 int period;
1149 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1150 (gs.period < 1))
1151 {
1152 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1153 return usage(progname,1);
1154 }
1155 gs.period = 1000*period;
1156 }
1157 break;
1158 case 'p':
1159 pidfile = optarg;
1160 break;
1161 case 'r':
1162 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1163 (gs.mode == MODE_SEPARATE_RESTART))
1164 {
1165 fputs("Ambiguous operating mode selected.\n",stderr);
1166 return usage(progname,1);
1167 }
1168 if (!valid_command(optarg))
1169 {
1170 fprintf(stderr,
1171 "Invalid restart command, must contain '%%s': %s\n",
1172 optarg);
1173 return usage(progname,1);
1174 }
1175 gs.restart_command = optarg;
1176 if (gs.mode == MODE_MONITOR)
1177 gs.mode = MODE_SEPARATE_RESTART;
1178 break;
1179 case 'R':
1180 if (gs.mode != MODE_MONITOR)
1181 {
1182 fputs("Ambiguous operating mode selected.\n",stderr);
1183 return usage(progname,1);
1184 }
1185 if (strchr(optarg,'%'))
1186 {
1187 fprintf(stderr,
1188 "Invalid restart-all arg, must not contain '%%s': %s\n",
1189 optarg);
1190 return usage(progname,1);
1191 }
1192 gs.restart_command = optarg;
1193 gs.mode = MODE_GLOBAL_RESTART;
1194 break;
1195 case 's':
1196 if (!valid_command(optarg))
1197 {
1198 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1199 optarg);
1200 return usage(progname,1);
1201 }
1202 gs.start_command = optarg;
1203 break;
1204 case 'S':
1205 gs.vtydir = optarg;
1206 break;
1207 case 't':
1208 {
1209 char garbage[3];
1210 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1211 (gs.timeout < 1))
1212 {
1213 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1214 return usage(progname,1);
1215 }
1216 }
1217 break;
1218 case 'T':
1219 {
1220 char garbage[3];
1221 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1222 (gs.restart_timeout < 1))
1223 {
1224 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1225 return usage(progname,1);
1226 }
1227 }
1228 break;
1229 case 'z':
1230 gs.unresponsive_restart = 1;
1231 break;
1232 case 'v':
1233 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1234 puts("Copyright 2004 Andrew J. Schorr");
1235 return 0;
1236 case 'h':
1237 return usage(progname,0);
1238 default:
1239 fputs("Invalid option.\n",stderr);
1240 return usage(progname,1);
1241 }
1242 }
1243
1244 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1245 {
1246 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1247 return usage(progname,1);
1248 }
1249 switch (gs.mode)
1250 {
1251 case MODE_MONITOR:
1252 if (gs.restart_command || gs.start_command || gs.stop_command)
1253 {
1254 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1255 mode_str[gs.mode]);
1256 return usage(progname,1);
1257 }
1258 break;
1259 case MODE_GLOBAL_RESTART:
1260 case MODE_SEPARATE_RESTART:
1261 if (!gs.restart_command || gs.start_command || gs.stop_command)
1262 {
1263 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1264 mode_str[gs.mode]);
1265 return usage(progname,1);
1266 }
1267 break;
1268 case MODE_PHASED_ZEBRA_RESTART:
1269 case MODE_PHASED_ALL_RESTART:
1270 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1271 {
1272 fprintf(stderr,
1273 "Need start, kill, and restart commands in [%s] mode.\n",
1274 mode_str[gs.mode]);
1275 return usage(progname,1);
1276 }
1277 break;
1278 }
1279
ajsc8b40f82004-12-22 16:17:16 +00001280 if (blankstr)
1281 {
1282 if (gs.restart_command)
1283 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1284 if (gs.start_command)
1285 gs.start_command = translate_blanks(gs.start_command,blankstr);
1286 if (gs.stop_command)
1287 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1288 }
1289
ajs8b886ca2004-12-22 02:56:38 +00001290 gs.restart.interval = gs.min_restart_interval;
1291 master = thread_master_create();
1292 signal_init (master, Q_SIGC(my_signals), my_signals);
1293 srandom(time(NULL));
1294
1295 {
1296 int i;
1297 struct daemon *tail = NULL;
1298
1299 for (i = optind; i < argc; i++)
1300 {
1301 struct daemon *dmn;
1302
1303 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1304 {
ajs098e2402004-12-22 17:00:46 +00001305 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1306 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001307 return 1;
1308 }
1309 dmn->name = dmn->restart.name = argv[i];
1310 dmn->state = DAEMON_INIT;
1311 gs.numdaemons++;
1312 gs.numdown++;
1313 dmn->fd = -1;
1314 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1315 100+(random() % 900));
1316 dmn->restart.interval = gs.min_restart_interval;
1317 if (tail)
1318 tail->next = dmn;
1319 else
1320 gs.daemons = dmn;
1321 tail = dmn;
1322
1323 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1324 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1325 !strcmp(dmn->name,special))
1326 gs.special = dmn;
1327 }
1328 }
1329 if (!gs.daemons)
1330 {
1331 fputs("Must specify one or more daemons to monitor.\n",stderr);
1332 return usage(progname,1);
1333 }
1334 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1335 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1336 {
1337 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1338 mode_str[gs.mode],special);
1339 return usage(progname,1);
1340 }
1341 if (gs.special && (gs.numdaemons < 2))
1342 {
1343 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1344 "to watch.\n",mode_str[gs.mode]);
1345 return usage(progname,1);
1346 }
1347
1348 zlog_default = openzlog(progname, ZLOG_NONE,
1349 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1350 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1351 if (daemon_mode)
1352 {
1353 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1354 daemon(0, 0);
1355 }
1356 else
1357 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1358
1359 /* Make sure we're not already running. */
1360 pid_output (pidfile);
1361
1362 /* Announce which daemons are being monitored. */
1363 {
1364 struct daemon *dmn;
1365 size_t len = 0;
1366
1367 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1368 len += strlen(dmn->name)+1;
1369
1370 {
1371 char buf[len+1];
1372 char *p = buf;
1373
1374 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1375 {
1376 if (p != buf)
1377 *p++ = ' ';
1378 strcpy(p,dmn->name);
1379 p += strlen(p);
1380 }
1381 zlog_notice("%s %s watching [%s], mode [%s]",
1382 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1383 }
1384 }
1385
1386 {
1387 struct thread thread;
1388
1389 while (thread_fetch (master, &thread))
1390 thread_call (&thread);
1391 }
1392
1393 /* Not reached. */
1394 return 0;
1395}