blob: efc29070c6f9993c0d55667a0d936e71fb02adde [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs098e2402004-12-22 17:00:46 +00002 $Id: watchquagga.c,v 1.5 2004/12/22 17:00:46 ajs Exp $
ajs8b886ca2004-12-22 02:56:38 +00003
4 Monitor status of quagga daemons and restart if necessary.
5
6 Copyright (C) 2004 Andrew J. Schorr
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23/* System headers: */
24#include <sys/types.h>
25#include <sys/socket.h>
26#include <sys/un.h>
27#include <unistd.h>
28#include <stdlib.h>
29#include <stdio.h>
30#include <signal.h>
31#include <fcntl.h>
32#include <errno.h>
33#include <time.h>
ajs098e2402004-12-22 17:00:46 +000034#include <string.h>
ajs8b886ca2004-12-22 02:56:38 +000035#include <sys/time.h>
36#include <sys/wait.h>
37
38/* Quagga headers: */
ajs6028df52004-12-22 14:08:13 +000039#ifdef HAVE_CONFIG_H
40#include "config.h"
41#endif
42
ajs8b886ca2004-12-22 02:56:38 +000043#include <thread.h>
44#include <log.h>
45#include <sigevent.h>
46#include <version.h>
47
48#ifndef MIN
49#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
50#endif
51
52/* Macros to help randomize timers. */
53#define JITTER(X) ((random() % ((X)+1))-((X)/2))
54#define FUZZY(X) ((X)+JITTER((X)/20))
55
56#define DEFAULT_PERIOD 5
57#define DEFAULT_TIMEOUT 10
58#define DEFAULT_RESTART_TIMEOUT 20
59#define DEFAULT_LOGLEVEL LOG_INFO
60#define DEFAULT_MIN_RESTART 60
61#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000062#ifdef PATH_WATCHQUAGGA_PID
63#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
64#else
ajs8b886ca2004-12-22 02:56:38 +000065#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000066#endif
ajs16f65112004-12-22 15:37:44 +000067#ifdef DAEMON_VTY_DIR
68#define VTYDIR DAEMON_VTY_DIR
69#else
70#define VTYDIR STATEDIR
71#endif
ajs8b886ca2004-12-22 02:56:38 +000072
73#define PING_TOKEN "PING"
74
75/* Needs to be global, referenced somewhere inside libzebra. */
76struct thread_master *master;
77
78typedef enum
79{
80 MODE_MONITOR = 0,
81 MODE_GLOBAL_RESTART,
82 MODE_SEPARATE_RESTART,
83 MODE_PHASED_ZEBRA_RESTART,
84 MODE_PHASED_ALL_RESTART
85} watch_mode_t;
86
87static const char *mode_str[] =
88{
89 "monitor",
90 "global restart",
91 "individual daemon restart",
92 "phased zebra restart",
93 "phased global restart for any failure",
94};
95
96typedef enum
97{
98 PHASE_NONE = 0,
99 PHASE_STOPS_PENDING,
100 PHASE_WAITING_DOWN,
101 PHASE_ZEBRA_RESTART_PENDING,
102 PHASE_WAITING_ZEBRA_UP
103} restart_phase_t;
104
105static const char *phase_str[] =
106{
107 "None",
108 "Stop jobs running",
109 "Waiting for other daemons to come down",
110 "Zebra restart job running",
111 "Waiting for zebra to come up",
112 "Start jobs running",
113};
114
115#define PHASE_TIMEOUT (3*gs.restart_timeout)
116
ajs098e2402004-12-22 17:00:46 +0000117struct restart_info
118{
119 const char *name;
120 const char *what;
121 pid_t pid;
122 struct timeval time;
123 long interval;
124 struct thread *t_kill;
125 int kills;
126};
127
128static struct global_state
129{
ajs8b886ca2004-12-22 02:56:38 +0000130 watch_mode_t mode;
131 restart_phase_t phase;
132 struct thread *t_phase_hanging;
133 const char *vtydir;
134 long period;
135 long timeout;
136 long restart_timeout;
137 long min_restart_interval;
138 long max_restart_interval;
139 int do_ping;
140 struct daemon *daemons;
141 const char *restart_command;
142 const char *start_command;
143 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000144 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000145 int unresponsive_restart;
146 int loglevel;
147 struct daemon *special; /* points to zebra when doing phased restart */
148 int numdaemons;
149 int numpids;
150 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
151} gs = {
152 .mode = MODE_MONITOR,
153 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000154 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000155 .period = 1000*DEFAULT_PERIOD,
156 .timeout = DEFAULT_TIMEOUT,
157 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
158 .loglevel = DEFAULT_LOGLEVEL,
159 .min_restart_interval = DEFAULT_MIN_RESTART,
160 .max_restart_interval = DEFAULT_MAX_RESTART,
161 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000162};
163
164typedef enum
165{
166 DAEMON_INIT,
167 DAEMON_DOWN,
168 DAEMON_CONNECTING,
169 DAEMON_UP,
170 DAEMON_UNRESPONSIVE
171} daemon_state_t;
172
173#define IS_UP(DMN) \
174 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
175
176static const char *state_str[] =
177{
178 "Init",
179 "Down",
180 "Connecting",
181 "Up",
182 "Unresponsive",
183};
184
185struct daemon {
186 const char *name;
187 daemon_state_t state;
188 int fd;
189 struct timeval echo_sent;
190 u_int connect_tries;
191 struct thread *t_wakeup;
192 struct thread *t_read;
193 struct thread *t_write;
194 struct daemon *next;
195 struct restart_info restart;
196};
197
198static const struct option longopts[] =
199{
200 { "daemon", no_argument, NULL, 'd'},
201 { "statedir", required_argument, NULL, 'S'},
202 { "no-echo", no_argument, NULL, 'e'},
203 { "loglevel", required_argument, NULL, 'l'},
204 { "interval", required_argument, NULL, 'i'},
205 { "timeout", required_argument, NULL, 't'},
206 { "restart-timeout", required_argument, NULL, 'T'},
207 { "restart", required_argument, NULL, 'r'},
208 { "start-command", required_argument, NULL, 's'},
209 { "kill-command", required_argument, NULL, 'k'},
210 { "restart-all", required_argument, NULL, 'R'},
211 { "all-restart", no_argument, NULL, 'a'},
212 { "always-all-restart", no_argument, NULL, 'A'},
213 { "unresponsive-restart", no_argument, NULL, 'z'},
214 { "min-restart-interval", required_argument, NULL, 'm'},
215 { "max-restart-interval", required_argument, NULL, 'M'},
216 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000217 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000218 { "help", no_argument, NULL, 'h'},
219 { "version", no_argument, NULL, 'v'},
220 { NULL, 0, NULL, 0 }
221};
222
223static int try_connect(struct daemon *dmn);
224static int wakeup_send_echo(struct thread *t_wakeup);
225static void try_restart(struct daemon *dmn);
226static void phase_check(void);
227
228static int
229usage(const char *progname, int status)
230{
231 if (status != 0)
232 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
233 else
234 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
235Watchdog program to monitor status of quagga daemons and try to restart\n\
236them if they are down or unresponsive. It determines whether a daemon is\n\
237up based on whether it can connect to the daemon's vty unix stream socket.\n\
238It then repeatedly sends echo commands over that socket to determine whether\n\
239the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
240on the socket connection and know immediately that the daemon is down.\n\n\
241The daemons to be monitored should be listed on the command line.\n\n\
242This program can run in one of 5 modes:\n\n\
2430. Mode: %s.\n\
244 Just monitor and report on status changes. Example:\n\
245 %s -d zebra ospfd bgpd\n\n\
2461. Mode: %s.\n\
247 Whenever any daemon hangs or crashes, use the given command to restart\n\
248 them all. Example:\n\
249 %s -dz \\\n\
250 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
251 zebra ospfd\n\n\
2522. Mode: %s.\n\
253 When any single daemon hangs or crashes, restart only the daemon that's\n\
254 in trouble using the supplied restart command. Example:\n\
255 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2563. Mode: %s.\n\
257 The same as the previous mode, except that there is special treatment when\n\
258 the zebra daemon is in trouble. In that case, a phased restart approach\n\
259 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
260 daemons. Example:\n\
261 %s -adz -r '/sbin/service %%s restart' \\\n\
262 -s '/sbin/service %%s start' \\\n\
263 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2644. Mode: %s.\n\
265 This is the same as the previous mode, except that the phased restart\n\
266 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
267 %s -Adz -r '/sbin/service %%s restart' \\\n\
268 -s '/sbin/service %%s start' \\\n\
269 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
270As of this writing, it is believed that mode 2 [%s]\n\
271is not safe, and mode 3 [%s] may not be safe with some of the\n\
272routing daemons.\n\n\
273In order to avoid attempting to restart the daemons in a fast loop,\n\
274the -m and -M options allow you to control the minimum delay between\n\
275restart commands. The minimum restart delay is recalculated each time\n\
276a restart is attempted: if the time since the last restart attempt exceeds\n\
277twice the -M value, then the restart delay is set to the -m value.\n\
278Otherwise, the interval is doubled (but capped at the -M value).\n\n\
279Options:\n\
280-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
281 to syslog instead of stdout.\n\
282-S, --statedir Set the vty socket directory (default is %s)\n\
283-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
284 option is necessary if the daemons do not support the\n\
285 echo command)\n\
286-l, --loglevel Set the logging level (default is %d).\n\
287 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
288 but it can be set higher than %d if extra-verbose debugging\n\
289 messages are desired.\n\
290-m, --min-restart-interval\n\
291 Set the minimum seconds to wait between invocations of daemon\n\
292 restart commands (default is %d).\n\
293-M, --max-restart-interval\n\
294 Set the maximum seconds to wait between invocations of daemon\n\
295 restart commands (default is %d).\n\
296-i, --interval Set the status polling interval in seconds (default is %d)\n\
297-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
298-T, --restart-timeout\n\
299 Set the restart (kill) timeout in seconds (default is %d).\n\
300 If any background jobs are still running after this much\n\
301 time has elapsed, they will be killed.\n\
302-r, --restart Supply a Bourne shell command to use to restart a single\n\
303 daemon. The command string should include '%%s' where the\n\
304 name of the daemon should be substituted.\n\
305 Note that -r and -R are incompatible.\n\
306-s, --start-command\n\
307 Supply a Bourne shell to command to use to start a single\n\
308 daemon. The command string should include '%%s' where the\n\
309 name of the daemon should be substituted.\n\
310-k, --kill-command\n\
311 Supply a Bourne shell to command to use to stop a single\n\
312 daemon. The command string should include '%%s' where the\n\
313 name of the daemon should be substituted.\n\
314-R, --restart-all\n\
315 When one or more daemons is down, try to restart everything\n\
316 using the Bourne shell command supplied as the argument.\n\
317 Note that -r and -R are incompatible.\n\
318-z, --unresponsive-restart\n\
319 When a daemon is unresponsive, treat it as being down for\n\
320 restart purposes.\n\
321-a, --all-restart\n\
322 When zebra hangs or crashes, restart all daemons using\n\
323 this phased approach: 1. stop all other daemons; 2. restart\n\
324 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
325-A, --always-all-restart\n\
326 When any daemon (not just zebra) hangs or crashes, use the\n\
327 same phased restart mechanism described above for -a.\n\
328 Requires -r, -s, and -k.\n\
329-p, --pid-file Set process identifier file name\n\
330 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000331-b, --blank-string\n\
332 When the supplied argument string is found in any of the\n\
333 various shell command arguments (-r, -s, -k, or -R), replace\n\
334 it with a space. This is an ugly hack to circumvent problems\n\
335 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000336-v, --version Print program version\n\
337-h, --help Display this help and exit\n\
338", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
339progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000340VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000341DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
342DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
343
344 return status;
345}
346
347static pid_t
348run_background(const char *shell_cmd)
349{
350 pid_t child;
351
352 switch (child = fork())
353 {
354 case -1:
355 zlog_err("fork failed, cannot run command [%s]: %s",
356 shell_cmd,safe_strerror(errno));
357 return -1;
358 case 0:
359 /* Child process. */
360 /* Use separate process group so child processes can be killed easily. */
361 if (setpgid(0,0) < 0)
362 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
363 {
364 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
365 execv("/bin/sh",(char *const *)argv);
366 zlog_err("execv(/bin/sh -c '%s') failed: %s",
367 shell_cmd,safe_strerror(errno));
368 _exit(127);
369 }
370 default:
371 /* Parent process: we will reap the child later. */
372 zlog_err("Forked background command [pid %d]: %s",child,shell_cmd);
373 return child;
374 }
375}
376
377static struct timeval *
378time_elapsed(struct timeval *result, const struct timeval *start_time)
379{
380 gettimeofday(result,NULL);
381 result->tv_sec -= start_time->tv_sec;
382 result->tv_usec -= start_time->tv_usec;
383 while (result->tv_usec < 0)
384 {
385 result->tv_usec += 1000000L;
386 result->tv_sec--;
387 }
388 return result;
389}
390
391static int
392restart_kill(struct thread *t_kill)
393{
394 struct restart_info *restart = THREAD_ARG(t_kill);
395 struct timeval delay;
396
397 time_elapsed(&delay,&restart->time);
398 zlog_warn("Warning: %s %s child process %d still running after "
399 "%ld seconds, sending signal %d",
400 restart->what,restart->name,restart->pid,delay.tv_sec,
401 (restart->kills ? SIGKILL : SIGTERM));
402 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
403 restart->kills++;
404 restart->t_kill = thread_add_timer(master,restart_kill,restart,
405 gs.restart_timeout);
406 return 0;
407}
408
409static struct restart_info *
410find_child(pid_t child)
411{
412 if (gs.mode == MODE_GLOBAL_RESTART)
413 {
414 if (gs.restart.pid == child)
415 return &gs.restart;
416 }
417 else
418 {
419 struct daemon *dmn;
420 for (dmn = gs.daemons; dmn; dmn = dmn->next)
421 {
422 if (dmn->restart.pid == child)
423 return &dmn->restart;
424 }
425 }
426 return NULL;
427}
428
429static void
430sigchild(void)
431{
432 pid_t child;
433 int status;
434 const char *name;
435 const char *what;
436 struct restart_info *restart;
437
438 switch (child = waitpid(-1,&status,WNOHANG))
439 {
440 case -1:
441 zlog_err("waitpid failed: %s",safe_strerror(errno));
442 return;
443 case 0:
444 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
445 return;
446 }
447
448 if ((restart = find_child(child)) != NULL)
449 {
450 name = restart->name;
451 what = restart->what;
452 restart->pid = 0;
453 gs.numpids--;
454 thread_cancel(restart->t_kill);
455 restart->t_kill = NULL;
456 /* Update restart time to reflect the time the command completed. */
457 gettimeofday(&restart->time,NULL);
458 }
459 else
460 {
461 zlog_err("waitpid returned status for an unknown child process %d",
462 child);
463 name = "(unknown)";
464 what = "background";
465 }
466 if (WIFSTOPPED(status))
467 zlog_warn("warning: %s %s process %d is stopped",
468 what,name,child);
469 else if (WIFSIGNALED(status))
470 zlog_warn("%s %s process %d terminated due to signal %d",
471 what,name,child,WTERMSIG(status));
472 else if (WIFEXITED(status))
473 {
474 if (WEXITSTATUS(status) != 0)
475 zlog_warn("%s %s process %d exited with non-zero status %d",
476 what,name,child,WEXITSTATUS(status));
477 else
478 zlog_debug("%s %s process %d exited normally",what,name,child);
479 }
480 else
481 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
482 what,name,child,status);
483 phase_check();
484}
485
486static int
487run_job(struct restart_info *restart, const char *cmdtype, const char *command,
488 int force, int update_interval)
489{
490 struct timeval delay;
491
492 if (gs.loglevel > LOG_DEBUG+1)
493 zlog_debug("attempting to %s %s",cmdtype,restart->name);
494
495 if (restart->pid)
496 {
497 if (gs.loglevel > LOG_DEBUG+1)
498 zlog_debug("cannot %s %s, previous pid %d still running",
499 cmdtype,restart->name,restart->pid);
500 return -1;
501 }
502
503 if (!force &&
504 (time_elapsed(&delay,&restart->time)->tv_sec < restart->interval))
505 {
506 if (gs.loglevel > LOG_DEBUG+1)
507 zlog_debug("postponing %s %s: "
508 "elapsed time %ld < retry interval %ld",
509 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
510 return -1;
511 }
512
513 gettimeofday(&restart->time,NULL);
514 restart->kills = 0;
515 {
516 char cmd[strlen(command)+strlen(restart->name)+1];
517 snprintf(cmd,sizeof(cmd),command,restart->name);
518 if ((restart->pid = run_background(cmd)) > 0)
519 {
520 restart->t_kill = thread_add_timer(master,restart_kill,restart,
521 gs.restart_timeout);
522 restart->what = cmdtype;
523 gs.numpids++;
524 }
525 else
526 restart->pid = 0;
527 }
528
529 /* Calculate the new restart interval. */
530 if (update_interval)
531 {
532 if (delay.tv_sec > 2*gs.max_restart_interval)
533 restart->interval = gs.min_restart_interval;
534 else if ((restart->interval *= 2) > gs.max_restart_interval)
535 restart->interval = gs.max_restart_interval;
536 if (gs.loglevel > LOG_DEBUG+1)
537 zlog_debug("restart %s interval is now %ld",
538 restart->name,restart->interval);
539 }
540 return restart->pid;
541}
542
543#define SET_READ_HANDLER(DMN) \
544 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
545
546#define SET_WAKEUP_DOWN(DMN) \
547 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
548 FUZZY(gs.period))
549
550#define SET_WAKEUP_UNRESPONSIVE(DMN) \
551 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
552 FUZZY(gs.period))
553
554#define SET_WAKEUP_ECHO(DMN) \
555 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
556 FUZZY(gs.period))
557
558static int
559wakeup_down(struct thread *t_wakeup)
560{
561 struct daemon *dmn = THREAD_ARG(t_wakeup);
562
563 dmn->t_wakeup = NULL;
564 if (try_connect(dmn) < 0)
565 SET_WAKEUP_DOWN(dmn);
566 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
567 try_restart(dmn);
568 return 0;
569}
570
571static int
572wakeup_init(struct thread *t_wakeup)
573{
574 struct daemon *dmn = THREAD_ARG(t_wakeup);
575
576 dmn->t_wakeup = NULL;
577 if (try_connect(dmn) < 0)
578 {
579 SET_WAKEUP_DOWN(dmn);
580 zlog_err("%s state -> down : initial connection attempt failed",
581 dmn->name);
582 dmn->state = DAEMON_DOWN;
583 }
584 return 0;
585}
586
587static void
588daemon_down(struct daemon *dmn, const char *why)
589{
590 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
591 zlog_err("%s state -> down : %s",dmn->name,why);
592 else if (gs.loglevel > LOG_DEBUG)
593 zlog_debug("%s still down : %s",dmn->name,why);
594 if (IS_UP(dmn))
595 gs.numdown++;
596 dmn->state = DAEMON_DOWN;
597 if (dmn->fd >= 0)
598 {
599 close(dmn->fd);
600 dmn->fd = -1;
601 }
602 THREAD_OFF(dmn->t_read);
603 THREAD_OFF(dmn->t_write);
604 THREAD_OFF(dmn->t_wakeup);
605 if (try_connect(dmn) < 0)
606 SET_WAKEUP_DOWN(dmn);
607 phase_check();
608}
609
610static int
611handle_read(struct thread *t_read)
612{
613 struct daemon *dmn = THREAD_ARG(t_read);
614 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
615 char buf[sizeof(resp)+100];
616 ssize_t rc;
617 struct timeval delay;
618
619 dmn->t_read = NULL;
620 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
621 {
622 char why[100];
623
624 if ((errno == EINTR) || (errno == EAGAIN))
625 {
626 /* Pretend it never happened. */
627 SET_READ_HANDLER(dmn);
628 return 0;
629 }
630 snprintf(why,sizeof(why),"unexpected read error: %s",
631 safe_strerror(errno));
632 daemon_down(dmn,why);
633 return 0;
634 }
635 if (rc == 0)
636 {
637 daemon_down(dmn,"read returned EOF");
638 return 0;
639 }
640 if (!dmn->echo_sent.tv_sec)
641 {
642 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000643 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
644 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000645 daemon_down(dmn,why);
646 return 0;
647 }
648
649 /* We are expecting an echo response: is there any chance that the
650 response would not be returned entirely in the first read? That
651 seems inconceivable... */
652 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
653 {
654 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000655 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
656 "(expecting %u): %.*s",
657 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000658 daemon_down(dmn,why);
659 return 0;
660 }
661
662 time_elapsed(&delay,&dmn->echo_sent);
663 dmn->echo_sent.tv_sec = 0;
664 if (dmn->state == DAEMON_UNRESPONSIVE)
665 {
666 if (delay.tv_sec < gs.timeout)
667 {
668 dmn->state = DAEMON_UP;
669 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
670 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
671 }
672 else
673 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
674 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
675 }
676 else if (gs.loglevel > LOG_DEBUG+1)
677 zlog_debug("%s: echo response received after %ld.%06ld seconds",
678 dmn->name,delay.tv_sec,delay.tv_usec);
679
680 SET_READ_HANDLER(dmn);
681 if (dmn->t_wakeup)
682 thread_cancel(dmn->t_wakeup);
683 SET_WAKEUP_ECHO(dmn);
684
685 return 0;
686}
687
688static void
689daemon_up(struct daemon *dmn, const char *why)
690{
691 dmn->state = DAEMON_UP;
692 gs.numdown--;
693 dmn->connect_tries = 0;
694 zlog_notice("%s state -> up : %s",dmn->name,why);
695 if (gs.do_ping)
696 SET_WAKEUP_ECHO(dmn);
697 phase_check();
698}
699
700static int
701check_connect(struct thread *t_write)
702{
703 struct daemon *dmn = THREAD_ARG(t_write);
704 int sockerr;
705 socklen_t reslen = sizeof(sockerr);
706
707 dmn->t_write = NULL;
708 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
709 {
710 zlog_warn("%s: check_connect: getsockopt failed: %s",
711 dmn->name,safe_strerror(errno));
712 daemon_down(dmn,"getsockopt failed checking connection success");
713 return 0;
714 }
715 if ((reslen == sizeof(sockerr)) && sockerr)
716 {
717 char why[100];
718 snprintf(why,sizeof(why),
719 "getsockopt reports that connection attempt failed: %s",
720 safe_strerror(sockerr));
721 daemon_down(dmn,why);
722 return 0;
723 }
724
725 daemon_up(dmn,"delayed connect succeeded");
726 return 0;
727}
728
729static int
730wakeup_connect_hanging(struct thread *t_wakeup)
731{
732 struct daemon *dmn = THREAD_ARG(t_wakeup);
733 char why[100];
734
735 dmn->t_wakeup = NULL;
736 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
737 gs.timeout);
738 daemon_down(dmn,why);
739 return 0;
740}
741
742/* Making connection to protocol daemon. */
743static int
744try_connect(struct daemon *dmn)
745{
746 int sock;
747 struct sockaddr_un addr;
748 socklen_t len;
749 int flags;
750
751 if (gs.loglevel > LOG_DEBUG+1)
752 zlog_debug("%s: attempting to connect",dmn->name);
753 dmn->connect_tries++;
754
755 memset (&addr, 0, sizeof (struct sockaddr_un));
756 addr.sun_family = AF_UNIX;
757 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
758 gs.vtydir,dmn->name);
759#ifdef HAVE_SUN_LEN
760 len = addr.sun_len = SUN_LEN(&addr);
761#else
762 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
763#endif /* HAVE_SUN_LEN */
764
765 /* Quick check to see if we might succeed before we go to the trouble
766 of creating a socket. */
767 if (access(addr.sun_path, W_OK) < 0)
768 {
769 if (errno != ENOENT)
770 zlog_err("%s: access to socket %s denied: %s",
771 dmn->name,addr.sun_path,safe_strerror(errno));
772 return -1;
773 }
774
775 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
776 {
777 zlog_err("%s(%s): cannot make socket: %s",
778 __func__,addr.sun_path, safe_strerror(errno));
779 return -1;
780 }
781
782 /* Set non-blocking. */
783 if ((flags = fcntl(sock, F_GETFL, 0)) < 0)
784 {
785 zlog_err("%s(%s): fcntl(F_GETFL) failed: %s",
786 __func__,addr.sun_path, safe_strerror(errno));
787 close(sock);
788 return -1;
789 }
790 if (fcntl(sock, F_SETFL, (flags|O_NONBLOCK)) < 0)
791 {
792 zlog_err("%s(%s): fcntl(F_SETFL,O_NONBLOCK) failed: %s",
793 __func__,addr.sun_path, safe_strerror(errno));
794 close(sock);
795 return -1;
796 }
797
798 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
799 {
800 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
801 {
802 if (gs.loglevel > LOG_DEBUG)
803 zlog_debug("%s(%s): connect failed: %s",
804 __func__,addr.sun_path, safe_strerror(errno));
805 close (sock);
806 return -1;
807 }
808 if (gs.loglevel > LOG_DEBUG)
809 zlog_debug("%s: connection in progress",dmn->name);
810 dmn->state = DAEMON_CONNECTING;
811 dmn->fd = sock;
812 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
813 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
814 gs.timeout);
815 SET_READ_HANDLER(dmn);
816 return 0;
817 }
818
819 dmn->fd = sock;
820 SET_READ_HANDLER(dmn);
821 daemon_up(dmn,"connect succeeded");
822 return 1;
823}
824
825static int
826phase_hanging(struct thread *t_hanging)
827{
828 gs.t_phase_hanging = NULL;
829 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
830 phase_str[gs.phase],PHASE_TIMEOUT);
831 gs.phase = PHASE_NONE;
832 return 0;
833}
834
835static void
836set_phase(restart_phase_t new_phase)
837{
838 gs.phase = new_phase;
839 if (gs.t_phase_hanging)
840 thread_cancel(gs.t_phase_hanging);
841 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
842 PHASE_TIMEOUT);
843}
844
845static void
846phase_check(void)
847{
848 switch (gs.phase)
849 {
850 case PHASE_NONE:
851 break;
852 case PHASE_STOPS_PENDING:
853 if (gs.numpids)
854 break;
855 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
856 set_phase(PHASE_WAITING_DOWN);
857 /*FALLTHRU*/
858 case PHASE_WAITING_DOWN:
859 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
860 break;
861 zlog_info("Phased restart: all routing daemons now down.");
862 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
863 set_phase(PHASE_ZEBRA_RESTART_PENDING);
864 /*FALLTHRU*/
865 case PHASE_ZEBRA_RESTART_PENDING:
866 if (gs.special->restart.pid)
867 break;
868 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
869 set_phase(PHASE_WAITING_ZEBRA_UP);
870 /*FALLTHRU*/
871 case PHASE_WAITING_ZEBRA_UP:
872 if (!IS_UP(gs.special))
873 break;
874 zlog_info("Phased restart: %s is now up.",gs.special->name);
875 {
876 struct daemon *dmn;
877 for (dmn = gs.daemons; dmn; dmn = dmn->next)
878 {
879 if (dmn != gs.special)
880 run_job(&dmn->restart,"start",gs.start_command,1,1);
881 }
882 }
883 gs.phase = PHASE_NONE;
884 THREAD_OFF(gs.t_phase_hanging);
885 zlog_notice("Phased global restart has completed.");
886 break;
887 }
888}
889
890static void
891try_restart(struct daemon *dmn)
892{
893 switch (gs.mode)
894 {
895 case MODE_MONITOR:
896 return;
897 case MODE_GLOBAL_RESTART:
898 run_job(&gs.restart,"restart",gs.restart_command,0,1);
899 break;
900 case MODE_SEPARATE_RESTART:
901 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
902 break;
903 case MODE_PHASED_ZEBRA_RESTART:
904 if (dmn != gs.special)
905 {
906 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
907 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
908 else
909 zlog_debug("%s: postponing restart attempt because master %s daemon "
910 "not up [%s], or phased restart in progress",
911 dmn->name,gs.special->name,state_str[gs.special->state]);
912 break;
913 }
914 /*FALLTHRU*/
915 case MODE_PHASED_ALL_RESTART:
916 if ((gs.phase != PHASE_NONE) || gs.numpids)
917 {
918 if (gs.loglevel > LOG_DEBUG+1)
919 zlog_debug("postponing phased global restart: restart already in "
920 "progress [%s], or outstanding child processes [%d]",
921 phase_str[gs.phase],gs.numpids);
922 break;
923 }
924 /* Is it too soon for a restart? */
925 {
926 struct timeval delay;
927 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
928 gs.special->restart.interval)
929 {
930 if (gs.loglevel > LOG_DEBUG+1)
931 zlog_debug("postponing phased global restart: "
932 "elapsed time %ld < retry interval %ld",
933 (long)delay.tv_sec,gs.special->restart.interval);
934 break;
935 }
936 }
937 zlog_info("Phased restart: stopping all routing daemons.");
938 /* First step: stop all other daemons. */
939 for (dmn = gs.daemons; dmn; dmn = dmn->next)
940 {
941 if (dmn != gs.special)
942 run_job(&dmn->restart,"stop",gs.stop_command,1,0);
943 }
944 set_phase(PHASE_STOPS_PENDING);
945 break;
946 default:
947 zlog_err("error: unknown restart mode %d",gs.mode);
948 break;
949 }
950}
951
952static int
953wakeup_unresponsive(struct thread *t_wakeup)
954{
955 struct daemon *dmn = THREAD_ARG(t_wakeup);
956
957 dmn->t_wakeup = NULL;
958 if (dmn->state != DAEMON_UNRESPONSIVE)
959 zlog_err("%s: no longer unresponsive (now %s), "
960 "wakeup should have been cancelled!",
961 dmn->name,state_str[dmn->state]);
962 else
963 {
964 SET_WAKEUP_UNRESPONSIVE(dmn);
965 try_restart(dmn);
966 }
967 return 0;
968}
969
970static int
971wakeup_no_answer(struct thread *t_wakeup)
972{
973 struct daemon *dmn = THREAD_ARG(t_wakeup);
974
975 dmn->t_wakeup = NULL;
976 dmn->state = DAEMON_UNRESPONSIVE;
977 zlog_err("%s state -> unresponsive : no response yet to ping "
978 "sent %ld seconds ago",dmn->name,gs.timeout);
979 if (gs.unresponsive_restart)
980 {
981 SET_WAKEUP_UNRESPONSIVE(dmn);
982 try_restart(dmn);
983 }
984 return 0;
985}
986
987static int
988wakeup_send_echo(struct thread *t_wakeup)
989{
990 static const char echocmd[] = "echo " PING_TOKEN;
991 ssize_t rc;
992 struct daemon *dmn = THREAD_ARG(t_wakeup);
993
994 dmn->t_wakeup = NULL;
995 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
996 ((size_t)rc != sizeof(echocmd)))
997 {
998 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000999 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
1000 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +00001001 daemon_down(dmn,why);
1002 }
1003 else
1004 {
1005 gettimeofday(&dmn->echo_sent,NULL);
1006 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
1007 }
1008 return 0;
1009}
1010
1011static void
1012sigint(void)
1013{
1014 zlog_notice("Terminating on signal");
1015 exit(0);
1016}
1017
1018static int
1019valid_command(const char *cmd)
1020{
1021 char *p;
1022
1023 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1024}
1025
ajsc8b40f82004-12-22 16:17:16 +00001026/* This is an ugly hack to circumvent problems with passing command-line
1027 arguments that contain spaces. The fix is to use a configuration file. */
1028static char *
1029translate_blanks(const char *cmd, const char *blankstr)
1030{
1031 char *res;
1032 char *p;
1033 size_t bslen = strlen(blankstr);
1034
1035 if (!(res = strdup(cmd)))
1036 {
1037 perror("strdup");
1038 exit(1);
1039 }
1040 while ((p = strstr(res,blankstr)) != NULL)
1041 {
1042 *p = ' ';
1043 if (bslen != 1)
1044 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1045 }
1046 return res;
1047}
1048
ajs8b886ca2004-12-22 02:56:38 +00001049int
1050main(int argc, char **argv)
1051{
1052 const char *progname;
1053 int opt;
1054 int daemon_mode = 0;
1055 const char *pidfile = DEFAULT_PIDFILE;
1056 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001057 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001058 static struct quagga_signal_t my_signals[] =
1059 {
1060 {
1061 .signal = SIGINT,
1062 .handler = sigint,
1063 },
1064 {
1065 .signal = SIGTERM,
1066 .handler = sigint,
1067 },
1068 {
1069 .signal = SIGCHLD,
1070 .handler = sigchild,
1071 },
1072 };
1073
1074 if ((progname = strrchr (argv[0], '/')) != NULL)
1075 progname++;
1076 else
1077 progname = argv[0];
1078
ajs098e2402004-12-22 17:00:46 +00001079 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001080 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001081 longopts, 0)) != EOF)
1082 {
1083 switch (opt)
1084 {
1085 case 0:
1086 break;
1087 case 'a':
1088 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1089 {
1090 fputs("Ambiguous operating mode selected.\n",stderr);
1091 return usage(progname,1);
1092 }
1093 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1094 break;
1095 case 'A':
1096 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1097 {
1098 fputs("Ambiguous operating mode selected.\n",stderr);
1099 return usage(progname,1);
1100 }
1101 gs.mode = MODE_PHASED_ALL_RESTART;
1102 break;
ajsc8b40f82004-12-22 16:17:16 +00001103 case 'b':
1104 blankstr = optarg;
1105 break;
ajs8b886ca2004-12-22 02:56:38 +00001106 case 'd':
1107 daemon_mode = 1;
1108 break;
1109 case 'e':
1110 gs.do_ping = 0;
1111 break;
1112 case 'k':
1113 if (!valid_command(optarg))
1114 {
1115 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1116 optarg);
1117 return usage(progname,1);
1118 }
1119 gs.stop_command = optarg;
1120 break;
1121 case 'l':
1122 {
1123 char garbage[3];
1124 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1125 (gs.loglevel < LOG_EMERG))
1126 {
1127 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1128 return usage(progname,1);
1129 }
1130 }
1131 break;
1132 case 'm':
1133 {
1134 char garbage[3];
1135 if ((sscanf(optarg,"%ld%1s",
1136 &gs.min_restart_interval,garbage) != 1) ||
1137 (gs.min_restart_interval < 0))
1138 {
1139 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1140 optarg);
1141 return usage(progname,1);
1142 }
1143 }
1144 break;
1145 case 'M':
1146 {
1147 char garbage[3];
1148 if ((sscanf(optarg,"%ld%1s",
1149 &gs.max_restart_interval,garbage) != 1) ||
1150 (gs.max_restart_interval < 0))
1151 {
1152 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1153 optarg);
1154 return usage(progname,1);
1155 }
1156 }
1157 break;
1158 case 'i':
1159 {
1160 char garbage[3];
1161 int period;
1162 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1163 (gs.period < 1))
1164 {
1165 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1166 return usage(progname,1);
1167 }
1168 gs.period = 1000*period;
1169 }
1170 break;
1171 case 'p':
1172 pidfile = optarg;
1173 break;
1174 case 'r':
1175 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1176 (gs.mode == MODE_SEPARATE_RESTART))
1177 {
1178 fputs("Ambiguous operating mode selected.\n",stderr);
1179 return usage(progname,1);
1180 }
1181 if (!valid_command(optarg))
1182 {
1183 fprintf(stderr,
1184 "Invalid restart command, must contain '%%s': %s\n",
1185 optarg);
1186 return usage(progname,1);
1187 }
1188 gs.restart_command = optarg;
1189 if (gs.mode == MODE_MONITOR)
1190 gs.mode = MODE_SEPARATE_RESTART;
1191 break;
1192 case 'R':
1193 if (gs.mode != MODE_MONITOR)
1194 {
1195 fputs("Ambiguous operating mode selected.\n",stderr);
1196 return usage(progname,1);
1197 }
1198 if (strchr(optarg,'%'))
1199 {
1200 fprintf(stderr,
1201 "Invalid restart-all arg, must not contain '%%s': %s\n",
1202 optarg);
1203 return usage(progname,1);
1204 }
1205 gs.restart_command = optarg;
1206 gs.mode = MODE_GLOBAL_RESTART;
1207 break;
1208 case 's':
1209 if (!valid_command(optarg))
1210 {
1211 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1212 optarg);
1213 return usage(progname,1);
1214 }
1215 gs.start_command = optarg;
1216 break;
1217 case 'S':
1218 gs.vtydir = optarg;
1219 break;
1220 case 't':
1221 {
1222 char garbage[3];
1223 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1224 (gs.timeout < 1))
1225 {
1226 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1227 return usage(progname,1);
1228 }
1229 }
1230 break;
1231 case 'T':
1232 {
1233 char garbage[3];
1234 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1235 (gs.restart_timeout < 1))
1236 {
1237 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1238 return usage(progname,1);
1239 }
1240 }
1241 break;
1242 case 'z':
1243 gs.unresponsive_restart = 1;
1244 break;
1245 case 'v':
1246 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1247 puts("Copyright 2004 Andrew J. Schorr");
1248 return 0;
1249 case 'h':
1250 return usage(progname,0);
1251 default:
1252 fputs("Invalid option.\n",stderr);
1253 return usage(progname,1);
1254 }
1255 }
1256
1257 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1258 {
1259 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1260 return usage(progname,1);
1261 }
1262 switch (gs.mode)
1263 {
1264 case MODE_MONITOR:
1265 if (gs.restart_command || gs.start_command || gs.stop_command)
1266 {
1267 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1268 mode_str[gs.mode]);
1269 return usage(progname,1);
1270 }
1271 break;
1272 case MODE_GLOBAL_RESTART:
1273 case MODE_SEPARATE_RESTART:
1274 if (!gs.restart_command || gs.start_command || gs.stop_command)
1275 {
1276 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1277 mode_str[gs.mode]);
1278 return usage(progname,1);
1279 }
1280 break;
1281 case MODE_PHASED_ZEBRA_RESTART:
1282 case MODE_PHASED_ALL_RESTART:
1283 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1284 {
1285 fprintf(stderr,
1286 "Need start, kill, and restart commands in [%s] mode.\n",
1287 mode_str[gs.mode]);
1288 return usage(progname,1);
1289 }
1290 break;
1291 }
1292
ajsc8b40f82004-12-22 16:17:16 +00001293 if (blankstr)
1294 {
1295 if (gs.restart_command)
1296 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1297 if (gs.start_command)
1298 gs.start_command = translate_blanks(gs.start_command,blankstr);
1299 if (gs.stop_command)
1300 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1301 }
1302
ajs8b886ca2004-12-22 02:56:38 +00001303 gs.restart.interval = gs.min_restart_interval;
1304 master = thread_master_create();
1305 signal_init (master, Q_SIGC(my_signals), my_signals);
1306 srandom(time(NULL));
1307
1308 {
1309 int i;
1310 struct daemon *tail = NULL;
1311
1312 for (i = optind; i < argc; i++)
1313 {
1314 struct daemon *dmn;
1315
1316 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1317 {
ajs098e2402004-12-22 17:00:46 +00001318 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1319 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001320 return 1;
1321 }
1322 dmn->name = dmn->restart.name = argv[i];
1323 dmn->state = DAEMON_INIT;
1324 gs.numdaemons++;
1325 gs.numdown++;
1326 dmn->fd = -1;
1327 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1328 100+(random() % 900));
1329 dmn->restart.interval = gs.min_restart_interval;
1330 if (tail)
1331 tail->next = dmn;
1332 else
1333 gs.daemons = dmn;
1334 tail = dmn;
1335
1336 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1337 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1338 !strcmp(dmn->name,special))
1339 gs.special = dmn;
1340 }
1341 }
1342 if (!gs.daemons)
1343 {
1344 fputs("Must specify one or more daemons to monitor.\n",stderr);
1345 return usage(progname,1);
1346 }
1347 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1348 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1349 {
1350 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1351 mode_str[gs.mode],special);
1352 return usage(progname,1);
1353 }
1354 if (gs.special && (gs.numdaemons < 2))
1355 {
1356 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1357 "to watch.\n",mode_str[gs.mode]);
1358 return usage(progname,1);
1359 }
1360
1361 zlog_default = openzlog(progname, ZLOG_NONE,
1362 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1363 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1364 if (daemon_mode)
1365 {
1366 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1367 daemon(0, 0);
1368 }
1369 else
1370 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1371
1372 /* Make sure we're not already running. */
1373 pid_output (pidfile);
1374
1375 /* Announce which daemons are being monitored. */
1376 {
1377 struct daemon *dmn;
1378 size_t len = 0;
1379
1380 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1381 len += strlen(dmn->name)+1;
1382
1383 {
1384 char buf[len+1];
1385 char *p = buf;
1386
1387 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1388 {
1389 if (p != buf)
1390 *p++ = ' ';
1391 strcpy(p,dmn->name);
1392 p += strlen(p);
1393 }
1394 zlog_notice("%s %s watching [%s], mode [%s]",
1395 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1396 }
1397 }
1398
1399 {
1400 struct thread thread;
1401
1402 while (thread_fetch (master, &thread))
1403 thread_call (&thread);
1404 }
1405
1406 /* Not reached. */
1407 return 0;
1408}