blob: fb1dbdd8bad5136d869ea47fd402dff33dd859fe [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
paul6f594022004-12-23 19:35:56 +00002 $Id: watchquagga.c,v 1.6 2004/12/23 19:35:56 paul Exp $
ajs8b886ca2004-12-22 02:56:38 +00003
4 Monitor status of quagga daemons and restart if necessary.
5
6 Copyright (C) 2004 Andrew J. Schorr
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23/* System headers: */
24#include <sys/types.h>
25#include <sys/socket.h>
26#include <sys/un.h>
27#include <unistd.h>
28#include <stdlib.h>
29#include <stdio.h>
30#include <signal.h>
31#include <fcntl.h>
32#include <errno.h>
33#include <time.h>
ajs098e2402004-12-22 17:00:46 +000034#include <string.h>
ajs8b886ca2004-12-22 02:56:38 +000035#include <sys/time.h>
36#include <sys/wait.h>
37
38/* Quagga headers: */
ajs6028df52004-12-22 14:08:13 +000039#ifdef HAVE_CONFIG_H
40#include "config.h"
41#endif
42
ajs8b886ca2004-12-22 02:56:38 +000043#include <thread.h>
44#include <log.h>
45#include <sigevent.h>
46#include <version.h>
paul6f594022004-12-23 19:35:56 +000047#include <getopt.h>
ajs8b886ca2004-12-22 02:56:38 +000048
49#ifndef MIN
50#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
51#endif
52
53/* Macros to help randomize timers. */
54#define JITTER(X) ((random() % ((X)+1))-((X)/2))
55#define FUZZY(X) ((X)+JITTER((X)/20))
56
57#define DEFAULT_PERIOD 5
58#define DEFAULT_TIMEOUT 10
59#define DEFAULT_RESTART_TIMEOUT 20
60#define DEFAULT_LOGLEVEL LOG_INFO
61#define DEFAULT_MIN_RESTART 60
62#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000063#ifdef PATH_WATCHQUAGGA_PID
64#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
65#else
ajs8b886ca2004-12-22 02:56:38 +000066#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000067#endif
ajs16f65112004-12-22 15:37:44 +000068#ifdef DAEMON_VTY_DIR
69#define VTYDIR DAEMON_VTY_DIR
70#else
71#define VTYDIR STATEDIR
72#endif
ajs8b886ca2004-12-22 02:56:38 +000073
74#define PING_TOKEN "PING"
75
76/* Needs to be global, referenced somewhere inside libzebra. */
77struct thread_master *master;
78
79typedef enum
80{
81 MODE_MONITOR = 0,
82 MODE_GLOBAL_RESTART,
83 MODE_SEPARATE_RESTART,
84 MODE_PHASED_ZEBRA_RESTART,
85 MODE_PHASED_ALL_RESTART
86} watch_mode_t;
87
88static const char *mode_str[] =
89{
90 "monitor",
91 "global restart",
92 "individual daemon restart",
93 "phased zebra restart",
94 "phased global restart for any failure",
95};
96
97typedef enum
98{
99 PHASE_NONE = 0,
100 PHASE_STOPS_PENDING,
101 PHASE_WAITING_DOWN,
102 PHASE_ZEBRA_RESTART_PENDING,
103 PHASE_WAITING_ZEBRA_UP
104} restart_phase_t;
105
106static const char *phase_str[] =
107{
108 "None",
109 "Stop jobs running",
110 "Waiting for other daemons to come down",
111 "Zebra restart job running",
112 "Waiting for zebra to come up",
113 "Start jobs running",
114};
115
116#define PHASE_TIMEOUT (3*gs.restart_timeout)
117
ajs098e2402004-12-22 17:00:46 +0000118struct restart_info
119{
120 const char *name;
121 const char *what;
122 pid_t pid;
123 struct timeval time;
124 long interval;
125 struct thread *t_kill;
126 int kills;
127};
128
129static struct global_state
130{
ajs8b886ca2004-12-22 02:56:38 +0000131 watch_mode_t mode;
132 restart_phase_t phase;
133 struct thread *t_phase_hanging;
134 const char *vtydir;
135 long period;
136 long timeout;
137 long restart_timeout;
138 long min_restart_interval;
139 long max_restart_interval;
140 int do_ping;
141 struct daemon *daemons;
142 const char *restart_command;
143 const char *start_command;
144 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000145 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000146 int unresponsive_restart;
147 int loglevel;
148 struct daemon *special; /* points to zebra when doing phased restart */
149 int numdaemons;
150 int numpids;
151 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
152} gs = {
153 .mode = MODE_MONITOR,
154 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000155 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000156 .period = 1000*DEFAULT_PERIOD,
157 .timeout = DEFAULT_TIMEOUT,
158 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
159 .loglevel = DEFAULT_LOGLEVEL,
160 .min_restart_interval = DEFAULT_MIN_RESTART,
161 .max_restart_interval = DEFAULT_MAX_RESTART,
162 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000163};
164
165typedef enum
166{
167 DAEMON_INIT,
168 DAEMON_DOWN,
169 DAEMON_CONNECTING,
170 DAEMON_UP,
171 DAEMON_UNRESPONSIVE
172} daemon_state_t;
173
174#define IS_UP(DMN) \
175 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
176
177static const char *state_str[] =
178{
179 "Init",
180 "Down",
181 "Connecting",
182 "Up",
183 "Unresponsive",
184};
185
186struct daemon {
187 const char *name;
188 daemon_state_t state;
189 int fd;
190 struct timeval echo_sent;
191 u_int connect_tries;
192 struct thread *t_wakeup;
193 struct thread *t_read;
194 struct thread *t_write;
195 struct daemon *next;
196 struct restart_info restart;
197};
198
199static const struct option longopts[] =
200{
201 { "daemon", no_argument, NULL, 'd'},
202 { "statedir", required_argument, NULL, 'S'},
203 { "no-echo", no_argument, NULL, 'e'},
204 { "loglevel", required_argument, NULL, 'l'},
205 { "interval", required_argument, NULL, 'i'},
206 { "timeout", required_argument, NULL, 't'},
207 { "restart-timeout", required_argument, NULL, 'T'},
208 { "restart", required_argument, NULL, 'r'},
209 { "start-command", required_argument, NULL, 's'},
210 { "kill-command", required_argument, NULL, 'k'},
211 { "restart-all", required_argument, NULL, 'R'},
212 { "all-restart", no_argument, NULL, 'a'},
213 { "always-all-restart", no_argument, NULL, 'A'},
214 { "unresponsive-restart", no_argument, NULL, 'z'},
215 { "min-restart-interval", required_argument, NULL, 'm'},
216 { "max-restart-interval", required_argument, NULL, 'M'},
217 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000218 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000219 { "help", no_argument, NULL, 'h'},
220 { "version", no_argument, NULL, 'v'},
221 { NULL, 0, NULL, 0 }
222};
223
224static int try_connect(struct daemon *dmn);
225static int wakeup_send_echo(struct thread *t_wakeup);
226static void try_restart(struct daemon *dmn);
227static void phase_check(void);
228
229static int
230usage(const char *progname, int status)
231{
232 if (status != 0)
233 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
234 else
235 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
236Watchdog program to monitor status of quagga daemons and try to restart\n\
237them if they are down or unresponsive. It determines whether a daemon is\n\
238up based on whether it can connect to the daemon's vty unix stream socket.\n\
239It then repeatedly sends echo commands over that socket to determine whether\n\
240the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
241on the socket connection and know immediately that the daemon is down.\n\n\
242The daemons to be monitored should be listed on the command line.\n\n\
243This program can run in one of 5 modes:\n\n\
2440. Mode: %s.\n\
245 Just monitor and report on status changes. Example:\n\
246 %s -d zebra ospfd bgpd\n\n\
2471. Mode: %s.\n\
248 Whenever any daemon hangs or crashes, use the given command to restart\n\
249 them all. Example:\n\
250 %s -dz \\\n\
251 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
252 zebra ospfd\n\n\
2532. Mode: %s.\n\
254 When any single daemon hangs or crashes, restart only the daemon that's\n\
255 in trouble using the supplied restart command. Example:\n\
256 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2573. Mode: %s.\n\
258 The same as the previous mode, except that there is special treatment when\n\
259 the zebra daemon is in trouble. In that case, a phased restart approach\n\
260 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
261 daemons. Example:\n\
262 %s -adz -r '/sbin/service %%s restart' \\\n\
263 -s '/sbin/service %%s start' \\\n\
264 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2654. Mode: %s.\n\
266 This is the same as the previous mode, except that the phased restart\n\
267 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
268 %s -Adz -r '/sbin/service %%s restart' \\\n\
269 -s '/sbin/service %%s start' \\\n\
270 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
271As of this writing, it is believed that mode 2 [%s]\n\
272is not safe, and mode 3 [%s] may not be safe with some of the\n\
273routing daemons.\n\n\
274In order to avoid attempting to restart the daemons in a fast loop,\n\
275the -m and -M options allow you to control the minimum delay between\n\
276restart commands. The minimum restart delay is recalculated each time\n\
277a restart is attempted: if the time since the last restart attempt exceeds\n\
278twice the -M value, then the restart delay is set to the -m value.\n\
279Otherwise, the interval is doubled (but capped at the -M value).\n\n\
280Options:\n\
281-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
282 to syslog instead of stdout.\n\
283-S, --statedir Set the vty socket directory (default is %s)\n\
284-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
285 option is necessary if the daemons do not support the\n\
286 echo command)\n\
287-l, --loglevel Set the logging level (default is %d).\n\
288 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
289 but it can be set higher than %d if extra-verbose debugging\n\
290 messages are desired.\n\
291-m, --min-restart-interval\n\
292 Set the minimum seconds to wait between invocations of daemon\n\
293 restart commands (default is %d).\n\
294-M, --max-restart-interval\n\
295 Set the maximum seconds to wait between invocations of daemon\n\
296 restart commands (default is %d).\n\
297-i, --interval Set the status polling interval in seconds (default is %d)\n\
298-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
299-T, --restart-timeout\n\
300 Set the restart (kill) timeout in seconds (default is %d).\n\
301 If any background jobs are still running after this much\n\
302 time has elapsed, they will be killed.\n\
303-r, --restart Supply a Bourne shell command to use to restart a single\n\
304 daemon. The command string should include '%%s' where the\n\
305 name of the daemon should be substituted.\n\
306 Note that -r and -R are incompatible.\n\
307-s, --start-command\n\
308 Supply a Bourne shell to command to use to start a single\n\
309 daemon. The command string should include '%%s' where the\n\
310 name of the daemon should be substituted.\n\
311-k, --kill-command\n\
312 Supply a Bourne shell to command to use to stop a single\n\
313 daemon. The command string should include '%%s' where the\n\
314 name of the daemon should be substituted.\n\
315-R, --restart-all\n\
316 When one or more daemons is down, try to restart everything\n\
317 using the Bourne shell command supplied as the argument.\n\
318 Note that -r and -R are incompatible.\n\
319-z, --unresponsive-restart\n\
320 When a daemon is unresponsive, treat it as being down for\n\
321 restart purposes.\n\
322-a, --all-restart\n\
323 When zebra hangs or crashes, restart all daemons using\n\
324 this phased approach: 1. stop all other daemons; 2. restart\n\
325 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
326-A, --always-all-restart\n\
327 When any daemon (not just zebra) hangs or crashes, use the\n\
328 same phased restart mechanism described above for -a.\n\
329 Requires -r, -s, and -k.\n\
330-p, --pid-file Set process identifier file name\n\
331 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000332-b, --blank-string\n\
333 When the supplied argument string is found in any of the\n\
334 various shell command arguments (-r, -s, -k, or -R), replace\n\
335 it with a space. This is an ugly hack to circumvent problems\n\
336 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000337-v, --version Print program version\n\
338-h, --help Display this help and exit\n\
339", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
340progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000341VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000342DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
343DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
344
345 return status;
346}
347
348static pid_t
349run_background(const char *shell_cmd)
350{
351 pid_t child;
352
353 switch (child = fork())
354 {
355 case -1:
356 zlog_err("fork failed, cannot run command [%s]: %s",
357 shell_cmd,safe_strerror(errno));
358 return -1;
359 case 0:
360 /* Child process. */
361 /* Use separate process group so child processes can be killed easily. */
362 if (setpgid(0,0) < 0)
363 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
364 {
365 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
366 execv("/bin/sh",(char *const *)argv);
367 zlog_err("execv(/bin/sh -c '%s') failed: %s",
368 shell_cmd,safe_strerror(errno));
369 _exit(127);
370 }
371 default:
372 /* Parent process: we will reap the child later. */
373 zlog_err("Forked background command [pid %d]: %s",child,shell_cmd);
374 return child;
375 }
376}
377
378static struct timeval *
379time_elapsed(struct timeval *result, const struct timeval *start_time)
380{
381 gettimeofday(result,NULL);
382 result->tv_sec -= start_time->tv_sec;
383 result->tv_usec -= start_time->tv_usec;
384 while (result->tv_usec < 0)
385 {
386 result->tv_usec += 1000000L;
387 result->tv_sec--;
388 }
389 return result;
390}
391
392static int
393restart_kill(struct thread *t_kill)
394{
395 struct restart_info *restart = THREAD_ARG(t_kill);
396 struct timeval delay;
397
398 time_elapsed(&delay,&restart->time);
399 zlog_warn("Warning: %s %s child process %d still running after "
400 "%ld seconds, sending signal %d",
401 restart->what,restart->name,restart->pid,delay.tv_sec,
402 (restart->kills ? SIGKILL : SIGTERM));
403 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
404 restart->kills++;
405 restart->t_kill = thread_add_timer(master,restart_kill,restart,
406 gs.restart_timeout);
407 return 0;
408}
409
410static struct restart_info *
411find_child(pid_t child)
412{
413 if (gs.mode == MODE_GLOBAL_RESTART)
414 {
415 if (gs.restart.pid == child)
416 return &gs.restart;
417 }
418 else
419 {
420 struct daemon *dmn;
421 for (dmn = gs.daemons; dmn; dmn = dmn->next)
422 {
423 if (dmn->restart.pid == child)
424 return &dmn->restart;
425 }
426 }
427 return NULL;
428}
429
430static void
431sigchild(void)
432{
433 pid_t child;
434 int status;
435 const char *name;
436 const char *what;
437 struct restart_info *restart;
438
439 switch (child = waitpid(-1,&status,WNOHANG))
440 {
441 case -1:
442 zlog_err("waitpid failed: %s",safe_strerror(errno));
443 return;
444 case 0:
445 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
446 return;
447 }
448
449 if ((restart = find_child(child)) != NULL)
450 {
451 name = restart->name;
452 what = restart->what;
453 restart->pid = 0;
454 gs.numpids--;
455 thread_cancel(restart->t_kill);
456 restart->t_kill = NULL;
457 /* Update restart time to reflect the time the command completed. */
458 gettimeofday(&restart->time,NULL);
459 }
460 else
461 {
462 zlog_err("waitpid returned status for an unknown child process %d",
463 child);
464 name = "(unknown)";
465 what = "background";
466 }
467 if (WIFSTOPPED(status))
468 zlog_warn("warning: %s %s process %d is stopped",
469 what,name,child);
470 else if (WIFSIGNALED(status))
471 zlog_warn("%s %s process %d terminated due to signal %d",
472 what,name,child,WTERMSIG(status));
473 else if (WIFEXITED(status))
474 {
475 if (WEXITSTATUS(status) != 0)
476 zlog_warn("%s %s process %d exited with non-zero status %d",
477 what,name,child,WEXITSTATUS(status));
478 else
479 zlog_debug("%s %s process %d exited normally",what,name,child);
480 }
481 else
482 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
483 what,name,child,status);
484 phase_check();
485}
486
487static int
488run_job(struct restart_info *restart, const char *cmdtype, const char *command,
489 int force, int update_interval)
490{
491 struct timeval delay;
492
493 if (gs.loglevel > LOG_DEBUG+1)
494 zlog_debug("attempting to %s %s",cmdtype,restart->name);
495
496 if (restart->pid)
497 {
498 if (gs.loglevel > LOG_DEBUG+1)
499 zlog_debug("cannot %s %s, previous pid %d still running",
500 cmdtype,restart->name,restart->pid);
501 return -1;
502 }
503
504 if (!force &&
505 (time_elapsed(&delay,&restart->time)->tv_sec < restart->interval))
506 {
507 if (gs.loglevel > LOG_DEBUG+1)
508 zlog_debug("postponing %s %s: "
509 "elapsed time %ld < retry interval %ld",
510 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
511 return -1;
512 }
513
514 gettimeofday(&restart->time,NULL);
515 restart->kills = 0;
516 {
517 char cmd[strlen(command)+strlen(restart->name)+1];
518 snprintf(cmd,sizeof(cmd),command,restart->name);
519 if ((restart->pid = run_background(cmd)) > 0)
520 {
521 restart->t_kill = thread_add_timer(master,restart_kill,restart,
522 gs.restart_timeout);
523 restart->what = cmdtype;
524 gs.numpids++;
525 }
526 else
527 restart->pid = 0;
528 }
529
530 /* Calculate the new restart interval. */
531 if (update_interval)
532 {
533 if (delay.tv_sec > 2*gs.max_restart_interval)
534 restart->interval = gs.min_restart_interval;
535 else if ((restart->interval *= 2) > gs.max_restart_interval)
536 restart->interval = gs.max_restart_interval;
537 if (gs.loglevel > LOG_DEBUG+1)
538 zlog_debug("restart %s interval is now %ld",
539 restart->name,restart->interval);
540 }
541 return restart->pid;
542}
543
544#define SET_READ_HANDLER(DMN) \
545 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
546
547#define SET_WAKEUP_DOWN(DMN) \
548 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
549 FUZZY(gs.period))
550
551#define SET_WAKEUP_UNRESPONSIVE(DMN) \
552 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
553 FUZZY(gs.period))
554
555#define SET_WAKEUP_ECHO(DMN) \
556 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
557 FUZZY(gs.period))
558
559static int
560wakeup_down(struct thread *t_wakeup)
561{
562 struct daemon *dmn = THREAD_ARG(t_wakeup);
563
564 dmn->t_wakeup = NULL;
565 if (try_connect(dmn) < 0)
566 SET_WAKEUP_DOWN(dmn);
567 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
568 try_restart(dmn);
569 return 0;
570}
571
572static int
573wakeup_init(struct thread *t_wakeup)
574{
575 struct daemon *dmn = THREAD_ARG(t_wakeup);
576
577 dmn->t_wakeup = NULL;
578 if (try_connect(dmn) < 0)
579 {
580 SET_WAKEUP_DOWN(dmn);
581 zlog_err("%s state -> down : initial connection attempt failed",
582 dmn->name);
583 dmn->state = DAEMON_DOWN;
584 }
585 return 0;
586}
587
588static void
589daemon_down(struct daemon *dmn, const char *why)
590{
591 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
592 zlog_err("%s state -> down : %s",dmn->name,why);
593 else if (gs.loglevel > LOG_DEBUG)
594 zlog_debug("%s still down : %s",dmn->name,why);
595 if (IS_UP(dmn))
596 gs.numdown++;
597 dmn->state = DAEMON_DOWN;
598 if (dmn->fd >= 0)
599 {
600 close(dmn->fd);
601 dmn->fd = -1;
602 }
603 THREAD_OFF(dmn->t_read);
604 THREAD_OFF(dmn->t_write);
605 THREAD_OFF(dmn->t_wakeup);
606 if (try_connect(dmn) < 0)
607 SET_WAKEUP_DOWN(dmn);
608 phase_check();
609}
610
611static int
612handle_read(struct thread *t_read)
613{
614 struct daemon *dmn = THREAD_ARG(t_read);
615 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
616 char buf[sizeof(resp)+100];
617 ssize_t rc;
618 struct timeval delay;
619
620 dmn->t_read = NULL;
621 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
622 {
623 char why[100];
624
625 if ((errno == EINTR) || (errno == EAGAIN))
626 {
627 /* Pretend it never happened. */
628 SET_READ_HANDLER(dmn);
629 return 0;
630 }
631 snprintf(why,sizeof(why),"unexpected read error: %s",
632 safe_strerror(errno));
633 daemon_down(dmn,why);
634 return 0;
635 }
636 if (rc == 0)
637 {
638 daemon_down(dmn,"read returned EOF");
639 return 0;
640 }
641 if (!dmn->echo_sent.tv_sec)
642 {
643 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000644 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
645 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000646 daemon_down(dmn,why);
647 return 0;
648 }
649
650 /* We are expecting an echo response: is there any chance that the
651 response would not be returned entirely in the first read? That
652 seems inconceivable... */
653 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
654 {
655 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000656 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
657 "(expecting %u): %.*s",
658 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000659 daemon_down(dmn,why);
660 return 0;
661 }
662
663 time_elapsed(&delay,&dmn->echo_sent);
664 dmn->echo_sent.tv_sec = 0;
665 if (dmn->state == DAEMON_UNRESPONSIVE)
666 {
667 if (delay.tv_sec < gs.timeout)
668 {
669 dmn->state = DAEMON_UP;
670 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
671 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
672 }
673 else
674 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
675 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
676 }
677 else if (gs.loglevel > LOG_DEBUG+1)
678 zlog_debug("%s: echo response received after %ld.%06ld seconds",
679 dmn->name,delay.tv_sec,delay.tv_usec);
680
681 SET_READ_HANDLER(dmn);
682 if (dmn->t_wakeup)
683 thread_cancel(dmn->t_wakeup);
684 SET_WAKEUP_ECHO(dmn);
685
686 return 0;
687}
688
689static void
690daemon_up(struct daemon *dmn, const char *why)
691{
692 dmn->state = DAEMON_UP;
693 gs.numdown--;
694 dmn->connect_tries = 0;
695 zlog_notice("%s state -> up : %s",dmn->name,why);
696 if (gs.do_ping)
697 SET_WAKEUP_ECHO(dmn);
698 phase_check();
699}
700
701static int
702check_connect(struct thread *t_write)
703{
704 struct daemon *dmn = THREAD_ARG(t_write);
705 int sockerr;
706 socklen_t reslen = sizeof(sockerr);
707
708 dmn->t_write = NULL;
709 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
710 {
711 zlog_warn("%s: check_connect: getsockopt failed: %s",
712 dmn->name,safe_strerror(errno));
713 daemon_down(dmn,"getsockopt failed checking connection success");
714 return 0;
715 }
716 if ((reslen == sizeof(sockerr)) && sockerr)
717 {
718 char why[100];
719 snprintf(why,sizeof(why),
720 "getsockopt reports that connection attempt failed: %s",
721 safe_strerror(sockerr));
722 daemon_down(dmn,why);
723 return 0;
724 }
725
726 daemon_up(dmn,"delayed connect succeeded");
727 return 0;
728}
729
730static int
731wakeup_connect_hanging(struct thread *t_wakeup)
732{
733 struct daemon *dmn = THREAD_ARG(t_wakeup);
734 char why[100];
735
736 dmn->t_wakeup = NULL;
737 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
738 gs.timeout);
739 daemon_down(dmn,why);
740 return 0;
741}
742
743/* Making connection to protocol daemon. */
744static int
745try_connect(struct daemon *dmn)
746{
747 int sock;
748 struct sockaddr_un addr;
749 socklen_t len;
750 int flags;
751
752 if (gs.loglevel > LOG_DEBUG+1)
753 zlog_debug("%s: attempting to connect",dmn->name);
754 dmn->connect_tries++;
755
756 memset (&addr, 0, sizeof (struct sockaddr_un));
757 addr.sun_family = AF_UNIX;
758 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
759 gs.vtydir,dmn->name);
760#ifdef HAVE_SUN_LEN
761 len = addr.sun_len = SUN_LEN(&addr);
762#else
763 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
764#endif /* HAVE_SUN_LEN */
765
766 /* Quick check to see if we might succeed before we go to the trouble
767 of creating a socket. */
768 if (access(addr.sun_path, W_OK) < 0)
769 {
770 if (errno != ENOENT)
771 zlog_err("%s: access to socket %s denied: %s",
772 dmn->name,addr.sun_path,safe_strerror(errno));
773 return -1;
774 }
775
776 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
777 {
778 zlog_err("%s(%s): cannot make socket: %s",
779 __func__,addr.sun_path, safe_strerror(errno));
780 return -1;
781 }
782
783 /* Set non-blocking. */
784 if ((flags = fcntl(sock, F_GETFL, 0)) < 0)
785 {
786 zlog_err("%s(%s): fcntl(F_GETFL) failed: %s",
787 __func__,addr.sun_path, safe_strerror(errno));
788 close(sock);
789 return -1;
790 }
791 if (fcntl(sock, F_SETFL, (flags|O_NONBLOCK)) < 0)
792 {
793 zlog_err("%s(%s): fcntl(F_SETFL,O_NONBLOCK) failed: %s",
794 __func__,addr.sun_path, safe_strerror(errno));
795 close(sock);
796 return -1;
797 }
798
799 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
800 {
801 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
802 {
803 if (gs.loglevel > LOG_DEBUG)
804 zlog_debug("%s(%s): connect failed: %s",
805 __func__,addr.sun_path, safe_strerror(errno));
806 close (sock);
807 return -1;
808 }
809 if (gs.loglevel > LOG_DEBUG)
810 zlog_debug("%s: connection in progress",dmn->name);
811 dmn->state = DAEMON_CONNECTING;
812 dmn->fd = sock;
813 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
814 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
815 gs.timeout);
816 SET_READ_HANDLER(dmn);
817 return 0;
818 }
819
820 dmn->fd = sock;
821 SET_READ_HANDLER(dmn);
822 daemon_up(dmn,"connect succeeded");
823 return 1;
824}
825
826static int
827phase_hanging(struct thread *t_hanging)
828{
829 gs.t_phase_hanging = NULL;
830 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
831 phase_str[gs.phase],PHASE_TIMEOUT);
832 gs.phase = PHASE_NONE;
833 return 0;
834}
835
836static void
837set_phase(restart_phase_t new_phase)
838{
839 gs.phase = new_phase;
840 if (gs.t_phase_hanging)
841 thread_cancel(gs.t_phase_hanging);
842 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
843 PHASE_TIMEOUT);
844}
845
846static void
847phase_check(void)
848{
849 switch (gs.phase)
850 {
851 case PHASE_NONE:
852 break;
853 case PHASE_STOPS_PENDING:
854 if (gs.numpids)
855 break;
856 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
857 set_phase(PHASE_WAITING_DOWN);
858 /*FALLTHRU*/
859 case PHASE_WAITING_DOWN:
860 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
861 break;
862 zlog_info("Phased restart: all routing daemons now down.");
863 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
864 set_phase(PHASE_ZEBRA_RESTART_PENDING);
865 /*FALLTHRU*/
866 case PHASE_ZEBRA_RESTART_PENDING:
867 if (gs.special->restart.pid)
868 break;
869 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
870 set_phase(PHASE_WAITING_ZEBRA_UP);
871 /*FALLTHRU*/
872 case PHASE_WAITING_ZEBRA_UP:
873 if (!IS_UP(gs.special))
874 break;
875 zlog_info("Phased restart: %s is now up.",gs.special->name);
876 {
877 struct daemon *dmn;
878 for (dmn = gs.daemons; dmn; dmn = dmn->next)
879 {
880 if (dmn != gs.special)
881 run_job(&dmn->restart,"start",gs.start_command,1,1);
882 }
883 }
884 gs.phase = PHASE_NONE;
885 THREAD_OFF(gs.t_phase_hanging);
886 zlog_notice("Phased global restart has completed.");
887 break;
888 }
889}
890
891static void
892try_restart(struct daemon *dmn)
893{
894 switch (gs.mode)
895 {
896 case MODE_MONITOR:
897 return;
898 case MODE_GLOBAL_RESTART:
899 run_job(&gs.restart,"restart",gs.restart_command,0,1);
900 break;
901 case MODE_SEPARATE_RESTART:
902 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
903 break;
904 case MODE_PHASED_ZEBRA_RESTART:
905 if (dmn != gs.special)
906 {
907 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
908 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
909 else
910 zlog_debug("%s: postponing restart attempt because master %s daemon "
911 "not up [%s], or phased restart in progress",
912 dmn->name,gs.special->name,state_str[gs.special->state]);
913 break;
914 }
915 /*FALLTHRU*/
916 case MODE_PHASED_ALL_RESTART:
917 if ((gs.phase != PHASE_NONE) || gs.numpids)
918 {
919 if (gs.loglevel > LOG_DEBUG+1)
920 zlog_debug("postponing phased global restart: restart already in "
921 "progress [%s], or outstanding child processes [%d]",
922 phase_str[gs.phase],gs.numpids);
923 break;
924 }
925 /* Is it too soon for a restart? */
926 {
927 struct timeval delay;
928 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
929 gs.special->restart.interval)
930 {
931 if (gs.loglevel > LOG_DEBUG+1)
932 zlog_debug("postponing phased global restart: "
933 "elapsed time %ld < retry interval %ld",
934 (long)delay.tv_sec,gs.special->restart.interval);
935 break;
936 }
937 }
938 zlog_info("Phased restart: stopping all routing daemons.");
939 /* First step: stop all other daemons. */
940 for (dmn = gs.daemons; dmn; dmn = dmn->next)
941 {
942 if (dmn != gs.special)
943 run_job(&dmn->restart,"stop",gs.stop_command,1,0);
944 }
945 set_phase(PHASE_STOPS_PENDING);
946 break;
947 default:
948 zlog_err("error: unknown restart mode %d",gs.mode);
949 break;
950 }
951}
952
953static int
954wakeup_unresponsive(struct thread *t_wakeup)
955{
956 struct daemon *dmn = THREAD_ARG(t_wakeup);
957
958 dmn->t_wakeup = NULL;
959 if (dmn->state != DAEMON_UNRESPONSIVE)
960 zlog_err("%s: no longer unresponsive (now %s), "
961 "wakeup should have been cancelled!",
962 dmn->name,state_str[dmn->state]);
963 else
964 {
965 SET_WAKEUP_UNRESPONSIVE(dmn);
966 try_restart(dmn);
967 }
968 return 0;
969}
970
971static int
972wakeup_no_answer(struct thread *t_wakeup)
973{
974 struct daemon *dmn = THREAD_ARG(t_wakeup);
975
976 dmn->t_wakeup = NULL;
977 dmn->state = DAEMON_UNRESPONSIVE;
978 zlog_err("%s state -> unresponsive : no response yet to ping "
979 "sent %ld seconds ago",dmn->name,gs.timeout);
980 if (gs.unresponsive_restart)
981 {
982 SET_WAKEUP_UNRESPONSIVE(dmn);
983 try_restart(dmn);
984 }
985 return 0;
986}
987
988static int
989wakeup_send_echo(struct thread *t_wakeup)
990{
991 static const char echocmd[] = "echo " PING_TOKEN;
992 ssize_t rc;
993 struct daemon *dmn = THREAD_ARG(t_wakeup);
994
995 dmn->t_wakeup = NULL;
996 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
997 ((size_t)rc != sizeof(echocmd)))
998 {
999 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +00001000 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
1001 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +00001002 daemon_down(dmn,why);
1003 }
1004 else
1005 {
1006 gettimeofday(&dmn->echo_sent,NULL);
1007 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
1008 }
1009 return 0;
1010}
1011
1012static void
1013sigint(void)
1014{
1015 zlog_notice("Terminating on signal");
1016 exit(0);
1017}
1018
1019static int
1020valid_command(const char *cmd)
1021{
1022 char *p;
1023
1024 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1025}
1026
ajsc8b40f82004-12-22 16:17:16 +00001027/* This is an ugly hack to circumvent problems with passing command-line
1028 arguments that contain spaces. The fix is to use a configuration file. */
1029static char *
1030translate_blanks(const char *cmd, const char *blankstr)
1031{
1032 char *res;
1033 char *p;
1034 size_t bslen = strlen(blankstr);
1035
1036 if (!(res = strdup(cmd)))
1037 {
1038 perror("strdup");
1039 exit(1);
1040 }
1041 while ((p = strstr(res,blankstr)) != NULL)
1042 {
1043 *p = ' ';
1044 if (bslen != 1)
1045 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1046 }
1047 return res;
1048}
1049
ajs8b886ca2004-12-22 02:56:38 +00001050int
1051main(int argc, char **argv)
1052{
1053 const char *progname;
1054 int opt;
1055 int daemon_mode = 0;
1056 const char *pidfile = DEFAULT_PIDFILE;
1057 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001058 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001059 static struct quagga_signal_t my_signals[] =
1060 {
1061 {
1062 .signal = SIGINT,
1063 .handler = sigint,
1064 },
1065 {
1066 .signal = SIGTERM,
1067 .handler = sigint,
1068 },
1069 {
1070 .signal = SIGCHLD,
1071 .handler = sigchild,
1072 },
1073 };
1074
1075 if ((progname = strrchr (argv[0], '/')) != NULL)
1076 progname++;
1077 else
1078 progname = argv[0];
1079
ajs098e2402004-12-22 17:00:46 +00001080 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001081 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001082 longopts, 0)) != EOF)
1083 {
1084 switch (opt)
1085 {
1086 case 0:
1087 break;
1088 case 'a':
1089 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1090 {
1091 fputs("Ambiguous operating mode selected.\n",stderr);
1092 return usage(progname,1);
1093 }
1094 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1095 break;
1096 case 'A':
1097 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1098 {
1099 fputs("Ambiguous operating mode selected.\n",stderr);
1100 return usage(progname,1);
1101 }
1102 gs.mode = MODE_PHASED_ALL_RESTART;
1103 break;
ajsc8b40f82004-12-22 16:17:16 +00001104 case 'b':
1105 blankstr = optarg;
1106 break;
ajs8b886ca2004-12-22 02:56:38 +00001107 case 'd':
1108 daemon_mode = 1;
1109 break;
1110 case 'e':
1111 gs.do_ping = 0;
1112 break;
1113 case 'k':
1114 if (!valid_command(optarg))
1115 {
1116 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1117 optarg);
1118 return usage(progname,1);
1119 }
1120 gs.stop_command = optarg;
1121 break;
1122 case 'l':
1123 {
1124 char garbage[3];
1125 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1126 (gs.loglevel < LOG_EMERG))
1127 {
1128 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1129 return usage(progname,1);
1130 }
1131 }
1132 break;
1133 case 'm':
1134 {
1135 char garbage[3];
1136 if ((sscanf(optarg,"%ld%1s",
1137 &gs.min_restart_interval,garbage) != 1) ||
1138 (gs.min_restart_interval < 0))
1139 {
1140 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1141 optarg);
1142 return usage(progname,1);
1143 }
1144 }
1145 break;
1146 case 'M':
1147 {
1148 char garbage[3];
1149 if ((sscanf(optarg,"%ld%1s",
1150 &gs.max_restart_interval,garbage) != 1) ||
1151 (gs.max_restart_interval < 0))
1152 {
1153 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1154 optarg);
1155 return usage(progname,1);
1156 }
1157 }
1158 break;
1159 case 'i':
1160 {
1161 char garbage[3];
1162 int period;
1163 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1164 (gs.period < 1))
1165 {
1166 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1167 return usage(progname,1);
1168 }
1169 gs.period = 1000*period;
1170 }
1171 break;
1172 case 'p':
1173 pidfile = optarg;
1174 break;
1175 case 'r':
1176 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1177 (gs.mode == MODE_SEPARATE_RESTART))
1178 {
1179 fputs("Ambiguous operating mode selected.\n",stderr);
1180 return usage(progname,1);
1181 }
1182 if (!valid_command(optarg))
1183 {
1184 fprintf(stderr,
1185 "Invalid restart command, must contain '%%s': %s\n",
1186 optarg);
1187 return usage(progname,1);
1188 }
1189 gs.restart_command = optarg;
1190 if (gs.mode == MODE_MONITOR)
1191 gs.mode = MODE_SEPARATE_RESTART;
1192 break;
1193 case 'R':
1194 if (gs.mode != MODE_MONITOR)
1195 {
1196 fputs("Ambiguous operating mode selected.\n",stderr);
1197 return usage(progname,1);
1198 }
1199 if (strchr(optarg,'%'))
1200 {
1201 fprintf(stderr,
1202 "Invalid restart-all arg, must not contain '%%s': %s\n",
1203 optarg);
1204 return usage(progname,1);
1205 }
1206 gs.restart_command = optarg;
1207 gs.mode = MODE_GLOBAL_RESTART;
1208 break;
1209 case 's':
1210 if (!valid_command(optarg))
1211 {
1212 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1213 optarg);
1214 return usage(progname,1);
1215 }
1216 gs.start_command = optarg;
1217 break;
1218 case 'S':
1219 gs.vtydir = optarg;
1220 break;
1221 case 't':
1222 {
1223 char garbage[3];
1224 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1225 (gs.timeout < 1))
1226 {
1227 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1228 return usage(progname,1);
1229 }
1230 }
1231 break;
1232 case 'T':
1233 {
1234 char garbage[3];
1235 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1236 (gs.restart_timeout < 1))
1237 {
1238 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1239 return usage(progname,1);
1240 }
1241 }
1242 break;
1243 case 'z':
1244 gs.unresponsive_restart = 1;
1245 break;
1246 case 'v':
1247 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1248 puts("Copyright 2004 Andrew J. Schorr");
1249 return 0;
1250 case 'h':
1251 return usage(progname,0);
1252 default:
1253 fputs("Invalid option.\n",stderr);
1254 return usage(progname,1);
1255 }
1256 }
1257
1258 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1259 {
1260 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1261 return usage(progname,1);
1262 }
1263 switch (gs.mode)
1264 {
1265 case MODE_MONITOR:
1266 if (gs.restart_command || gs.start_command || gs.stop_command)
1267 {
1268 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1269 mode_str[gs.mode]);
1270 return usage(progname,1);
1271 }
1272 break;
1273 case MODE_GLOBAL_RESTART:
1274 case MODE_SEPARATE_RESTART:
1275 if (!gs.restart_command || gs.start_command || gs.stop_command)
1276 {
1277 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1278 mode_str[gs.mode]);
1279 return usage(progname,1);
1280 }
1281 break;
1282 case MODE_PHASED_ZEBRA_RESTART:
1283 case MODE_PHASED_ALL_RESTART:
1284 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1285 {
1286 fprintf(stderr,
1287 "Need start, kill, and restart commands in [%s] mode.\n",
1288 mode_str[gs.mode]);
1289 return usage(progname,1);
1290 }
1291 break;
1292 }
1293
ajsc8b40f82004-12-22 16:17:16 +00001294 if (blankstr)
1295 {
1296 if (gs.restart_command)
1297 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1298 if (gs.start_command)
1299 gs.start_command = translate_blanks(gs.start_command,blankstr);
1300 if (gs.stop_command)
1301 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1302 }
1303
ajs8b886ca2004-12-22 02:56:38 +00001304 gs.restart.interval = gs.min_restart_interval;
1305 master = thread_master_create();
1306 signal_init (master, Q_SIGC(my_signals), my_signals);
1307 srandom(time(NULL));
1308
1309 {
1310 int i;
1311 struct daemon *tail = NULL;
1312
1313 for (i = optind; i < argc; i++)
1314 {
1315 struct daemon *dmn;
1316
1317 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1318 {
ajs098e2402004-12-22 17:00:46 +00001319 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1320 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001321 return 1;
1322 }
1323 dmn->name = dmn->restart.name = argv[i];
1324 dmn->state = DAEMON_INIT;
1325 gs.numdaemons++;
1326 gs.numdown++;
1327 dmn->fd = -1;
1328 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1329 100+(random() % 900));
1330 dmn->restart.interval = gs.min_restart_interval;
1331 if (tail)
1332 tail->next = dmn;
1333 else
1334 gs.daemons = dmn;
1335 tail = dmn;
1336
1337 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1338 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1339 !strcmp(dmn->name,special))
1340 gs.special = dmn;
1341 }
1342 }
1343 if (!gs.daemons)
1344 {
1345 fputs("Must specify one or more daemons to monitor.\n",stderr);
1346 return usage(progname,1);
1347 }
1348 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1349 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1350 {
1351 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1352 mode_str[gs.mode],special);
1353 return usage(progname,1);
1354 }
1355 if (gs.special && (gs.numdaemons < 2))
1356 {
1357 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1358 "to watch.\n",mode_str[gs.mode]);
1359 return usage(progname,1);
1360 }
1361
1362 zlog_default = openzlog(progname, ZLOG_NONE,
1363 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1364 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1365 if (daemon_mode)
1366 {
1367 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1368 daemon(0, 0);
1369 }
1370 else
1371 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1372
1373 /* Make sure we're not already running. */
1374 pid_output (pidfile);
1375
1376 /* Announce which daemons are being monitored. */
1377 {
1378 struct daemon *dmn;
1379 size_t len = 0;
1380
1381 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1382 len += strlen(dmn->name)+1;
1383
1384 {
1385 char buf[len+1];
1386 char *p = buf;
1387
1388 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1389 {
1390 if (p != buf)
1391 *p++ = ' ';
1392 strcpy(p,dmn->name);
1393 p += strlen(p);
1394 }
1395 zlog_notice("%s %s watching [%s], mode [%s]",
1396 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1397 }
1398 }
1399
1400 {
1401 struct thread thread;
1402
1403 while (thread_fetch (master, &thread))
1404 thread_call (&thread);
1405 }
1406
1407 /* Not reached. */
1408 return 0;
1409}