blob: fb24d75c576377a65786a17e7d18ce035e28794f [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajsf2d82572004-12-29 17:45:08 +00002 $Id: watchquagga.c,v 1.8 2004/12/29 17:45:08 ajs Exp $
ajs8b886ca2004-12-22 02:56:38 +00003
4 Monitor status of quagga daemons and restart if necessary.
5
6 Copyright (C) 2004 Andrew J. Schorr
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
ajsa3655342004-12-29 17:39:10 +000023#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000024#include <thread.h>
25#include <log.h>
26#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000027#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000028#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000029#include <sys/un.h>
30#include <sys/wait.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
218 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
219Watchdog program to monitor status of quagga daemons and try to restart\n\
220them if they are down or unresponsive. It determines whether a daemon is\n\
221up based on whether it can connect to the daemon's vty unix stream socket.\n\
222It then repeatedly sends echo commands over that socket to determine whether\n\
223the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
224on the socket connection and know immediately that the daemon is down.\n\n\
225The daemons to be monitored should be listed on the command line.\n\n\
226This program can run in one of 5 modes:\n\n\
2270. Mode: %s.\n\
228 Just monitor and report on status changes. Example:\n\
229 %s -d zebra ospfd bgpd\n\n\
2301. Mode: %s.\n\
231 Whenever any daemon hangs or crashes, use the given command to restart\n\
232 them all. Example:\n\
233 %s -dz \\\n\
234 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
235 zebra ospfd\n\n\
2362. Mode: %s.\n\
237 When any single daemon hangs or crashes, restart only the daemon that's\n\
238 in trouble using the supplied restart command. Example:\n\
239 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2403. Mode: %s.\n\
241 The same as the previous mode, except that there is special treatment when\n\
242 the zebra daemon is in trouble. In that case, a phased restart approach\n\
243 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
244 daemons. Example:\n\
245 %s -adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2484. Mode: %s.\n\
249 This is the same as the previous mode, except that the phased restart\n\
250 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
251 %s -Adz -r '/sbin/service %%s restart' \\\n\
252 -s '/sbin/service %%s start' \\\n\
253 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
254As of this writing, it is believed that mode 2 [%s]\n\
255is not safe, and mode 3 [%s] may not be safe with some of the\n\
256routing daemons.\n\n\
257In order to avoid attempting to restart the daemons in a fast loop,\n\
258the -m and -M options allow you to control the minimum delay between\n\
259restart commands. The minimum restart delay is recalculated each time\n\
260a restart is attempted: if the time since the last restart attempt exceeds\n\
261twice the -M value, then the restart delay is set to the -m value.\n\
262Otherwise, the interval is doubled (but capped at the -M value).\n\n\
263Options:\n\
264-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
265 to syslog instead of stdout.\n\
266-S, --statedir Set the vty socket directory (default is %s)\n\
267-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
268 option is necessary if the daemons do not support the\n\
269 echo command)\n\
270-l, --loglevel Set the logging level (default is %d).\n\
271 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
272 but it can be set higher than %d if extra-verbose debugging\n\
273 messages are desired.\n\
274-m, --min-restart-interval\n\
275 Set the minimum seconds to wait between invocations of daemon\n\
276 restart commands (default is %d).\n\
277-M, --max-restart-interval\n\
278 Set the maximum seconds to wait between invocations of daemon\n\
279 restart commands (default is %d).\n\
280-i, --interval Set the status polling interval in seconds (default is %d)\n\
281-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
282-T, --restart-timeout\n\
283 Set the restart (kill) timeout in seconds (default is %d).\n\
284 If any background jobs are still running after this much\n\
285 time has elapsed, they will be killed.\n\
286-r, --restart Supply a Bourne shell command to use to restart a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 Note that -r and -R are incompatible.\n\
290-s, --start-command\n\
291 Supply a Bourne shell to command to use to start a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294-k, --kill-command\n\
295 Supply a Bourne shell to command to use to stop a single\n\
296 daemon. The command string should include '%%s' where the\n\
297 name of the daemon should be substituted.\n\
298-R, --restart-all\n\
299 When one or more daemons is down, try to restart everything\n\
300 using the Bourne shell command supplied as the argument.\n\
301 Note that -r and -R are incompatible.\n\
302-z, --unresponsive-restart\n\
303 When a daemon is unresponsive, treat it as being down for\n\
304 restart purposes.\n\
305-a, --all-restart\n\
306 When zebra hangs or crashes, restart all daemons using\n\
307 this phased approach: 1. stop all other daemons; 2. restart\n\
308 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
309-A, --always-all-restart\n\
310 When any daemon (not just zebra) hangs or crashes, use the\n\
311 same phased restart mechanism described above for -a.\n\
312 Requires -r, -s, and -k.\n\
313-p, --pid-file Set process identifier file name\n\
314 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000315-b, --blank-string\n\
316 When the supplied argument string is found in any of the\n\
317 various shell command arguments (-r, -s, -k, or -R), replace\n\
318 it with a space. This is an ugly hack to circumvent problems\n\
319 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000320-v, --version Print program version\n\
321-h, --help Display this help and exit\n\
322", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
323progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000324VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000325DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
326DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
327
328 return status;
329}
330
331static pid_t
332run_background(const char *shell_cmd)
333{
334 pid_t child;
335
336 switch (child = fork())
337 {
338 case -1:
339 zlog_err("fork failed, cannot run command [%s]: %s",
340 shell_cmd,safe_strerror(errno));
341 return -1;
342 case 0:
343 /* Child process. */
344 /* Use separate process group so child processes can be killed easily. */
345 if (setpgid(0,0) < 0)
346 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
347 {
348 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
349 execv("/bin/sh",(char *const *)argv);
350 zlog_err("execv(/bin/sh -c '%s') failed: %s",
351 shell_cmd,safe_strerror(errno));
352 _exit(127);
353 }
354 default:
355 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000356 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000357 return child;
358 }
359}
360
361static struct timeval *
362time_elapsed(struct timeval *result, const struct timeval *start_time)
363{
364 gettimeofday(result,NULL);
365 result->tv_sec -= start_time->tv_sec;
366 result->tv_usec -= start_time->tv_usec;
367 while (result->tv_usec < 0)
368 {
369 result->tv_usec += 1000000L;
370 result->tv_sec--;
371 }
372 return result;
373}
374
375static int
376restart_kill(struct thread *t_kill)
377{
378 struct restart_info *restart = THREAD_ARG(t_kill);
379 struct timeval delay;
380
381 time_elapsed(&delay,&restart->time);
382 zlog_warn("Warning: %s %s child process %d still running after "
383 "%ld seconds, sending signal %d",
ajsf2d82572004-12-29 17:45:08 +0000384 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000385 (restart->kills ? SIGKILL : SIGTERM));
386 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
387 restart->kills++;
388 restart->t_kill = thread_add_timer(master,restart_kill,restart,
389 gs.restart_timeout);
390 return 0;
391}
392
393static struct restart_info *
394find_child(pid_t child)
395{
396 if (gs.mode == MODE_GLOBAL_RESTART)
397 {
398 if (gs.restart.pid == child)
399 return &gs.restart;
400 }
401 else
402 {
403 struct daemon *dmn;
404 for (dmn = gs.daemons; dmn; dmn = dmn->next)
405 {
406 if (dmn->restart.pid == child)
407 return &dmn->restart;
408 }
409 }
410 return NULL;
411}
412
413static void
414sigchild(void)
415{
416 pid_t child;
417 int status;
418 const char *name;
419 const char *what;
420 struct restart_info *restart;
421
422 switch (child = waitpid(-1,&status,WNOHANG))
423 {
424 case -1:
425 zlog_err("waitpid failed: %s",safe_strerror(errno));
426 return;
427 case 0:
428 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
429 return;
430 }
431
432 if ((restart = find_child(child)) != NULL)
433 {
434 name = restart->name;
435 what = restart->what;
436 restart->pid = 0;
437 gs.numpids--;
438 thread_cancel(restart->t_kill);
439 restart->t_kill = NULL;
440 /* Update restart time to reflect the time the command completed. */
441 gettimeofday(&restart->time,NULL);
442 }
443 else
444 {
445 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000446 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000447 name = "(unknown)";
448 what = "background";
449 }
450 if (WIFSTOPPED(status))
451 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000452 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000453 else if (WIFSIGNALED(status))
454 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000455 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000456 else if (WIFEXITED(status))
457 {
458 if (WEXITSTATUS(status) != 0)
459 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000460 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000461 else
ajsf2d82572004-12-29 17:45:08 +0000462 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000463 }
464 else
465 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000466 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000467 phase_check();
468}
469
470static int
471run_job(struct restart_info *restart, const char *cmdtype, const char *command,
472 int force, int update_interval)
473{
474 struct timeval delay;
475
476 if (gs.loglevel > LOG_DEBUG+1)
477 zlog_debug("attempting to %s %s",cmdtype,restart->name);
478
479 if (restart->pid)
480 {
481 if (gs.loglevel > LOG_DEBUG+1)
482 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000483 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000484 return -1;
485 }
486
487 if (!force &&
488 (time_elapsed(&delay,&restart->time)->tv_sec < restart->interval))
489 {
490 if (gs.loglevel > LOG_DEBUG+1)
491 zlog_debug("postponing %s %s: "
492 "elapsed time %ld < retry interval %ld",
493 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
494 return -1;
495 }
496
497 gettimeofday(&restart->time,NULL);
498 restart->kills = 0;
499 {
500 char cmd[strlen(command)+strlen(restart->name)+1];
501 snprintf(cmd,sizeof(cmd),command,restart->name);
502 if ((restart->pid = run_background(cmd)) > 0)
503 {
504 restart->t_kill = thread_add_timer(master,restart_kill,restart,
505 gs.restart_timeout);
506 restart->what = cmdtype;
507 gs.numpids++;
508 }
509 else
510 restart->pid = 0;
511 }
512
513 /* Calculate the new restart interval. */
514 if (update_interval)
515 {
516 if (delay.tv_sec > 2*gs.max_restart_interval)
517 restart->interval = gs.min_restart_interval;
518 else if ((restart->interval *= 2) > gs.max_restart_interval)
519 restart->interval = gs.max_restart_interval;
520 if (gs.loglevel > LOG_DEBUG+1)
521 zlog_debug("restart %s interval is now %ld",
522 restart->name,restart->interval);
523 }
524 return restart->pid;
525}
526
527#define SET_READ_HANDLER(DMN) \
528 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
529
530#define SET_WAKEUP_DOWN(DMN) \
531 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
532 FUZZY(gs.period))
533
534#define SET_WAKEUP_UNRESPONSIVE(DMN) \
535 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
536 FUZZY(gs.period))
537
538#define SET_WAKEUP_ECHO(DMN) \
539 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
540 FUZZY(gs.period))
541
542static int
543wakeup_down(struct thread *t_wakeup)
544{
545 struct daemon *dmn = THREAD_ARG(t_wakeup);
546
547 dmn->t_wakeup = NULL;
548 if (try_connect(dmn) < 0)
549 SET_WAKEUP_DOWN(dmn);
550 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
551 try_restart(dmn);
552 return 0;
553}
554
555static int
556wakeup_init(struct thread *t_wakeup)
557{
558 struct daemon *dmn = THREAD_ARG(t_wakeup);
559
560 dmn->t_wakeup = NULL;
561 if (try_connect(dmn) < 0)
562 {
563 SET_WAKEUP_DOWN(dmn);
564 zlog_err("%s state -> down : initial connection attempt failed",
565 dmn->name);
566 dmn->state = DAEMON_DOWN;
567 }
568 return 0;
569}
570
571static void
572daemon_down(struct daemon *dmn, const char *why)
573{
574 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
575 zlog_err("%s state -> down : %s",dmn->name,why);
576 else if (gs.loglevel > LOG_DEBUG)
577 zlog_debug("%s still down : %s",dmn->name,why);
578 if (IS_UP(dmn))
579 gs.numdown++;
580 dmn->state = DAEMON_DOWN;
581 if (dmn->fd >= 0)
582 {
583 close(dmn->fd);
584 dmn->fd = -1;
585 }
586 THREAD_OFF(dmn->t_read);
587 THREAD_OFF(dmn->t_write);
588 THREAD_OFF(dmn->t_wakeup);
589 if (try_connect(dmn) < 0)
590 SET_WAKEUP_DOWN(dmn);
591 phase_check();
592}
593
594static int
595handle_read(struct thread *t_read)
596{
597 struct daemon *dmn = THREAD_ARG(t_read);
598 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
599 char buf[sizeof(resp)+100];
600 ssize_t rc;
601 struct timeval delay;
602
603 dmn->t_read = NULL;
604 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
605 {
606 char why[100];
607
608 if ((errno == EINTR) || (errno == EAGAIN))
609 {
610 /* Pretend it never happened. */
611 SET_READ_HANDLER(dmn);
612 return 0;
613 }
614 snprintf(why,sizeof(why),"unexpected read error: %s",
615 safe_strerror(errno));
616 daemon_down(dmn,why);
617 return 0;
618 }
619 if (rc == 0)
620 {
621 daemon_down(dmn,"read returned EOF");
622 return 0;
623 }
624 if (!dmn->echo_sent.tv_sec)
625 {
626 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000627 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
628 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000629 daemon_down(dmn,why);
630 return 0;
631 }
632
633 /* We are expecting an echo response: is there any chance that the
634 response would not be returned entirely in the first read? That
635 seems inconceivable... */
636 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
637 {
638 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000639 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
640 "(expecting %u): %.*s",
641 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000642 daemon_down(dmn,why);
643 return 0;
644 }
645
646 time_elapsed(&delay,&dmn->echo_sent);
647 dmn->echo_sent.tv_sec = 0;
648 if (dmn->state == DAEMON_UNRESPONSIVE)
649 {
650 if (delay.tv_sec < gs.timeout)
651 {
652 dmn->state = DAEMON_UP;
653 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
654 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
655 }
656 else
657 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
658 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
659 }
660 else if (gs.loglevel > LOG_DEBUG+1)
661 zlog_debug("%s: echo response received after %ld.%06ld seconds",
662 dmn->name,delay.tv_sec,delay.tv_usec);
663
664 SET_READ_HANDLER(dmn);
665 if (dmn->t_wakeup)
666 thread_cancel(dmn->t_wakeup);
667 SET_WAKEUP_ECHO(dmn);
668
669 return 0;
670}
671
672static void
673daemon_up(struct daemon *dmn, const char *why)
674{
675 dmn->state = DAEMON_UP;
676 gs.numdown--;
677 dmn->connect_tries = 0;
678 zlog_notice("%s state -> up : %s",dmn->name,why);
679 if (gs.do_ping)
680 SET_WAKEUP_ECHO(dmn);
681 phase_check();
682}
683
684static int
685check_connect(struct thread *t_write)
686{
687 struct daemon *dmn = THREAD_ARG(t_write);
688 int sockerr;
689 socklen_t reslen = sizeof(sockerr);
690
691 dmn->t_write = NULL;
692 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
693 {
694 zlog_warn("%s: check_connect: getsockopt failed: %s",
695 dmn->name,safe_strerror(errno));
696 daemon_down(dmn,"getsockopt failed checking connection success");
697 return 0;
698 }
699 if ((reslen == sizeof(sockerr)) && sockerr)
700 {
701 char why[100];
702 snprintf(why,sizeof(why),
703 "getsockopt reports that connection attempt failed: %s",
704 safe_strerror(sockerr));
705 daemon_down(dmn,why);
706 return 0;
707 }
708
709 daemon_up(dmn,"delayed connect succeeded");
710 return 0;
711}
712
713static int
714wakeup_connect_hanging(struct thread *t_wakeup)
715{
716 struct daemon *dmn = THREAD_ARG(t_wakeup);
717 char why[100];
718
719 dmn->t_wakeup = NULL;
720 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
721 gs.timeout);
722 daemon_down(dmn,why);
723 return 0;
724}
725
726/* Making connection to protocol daemon. */
727static int
728try_connect(struct daemon *dmn)
729{
730 int sock;
731 struct sockaddr_un addr;
732 socklen_t len;
733 int flags;
734
735 if (gs.loglevel > LOG_DEBUG+1)
736 zlog_debug("%s: attempting to connect",dmn->name);
737 dmn->connect_tries++;
738
739 memset (&addr, 0, sizeof (struct sockaddr_un));
740 addr.sun_family = AF_UNIX;
741 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
742 gs.vtydir,dmn->name);
743#ifdef HAVE_SUN_LEN
744 len = addr.sun_len = SUN_LEN(&addr);
745#else
746 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
747#endif /* HAVE_SUN_LEN */
748
749 /* Quick check to see if we might succeed before we go to the trouble
750 of creating a socket. */
751 if (access(addr.sun_path, W_OK) < 0)
752 {
753 if (errno != ENOENT)
754 zlog_err("%s: access to socket %s denied: %s",
755 dmn->name,addr.sun_path,safe_strerror(errno));
756 return -1;
757 }
758
759 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
760 {
761 zlog_err("%s(%s): cannot make socket: %s",
762 __func__,addr.sun_path, safe_strerror(errno));
763 return -1;
764 }
765
766 /* Set non-blocking. */
767 if ((flags = fcntl(sock, F_GETFL, 0)) < 0)
768 {
769 zlog_err("%s(%s): fcntl(F_GETFL) failed: %s",
770 __func__,addr.sun_path, safe_strerror(errno));
771 close(sock);
772 return -1;
773 }
774 if (fcntl(sock, F_SETFL, (flags|O_NONBLOCK)) < 0)
775 {
776 zlog_err("%s(%s): fcntl(F_SETFL,O_NONBLOCK) failed: %s",
777 __func__,addr.sun_path, safe_strerror(errno));
778 close(sock);
779 return -1;
780 }
781
782 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
783 {
784 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
785 {
786 if (gs.loglevel > LOG_DEBUG)
787 zlog_debug("%s(%s): connect failed: %s",
788 __func__,addr.sun_path, safe_strerror(errno));
789 close (sock);
790 return -1;
791 }
792 if (gs.loglevel > LOG_DEBUG)
793 zlog_debug("%s: connection in progress",dmn->name);
794 dmn->state = DAEMON_CONNECTING;
795 dmn->fd = sock;
796 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
797 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
798 gs.timeout);
799 SET_READ_HANDLER(dmn);
800 return 0;
801 }
802
803 dmn->fd = sock;
804 SET_READ_HANDLER(dmn);
805 daemon_up(dmn,"connect succeeded");
806 return 1;
807}
808
809static int
810phase_hanging(struct thread *t_hanging)
811{
812 gs.t_phase_hanging = NULL;
813 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
814 phase_str[gs.phase],PHASE_TIMEOUT);
815 gs.phase = PHASE_NONE;
816 return 0;
817}
818
819static void
820set_phase(restart_phase_t new_phase)
821{
822 gs.phase = new_phase;
823 if (gs.t_phase_hanging)
824 thread_cancel(gs.t_phase_hanging);
825 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
826 PHASE_TIMEOUT);
827}
828
829static void
830phase_check(void)
831{
832 switch (gs.phase)
833 {
834 case PHASE_NONE:
835 break;
836 case PHASE_STOPS_PENDING:
837 if (gs.numpids)
838 break;
839 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
840 set_phase(PHASE_WAITING_DOWN);
841 /*FALLTHRU*/
842 case PHASE_WAITING_DOWN:
843 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
844 break;
845 zlog_info("Phased restart: all routing daemons now down.");
846 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
847 set_phase(PHASE_ZEBRA_RESTART_PENDING);
848 /*FALLTHRU*/
849 case PHASE_ZEBRA_RESTART_PENDING:
850 if (gs.special->restart.pid)
851 break;
852 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
853 set_phase(PHASE_WAITING_ZEBRA_UP);
854 /*FALLTHRU*/
855 case PHASE_WAITING_ZEBRA_UP:
856 if (!IS_UP(gs.special))
857 break;
858 zlog_info("Phased restart: %s is now up.",gs.special->name);
859 {
860 struct daemon *dmn;
861 for (dmn = gs.daemons; dmn; dmn = dmn->next)
862 {
863 if (dmn != gs.special)
864 run_job(&dmn->restart,"start",gs.start_command,1,1);
865 }
866 }
867 gs.phase = PHASE_NONE;
868 THREAD_OFF(gs.t_phase_hanging);
869 zlog_notice("Phased global restart has completed.");
870 break;
871 }
872}
873
874static void
875try_restart(struct daemon *dmn)
876{
877 switch (gs.mode)
878 {
879 case MODE_MONITOR:
880 return;
881 case MODE_GLOBAL_RESTART:
882 run_job(&gs.restart,"restart",gs.restart_command,0,1);
883 break;
884 case MODE_SEPARATE_RESTART:
885 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
886 break;
887 case MODE_PHASED_ZEBRA_RESTART:
888 if (dmn != gs.special)
889 {
890 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
891 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
892 else
893 zlog_debug("%s: postponing restart attempt because master %s daemon "
894 "not up [%s], or phased restart in progress",
895 dmn->name,gs.special->name,state_str[gs.special->state]);
896 break;
897 }
898 /*FALLTHRU*/
899 case MODE_PHASED_ALL_RESTART:
900 if ((gs.phase != PHASE_NONE) || gs.numpids)
901 {
902 if (gs.loglevel > LOG_DEBUG+1)
903 zlog_debug("postponing phased global restart: restart already in "
904 "progress [%s], or outstanding child processes [%d]",
905 phase_str[gs.phase],gs.numpids);
906 break;
907 }
908 /* Is it too soon for a restart? */
909 {
910 struct timeval delay;
911 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
912 gs.special->restart.interval)
913 {
914 if (gs.loglevel > LOG_DEBUG+1)
915 zlog_debug("postponing phased global restart: "
916 "elapsed time %ld < retry interval %ld",
917 (long)delay.tv_sec,gs.special->restart.interval);
918 break;
919 }
920 }
921 zlog_info("Phased restart: stopping all routing daemons.");
922 /* First step: stop all other daemons. */
923 for (dmn = gs.daemons; dmn; dmn = dmn->next)
924 {
925 if (dmn != gs.special)
926 run_job(&dmn->restart,"stop",gs.stop_command,1,0);
927 }
928 set_phase(PHASE_STOPS_PENDING);
929 break;
930 default:
931 zlog_err("error: unknown restart mode %d",gs.mode);
932 break;
933 }
934}
935
936static int
937wakeup_unresponsive(struct thread *t_wakeup)
938{
939 struct daemon *dmn = THREAD_ARG(t_wakeup);
940
941 dmn->t_wakeup = NULL;
942 if (dmn->state != DAEMON_UNRESPONSIVE)
943 zlog_err("%s: no longer unresponsive (now %s), "
944 "wakeup should have been cancelled!",
945 dmn->name,state_str[dmn->state]);
946 else
947 {
948 SET_WAKEUP_UNRESPONSIVE(dmn);
949 try_restart(dmn);
950 }
951 return 0;
952}
953
954static int
955wakeup_no_answer(struct thread *t_wakeup)
956{
957 struct daemon *dmn = THREAD_ARG(t_wakeup);
958
959 dmn->t_wakeup = NULL;
960 dmn->state = DAEMON_UNRESPONSIVE;
961 zlog_err("%s state -> unresponsive : no response yet to ping "
962 "sent %ld seconds ago",dmn->name,gs.timeout);
963 if (gs.unresponsive_restart)
964 {
965 SET_WAKEUP_UNRESPONSIVE(dmn);
966 try_restart(dmn);
967 }
968 return 0;
969}
970
971static int
972wakeup_send_echo(struct thread *t_wakeup)
973{
974 static const char echocmd[] = "echo " PING_TOKEN;
975 ssize_t rc;
976 struct daemon *dmn = THREAD_ARG(t_wakeup);
977
978 dmn->t_wakeup = NULL;
979 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
980 ((size_t)rc != sizeof(echocmd)))
981 {
982 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000983 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
984 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000985 daemon_down(dmn,why);
986 }
987 else
988 {
989 gettimeofday(&dmn->echo_sent,NULL);
990 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
991 }
992 return 0;
993}
994
995static void
996sigint(void)
997{
998 zlog_notice("Terminating on signal");
999 exit(0);
1000}
1001
1002static int
1003valid_command(const char *cmd)
1004{
1005 char *p;
1006
1007 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1008}
1009
ajsc8b40f82004-12-22 16:17:16 +00001010/* This is an ugly hack to circumvent problems with passing command-line
1011 arguments that contain spaces. The fix is to use a configuration file. */
1012static char *
1013translate_blanks(const char *cmd, const char *blankstr)
1014{
1015 char *res;
1016 char *p;
1017 size_t bslen = strlen(blankstr);
1018
1019 if (!(res = strdup(cmd)))
1020 {
1021 perror("strdup");
1022 exit(1);
1023 }
1024 while ((p = strstr(res,blankstr)) != NULL)
1025 {
1026 *p = ' ';
1027 if (bslen != 1)
1028 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1029 }
1030 return res;
1031}
1032
ajs8b886ca2004-12-22 02:56:38 +00001033int
1034main(int argc, char **argv)
1035{
1036 const char *progname;
1037 int opt;
1038 int daemon_mode = 0;
1039 const char *pidfile = DEFAULT_PIDFILE;
1040 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001041 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001042 static struct quagga_signal_t my_signals[] =
1043 {
1044 {
1045 .signal = SIGINT,
1046 .handler = sigint,
1047 },
1048 {
1049 .signal = SIGTERM,
1050 .handler = sigint,
1051 },
1052 {
1053 .signal = SIGCHLD,
1054 .handler = sigchild,
1055 },
1056 };
1057
1058 if ((progname = strrchr (argv[0], '/')) != NULL)
1059 progname++;
1060 else
1061 progname = argv[0];
1062
ajs098e2402004-12-22 17:00:46 +00001063 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001064 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001065 longopts, 0)) != EOF)
1066 {
1067 switch (opt)
1068 {
1069 case 0:
1070 break;
1071 case 'a':
1072 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1073 {
1074 fputs("Ambiguous operating mode selected.\n",stderr);
1075 return usage(progname,1);
1076 }
1077 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1078 break;
1079 case 'A':
1080 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1081 {
1082 fputs("Ambiguous operating mode selected.\n",stderr);
1083 return usage(progname,1);
1084 }
1085 gs.mode = MODE_PHASED_ALL_RESTART;
1086 break;
ajsc8b40f82004-12-22 16:17:16 +00001087 case 'b':
1088 blankstr = optarg;
1089 break;
ajs8b886ca2004-12-22 02:56:38 +00001090 case 'd':
1091 daemon_mode = 1;
1092 break;
1093 case 'e':
1094 gs.do_ping = 0;
1095 break;
1096 case 'k':
1097 if (!valid_command(optarg))
1098 {
1099 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1100 optarg);
1101 return usage(progname,1);
1102 }
1103 gs.stop_command = optarg;
1104 break;
1105 case 'l':
1106 {
1107 char garbage[3];
1108 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1109 (gs.loglevel < LOG_EMERG))
1110 {
1111 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1112 return usage(progname,1);
1113 }
1114 }
1115 break;
1116 case 'm':
1117 {
1118 char garbage[3];
1119 if ((sscanf(optarg,"%ld%1s",
1120 &gs.min_restart_interval,garbage) != 1) ||
1121 (gs.min_restart_interval < 0))
1122 {
1123 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1124 optarg);
1125 return usage(progname,1);
1126 }
1127 }
1128 break;
1129 case 'M':
1130 {
1131 char garbage[3];
1132 if ((sscanf(optarg,"%ld%1s",
1133 &gs.max_restart_interval,garbage) != 1) ||
1134 (gs.max_restart_interval < 0))
1135 {
1136 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1137 optarg);
1138 return usage(progname,1);
1139 }
1140 }
1141 break;
1142 case 'i':
1143 {
1144 char garbage[3];
1145 int period;
1146 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1147 (gs.period < 1))
1148 {
1149 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1150 return usage(progname,1);
1151 }
1152 gs.period = 1000*period;
1153 }
1154 break;
1155 case 'p':
1156 pidfile = optarg;
1157 break;
1158 case 'r':
1159 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1160 (gs.mode == MODE_SEPARATE_RESTART))
1161 {
1162 fputs("Ambiguous operating mode selected.\n",stderr);
1163 return usage(progname,1);
1164 }
1165 if (!valid_command(optarg))
1166 {
1167 fprintf(stderr,
1168 "Invalid restart command, must contain '%%s': %s\n",
1169 optarg);
1170 return usage(progname,1);
1171 }
1172 gs.restart_command = optarg;
1173 if (gs.mode == MODE_MONITOR)
1174 gs.mode = MODE_SEPARATE_RESTART;
1175 break;
1176 case 'R':
1177 if (gs.mode != MODE_MONITOR)
1178 {
1179 fputs("Ambiguous operating mode selected.\n",stderr);
1180 return usage(progname,1);
1181 }
1182 if (strchr(optarg,'%'))
1183 {
1184 fprintf(stderr,
1185 "Invalid restart-all arg, must not contain '%%s': %s\n",
1186 optarg);
1187 return usage(progname,1);
1188 }
1189 gs.restart_command = optarg;
1190 gs.mode = MODE_GLOBAL_RESTART;
1191 break;
1192 case 's':
1193 if (!valid_command(optarg))
1194 {
1195 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1196 optarg);
1197 return usage(progname,1);
1198 }
1199 gs.start_command = optarg;
1200 break;
1201 case 'S':
1202 gs.vtydir = optarg;
1203 break;
1204 case 't':
1205 {
1206 char garbage[3];
1207 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1208 (gs.timeout < 1))
1209 {
1210 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1211 return usage(progname,1);
1212 }
1213 }
1214 break;
1215 case 'T':
1216 {
1217 char garbage[3];
1218 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1219 (gs.restart_timeout < 1))
1220 {
1221 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1222 return usage(progname,1);
1223 }
1224 }
1225 break;
1226 case 'z':
1227 gs.unresponsive_restart = 1;
1228 break;
1229 case 'v':
1230 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1231 puts("Copyright 2004 Andrew J. Schorr");
1232 return 0;
1233 case 'h':
1234 return usage(progname,0);
1235 default:
1236 fputs("Invalid option.\n",stderr);
1237 return usage(progname,1);
1238 }
1239 }
1240
1241 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1242 {
1243 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1244 return usage(progname,1);
1245 }
1246 switch (gs.mode)
1247 {
1248 case MODE_MONITOR:
1249 if (gs.restart_command || gs.start_command || gs.stop_command)
1250 {
1251 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1252 mode_str[gs.mode]);
1253 return usage(progname,1);
1254 }
1255 break;
1256 case MODE_GLOBAL_RESTART:
1257 case MODE_SEPARATE_RESTART:
1258 if (!gs.restart_command || gs.start_command || gs.stop_command)
1259 {
1260 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1261 mode_str[gs.mode]);
1262 return usage(progname,1);
1263 }
1264 break;
1265 case MODE_PHASED_ZEBRA_RESTART:
1266 case MODE_PHASED_ALL_RESTART:
1267 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1268 {
1269 fprintf(stderr,
1270 "Need start, kill, and restart commands in [%s] mode.\n",
1271 mode_str[gs.mode]);
1272 return usage(progname,1);
1273 }
1274 break;
1275 }
1276
ajsc8b40f82004-12-22 16:17:16 +00001277 if (blankstr)
1278 {
1279 if (gs.restart_command)
1280 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1281 if (gs.start_command)
1282 gs.start_command = translate_blanks(gs.start_command,blankstr);
1283 if (gs.stop_command)
1284 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1285 }
1286
ajs8b886ca2004-12-22 02:56:38 +00001287 gs.restart.interval = gs.min_restart_interval;
1288 master = thread_master_create();
1289 signal_init (master, Q_SIGC(my_signals), my_signals);
1290 srandom(time(NULL));
1291
1292 {
1293 int i;
1294 struct daemon *tail = NULL;
1295
1296 for (i = optind; i < argc; i++)
1297 {
1298 struct daemon *dmn;
1299
1300 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1301 {
ajs098e2402004-12-22 17:00:46 +00001302 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1303 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001304 return 1;
1305 }
1306 dmn->name = dmn->restart.name = argv[i];
1307 dmn->state = DAEMON_INIT;
1308 gs.numdaemons++;
1309 gs.numdown++;
1310 dmn->fd = -1;
1311 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1312 100+(random() % 900));
1313 dmn->restart.interval = gs.min_restart_interval;
1314 if (tail)
1315 tail->next = dmn;
1316 else
1317 gs.daemons = dmn;
1318 tail = dmn;
1319
1320 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1321 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1322 !strcmp(dmn->name,special))
1323 gs.special = dmn;
1324 }
1325 }
1326 if (!gs.daemons)
1327 {
1328 fputs("Must specify one or more daemons to monitor.\n",stderr);
1329 return usage(progname,1);
1330 }
1331 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1332 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1333 {
1334 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1335 mode_str[gs.mode],special);
1336 return usage(progname,1);
1337 }
1338 if (gs.special && (gs.numdaemons < 2))
1339 {
1340 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1341 "to watch.\n",mode_str[gs.mode]);
1342 return usage(progname,1);
1343 }
1344
1345 zlog_default = openzlog(progname, ZLOG_NONE,
1346 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1347 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1348 if (daemon_mode)
1349 {
1350 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
1351 daemon(0, 0);
1352 }
1353 else
1354 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1355
1356 /* Make sure we're not already running. */
1357 pid_output (pidfile);
1358
1359 /* Announce which daemons are being monitored. */
1360 {
1361 struct daemon *dmn;
1362 size_t len = 0;
1363
1364 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1365 len += strlen(dmn->name)+1;
1366
1367 {
1368 char buf[len+1];
1369 char *p = buf;
1370
1371 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1372 {
1373 if (p != buf)
1374 *p++ = ' ';
1375 strcpy(p,dmn->name);
1376 p += strlen(p);
1377 }
1378 zlog_notice("%s %s watching [%s], mode [%s]",
1379 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1380 }
1381 }
1382
1383 {
1384 struct thread thread;
1385
1386 while (thread_fetch (master, &thread))
1387 thread_call (&thread);
1388 }
1389
1390 /* Not reached. */
1391 return 0;
1392}