blob: b57fd3ae04844bd4b0b94dbc9d3eafaea064a2dc [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs8b886ca2004-12-22 02:56:38 +00002 Monitor status of quagga daemons and restart if necessary.
3
4 Copyright (C) 2004 Andrew J. Schorr
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
ajsa3655342004-12-29 17:39:10 +000021#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000022#include <thread.h>
23#include <log.h>
ajs52e66292005-02-16 20:40:25 +000024#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000025#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000026#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000027#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000028#include <sys/un.h>
29#include <sys/wait.h>
ajs8b886ca2004-12-22 02:56:38 +000030
31#ifndef MIN
32#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
33#endif
34
35/* Macros to help randomize timers. */
36#define JITTER(X) ((random() % ((X)+1))-((X)/2))
37#define FUZZY(X) ((X)+JITTER((X)/20))
38
39#define DEFAULT_PERIOD 5
40#define DEFAULT_TIMEOUT 10
41#define DEFAULT_RESTART_TIMEOUT 20
42#define DEFAULT_LOGLEVEL LOG_INFO
43#define DEFAULT_MIN_RESTART 60
44#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000045#ifdef PATH_WATCHQUAGGA_PID
46#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
47#else
ajs8b886ca2004-12-22 02:56:38 +000048#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000049#endif
ajs16f65112004-12-22 15:37:44 +000050#ifdef DAEMON_VTY_DIR
51#define VTYDIR DAEMON_VTY_DIR
52#else
53#define VTYDIR STATEDIR
54#endif
ajs8b886ca2004-12-22 02:56:38 +000055
56#define PING_TOKEN "PING"
57
58/* Needs to be global, referenced somewhere inside libzebra. */
59struct thread_master *master;
60
61typedef enum
62{
63 MODE_MONITOR = 0,
64 MODE_GLOBAL_RESTART,
65 MODE_SEPARATE_RESTART,
66 MODE_PHASED_ZEBRA_RESTART,
67 MODE_PHASED_ALL_RESTART
68} watch_mode_t;
69
70static const char *mode_str[] =
71{
72 "monitor",
73 "global restart",
74 "individual daemon restart",
75 "phased zebra restart",
76 "phased global restart for any failure",
77};
78
79typedef enum
80{
81 PHASE_NONE = 0,
82 PHASE_STOPS_PENDING,
83 PHASE_WAITING_DOWN,
84 PHASE_ZEBRA_RESTART_PENDING,
85 PHASE_WAITING_ZEBRA_UP
86} restart_phase_t;
87
88static const char *phase_str[] =
89{
90 "None",
91 "Stop jobs running",
92 "Waiting for other daemons to come down",
93 "Zebra restart job running",
94 "Waiting for zebra to come up",
95 "Start jobs running",
96};
97
98#define PHASE_TIMEOUT (3*gs.restart_timeout)
99
ajs098e2402004-12-22 17:00:46 +0000100struct restart_info
101{
102 const char *name;
103 const char *what;
104 pid_t pid;
105 struct timeval time;
106 long interval;
107 struct thread *t_kill;
108 int kills;
109};
110
111static struct global_state
112{
ajs8b886ca2004-12-22 02:56:38 +0000113 watch_mode_t mode;
114 restart_phase_t phase;
115 struct thread *t_phase_hanging;
116 const char *vtydir;
117 long period;
118 long timeout;
119 long restart_timeout;
120 long min_restart_interval;
121 long max_restart_interval;
122 int do_ping;
123 struct daemon *daemons;
124 const char *restart_command;
125 const char *start_command;
126 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000127 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000128 int unresponsive_restart;
129 int loglevel;
130 struct daemon *special; /* points to zebra when doing phased restart */
131 int numdaemons;
132 int numpids;
133 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
134} gs = {
135 .mode = MODE_MONITOR,
136 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000137 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000138 .period = 1000*DEFAULT_PERIOD,
139 .timeout = DEFAULT_TIMEOUT,
140 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
141 .loglevel = DEFAULT_LOGLEVEL,
142 .min_restart_interval = DEFAULT_MIN_RESTART,
143 .max_restart_interval = DEFAULT_MAX_RESTART,
144 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000145};
146
147typedef enum
148{
149 DAEMON_INIT,
150 DAEMON_DOWN,
151 DAEMON_CONNECTING,
152 DAEMON_UP,
153 DAEMON_UNRESPONSIVE
154} daemon_state_t;
155
156#define IS_UP(DMN) \
157 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
158
159static const char *state_str[] =
160{
161 "Init",
162 "Down",
163 "Connecting",
164 "Up",
165 "Unresponsive",
166};
167
168struct daemon {
169 const char *name;
170 daemon_state_t state;
171 int fd;
172 struct timeval echo_sent;
173 u_int connect_tries;
174 struct thread *t_wakeup;
175 struct thread *t_read;
176 struct thread *t_write;
177 struct daemon *next;
178 struct restart_info restart;
179};
180
181static const struct option longopts[] =
182{
183 { "daemon", no_argument, NULL, 'd'},
184 { "statedir", required_argument, NULL, 'S'},
185 { "no-echo", no_argument, NULL, 'e'},
186 { "loglevel", required_argument, NULL, 'l'},
187 { "interval", required_argument, NULL, 'i'},
188 { "timeout", required_argument, NULL, 't'},
189 { "restart-timeout", required_argument, NULL, 'T'},
190 { "restart", required_argument, NULL, 'r'},
191 { "start-command", required_argument, NULL, 's'},
192 { "kill-command", required_argument, NULL, 'k'},
193 { "restart-all", required_argument, NULL, 'R'},
194 { "all-restart", no_argument, NULL, 'a'},
195 { "always-all-restart", no_argument, NULL, 'A'},
196 { "unresponsive-restart", no_argument, NULL, 'z'},
197 { "min-restart-interval", required_argument, NULL, 'm'},
198 { "max-restart-interval", required_argument, NULL, 'M'},
199 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000200 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000201 { "help", no_argument, NULL, 'h'},
202 { "version", no_argument, NULL, 'v'},
203 { NULL, 0, NULL, 0 }
204};
205
206static int try_connect(struct daemon *dmn);
207static int wakeup_send_echo(struct thread *t_wakeup);
208static void try_restart(struct daemon *dmn);
209static void phase_check(void);
210
211static int
212usage(const char *progname, int status)
213{
214 if (status != 0)
215 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
216 else
217 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
218Watchdog program to monitor status of quagga daemons and try to restart\n\
219them if they are down or unresponsive. It determines whether a daemon is\n\
220up based on whether it can connect to the daemon's vty unix stream socket.\n\
221It then repeatedly sends echo commands over that socket to determine whether\n\
222the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
223on the socket connection and know immediately that the daemon is down.\n\n\
224The daemons to be monitored should be listed on the command line.\n\n\
225This program can run in one of 5 modes:\n\n\
2260. Mode: %s.\n\
227 Just monitor and report on status changes. Example:\n\
228 %s -d zebra ospfd bgpd\n\n\
2291. Mode: %s.\n\
230 Whenever any daemon hangs or crashes, use the given command to restart\n\
231 them all. Example:\n\
232 %s -dz \\\n\
233 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
234 zebra ospfd\n\n\
2352. Mode: %s.\n\
236 When any single daemon hangs or crashes, restart only the daemon that's\n\
237 in trouble using the supplied restart command. Example:\n\
238 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2393. Mode: %s.\n\
240 The same as the previous mode, except that there is special treatment when\n\
241 the zebra daemon is in trouble. In that case, a phased restart approach\n\
242 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
243 daemons. Example:\n\
244 %s -adz -r '/sbin/service %%s restart' \\\n\
245 -s '/sbin/service %%s start' \\\n\
246 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2474. Mode: %s.\n\
248 This is the same as the previous mode, except that the phased restart\n\
249 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
250 %s -Adz -r '/sbin/service %%s restart' \\\n\
251 -s '/sbin/service %%s start' \\\n\
252 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
253As of this writing, it is believed that mode 2 [%s]\n\
254is not safe, and mode 3 [%s] may not be safe with some of the\n\
255routing daemons.\n\n\
256In order to avoid attempting to restart the daemons in a fast loop,\n\
257the -m and -M options allow you to control the minimum delay between\n\
258restart commands. The minimum restart delay is recalculated each time\n\
259a restart is attempted: if the time since the last restart attempt exceeds\n\
260twice the -M value, then the restart delay is set to the -m value.\n\
261Otherwise, the interval is doubled (but capped at the -M value).\n\n\
262Options:\n\
263-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
264 to syslog instead of stdout.\n\
265-S, --statedir Set the vty socket directory (default is %s)\n\
266-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
267 option is necessary if the daemons do not support the\n\
268 echo command)\n\
269-l, --loglevel Set the logging level (default is %d).\n\
270 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
271 but it can be set higher than %d if extra-verbose debugging\n\
272 messages are desired.\n\
273-m, --min-restart-interval\n\
274 Set the minimum seconds to wait between invocations of daemon\n\
275 restart commands (default is %d).\n\
276-M, --max-restart-interval\n\
277 Set the maximum seconds to wait between invocations of daemon\n\
278 restart commands (default is %d).\n\
279-i, --interval Set the status polling interval in seconds (default is %d)\n\
280-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
281-T, --restart-timeout\n\
282 Set the restart (kill) timeout in seconds (default is %d).\n\
283 If any background jobs are still running after this much\n\
284 time has elapsed, they will be killed.\n\
285-r, --restart Supply a Bourne shell command to use to restart a single\n\
286 daemon. The command string should include '%%s' where the\n\
287 name of the daemon should be substituted.\n\
288 Note that -r and -R are incompatible.\n\
289-s, --start-command\n\
290 Supply a Bourne shell to command to use to start a single\n\
291 daemon. The command string should include '%%s' where the\n\
292 name of the daemon should be substituted.\n\
293-k, --kill-command\n\
294 Supply a Bourne shell to command to use to stop a single\n\
295 daemon. The command string should include '%%s' where the\n\
296 name of the daemon should be substituted.\n\
297-R, --restart-all\n\
298 When one or more daemons is down, try to restart everything\n\
299 using the Bourne shell command supplied as the argument.\n\
300 Note that -r and -R are incompatible.\n\
301-z, --unresponsive-restart\n\
302 When a daemon is unresponsive, treat it as being down for\n\
303 restart purposes.\n\
304-a, --all-restart\n\
305 When zebra hangs or crashes, restart all daemons using\n\
306 this phased approach: 1. stop all other daemons; 2. restart\n\
307 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
308-A, --always-all-restart\n\
309 When any daemon (not just zebra) hangs or crashes, use the\n\
310 same phased restart mechanism described above for -a.\n\
311 Requires -r, -s, and -k.\n\
312-p, --pid-file Set process identifier file name\n\
313 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000314-b, --blank-string\n\
315 When the supplied argument string is found in any of the\n\
316 various shell command arguments (-r, -s, -k, or -R), replace\n\
317 it with a space. This is an ugly hack to circumvent problems\n\
318 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000319-v, --version Print program version\n\
320-h, --help Display this help and exit\n\
321", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
322progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000323VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000324DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
325DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
326
327 return status;
328}
329
330static pid_t
331run_background(const char *shell_cmd)
332{
333 pid_t child;
334
335 switch (child = fork())
336 {
337 case -1:
338 zlog_err("fork failed, cannot run command [%s]: %s",
339 shell_cmd,safe_strerror(errno));
340 return -1;
341 case 0:
342 /* Child process. */
343 /* Use separate process group so child processes can be killed easily. */
344 if (setpgid(0,0) < 0)
345 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
346 {
347 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
348 execv("/bin/sh",(char *const *)argv);
349 zlog_err("execv(/bin/sh -c '%s') failed: %s",
350 shell_cmd,safe_strerror(errno));
351 _exit(127);
352 }
353 default:
354 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000355 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000356 return child;
357 }
358}
359
360static struct timeval *
361time_elapsed(struct timeval *result, const struct timeval *start_time)
362{
363 gettimeofday(result,NULL);
364 result->tv_sec -= start_time->tv_sec;
365 result->tv_usec -= start_time->tv_usec;
366 while (result->tv_usec < 0)
367 {
368 result->tv_usec += 1000000L;
369 result->tv_sec--;
370 }
371 return result;
372}
373
374static int
375restart_kill(struct thread *t_kill)
376{
377 struct restart_info *restart = THREAD_ARG(t_kill);
378 struct timeval delay;
379
380 time_elapsed(&delay,&restart->time);
381 zlog_warn("Warning: %s %s child process %d still running after "
382 "%ld seconds, sending signal %d",
ajsf2d82572004-12-29 17:45:08 +0000383 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000384 (restart->kills ? SIGKILL : SIGTERM));
385 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
386 restart->kills++;
387 restart->t_kill = thread_add_timer(master,restart_kill,restart,
388 gs.restart_timeout);
389 return 0;
390}
391
392static struct restart_info *
393find_child(pid_t child)
394{
395 if (gs.mode == MODE_GLOBAL_RESTART)
396 {
397 if (gs.restart.pid == child)
398 return &gs.restart;
399 }
400 else
401 {
402 struct daemon *dmn;
403 for (dmn = gs.daemons; dmn; dmn = dmn->next)
404 {
405 if (dmn->restart.pid == child)
406 return &dmn->restart;
407 }
408 }
409 return NULL;
410}
411
412static void
413sigchild(void)
414{
415 pid_t child;
416 int status;
417 const char *name;
418 const char *what;
419 struct restart_info *restart;
420
421 switch (child = waitpid(-1,&status,WNOHANG))
422 {
423 case -1:
424 zlog_err("waitpid failed: %s",safe_strerror(errno));
425 return;
426 case 0:
427 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
428 return;
429 }
430
431 if ((restart = find_child(child)) != NULL)
432 {
433 name = restart->name;
434 what = restart->what;
435 restart->pid = 0;
436 gs.numpids--;
437 thread_cancel(restart->t_kill);
438 restart->t_kill = NULL;
439 /* Update restart time to reflect the time the command completed. */
440 gettimeofday(&restart->time,NULL);
441 }
442 else
443 {
444 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000445 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000446 name = "(unknown)";
447 what = "background";
448 }
449 if (WIFSTOPPED(status))
450 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000451 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000452 else if (WIFSIGNALED(status))
453 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000454 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000455 else if (WIFEXITED(status))
456 {
457 if (WEXITSTATUS(status) != 0)
458 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000459 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000460 else
ajsf2d82572004-12-29 17:45:08 +0000461 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000462 }
463 else
464 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000465 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000466 phase_check();
467}
468
469static int
470run_job(struct restart_info *restart, const char *cmdtype, const char *command,
471 int force, int update_interval)
472{
473 struct timeval delay;
474
475 if (gs.loglevel > LOG_DEBUG+1)
476 zlog_debug("attempting to %s %s",cmdtype,restart->name);
477
478 if (restart->pid)
479 {
480 if (gs.loglevel > LOG_DEBUG+1)
481 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000482 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000483 return -1;
484 }
485
ajsa8a8ddc2005-01-12 16:24:51 +0000486 /* Note: time_elapsed test must come before the force test, since we need
487 to make sure that delay is initialized for use below in updating the
488 restart interval. */
489 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
490 !force)
ajs8b886ca2004-12-22 02:56:38 +0000491 {
492 if (gs.loglevel > LOG_DEBUG+1)
493 zlog_debug("postponing %s %s: "
494 "elapsed time %ld < retry interval %ld",
495 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
496 return -1;
497 }
498
499 gettimeofday(&restart->time,NULL);
500 restart->kills = 0;
501 {
502 char cmd[strlen(command)+strlen(restart->name)+1];
503 snprintf(cmd,sizeof(cmd),command,restart->name);
504 if ((restart->pid = run_background(cmd)) > 0)
505 {
506 restart->t_kill = thread_add_timer(master,restart_kill,restart,
507 gs.restart_timeout);
508 restart->what = cmdtype;
509 gs.numpids++;
510 }
511 else
512 restart->pid = 0;
513 }
514
515 /* Calculate the new restart interval. */
516 if (update_interval)
517 {
518 if (delay.tv_sec > 2*gs.max_restart_interval)
519 restart->interval = gs.min_restart_interval;
520 else if ((restart->interval *= 2) > gs.max_restart_interval)
521 restart->interval = gs.max_restart_interval;
522 if (gs.loglevel > LOG_DEBUG+1)
523 zlog_debug("restart %s interval is now %ld",
524 restart->name,restart->interval);
525 }
526 return restart->pid;
527}
528
529#define SET_READ_HANDLER(DMN) \
530 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
531
532#define SET_WAKEUP_DOWN(DMN) \
533 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
534 FUZZY(gs.period))
535
536#define SET_WAKEUP_UNRESPONSIVE(DMN) \
537 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
538 FUZZY(gs.period))
539
540#define SET_WAKEUP_ECHO(DMN) \
541 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
542 FUZZY(gs.period))
543
544static int
545wakeup_down(struct thread *t_wakeup)
546{
547 struct daemon *dmn = THREAD_ARG(t_wakeup);
548
549 dmn->t_wakeup = NULL;
550 if (try_connect(dmn) < 0)
551 SET_WAKEUP_DOWN(dmn);
552 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
553 try_restart(dmn);
554 return 0;
555}
556
557static int
558wakeup_init(struct thread *t_wakeup)
559{
560 struct daemon *dmn = THREAD_ARG(t_wakeup);
561
562 dmn->t_wakeup = NULL;
563 if (try_connect(dmn) < 0)
564 {
565 SET_WAKEUP_DOWN(dmn);
566 zlog_err("%s state -> down : initial connection attempt failed",
567 dmn->name);
568 dmn->state = DAEMON_DOWN;
569 }
570 return 0;
571}
572
573static void
574daemon_down(struct daemon *dmn, const char *why)
575{
576 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
577 zlog_err("%s state -> down : %s",dmn->name,why);
578 else if (gs.loglevel > LOG_DEBUG)
579 zlog_debug("%s still down : %s",dmn->name,why);
580 if (IS_UP(dmn))
581 gs.numdown++;
582 dmn->state = DAEMON_DOWN;
583 if (dmn->fd >= 0)
584 {
585 close(dmn->fd);
586 dmn->fd = -1;
587 }
588 THREAD_OFF(dmn->t_read);
589 THREAD_OFF(dmn->t_write);
590 THREAD_OFF(dmn->t_wakeup);
591 if (try_connect(dmn) < 0)
592 SET_WAKEUP_DOWN(dmn);
593 phase_check();
594}
595
596static int
597handle_read(struct thread *t_read)
598{
599 struct daemon *dmn = THREAD_ARG(t_read);
600 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
601 char buf[sizeof(resp)+100];
602 ssize_t rc;
603 struct timeval delay;
604
605 dmn->t_read = NULL;
606 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
607 {
608 char why[100];
609
ajs518cde82005-02-17 20:11:58 +0000610 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000611 {
612 /* Pretend it never happened. */
613 SET_READ_HANDLER(dmn);
614 return 0;
615 }
616 snprintf(why,sizeof(why),"unexpected read error: %s",
617 safe_strerror(errno));
618 daemon_down(dmn,why);
619 return 0;
620 }
621 if (rc == 0)
622 {
623 daemon_down(dmn,"read returned EOF");
624 return 0;
625 }
626 if (!dmn->echo_sent.tv_sec)
627 {
628 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000629 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
630 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000631 daemon_down(dmn,why);
632 return 0;
633 }
634
635 /* We are expecting an echo response: is there any chance that the
636 response would not be returned entirely in the first read? That
637 seems inconceivable... */
638 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
639 {
640 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000641 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
642 "(expecting %u): %.*s",
643 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000644 daemon_down(dmn,why);
645 return 0;
646 }
647
648 time_elapsed(&delay,&dmn->echo_sent);
649 dmn->echo_sent.tv_sec = 0;
650 if (dmn->state == DAEMON_UNRESPONSIVE)
651 {
652 if (delay.tv_sec < gs.timeout)
653 {
654 dmn->state = DAEMON_UP;
655 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
656 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
657 }
658 else
659 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
660 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
661 }
662 else if (gs.loglevel > LOG_DEBUG+1)
663 zlog_debug("%s: echo response received after %ld.%06ld seconds",
664 dmn->name,delay.tv_sec,delay.tv_usec);
665
666 SET_READ_HANDLER(dmn);
667 if (dmn->t_wakeup)
668 thread_cancel(dmn->t_wakeup);
669 SET_WAKEUP_ECHO(dmn);
670
671 return 0;
672}
673
674static void
675daemon_up(struct daemon *dmn, const char *why)
676{
677 dmn->state = DAEMON_UP;
678 gs.numdown--;
679 dmn->connect_tries = 0;
680 zlog_notice("%s state -> up : %s",dmn->name,why);
681 if (gs.do_ping)
682 SET_WAKEUP_ECHO(dmn);
683 phase_check();
684}
685
686static int
687check_connect(struct thread *t_write)
688{
689 struct daemon *dmn = THREAD_ARG(t_write);
690 int sockerr;
691 socklen_t reslen = sizeof(sockerr);
692
693 dmn->t_write = NULL;
694 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
695 {
696 zlog_warn("%s: check_connect: getsockopt failed: %s",
697 dmn->name,safe_strerror(errno));
698 daemon_down(dmn,"getsockopt failed checking connection success");
699 return 0;
700 }
701 if ((reslen == sizeof(sockerr)) && sockerr)
702 {
703 char why[100];
704 snprintf(why,sizeof(why),
705 "getsockopt reports that connection attempt failed: %s",
706 safe_strerror(sockerr));
707 daemon_down(dmn,why);
708 return 0;
709 }
710
711 daemon_up(dmn,"delayed connect succeeded");
712 return 0;
713}
714
715static int
716wakeup_connect_hanging(struct thread *t_wakeup)
717{
718 struct daemon *dmn = THREAD_ARG(t_wakeup);
719 char why[100];
720
721 dmn->t_wakeup = NULL;
722 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
723 gs.timeout);
724 daemon_down(dmn,why);
725 return 0;
726}
727
728/* Making connection to protocol daemon. */
729static int
730try_connect(struct daemon *dmn)
731{
732 int sock;
733 struct sockaddr_un addr;
734 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000735
736 if (gs.loglevel > LOG_DEBUG+1)
737 zlog_debug("%s: attempting to connect",dmn->name);
738 dmn->connect_tries++;
739
740 memset (&addr, 0, sizeof (struct sockaddr_un));
741 addr.sun_family = AF_UNIX;
742 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
743 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000744#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000745 len = addr.sun_len = SUN_LEN(&addr);
746#else
747 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000748#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000749
750 /* Quick check to see if we might succeed before we go to the trouble
751 of creating a socket. */
752 if (access(addr.sun_path, W_OK) < 0)
753 {
754 if (errno != ENOENT)
755 zlog_err("%s: access to socket %s denied: %s",
756 dmn->name,addr.sun_path,safe_strerror(errno));
757 return -1;
758 }
759
760 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
761 {
762 zlog_err("%s(%s): cannot make socket: %s",
763 __func__,addr.sun_path, safe_strerror(errno));
764 return -1;
765 }
766
ajs52e66292005-02-16 20:40:25 +0000767 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000768 {
ajs52e66292005-02-16 20:40:25 +0000769 zlog_err("%s(%s): set_nonblocking(%d) failed",
770 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000771 close(sock);
772 return -1;
773 }
774
775 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
776 {
777 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
778 {
779 if (gs.loglevel > LOG_DEBUG)
780 zlog_debug("%s(%s): connect failed: %s",
781 __func__,addr.sun_path, safe_strerror(errno));
782 close (sock);
783 return -1;
784 }
785 if (gs.loglevel > LOG_DEBUG)
786 zlog_debug("%s: connection in progress",dmn->name);
787 dmn->state = DAEMON_CONNECTING;
788 dmn->fd = sock;
789 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
790 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
791 gs.timeout);
792 SET_READ_HANDLER(dmn);
793 return 0;
794 }
795
796 dmn->fd = sock;
797 SET_READ_HANDLER(dmn);
798 daemon_up(dmn,"connect succeeded");
799 return 1;
800}
801
802static int
803phase_hanging(struct thread *t_hanging)
804{
805 gs.t_phase_hanging = NULL;
806 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
807 phase_str[gs.phase],PHASE_TIMEOUT);
808 gs.phase = PHASE_NONE;
809 return 0;
810}
811
812static void
813set_phase(restart_phase_t new_phase)
814{
815 gs.phase = new_phase;
816 if (gs.t_phase_hanging)
817 thread_cancel(gs.t_phase_hanging);
818 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
819 PHASE_TIMEOUT);
820}
821
822static void
823phase_check(void)
824{
825 switch (gs.phase)
826 {
827 case PHASE_NONE:
828 break;
829 case PHASE_STOPS_PENDING:
830 if (gs.numpids)
831 break;
832 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
833 set_phase(PHASE_WAITING_DOWN);
834 /*FALLTHRU*/
835 case PHASE_WAITING_DOWN:
836 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
837 break;
838 zlog_info("Phased restart: all routing daemons now down.");
839 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
840 set_phase(PHASE_ZEBRA_RESTART_PENDING);
841 /*FALLTHRU*/
842 case PHASE_ZEBRA_RESTART_PENDING:
843 if (gs.special->restart.pid)
844 break;
845 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
846 set_phase(PHASE_WAITING_ZEBRA_UP);
847 /*FALLTHRU*/
848 case PHASE_WAITING_ZEBRA_UP:
849 if (!IS_UP(gs.special))
850 break;
851 zlog_info("Phased restart: %s is now up.",gs.special->name);
852 {
853 struct daemon *dmn;
854 for (dmn = gs.daemons; dmn; dmn = dmn->next)
855 {
856 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000857 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000858 }
859 }
860 gs.phase = PHASE_NONE;
861 THREAD_OFF(gs.t_phase_hanging);
862 zlog_notice("Phased global restart has completed.");
863 break;
864 }
865}
866
867static void
868try_restart(struct daemon *dmn)
869{
870 switch (gs.mode)
871 {
872 case MODE_MONITOR:
873 return;
874 case MODE_GLOBAL_RESTART:
875 run_job(&gs.restart,"restart",gs.restart_command,0,1);
876 break;
877 case MODE_SEPARATE_RESTART:
878 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
879 break;
880 case MODE_PHASED_ZEBRA_RESTART:
881 if (dmn != gs.special)
882 {
883 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
884 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
885 else
886 zlog_debug("%s: postponing restart attempt because master %s daemon "
887 "not up [%s], or phased restart in progress",
888 dmn->name,gs.special->name,state_str[gs.special->state]);
889 break;
890 }
891 /*FALLTHRU*/
892 case MODE_PHASED_ALL_RESTART:
893 if ((gs.phase != PHASE_NONE) || gs.numpids)
894 {
895 if (gs.loglevel > LOG_DEBUG+1)
896 zlog_debug("postponing phased global restart: restart already in "
897 "progress [%s], or outstanding child processes [%d]",
898 phase_str[gs.phase],gs.numpids);
899 break;
900 }
901 /* Is it too soon for a restart? */
902 {
903 struct timeval delay;
904 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
905 gs.special->restart.interval)
906 {
907 if (gs.loglevel > LOG_DEBUG+1)
908 zlog_debug("postponing phased global restart: "
909 "elapsed time %ld < retry interval %ld",
910 (long)delay.tv_sec,gs.special->restart.interval);
911 break;
912 }
913 }
914 zlog_info("Phased restart: stopping all routing daemons.");
915 /* First step: stop all other daemons. */
916 for (dmn = gs.daemons; dmn; dmn = dmn->next)
917 {
918 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000919 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000920 }
921 set_phase(PHASE_STOPS_PENDING);
922 break;
923 default:
924 zlog_err("error: unknown restart mode %d",gs.mode);
925 break;
926 }
927}
928
929static int
930wakeup_unresponsive(struct thread *t_wakeup)
931{
932 struct daemon *dmn = THREAD_ARG(t_wakeup);
933
934 dmn->t_wakeup = NULL;
935 if (dmn->state != DAEMON_UNRESPONSIVE)
936 zlog_err("%s: no longer unresponsive (now %s), "
937 "wakeup should have been cancelled!",
938 dmn->name,state_str[dmn->state]);
939 else
940 {
941 SET_WAKEUP_UNRESPONSIVE(dmn);
942 try_restart(dmn);
943 }
944 return 0;
945}
946
947static int
948wakeup_no_answer(struct thread *t_wakeup)
949{
950 struct daemon *dmn = THREAD_ARG(t_wakeup);
951
952 dmn->t_wakeup = NULL;
953 dmn->state = DAEMON_UNRESPONSIVE;
954 zlog_err("%s state -> unresponsive : no response yet to ping "
955 "sent %ld seconds ago",dmn->name,gs.timeout);
956 if (gs.unresponsive_restart)
957 {
958 SET_WAKEUP_UNRESPONSIVE(dmn);
959 try_restart(dmn);
960 }
961 return 0;
962}
963
964static int
965wakeup_send_echo(struct thread *t_wakeup)
966{
967 static const char echocmd[] = "echo " PING_TOKEN;
968 ssize_t rc;
969 struct daemon *dmn = THREAD_ARG(t_wakeup);
970
971 dmn->t_wakeup = NULL;
972 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
973 ((size_t)rc != sizeof(echocmd)))
974 {
975 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000976 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
977 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000978 daemon_down(dmn,why);
979 }
980 else
981 {
982 gettimeofday(&dmn->echo_sent,NULL);
983 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
984 }
985 return 0;
986}
987
988static void
989sigint(void)
990{
991 zlog_notice("Terminating on signal");
992 exit(0);
993}
994
995static int
996valid_command(const char *cmd)
997{
998 char *p;
999
1000 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1001}
1002
ajsc8b40f82004-12-22 16:17:16 +00001003/* This is an ugly hack to circumvent problems with passing command-line
1004 arguments that contain spaces. The fix is to use a configuration file. */
1005static char *
1006translate_blanks(const char *cmd, const char *blankstr)
1007{
1008 char *res;
1009 char *p;
1010 size_t bslen = strlen(blankstr);
1011
1012 if (!(res = strdup(cmd)))
1013 {
1014 perror("strdup");
1015 exit(1);
1016 }
1017 while ((p = strstr(res,blankstr)) != NULL)
1018 {
1019 *p = ' ';
1020 if (bslen != 1)
1021 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1022 }
1023 return res;
1024}
1025
ajs8b886ca2004-12-22 02:56:38 +00001026int
1027main(int argc, char **argv)
1028{
1029 const char *progname;
1030 int opt;
1031 int daemon_mode = 0;
1032 const char *pidfile = DEFAULT_PIDFILE;
1033 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001034 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001035 static struct quagga_signal_t my_signals[] =
1036 {
1037 {
1038 .signal = SIGINT,
1039 .handler = sigint,
1040 },
1041 {
1042 .signal = SIGTERM,
1043 .handler = sigint,
1044 },
1045 {
1046 .signal = SIGCHLD,
1047 .handler = sigchild,
1048 },
1049 };
1050
1051 if ((progname = strrchr (argv[0], '/')) != NULL)
1052 progname++;
1053 else
1054 progname = argv[0];
1055
ajs098e2402004-12-22 17:00:46 +00001056 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001057 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001058 longopts, 0)) != EOF)
1059 {
1060 switch (opt)
1061 {
1062 case 0:
1063 break;
1064 case 'a':
1065 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1066 {
1067 fputs("Ambiguous operating mode selected.\n",stderr);
1068 return usage(progname,1);
1069 }
1070 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1071 break;
1072 case 'A':
1073 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1074 {
1075 fputs("Ambiguous operating mode selected.\n",stderr);
1076 return usage(progname,1);
1077 }
1078 gs.mode = MODE_PHASED_ALL_RESTART;
1079 break;
ajsc8b40f82004-12-22 16:17:16 +00001080 case 'b':
1081 blankstr = optarg;
1082 break;
ajs8b886ca2004-12-22 02:56:38 +00001083 case 'd':
1084 daemon_mode = 1;
1085 break;
1086 case 'e':
1087 gs.do_ping = 0;
1088 break;
1089 case 'k':
1090 if (!valid_command(optarg))
1091 {
1092 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1093 optarg);
1094 return usage(progname,1);
1095 }
1096 gs.stop_command = optarg;
1097 break;
1098 case 'l':
1099 {
1100 char garbage[3];
1101 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1102 (gs.loglevel < LOG_EMERG))
1103 {
1104 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1105 return usage(progname,1);
1106 }
1107 }
1108 break;
1109 case 'm':
1110 {
1111 char garbage[3];
1112 if ((sscanf(optarg,"%ld%1s",
1113 &gs.min_restart_interval,garbage) != 1) ||
1114 (gs.min_restart_interval < 0))
1115 {
1116 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1117 optarg);
1118 return usage(progname,1);
1119 }
1120 }
1121 break;
1122 case 'M':
1123 {
1124 char garbage[3];
1125 if ((sscanf(optarg,"%ld%1s",
1126 &gs.max_restart_interval,garbage) != 1) ||
1127 (gs.max_restart_interval < 0))
1128 {
1129 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1130 optarg);
1131 return usage(progname,1);
1132 }
1133 }
1134 break;
1135 case 'i':
1136 {
1137 char garbage[3];
1138 int period;
1139 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1140 (gs.period < 1))
1141 {
1142 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1143 return usage(progname,1);
1144 }
1145 gs.period = 1000*period;
1146 }
1147 break;
1148 case 'p':
1149 pidfile = optarg;
1150 break;
1151 case 'r':
1152 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1153 (gs.mode == MODE_SEPARATE_RESTART))
1154 {
1155 fputs("Ambiguous operating mode selected.\n",stderr);
1156 return usage(progname,1);
1157 }
1158 if (!valid_command(optarg))
1159 {
1160 fprintf(stderr,
1161 "Invalid restart command, must contain '%%s': %s\n",
1162 optarg);
1163 return usage(progname,1);
1164 }
1165 gs.restart_command = optarg;
1166 if (gs.mode == MODE_MONITOR)
1167 gs.mode = MODE_SEPARATE_RESTART;
1168 break;
1169 case 'R':
1170 if (gs.mode != MODE_MONITOR)
1171 {
1172 fputs("Ambiguous operating mode selected.\n",stderr);
1173 return usage(progname,1);
1174 }
1175 if (strchr(optarg,'%'))
1176 {
1177 fprintf(stderr,
1178 "Invalid restart-all arg, must not contain '%%s': %s\n",
1179 optarg);
1180 return usage(progname,1);
1181 }
1182 gs.restart_command = optarg;
1183 gs.mode = MODE_GLOBAL_RESTART;
1184 break;
1185 case 's':
1186 if (!valid_command(optarg))
1187 {
1188 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1189 optarg);
1190 return usage(progname,1);
1191 }
1192 gs.start_command = optarg;
1193 break;
1194 case 'S':
1195 gs.vtydir = optarg;
1196 break;
1197 case 't':
1198 {
1199 char garbage[3];
1200 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1201 (gs.timeout < 1))
1202 {
1203 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1204 return usage(progname,1);
1205 }
1206 }
1207 break;
1208 case 'T':
1209 {
1210 char garbage[3];
1211 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1212 (gs.restart_timeout < 1))
1213 {
1214 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1215 return usage(progname,1);
1216 }
1217 }
1218 break;
1219 case 'z':
1220 gs.unresponsive_restart = 1;
1221 break;
1222 case 'v':
1223 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1224 puts("Copyright 2004 Andrew J. Schorr");
1225 return 0;
1226 case 'h':
1227 return usage(progname,0);
1228 default:
1229 fputs("Invalid option.\n",stderr);
1230 return usage(progname,1);
1231 }
1232 }
1233
1234 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1235 {
1236 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1237 return usage(progname,1);
1238 }
1239 switch (gs.mode)
1240 {
1241 case MODE_MONITOR:
1242 if (gs.restart_command || gs.start_command || gs.stop_command)
1243 {
1244 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1245 mode_str[gs.mode]);
1246 return usage(progname,1);
1247 }
1248 break;
1249 case MODE_GLOBAL_RESTART:
1250 case MODE_SEPARATE_RESTART:
1251 if (!gs.restart_command || gs.start_command || gs.stop_command)
1252 {
1253 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1254 mode_str[gs.mode]);
1255 return usage(progname,1);
1256 }
1257 break;
1258 case MODE_PHASED_ZEBRA_RESTART:
1259 case MODE_PHASED_ALL_RESTART:
1260 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1261 {
1262 fprintf(stderr,
1263 "Need start, kill, and restart commands in [%s] mode.\n",
1264 mode_str[gs.mode]);
1265 return usage(progname,1);
1266 }
1267 break;
1268 }
1269
ajsc8b40f82004-12-22 16:17:16 +00001270 if (blankstr)
1271 {
1272 if (gs.restart_command)
1273 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1274 if (gs.start_command)
1275 gs.start_command = translate_blanks(gs.start_command,blankstr);
1276 if (gs.stop_command)
1277 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1278 }
1279
ajs8b886ca2004-12-22 02:56:38 +00001280 gs.restart.interval = gs.min_restart_interval;
1281 master = thread_master_create();
1282 signal_init (master, Q_SIGC(my_signals), my_signals);
1283 srandom(time(NULL));
1284
1285 {
1286 int i;
1287 struct daemon *tail = NULL;
1288
1289 for (i = optind; i < argc; i++)
1290 {
1291 struct daemon *dmn;
1292
1293 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1294 {
ajs098e2402004-12-22 17:00:46 +00001295 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1296 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001297 return 1;
1298 }
1299 dmn->name = dmn->restart.name = argv[i];
1300 dmn->state = DAEMON_INIT;
1301 gs.numdaemons++;
1302 gs.numdown++;
1303 dmn->fd = -1;
1304 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1305 100+(random() % 900));
1306 dmn->restart.interval = gs.min_restart_interval;
1307 if (tail)
1308 tail->next = dmn;
1309 else
1310 gs.daemons = dmn;
1311 tail = dmn;
1312
1313 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1314 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1315 !strcmp(dmn->name,special))
1316 gs.special = dmn;
1317 }
1318 }
1319 if (!gs.daemons)
1320 {
1321 fputs("Must specify one or more daemons to monitor.\n",stderr);
1322 return usage(progname,1);
1323 }
1324 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1325 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1326 {
1327 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1328 mode_str[gs.mode],special);
1329 return usage(progname,1);
1330 }
1331 if (gs.special && (gs.numdaemons < 2))
1332 {
1333 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1334 "to watch.\n",mode_str[gs.mode]);
1335 return usage(progname,1);
1336 }
1337
1338 zlog_default = openzlog(progname, ZLOG_NONE,
1339 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1340 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1341 if (daemon_mode)
1342 {
1343 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001344 if (daemon (0, 0) < 0)
1345 {
1346 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1347 exit (1);
1348 }
ajs8b886ca2004-12-22 02:56:38 +00001349 }
1350 else
1351 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1352
1353 /* Make sure we're not already running. */
1354 pid_output (pidfile);
1355
1356 /* Announce which daemons are being monitored. */
1357 {
1358 struct daemon *dmn;
1359 size_t len = 0;
1360
1361 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1362 len += strlen(dmn->name)+1;
1363
1364 {
1365 char buf[len+1];
1366 char *p = buf;
1367
1368 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1369 {
1370 if (p != buf)
1371 *p++ = ' ';
1372 strcpy(p,dmn->name);
1373 p += strlen(p);
1374 }
1375 zlog_notice("%s %s watching [%s], mode [%s]",
1376 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1377 }
1378 }
1379
1380 {
1381 struct thread thread;
1382
1383 while (thread_fetch (master, &thread))
1384 thread_call (&thread);
1385 }
1386
1387 /* Not reached. */
1388 return 0;
1389}