blob: fb628acca31640a1df139fe8d8aff9e6bd2778f4 [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
Paul Jakma6f0e3f62007-05-10 02:38:51 +00002 $Id$
ajs8b886ca2004-12-22 02:56:38 +00003
4 Monitor status of quagga daemons and restart if necessary.
5
6 Copyright (C) 2004 Andrew J. Schorr
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
ajsa3655342004-12-29 17:39:10 +000023#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000024#include <thread.h>
25#include <log.h>
ajs52e66292005-02-16 20:40:25 +000026#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000027#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000028#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000029#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000030#include <sys/un.h>
31#include <sys/wait.h>
ajs8b886ca2004-12-22 02:56:38 +000032
33#ifndef MIN
34#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
35#endif
36
37/* Macros to help randomize timers. */
38#define JITTER(X) ((random() % ((X)+1))-((X)/2))
39#define FUZZY(X) ((X)+JITTER((X)/20))
40
41#define DEFAULT_PERIOD 5
42#define DEFAULT_TIMEOUT 10
43#define DEFAULT_RESTART_TIMEOUT 20
44#define DEFAULT_LOGLEVEL LOG_INFO
45#define DEFAULT_MIN_RESTART 60
46#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000047#ifdef PATH_WATCHQUAGGA_PID
48#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
49#else
ajs8b886ca2004-12-22 02:56:38 +000050#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000051#endif
ajs16f65112004-12-22 15:37:44 +000052#ifdef DAEMON_VTY_DIR
53#define VTYDIR DAEMON_VTY_DIR
54#else
55#define VTYDIR STATEDIR
56#endif
ajs8b886ca2004-12-22 02:56:38 +000057
58#define PING_TOKEN "PING"
59
60/* Needs to be global, referenced somewhere inside libzebra. */
61struct thread_master *master;
62
63typedef enum
64{
65 MODE_MONITOR = 0,
66 MODE_GLOBAL_RESTART,
67 MODE_SEPARATE_RESTART,
68 MODE_PHASED_ZEBRA_RESTART,
69 MODE_PHASED_ALL_RESTART
70} watch_mode_t;
71
72static const char *mode_str[] =
73{
74 "monitor",
75 "global restart",
76 "individual daemon restart",
77 "phased zebra restart",
78 "phased global restart for any failure",
79};
80
81typedef enum
82{
83 PHASE_NONE = 0,
84 PHASE_STOPS_PENDING,
85 PHASE_WAITING_DOWN,
86 PHASE_ZEBRA_RESTART_PENDING,
87 PHASE_WAITING_ZEBRA_UP
88} restart_phase_t;
89
90static const char *phase_str[] =
91{
92 "None",
93 "Stop jobs running",
94 "Waiting for other daemons to come down",
95 "Zebra restart job running",
96 "Waiting for zebra to come up",
97 "Start jobs running",
98};
99
100#define PHASE_TIMEOUT (3*gs.restart_timeout)
101
ajs098e2402004-12-22 17:00:46 +0000102struct restart_info
103{
104 const char *name;
105 const char *what;
106 pid_t pid;
107 struct timeval time;
108 long interval;
109 struct thread *t_kill;
110 int kills;
111};
112
113static struct global_state
114{
ajs8b886ca2004-12-22 02:56:38 +0000115 watch_mode_t mode;
116 restart_phase_t phase;
117 struct thread *t_phase_hanging;
118 const char *vtydir;
119 long period;
120 long timeout;
121 long restart_timeout;
122 long min_restart_interval;
123 long max_restart_interval;
124 int do_ping;
125 struct daemon *daemons;
126 const char *restart_command;
127 const char *start_command;
128 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000129 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000130 int unresponsive_restart;
131 int loglevel;
132 struct daemon *special; /* points to zebra when doing phased restart */
133 int numdaemons;
134 int numpids;
135 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
136} gs = {
137 .mode = MODE_MONITOR,
138 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000139 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000140 .period = 1000*DEFAULT_PERIOD,
141 .timeout = DEFAULT_TIMEOUT,
142 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
143 .loglevel = DEFAULT_LOGLEVEL,
144 .min_restart_interval = DEFAULT_MIN_RESTART,
145 .max_restart_interval = DEFAULT_MAX_RESTART,
146 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000147};
148
149typedef enum
150{
151 DAEMON_INIT,
152 DAEMON_DOWN,
153 DAEMON_CONNECTING,
154 DAEMON_UP,
155 DAEMON_UNRESPONSIVE
156} daemon_state_t;
157
158#define IS_UP(DMN) \
159 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
160
161static const char *state_str[] =
162{
163 "Init",
164 "Down",
165 "Connecting",
166 "Up",
167 "Unresponsive",
168};
169
170struct daemon {
171 const char *name;
172 daemon_state_t state;
173 int fd;
174 struct timeval echo_sent;
175 u_int connect_tries;
176 struct thread *t_wakeup;
177 struct thread *t_read;
178 struct thread *t_write;
179 struct daemon *next;
180 struct restart_info restart;
181};
182
183static const struct option longopts[] =
184{
185 { "daemon", no_argument, NULL, 'd'},
186 { "statedir", required_argument, NULL, 'S'},
187 { "no-echo", no_argument, NULL, 'e'},
188 { "loglevel", required_argument, NULL, 'l'},
189 { "interval", required_argument, NULL, 'i'},
190 { "timeout", required_argument, NULL, 't'},
191 { "restart-timeout", required_argument, NULL, 'T'},
192 { "restart", required_argument, NULL, 'r'},
193 { "start-command", required_argument, NULL, 's'},
194 { "kill-command", required_argument, NULL, 'k'},
195 { "restart-all", required_argument, NULL, 'R'},
196 { "all-restart", no_argument, NULL, 'a'},
197 { "always-all-restart", no_argument, NULL, 'A'},
198 { "unresponsive-restart", no_argument, NULL, 'z'},
199 { "min-restart-interval", required_argument, NULL, 'm'},
200 { "max-restart-interval", required_argument, NULL, 'M'},
201 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000202 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000203 { "help", no_argument, NULL, 'h'},
204 { "version", no_argument, NULL, 'v'},
205 { NULL, 0, NULL, 0 }
206};
207
208static int try_connect(struct daemon *dmn);
209static int wakeup_send_echo(struct thread *t_wakeup);
210static void try_restart(struct daemon *dmn);
211static void phase_check(void);
212
213static int
214usage(const char *progname, int status)
215{
216 if (status != 0)
217 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
218 else
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
220Watchdog program to monitor status of quagga daemons and try to restart\n\
221them if they are down or unresponsive. It determines whether a daemon is\n\
222up based on whether it can connect to the daemon's vty unix stream socket.\n\
223It then repeatedly sends echo commands over that socket to determine whether\n\
224the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225on the socket connection and know immediately that the daemon is down.\n\n\
226The daemons to be monitored should be listed on the command line.\n\n\
227This program can run in one of 5 modes:\n\n\
2280. Mode: %s.\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
2311. Mode: %s.\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
234 %s -dz \\\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
236 zebra ospfd\n\n\
2372. Mode: %s.\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2413. Mode: %s.\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
245 daemons. Example:\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2494. Mode: %s.\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255As of this writing, it is believed that mode 2 [%s]\n\
256is not safe, and mode 3 [%s] may not be safe with some of the\n\
257routing daemons.\n\n\
258In order to avoid attempting to restart the daemons in a fast loop,\n\
259the -m and -M options allow you to control the minimum delay between\n\
260restart commands. The minimum restart delay is recalculated each time\n\
261a restart is attempted: if the time since the last restart attempt exceeds\n\
262twice the -M value, then the restart delay is set to the -m value.\n\
263Otherwise, the interval is doubled (but capped at the -M value).\n\n\
264Options:\n\
265-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
266 to syslog instead of stdout.\n\
267-S, --statedir Set the vty socket directory (default is %s)\n\
268-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
269 option is necessary if the daemons do not support the\n\
270 echo command)\n\
271-l, --loglevel Set the logging level (default is %d).\n\
272 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
273 but it can be set higher than %d if extra-verbose debugging\n\
274 messages are desired.\n\
275-m, --min-restart-interval\n\
276 Set the minimum seconds to wait between invocations of daemon\n\
277 restart commands (default is %d).\n\
278-M, --max-restart-interval\n\
279 Set the maximum seconds to wait between invocations of daemon\n\
280 restart commands (default is %d).\n\
281-i, --interval Set the status polling interval in seconds (default is %d)\n\
282-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
283-T, --restart-timeout\n\
284 Set the restart (kill) timeout in seconds (default is %d).\n\
285 If any background jobs are still running after this much\n\
286 time has elapsed, they will be killed.\n\
287-r, --restart Supply a Bourne shell command to use to restart a single\n\
288 daemon. The command string should include '%%s' where the\n\
289 name of the daemon should be substituted.\n\
290 Note that -r and -R are incompatible.\n\
291-s, --start-command\n\
292 Supply a Bourne shell to command to use to start a single\n\
293 daemon. The command string should include '%%s' where the\n\
294 name of the daemon should be substituted.\n\
295-k, --kill-command\n\
296 Supply a Bourne shell to command to use to stop a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
299-R, --restart-all\n\
300 When one or more daemons is down, try to restart everything\n\
301 using the Bourne shell command supplied as the argument.\n\
302 Note that -r and -R are incompatible.\n\
303-z, --unresponsive-restart\n\
304 When a daemon is unresponsive, treat it as being down for\n\
305 restart purposes.\n\
306-a, --all-restart\n\
307 When zebra hangs or crashes, restart all daemons using\n\
308 this phased approach: 1. stop all other daemons; 2. restart\n\
309 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
310-A, --always-all-restart\n\
311 When any daemon (not just zebra) hangs or crashes, use the\n\
312 same phased restart mechanism described above for -a.\n\
313 Requires -r, -s, and -k.\n\
314-p, --pid-file Set process identifier file name\n\
315 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000316-b, --blank-string\n\
317 When the supplied argument string is found in any of the\n\
318 various shell command arguments (-r, -s, -k, or -R), replace\n\
319 it with a space. This is an ugly hack to circumvent problems\n\
320 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000321-v, --version Print program version\n\
322-h, --help Display this help and exit\n\
323", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
324progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000325VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000326DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
327DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
328
329 return status;
330}
331
332static pid_t
333run_background(const char *shell_cmd)
334{
335 pid_t child;
336
337 switch (child = fork())
338 {
339 case -1:
340 zlog_err("fork failed, cannot run command [%s]: %s",
341 shell_cmd,safe_strerror(errno));
342 return -1;
343 case 0:
344 /* Child process. */
345 /* Use separate process group so child processes can be killed easily. */
346 if (setpgid(0,0) < 0)
347 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
348 {
349 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
350 execv("/bin/sh",(char *const *)argv);
351 zlog_err("execv(/bin/sh -c '%s') failed: %s",
352 shell_cmd,safe_strerror(errno));
353 _exit(127);
354 }
355 default:
356 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000357 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000358 return child;
359 }
360}
361
362static struct timeval *
363time_elapsed(struct timeval *result, const struct timeval *start_time)
364{
365 gettimeofday(result,NULL);
366 result->tv_sec -= start_time->tv_sec;
367 result->tv_usec -= start_time->tv_usec;
368 while (result->tv_usec < 0)
369 {
370 result->tv_usec += 1000000L;
371 result->tv_sec--;
372 }
373 return result;
374}
375
376static int
377restart_kill(struct thread *t_kill)
378{
379 struct restart_info *restart = THREAD_ARG(t_kill);
380 struct timeval delay;
381
382 time_elapsed(&delay,&restart->time);
383 zlog_warn("Warning: %s %s child process %d still running after "
384 "%ld seconds, sending signal %d",
ajsf2d82572004-12-29 17:45:08 +0000385 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000386 (restart->kills ? SIGKILL : SIGTERM));
387 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
388 restart->kills++;
389 restart->t_kill = thread_add_timer(master,restart_kill,restart,
390 gs.restart_timeout);
391 return 0;
392}
393
394static struct restart_info *
395find_child(pid_t child)
396{
397 if (gs.mode == MODE_GLOBAL_RESTART)
398 {
399 if (gs.restart.pid == child)
400 return &gs.restart;
401 }
402 else
403 {
404 struct daemon *dmn;
405 for (dmn = gs.daemons; dmn; dmn = dmn->next)
406 {
407 if (dmn->restart.pid == child)
408 return &dmn->restart;
409 }
410 }
411 return NULL;
412}
413
414static void
415sigchild(void)
416{
417 pid_t child;
418 int status;
419 const char *name;
420 const char *what;
421 struct restart_info *restart;
422
423 switch (child = waitpid(-1,&status,WNOHANG))
424 {
425 case -1:
426 zlog_err("waitpid failed: %s",safe_strerror(errno));
427 return;
428 case 0:
429 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
430 return;
431 }
432
433 if ((restart = find_child(child)) != NULL)
434 {
435 name = restart->name;
436 what = restart->what;
437 restart->pid = 0;
438 gs.numpids--;
439 thread_cancel(restart->t_kill);
440 restart->t_kill = NULL;
441 /* Update restart time to reflect the time the command completed. */
442 gettimeofday(&restart->time,NULL);
443 }
444 else
445 {
446 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000447 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000448 name = "(unknown)";
449 what = "background";
450 }
451 if (WIFSTOPPED(status))
452 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000453 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000454 else if (WIFSIGNALED(status))
455 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000456 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000457 else if (WIFEXITED(status))
458 {
459 if (WEXITSTATUS(status) != 0)
460 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000461 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000462 else
ajsf2d82572004-12-29 17:45:08 +0000463 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000464 }
465 else
466 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000467 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000468 phase_check();
469}
470
471static int
472run_job(struct restart_info *restart, const char *cmdtype, const char *command,
473 int force, int update_interval)
474{
475 struct timeval delay;
476
477 if (gs.loglevel > LOG_DEBUG+1)
478 zlog_debug("attempting to %s %s",cmdtype,restart->name);
479
480 if (restart->pid)
481 {
482 if (gs.loglevel > LOG_DEBUG+1)
483 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000484 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000485 return -1;
486 }
487
ajsa8a8ddc2005-01-12 16:24:51 +0000488 /* Note: time_elapsed test must come before the force test, since we need
489 to make sure that delay is initialized for use below in updating the
490 restart interval. */
491 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
492 !force)
ajs8b886ca2004-12-22 02:56:38 +0000493 {
494 if (gs.loglevel > LOG_DEBUG+1)
495 zlog_debug("postponing %s %s: "
496 "elapsed time %ld < retry interval %ld",
497 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
498 return -1;
499 }
500
501 gettimeofday(&restart->time,NULL);
502 restart->kills = 0;
503 {
504 char cmd[strlen(command)+strlen(restart->name)+1];
505 snprintf(cmd,sizeof(cmd),command,restart->name);
506 if ((restart->pid = run_background(cmd)) > 0)
507 {
508 restart->t_kill = thread_add_timer(master,restart_kill,restart,
509 gs.restart_timeout);
510 restart->what = cmdtype;
511 gs.numpids++;
512 }
513 else
514 restart->pid = 0;
515 }
516
517 /* Calculate the new restart interval. */
518 if (update_interval)
519 {
520 if (delay.tv_sec > 2*gs.max_restart_interval)
521 restart->interval = gs.min_restart_interval;
522 else if ((restart->interval *= 2) > gs.max_restart_interval)
523 restart->interval = gs.max_restart_interval;
524 if (gs.loglevel > LOG_DEBUG+1)
525 zlog_debug("restart %s interval is now %ld",
526 restart->name,restart->interval);
527 }
528 return restart->pid;
529}
530
531#define SET_READ_HANDLER(DMN) \
532 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
533
534#define SET_WAKEUP_DOWN(DMN) \
535 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
536 FUZZY(gs.period))
537
538#define SET_WAKEUP_UNRESPONSIVE(DMN) \
539 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
540 FUZZY(gs.period))
541
542#define SET_WAKEUP_ECHO(DMN) \
543 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
544 FUZZY(gs.period))
545
546static int
547wakeup_down(struct thread *t_wakeup)
548{
549 struct daemon *dmn = THREAD_ARG(t_wakeup);
550
551 dmn->t_wakeup = NULL;
552 if (try_connect(dmn) < 0)
553 SET_WAKEUP_DOWN(dmn);
554 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
555 try_restart(dmn);
556 return 0;
557}
558
559static int
560wakeup_init(struct thread *t_wakeup)
561{
562 struct daemon *dmn = THREAD_ARG(t_wakeup);
563
564 dmn->t_wakeup = NULL;
565 if (try_connect(dmn) < 0)
566 {
567 SET_WAKEUP_DOWN(dmn);
568 zlog_err("%s state -> down : initial connection attempt failed",
569 dmn->name);
570 dmn->state = DAEMON_DOWN;
571 }
572 return 0;
573}
574
575static void
576daemon_down(struct daemon *dmn, const char *why)
577{
578 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
579 zlog_err("%s state -> down : %s",dmn->name,why);
580 else if (gs.loglevel > LOG_DEBUG)
581 zlog_debug("%s still down : %s",dmn->name,why);
582 if (IS_UP(dmn))
583 gs.numdown++;
584 dmn->state = DAEMON_DOWN;
585 if (dmn->fd >= 0)
586 {
587 close(dmn->fd);
588 dmn->fd = -1;
589 }
590 THREAD_OFF(dmn->t_read);
591 THREAD_OFF(dmn->t_write);
592 THREAD_OFF(dmn->t_wakeup);
593 if (try_connect(dmn) < 0)
594 SET_WAKEUP_DOWN(dmn);
595 phase_check();
596}
597
598static int
599handle_read(struct thread *t_read)
600{
601 struct daemon *dmn = THREAD_ARG(t_read);
602 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
603 char buf[sizeof(resp)+100];
604 ssize_t rc;
605 struct timeval delay;
606
607 dmn->t_read = NULL;
608 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
609 {
610 char why[100];
611
ajs518cde82005-02-17 20:11:58 +0000612 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000613 {
614 /* Pretend it never happened. */
615 SET_READ_HANDLER(dmn);
616 return 0;
617 }
618 snprintf(why,sizeof(why),"unexpected read error: %s",
619 safe_strerror(errno));
620 daemon_down(dmn,why);
621 return 0;
622 }
623 if (rc == 0)
624 {
625 daemon_down(dmn,"read returned EOF");
626 return 0;
627 }
628 if (!dmn->echo_sent.tv_sec)
629 {
630 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000631 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
632 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000633 daemon_down(dmn,why);
634 return 0;
635 }
636
637 /* We are expecting an echo response: is there any chance that the
638 response would not be returned entirely in the first read? That
639 seems inconceivable... */
640 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
641 {
642 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000643 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
644 "(expecting %u): %.*s",
645 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000646 daemon_down(dmn,why);
647 return 0;
648 }
649
650 time_elapsed(&delay,&dmn->echo_sent);
651 dmn->echo_sent.tv_sec = 0;
652 if (dmn->state == DAEMON_UNRESPONSIVE)
653 {
654 if (delay.tv_sec < gs.timeout)
655 {
656 dmn->state = DAEMON_UP;
657 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
658 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
659 }
660 else
661 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
662 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
663 }
664 else if (gs.loglevel > LOG_DEBUG+1)
665 zlog_debug("%s: echo response received after %ld.%06ld seconds",
666 dmn->name,delay.tv_sec,delay.tv_usec);
667
668 SET_READ_HANDLER(dmn);
669 if (dmn->t_wakeup)
670 thread_cancel(dmn->t_wakeup);
671 SET_WAKEUP_ECHO(dmn);
672
673 return 0;
674}
675
676static void
677daemon_up(struct daemon *dmn, const char *why)
678{
679 dmn->state = DAEMON_UP;
680 gs.numdown--;
681 dmn->connect_tries = 0;
682 zlog_notice("%s state -> up : %s",dmn->name,why);
683 if (gs.do_ping)
684 SET_WAKEUP_ECHO(dmn);
685 phase_check();
686}
687
688static int
689check_connect(struct thread *t_write)
690{
691 struct daemon *dmn = THREAD_ARG(t_write);
692 int sockerr;
693 socklen_t reslen = sizeof(sockerr);
694
695 dmn->t_write = NULL;
696 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
697 {
698 zlog_warn("%s: check_connect: getsockopt failed: %s",
699 dmn->name,safe_strerror(errno));
700 daemon_down(dmn,"getsockopt failed checking connection success");
701 return 0;
702 }
703 if ((reslen == sizeof(sockerr)) && sockerr)
704 {
705 char why[100];
706 snprintf(why,sizeof(why),
707 "getsockopt reports that connection attempt failed: %s",
708 safe_strerror(sockerr));
709 daemon_down(dmn,why);
710 return 0;
711 }
712
713 daemon_up(dmn,"delayed connect succeeded");
714 return 0;
715}
716
717static int
718wakeup_connect_hanging(struct thread *t_wakeup)
719{
720 struct daemon *dmn = THREAD_ARG(t_wakeup);
721 char why[100];
722
723 dmn->t_wakeup = NULL;
724 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
725 gs.timeout);
726 daemon_down(dmn,why);
727 return 0;
728}
729
730/* Making connection to protocol daemon. */
731static int
732try_connect(struct daemon *dmn)
733{
734 int sock;
735 struct sockaddr_un addr;
736 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000737
738 if (gs.loglevel > LOG_DEBUG+1)
739 zlog_debug("%s: attempting to connect",dmn->name);
740 dmn->connect_tries++;
741
742 memset (&addr, 0, sizeof (struct sockaddr_un));
743 addr.sun_family = AF_UNIX;
744 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
745 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000746#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000747 len = addr.sun_len = SUN_LEN(&addr);
748#else
749 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000750#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000751
752 /* Quick check to see if we might succeed before we go to the trouble
753 of creating a socket. */
754 if (access(addr.sun_path, W_OK) < 0)
755 {
756 if (errno != ENOENT)
757 zlog_err("%s: access to socket %s denied: %s",
758 dmn->name,addr.sun_path,safe_strerror(errno));
759 return -1;
760 }
761
762 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
763 {
764 zlog_err("%s(%s): cannot make socket: %s",
765 __func__,addr.sun_path, safe_strerror(errno));
766 return -1;
767 }
768
ajs52e66292005-02-16 20:40:25 +0000769 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000770 {
ajs52e66292005-02-16 20:40:25 +0000771 zlog_err("%s(%s): set_nonblocking(%d) failed",
772 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000773 close(sock);
774 return -1;
775 }
776
777 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
778 {
779 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
780 {
781 if (gs.loglevel > LOG_DEBUG)
782 zlog_debug("%s(%s): connect failed: %s",
783 __func__,addr.sun_path, safe_strerror(errno));
784 close (sock);
785 return -1;
786 }
787 if (gs.loglevel > LOG_DEBUG)
788 zlog_debug("%s: connection in progress",dmn->name);
789 dmn->state = DAEMON_CONNECTING;
790 dmn->fd = sock;
791 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
792 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
793 gs.timeout);
794 SET_READ_HANDLER(dmn);
795 return 0;
796 }
797
798 dmn->fd = sock;
799 SET_READ_HANDLER(dmn);
800 daemon_up(dmn,"connect succeeded");
801 return 1;
802}
803
804static int
805phase_hanging(struct thread *t_hanging)
806{
807 gs.t_phase_hanging = NULL;
808 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
809 phase_str[gs.phase],PHASE_TIMEOUT);
810 gs.phase = PHASE_NONE;
811 return 0;
812}
813
814static void
815set_phase(restart_phase_t new_phase)
816{
817 gs.phase = new_phase;
818 if (gs.t_phase_hanging)
819 thread_cancel(gs.t_phase_hanging);
820 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
821 PHASE_TIMEOUT);
822}
823
824static void
825phase_check(void)
826{
827 switch (gs.phase)
828 {
829 case PHASE_NONE:
830 break;
831 case PHASE_STOPS_PENDING:
832 if (gs.numpids)
833 break;
834 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
835 set_phase(PHASE_WAITING_DOWN);
836 /*FALLTHRU*/
837 case PHASE_WAITING_DOWN:
838 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
839 break;
840 zlog_info("Phased restart: all routing daemons now down.");
841 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
842 set_phase(PHASE_ZEBRA_RESTART_PENDING);
843 /*FALLTHRU*/
844 case PHASE_ZEBRA_RESTART_PENDING:
845 if (gs.special->restart.pid)
846 break;
847 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
848 set_phase(PHASE_WAITING_ZEBRA_UP);
849 /*FALLTHRU*/
850 case PHASE_WAITING_ZEBRA_UP:
851 if (!IS_UP(gs.special))
852 break;
853 zlog_info("Phased restart: %s is now up.",gs.special->name);
854 {
855 struct daemon *dmn;
856 for (dmn = gs.daemons; dmn; dmn = dmn->next)
857 {
858 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000859 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000860 }
861 }
862 gs.phase = PHASE_NONE;
863 THREAD_OFF(gs.t_phase_hanging);
864 zlog_notice("Phased global restart has completed.");
865 break;
866 }
867}
868
869static void
870try_restart(struct daemon *dmn)
871{
872 switch (gs.mode)
873 {
874 case MODE_MONITOR:
875 return;
876 case MODE_GLOBAL_RESTART:
877 run_job(&gs.restart,"restart",gs.restart_command,0,1);
878 break;
879 case MODE_SEPARATE_RESTART:
880 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
881 break;
882 case MODE_PHASED_ZEBRA_RESTART:
883 if (dmn != gs.special)
884 {
885 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
886 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
887 else
888 zlog_debug("%s: postponing restart attempt because master %s daemon "
889 "not up [%s], or phased restart in progress",
890 dmn->name,gs.special->name,state_str[gs.special->state]);
891 break;
892 }
893 /*FALLTHRU*/
894 case MODE_PHASED_ALL_RESTART:
895 if ((gs.phase != PHASE_NONE) || gs.numpids)
896 {
897 if (gs.loglevel > LOG_DEBUG+1)
898 zlog_debug("postponing phased global restart: restart already in "
899 "progress [%s], or outstanding child processes [%d]",
900 phase_str[gs.phase],gs.numpids);
901 break;
902 }
903 /* Is it too soon for a restart? */
904 {
905 struct timeval delay;
906 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
907 gs.special->restart.interval)
908 {
909 if (gs.loglevel > LOG_DEBUG+1)
910 zlog_debug("postponing phased global restart: "
911 "elapsed time %ld < retry interval %ld",
912 (long)delay.tv_sec,gs.special->restart.interval);
913 break;
914 }
915 }
916 zlog_info("Phased restart: stopping all routing daemons.");
917 /* First step: stop all other daemons. */
918 for (dmn = gs.daemons; dmn; dmn = dmn->next)
919 {
920 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000921 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000922 }
923 set_phase(PHASE_STOPS_PENDING);
924 break;
925 default:
926 zlog_err("error: unknown restart mode %d",gs.mode);
927 break;
928 }
929}
930
931static int
932wakeup_unresponsive(struct thread *t_wakeup)
933{
934 struct daemon *dmn = THREAD_ARG(t_wakeup);
935
936 dmn->t_wakeup = NULL;
937 if (dmn->state != DAEMON_UNRESPONSIVE)
938 zlog_err("%s: no longer unresponsive (now %s), "
939 "wakeup should have been cancelled!",
940 dmn->name,state_str[dmn->state]);
941 else
942 {
943 SET_WAKEUP_UNRESPONSIVE(dmn);
944 try_restart(dmn);
945 }
946 return 0;
947}
948
949static int
950wakeup_no_answer(struct thread *t_wakeup)
951{
952 struct daemon *dmn = THREAD_ARG(t_wakeup);
953
954 dmn->t_wakeup = NULL;
955 dmn->state = DAEMON_UNRESPONSIVE;
956 zlog_err("%s state -> unresponsive : no response yet to ping "
957 "sent %ld seconds ago",dmn->name,gs.timeout);
958 if (gs.unresponsive_restart)
959 {
960 SET_WAKEUP_UNRESPONSIVE(dmn);
961 try_restart(dmn);
962 }
963 return 0;
964}
965
966static int
967wakeup_send_echo(struct thread *t_wakeup)
968{
969 static const char echocmd[] = "echo " PING_TOKEN;
970 ssize_t rc;
971 struct daemon *dmn = THREAD_ARG(t_wakeup);
972
973 dmn->t_wakeup = NULL;
974 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
975 ((size_t)rc != sizeof(echocmd)))
976 {
977 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000978 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
979 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000980 daemon_down(dmn,why);
981 }
982 else
983 {
984 gettimeofday(&dmn->echo_sent,NULL);
985 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
986 }
987 return 0;
988}
989
990static void
991sigint(void)
992{
993 zlog_notice("Terminating on signal");
994 exit(0);
995}
996
997static int
998valid_command(const char *cmd)
999{
1000 char *p;
1001
1002 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1003}
1004
ajsc8b40f82004-12-22 16:17:16 +00001005/* This is an ugly hack to circumvent problems with passing command-line
1006 arguments that contain spaces. The fix is to use a configuration file. */
1007static char *
1008translate_blanks(const char *cmd, const char *blankstr)
1009{
1010 char *res;
1011 char *p;
1012 size_t bslen = strlen(blankstr);
1013
1014 if (!(res = strdup(cmd)))
1015 {
1016 perror("strdup");
1017 exit(1);
1018 }
1019 while ((p = strstr(res,blankstr)) != NULL)
1020 {
1021 *p = ' ';
1022 if (bslen != 1)
1023 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1024 }
1025 return res;
1026}
1027
ajs8b886ca2004-12-22 02:56:38 +00001028int
1029main(int argc, char **argv)
1030{
1031 const char *progname;
1032 int opt;
1033 int daemon_mode = 0;
1034 const char *pidfile = DEFAULT_PIDFILE;
1035 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001036 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001037 static struct quagga_signal_t my_signals[] =
1038 {
1039 {
1040 .signal = SIGINT,
1041 .handler = sigint,
1042 },
1043 {
1044 .signal = SIGTERM,
1045 .handler = sigint,
1046 },
1047 {
1048 .signal = SIGCHLD,
1049 .handler = sigchild,
1050 },
1051 };
1052
1053 if ((progname = strrchr (argv[0], '/')) != NULL)
1054 progname++;
1055 else
1056 progname = argv[0];
1057
ajs098e2402004-12-22 17:00:46 +00001058 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001059 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001060 longopts, 0)) != EOF)
1061 {
1062 switch (opt)
1063 {
1064 case 0:
1065 break;
1066 case 'a':
1067 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1068 {
1069 fputs("Ambiguous operating mode selected.\n",stderr);
1070 return usage(progname,1);
1071 }
1072 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1073 break;
1074 case 'A':
1075 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1076 {
1077 fputs("Ambiguous operating mode selected.\n",stderr);
1078 return usage(progname,1);
1079 }
1080 gs.mode = MODE_PHASED_ALL_RESTART;
1081 break;
ajsc8b40f82004-12-22 16:17:16 +00001082 case 'b':
1083 blankstr = optarg;
1084 break;
ajs8b886ca2004-12-22 02:56:38 +00001085 case 'd':
1086 daemon_mode = 1;
1087 break;
1088 case 'e':
1089 gs.do_ping = 0;
1090 break;
1091 case 'k':
1092 if (!valid_command(optarg))
1093 {
1094 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1095 optarg);
1096 return usage(progname,1);
1097 }
1098 gs.stop_command = optarg;
1099 break;
1100 case 'l':
1101 {
1102 char garbage[3];
1103 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1104 (gs.loglevel < LOG_EMERG))
1105 {
1106 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1107 return usage(progname,1);
1108 }
1109 }
1110 break;
1111 case 'm':
1112 {
1113 char garbage[3];
1114 if ((sscanf(optarg,"%ld%1s",
1115 &gs.min_restart_interval,garbage) != 1) ||
1116 (gs.min_restart_interval < 0))
1117 {
1118 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1119 optarg);
1120 return usage(progname,1);
1121 }
1122 }
1123 break;
1124 case 'M':
1125 {
1126 char garbage[3];
1127 if ((sscanf(optarg,"%ld%1s",
1128 &gs.max_restart_interval,garbage) != 1) ||
1129 (gs.max_restart_interval < 0))
1130 {
1131 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1132 optarg);
1133 return usage(progname,1);
1134 }
1135 }
1136 break;
1137 case 'i':
1138 {
1139 char garbage[3];
1140 int period;
1141 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1142 (gs.period < 1))
1143 {
1144 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1145 return usage(progname,1);
1146 }
1147 gs.period = 1000*period;
1148 }
1149 break;
1150 case 'p':
1151 pidfile = optarg;
1152 break;
1153 case 'r':
1154 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1155 (gs.mode == MODE_SEPARATE_RESTART))
1156 {
1157 fputs("Ambiguous operating mode selected.\n",stderr);
1158 return usage(progname,1);
1159 }
1160 if (!valid_command(optarg))
1161 {
1162 fprintf(stderr,
1163 "Invalid restart command, must contain '%%s': %s\n",
1164 optarg);
1165 return usage(progname,1);
1166 }
1167 gs.restart_command = optarg;
1168 if (gs.mode == MODE_MONITOR)
1169 gs.mode = MODE_SEPARATE_RESTART;
1170 break;
1171 case 'R':
1172 if (gs.mode != MODE_MONITOR)
1173 {
1174 fputs("Ambiguous operating mode selected.\n",stderr);
1175 return usage(progname,1);
1176 }
1177 if (strchr(optarg,'%'))
1178 {
1179 fprintf(stderr,
1180 "Invalid restart-all arg, must not contain '%%s': %s\n",
1181 optarg);
1182 return usage(progname,1);
1183 }
1184 gs.restart_command = optarg;
1185 gs.mode = MODE_GLOBAL_RESTART;
1186 break;
1187 case 's':
1188 if (!valid_command(optarg))
1189 {
1190 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1191 optarg);
1192 return usage(progname,1);
1193 }
1194 gs.start_command = optarg;
1195 break;
1196 case 'S':
1197 gs.vtydir = optarg;
1198 break;
1199 case 't':
1200 {
1201 char garbage[3];
1202 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1203 (gs.timeout < 1))
1204 {
1205 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1206 return usage(progname,1);
1207 }
1208 }
1209 break;
1210 case 'T':
1211 {
1212 char garbage[3];
1213 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1214 (gs.restart_timeout < 1))
1215 {
1216 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1217 return usage(progname,1);
1218 }
1219 }
1220 break;
1221 case 'z':
1222 gs.unresponsive_restart = 1;
1223 break;
1224 case 'v':
1225 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1226 puts("Copyright 2004 Andrew J. Schorr");
1227 return 0;
1228 case 'h':
1229 return usage(progname,0);
1230 default:
1231 fputs("Invalid option.\n",stderr);
1232 return usage(progname,1);
1233 }
1234 }
1235
1236 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1237 {
1238 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1239 return usage(progname,1);
1240 }
1241 switch (gs.mode)
1242 {
1243 case MODE_MONITOR:
1244 if (gs.restart_command || gs.start_command || gs.stop_command)
1245 {
1246 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1247 mode_str[gs.mode]);
1248 return usage(progname,1);
1249 }
1250 break;
1251 case MODE_GLOBAL_RESTART:
1252 case MODE_SEPARATE_RESTART:
1253 if (!gs.restart_command || gs.start_command || gs.stop_command)
1254 {
1255 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1256 mode_str[gs.mode]);
1257 return usage(progname,1);
1258 }
1259 break;
1260 case MODE_PHASED_ZEBRA_RESTART:
1261 case MODE_PHASED_ALL_RESTART:
1262 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1263 {
1264 fprintf(stderr,
1265 "Need start, kill, and restart commands in [%s] mode.\n",
1266 mode_str[gs.mode]);
1267 return usage(progname,1);
1268 }
1269 break;
1270 }
1271
ajsc8b40f82004-12-22 16:17:16 +00001272 if (blankstr)
1273 {
1274 if (gs.restart_command)
1275 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1276 if (gs.start_command)
1277 gs.start_command = translate_blanks(gs.start_command,blankstr);
1278 if (gs.stop_command)
1279 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1280 }
1281
ajs8b886ca2004-12-22 02:56:38 +00001282 gs.restart.interval = gs.min_restart_interval;
1283 master = thread_master_create();
1284 signal_init (master, Q_SIGC(my_signals), my_signals);
1285 srandom(time(NULL));
1286
1287 {
1288 int i;
1289 struct daemon *tail = NULL;
1290
1291 for (i = optind; i < argc; i++)
1292 {
1293 struct daemon *dmn;
1294
1295 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1296 {
ajs098e2402004-12-22 17:00:46 +00001297 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1298 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001299 return 1;
1300 }
1301 dmn->name = dmn->restart.name = argv[i];
1302 dmn->state = DAEMON_INIT;
1303 gs.numdaemons++;
1304 gs.numdown++;
1305 dmn->fd = -1;
1306 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1307 100+(random() % 900));
1308 dmn->restart.interval = gs.min_restart_interval;
1309 if (tail)
1310 tail->next = dmn;
1311 else
1312 gs.daemons = dmn;
1313 tail = dmn;
1314
1315 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1316 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1317 !strcmp(dmn->name,special))
1318 gs.special = dmn;
1319 }
1320 }
1321 if (!gs.daemons)
1322 {
1323 fputs("Must specify one or more daemons to monitor.\n",stderr);
1324 return usage(progname,1);
1325 }
1326 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1327 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1328 {
1329 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1330 mode_str[gs.mode],special);
1331 return usage(progname,1);
1332 }
1333 if (gs.special && (gs.numdaemons < 2))
1334 {
1335 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1336 "to watch.\n",mode_str[gs.mode]);
1337 return usage(progname,1);
1338 }
1339
1340 zlog_default = openzlog(progname, ZLOG_NONE,
1341 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1342 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1343 if (daemon_mode)
1344 {
1345 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001346 if (daemon (0, 0) < 0)
1347 {
1348 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1349 exit (1);
1350 }
ajs8b886ca2004-12-22 02:56:38 +00001351 }
1352 else
1353 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1354
1355 /* Make sure we're not already running. */
1356 pid_output (pidfile);
1357
1358 /* Announce which daemons are being monitored. */
1359 {
1360 struct daemon *dmn;
1361 size_t len = 0;
1362
1363 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1364 len += strlen(dmn->name)+1;
1365
1366 {
1367 char buf[len+1];
1368 char *p = buf;
1369
1370 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1371 {
1372 if (p != buf)
1373 *p++ = ' ';
1374 strcpy(p,dmn->name);
1375 p += strlen(p);
1376 }
1377 zlog_notice("%s %s watching [%s], mode [%s]",
1378 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1379 }
1380 }
1381
1382 {
1383 struct thread thread;
1384
1385 while (thread_fetch (master, &thread))
1386 thread_call (&thread);
1387 }
1388
1389 /* Not reached. */
1390 return 0;
1391}