blob: c1c88921098720c2199cd9f2cd012d5ba3133ece [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs8b886ca2004-12-22 02:56:38 +00002 Monitor status of quagga daemons and restart if necessary.
3
4 Copyright (C) 2004 Andrew J. Schorr
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
ajsa3655342004-12-29 17:39:10 +000021#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000022#include <thread.h>
23#include <log.h>
ajs52e66292005-02-16 20:40:25 +000024#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000025#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000026#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000027#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000028#include <sys/un.h>
29#include <sys/wait.h>
Balaji.G837d16c2012-09-26 14:09:10 +053030#include <memory.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
218 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
219Watchdog program to monitor status of quagga daemons and try to restart\n\
220them if they are down or unresponsive. It determines whether a daemon is\n\
221up based on whether it can connect to the daemon's vty unix stream socket.\n\
222It then repeatedly sends echo commands over that socket to determine whether\n\
223the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
224on the socket connection and know immediately that the daemon is down.\n\n\
225The daemons to be monitored should be listed on the command line.\n\n\
226This program can run in one of 5 modes:\n\n\
2270. Mode: %s.\n\
228 Just monitor and report on status changes. Example:\n\
229 %s -d zebra ospfd bgpd\n\n\
2301. Mode: %s.\n\
231 Whenever any daemon hangs or crashes, use the given command to restart\n\
232 them all. Example:\n\
233 %s -dz \\\n\
234 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
235 zebra ospfd\n\n\
2362. Mode: %s.\n\
237 When any single daemon hangs or crashes, restart only the daemon that's\n\
238 in trouble using the supplied restart command. Example:\n\
239 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2403. Mode: %s.\n\
241 The same as the previous mode, except that there is special treatment when\n\
242 the zebra daemon is in trouble. In that case, a phased restart approach\n\
243 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
244 daemons. Example:\n\
245 %s -adz -r '/sbin/service %%s restart' \\\n\
246 -s '/sbin/service %%s start' \\\n\
247 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2484. Mode: %s.\n\
249 This is the same as the previous mode, except that the phased restart\n\
250 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
251 %s -Adz -r '/sbin/service %%s restart' \\\n\
252 -s '/sbin/service %%s start' \\\n\
253 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
254As of this writing, it is believed that mode 2 [%s]\n\
255is not safe, and mode 3 [%s] may not be safe with some of the\n\
256routing daemons.\n\n\
257In order to avoid attempting to restart the daemons in a fast loop,\n\
258the -m and -M options allow you to control the minimum delay between\n\
259restart commands. The minimum restart delay is recalculated each time\n\
260a restart is attempted: if the time since the last restart attempt exceeds\n\
261twice the -M value, then the restart delay is set to the -m value.\n\
262Otherwise, the interval is doubled (but capped at the -M value).\n\n\
263Options:\n\
264-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
265 to syslog instead of stdout.\n\
266-S, --statedir Set the vty socket directory (default is %s)\n\
267-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
268 option is necessary if the daemons do not support the\n\
269 echo command)\n\
270-l, --loglevel Set the logging level (default is %d).\n\
271 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
272 but it can be set higher than %d if extra-verbose debugging\n\
273 messages are desired.\n\
274-m, --min-restart-interval\n\
275 Set the minimum seconds to wait between invocations of daemon\n\
276 restart commands (default is %d).\n\
277-M, --max-restart-interval\n\
278 Set the maximum seconds to wait between invocations of daemon\n\
279 restart commands (default is %d).\n\
280-i, --interval Set the status polling interval in seconds (default is %d)\n\
281-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
282-T, --restart-timeout\n\
283 Set the restart (kill) timeout in seconds (default is %d).\n\
284 If any background jobs are still running after this much\n\
285 time has elapsed, they will be killed.\n\
286-r, --restart Supply a Bourne shell command to use to restart a single\n\
287 daemon. The command string should include '%%s' where the\n\
288 name of the daemon should be substituted.\n\
289 Note that -r and -R are incompatible.\n\
290-s, --start-command\n\
291 Supply a Bourne shell to command to use to start a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294-k, --kill-command\n\
295 Supply a Bourne shell to command to use to stop a single\n\
296 daemon. The command string should include '%%s' where the\n\
297 name of the daemon should be substituted.\n\
298-R, --restart-all\n\
299 When one or more daemons is down, try to restart everything\n\
300 using the Bourne shell command supplied as the argument.\n\
301 Note that -r and -R are incompatible.\n\
302-z, --unresponsive-restart\n\
303 When a daemon is unresponsive, treat it as being down for\n\
304 restart purposes.\n\
305-a, --all-restart\n\
306 When zebra hangs or crashes, restart all daemons using\n\
307 this phased approach: 1. stop all other daemons; 2. restart\n\
308 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
309-A, --always-all-restart\n\
310 When any daemon (not just zebra) hangs or crashes, use the\n\
311 same phased restart mechanism described above for -a.\n\
312 Requires -r, -s, and -k.\n\
313-p, --pid-file Set process identifier file name\n\
314 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000315-b, --blank-string\n\
316 When the supplied argument string is found in any of the\n\
317 various shell command arguments (-r, -s, -k, or -R), replace\n\
318 it with a space. This is an ugly hack to circumvent problems\n\
319 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000320-v, --version Print program version\n\
321-h, --help Display this help and exit\n\
322", progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
323progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],mode_str[3],
ajs16f65112004-12-22 15:37:44 +0000324VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
ajs8b886ca2004-12-22 02:56:38 +0000325DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
326DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,DEFAULT_PIDFILE);
327
328 return status;
329}
330
331static pid_t
332run_background(const char *shell_cmd)
333{
334 pid_t child;
335
336 switch (child = fork())
337 {
338 case -1:
339 zlog_err("fork failed, cannot run command [%s]: %s",
340 shell_cmd,safe_strerror(errno));
341 return -1;
342 case 0:
343 /* Child process. */
344 /* Use separate process group so child processes can be killed easily. */
345 if (setpgid(0,0) < 0)
346 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
347 {
348 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
349 execv("/bin/sh",(char *const *)argv);
350 zlog_err("execv(/bin/sh -c '%s') failed: %s",
351 shell_cmd,safe_strerror(errno));
352 _exit(127);
353 }
354 default:
355 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000356 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000357 return child;
358 }
359}
360
361static struct timeval *
362time_elapsed(struct timeval *result, const struct timeval *start_time)
363{
364 gettimeofday(result,NULL);
365 result->tv_sec -= start_time->tv_sec;
366 result->tv_usec -= start_time->tv_usec;
367 while (result->tv_usec < 0)
368 {
369 result->tv_usec += 1000000L;
370 result->tv_sec--;
371 }
372 return result;
373}
374
375static int
376restart_kill(struct thread *t_kill)
377{
378 struct restart_info *restart = THREAD_ARG(t_kill);
379 struct timeval delay;
380
381 time_elapsed(&delay,&restart->time);
382 zlog_warn("Warning: %s %s child process %d still running after "
383 "%ld seconds, sending signal %d",
ajsf2d82572004-12-29 17:45:08 +0000384 restart->what,restart->name,(int)restart->pid,delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000385 (restart->kills ? SIGKILL : SIGTERM));
386 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
387 restart->kills++;
388 restart->t_kill = thread_add_timer(master,restart_kill,restart,
389 gs.restart_timeout);
390 return 0;
391}
392
393static struct restart_info *
394find_child(pid_t child)
395{
396 if (gs.mode == MODE_GLOBAL_RESTART)
397 {
398 if (gs.restart.pid == child)
399 return &gs.restart;
400 }
401 else
402 {
403 struct daemon *dmn;
404 for (dmn = gs.daemons; dmn; dmn = dmn->next)
405 {
406 if (dmn->restart.pid == child)
407 return &dmn->restart;
408 }
409 }
410 return NULL;
411}
412
413static void
414sigchild(void)
415{
416 pid_t child;
417 int status;
418 const char *name;
419 const char *what;
420 struct restart_info *restart;
421
422 switch (child = waitpid(-1,&status,WNOHANG))
423 {
424 case -1:
425 zlog_err("waitpid failed: %s",safe_strerror(errno));
426 return;
427 case 0:
428 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
429 return;
430 }
431
432 if ((restart = find_child(child)) != NULL)
433 {
434 name = restart->name;
435 what = restart->what;
436 restart->pid = 0;
437 gs.numpids--;
438 thread_cancel(restart->t_kill);
439 restart->t_kill = NULL;
440 /* Update restart time to reflect the time the command completed. */
441 gettimeofday(&restart->time,NULL);
442 }
443 else
444 {
445 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000446 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000447 name = "(unknown)";
448 what = "background";
449 }
450 if (WIFSTOPPED(status))
451 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000452 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000453 else if (WIFSIGNALED(status))
454 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000455 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000456 else if (WIFEXITED(status))
457 {
458 if (WEXITSTATUS(status) != 0)
459 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000460 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000461 else
ajsf2d82572004-12-29 17:45:08 +0000462 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000463 }
464 else
465 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000466 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000467 phase_check();
468}
469
470static int
471run_job(struct restart_info *restart, const char *cmdtype, const char *command,
472 int force, int update_interval)
473{
474 struct timeval delay;
475
476 if (gs.loglevel > LOG_DEBUG+1)
477 zlog_debug("attempting to %s %s",cmdtype,restart->name);
478
479 if (restart->pid)
480 {
481 if (gs.loglevel > LOG_DEBUG+1)
482 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000483 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000484 return -1;
485 }
486
ajsa8a8ddc2005-01-12 16:24:51 +0000487 /* Note: time_elapsed test must come before the force test, since we need
488 to make sure that delay is initialized for use below in updating the
489 restart interval. */
490 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
491 !force)
ajs8b886ca2004-12-22 02:56:38 +0000492 {
493 if (gs.loglevel > LOG_DEBUG+1)
494 zlog_debug("postponing %s %s: "
495 "elapsed time %ld < retry interval %ld",
496 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
497 return -1;
498 }
499
500 gettimeofday(&restart->time,NULL);
501 restart->kills = 0;
502 {
503 char cmd[strlen(command)+strlen(restart->name)+1];
504 snprintf(cmd,sizeof(cmd),command,restart->name);
505 if ((restart->pid = run_background(cmd)) > 0)
506 {
507 restart->t_kill = thread_add_timer(master,restart_kill,restart,
508 gs.restart_timeout);
509 restart->what = cmdtype;
510 gs.numpids++;
511 }
512 else
513 restart->pid = 0;
514 }
515
516 /* Calculate the new restart interval. */
517 if (update_interval)
518 {
519 if (delay.tv_sec > 2*gs.max_restart_interval)
520 restart->interval = gs.min_restart_interval;
521 else if ((restart->interval *= 2) > gs.max_restart_interval)
522 restart->interval = gs.max_restart_interval;
523 if (gs.loglevel > LOG_DEBUG+1)
524 zlog_debug("restart %s interval is now %ld",
525 restart->name,restart->interval);
526 }
527 return restart->pid;
528}
529
530#define SET_READ_HANDLER(DMN) \
531 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
532
533#define SET_WAKEUP_DOWN(DMN) \
534 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
535 FUZZY(gs.period))
536
537#define SET_WAKEUP_UNRESPONSIVE(DMN) \
538 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
539 FUZZY(gs.period))
540
541#define SET_WAKEUP_ECHO(DMN) \
542 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
543 FUZZY(gs.period))
544
545static int
546wakeup_down(struct thread *t_wakeup)
547{
548 struct daemon *dmn = THREAD_ARG(t_wakeup);
549
550 dmn->t_wakeup = NULL;
551 if (try_connect(dmn) < 0)
552 SET_WAKEUP_DOWN(dmn);
553 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
554 try_restart(dmn);
555 return 0;
556}
557
558static int
559wakeup_init(struct thread *t_wakeup)
560{
561 struct daemon *dmn = THREAD_ARG(t_wakeup);
562
563 dmn->t_wakeup = NULL;
564 if (try_connect(dmn) < 0)
565 {
566 SET_WAKEUP_DOWN(dmn);
567 zlog_err("%s state -> down : initial connection attempt failed",
568 dmn->name);
569 dmn->state = DAEMON_DOWN;
570 }
571 return 0;
572}
573
574static void
575daemon_down(struct daemon *dmn, const char *why)
576{
577 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
578 zlog_err("%s state -> down : %s",dmn->name,why);
579 else if (gs.loglevel > LOG_DEBUG)
580 zlog_debug("%s still down : %s",dmn->name,why);
581 if (IS_UP(dmn))
582 gs.numdown++;
583 dmn->state = DAEMON_DOWN;
584 if (dmn->fd >= 0)
585 {
586 close(dmn->fd);
587 dmn->fd = -1;
588 }
589 THREAD_OFF(dmn->t_read);
590 THREAD_OFF(dmn->t_write);
591 THREAD_OFF(dmn->t_wakeup);
592 if (try_connect(dmn) < 0)
593 SET_WAKEUP_DOWN(dmn);
594 phase_check();
595}
596
597static int
598handle_read(struct thread *t_read)
599{
600 struct daemon *dmn = THREAD_ARG(t_read);
601 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
602 char buf[sizeof(resp)+100];
603 ssize_t rc;
604 struct timeval delay;
605
606 dmn->t_read = NULL;
607 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
608 {
609 char why[100];
610
ajs518cde82005-02-17 20:11:58 +0000611 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000612 {
613 /* Pretend it never happened. */
614 SET_READ_HANDLER(dmn);
615 return 0;
616 }
617 snprintf(why,sizeof(why),"unexpected read error: %s",
618 safe_strerror(errno));
619 daemon_down(dmn,why);
620 return 0;
621 }
622 if (rc == 0)
623 {
624 daemon_down(dmn,"read returned EOF");
625 return 0;
626 }
627 if (!dmn->echo_sent.tv_sec)
628 {
629 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000630 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
631 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000632 daemon_down(dmn,why);
633 return 0;
634 }
635
636 /* We are expecting an echo response: is there any chance that the
637 response would not be returned entirely in the first read? That
638 seems inconceivable... */
639 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
640 {
641 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000642 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
643 "(expecting %u): %.*s",
644 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000645 daemon_down(dmn,why);
646 return 0;
647 }
648
649 time_elapsed(&delay,&dmn->echo_sent);
650 dmn->echo_sent.tv_sec = 0;
651 if (dmn->state == DAEMON_UNRESPONSIVE)
652 {
653 if (delay.tv_sec < gs.timeout)
654 {
655 dmn->state = DAEMON_UP;
656 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
657 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
658 }
659 else
660 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
661 "seconds", dmn->name,delay.tv_sec,delay.tv_usec);
662 }
663 else if (gs.loglevel > LOG_DEBUG+1)
664 zlog_debug("%s: echo response received after %ld.%06ld seconds",
665 dmn->name,delay.tv_sec,delay.tv_usec);
666
667 SET_READ_HANDLER(dmn);
668 if (dmn->t_wakeup)
669 thread_cancel(dmn->t_wakeup);
670 SET_WAKEUP_ECHO(dmn);
671
672 return 0;
673}
674
675static void
676daemon_up(struct daemon *dmn, const char *why)
677{
678 dmn->state = DAEMON_UP;
679 gs.numdown--;
680 dmn->connect_tries = 0;
681 zlog_notice("%s state -> up : %s",dmn->name,why);
682 if (gs.do_ping)
683 SET_WAKEUP_ECHO(dmn);
684 phase_check();
685}
686
687static int
688check_connect(struct thread *t_write)
689{
690 struct daemon *dmn = THREAD_ARG(t_write);
691 int sockerr;
692 socklen_t reslen = sizeof(sockerr);
693
694 dmn->t_write = NULL;
695 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
696 {
697 zlog_warn("%s: check_connect: getsockopt failed: %s",
698 dmn->name,safe_strerror(errno));
699 daemon_down(dmn,"getsockopt failed checking connection success");
700 return 0;
701 }
702 if ((reslen == sizeof(sockerr)) && sockerr)
703 {
704 char why[100];
705 snprintf(why,sizeof(why),
706 "getsockopt reports that connection attempt failed: %s",
707 safe_strerror(sockerr));
708 daemon_down(dmn,why);
709 return 0;
710 }
711
712 daemon_up(dmn,"delayed connect succeeded");
713 return 0;
714}
715
716static int
717wakeup_connect_hanging(struct thread *t_wakeup)
718{
719 struct daemon *dmn = THREAD_ARG(t_wakeup);
720 char why[100];
721
722 dmn->t_wakeup = NULL;
723 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
724 gs.timeout);
725 daemon_down(dmn,why);
726 return 0;
727}
728
729/* Making connection to protocol daemon. */
730static int
731try_connect(struct daemon *dmn)
732{
733 int sock;
734 struct sockaddr_un addr;
735 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000736
737 if (gs.loglevel > LOG_DEBUG+1)
738 zlog_debug("%s: attempting to connect",dmn->name);
739 dmn->connect_tries++;
740
741 memset (&addr, 0, sizeof (struct sockaddr_un));
742 addr.sun_family = AF_UNIX;
743 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
744 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000745#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000746 len = addr.sun_len = SUN_LEN(&addr);
747#else
748 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000749#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000750
751 /* Quick check to see if we might succeed before we go to the trouble
752 of creating a socket. */
753 if (access(addr.sun_path, W_OK) < 0)
754 {
755 if (errno != ENOENT)
756 zlog_err("%s: access to socket %s denied: %s",
757 dmn->name,addr.sun_path,safe_strerror(errno));
758 return -1;
759 }
760
761 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
762 {
763 zlog_err("%s(%s): cannot make socket: %s",
764 __func__,addr.sun_path, safe_strerror(errno));
765 return -1;
766 }
767
ajs52e66292005-02-16 20:40:25 +0000768 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000769 {
ajs52e66292005-02-16 20:40:25 +0000770 zlog_err("%s(%s): set_nonblocking(%d) failed",
771 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000772 close(sock);
773 return -1;
774 }
775
776 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
777 {
778 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
779 {
780 if (gs.loglevel > LOG_DEBUG)
781 zlog_debug("%s(%s): connect failed: %s",
782 __func__,addr.sun_path, safe_strerror(errno));
783 close (sock);
784 return -1;
785 }
786 if (gs.loglevel > LOG_DEBUG)
787 zlog_debug("%s: connection in progress",dmn->name);
788 dmn->state = DAEMON_CONNECTING;
789 dmn->fd = sock;
790 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
791 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
792 gs.timeout);
793 SET_READ_HANDLER(dmn);
794 return 0;
795 }
796
797 dmn->fd = sock;
798 SET_READ_HANDLER(dmn);
799 daemon_up(dmn,"connect succeeded");
800 return 1;
801}
802
803static int
804phase_hanging(struct thread *t_hanging)
805{
806 gs.t_phase_hanging = NULL;
807 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
808 phase_str[gs.phase],PHASE_TIMEOUT);
809 gs.phase = PHASE_NONE;
810 return 0;
811}
812
813static void
814set_phase(restart_phase_t new_phase)
815{
816 gs.phase = new_phase;
817 if (gs.t_phase_hanging)
818 thread_cancel(gs.t_phase_hanging);
819 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
820 PHASE_TIMEOUT);
821}
822
823static void
824phase_check(void)
825{
826 switch (gs.phase)
827 {
828 case PHASE_NONE:
829 break;
830 case PHASE_STOPS_PENDING:
831 if (gs.numpids)
832 break;
833 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
834 set_phase(PHASE_WAITING_DOWN);
835 /*FALLTHRU*/
836 case PHASE_WAITING_DOWN:
837 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
838 break;
839 zlog_info("Phased restart: all routing daemons now down.");
840 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
841 set_phase(PHASE_ZEBRA_RESTART_PENDING);
842 /*FALLTHRU*/
843 case PHASE_ZEBRA_RESTART_PENDING:
844 if (gs.special->restart.pid)
845 break;
846 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
847 set_phase(PHASE_WAITING_ZEBRA_UP);
848 /*FALLTHRU*/
849 case PHASE_WAITING_ZEBRA_UP:
850 if (!IS_UP(gs.special))
851 break;
852 zlog_info("Phased restart: %s is now up.",gs.special->name);
853 {
854 struct daemon *dmn;
855 for (dmn = gs.daemons; dmn; dmn = dmn->next)
856 {
857 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000858 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000859 }
860 }
861 gs.phase = PHASE_NONE;
862 THREAD_OFF(gs.t_phase_hanging);
863 zlog_notice("Phased global restart has completed.");
864 break;
865 }
866}
867
868static void
869try_restart(struct daemon *dmn)
870{
871 switch (gs.mode)
872 {
873 case MODE_MONITOR:
874 return;
875 case MODE_GLOBAL_RESTART:
876 run_job(&gs.restart,"restart",gs.restart_command,0,1);
877 break;
878 case MODE_SEPARATE_RESTART:
879 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
880 break;
881 case MODE_PHASED_ZEBRA_RESTART:
882 if (dmn != gs.special)
883 {
884 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
885 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
886 else
887 zlog_debug("%s: postponing restart attempt because master %s daemon "
888 "not up [%s], or phased restart in progress",
889 dmn->name,gs.special->name,state_str[gs.special->state]);
890 break;
891 }
892 /*FALLTHRU*/
893 case MODE_PHASED_ALL_RESTART:
894 if ((gs.phase != PHASE_NONE) || gs.numpids)
895 {
896 if (gs.loglevel > LOG_DEBUG+1)
897 zlog_debug("postponing phased global restart: restart already in "
898 "progress [%s], or outstanding child processes [%d]",
899 phase_str[gs.phase],gs.numpids);
900 break;
901 }
902 /* Is it too soon for a restart? */
903 {
904 struct timeval delay;
905 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
906 gs.special->restart.interval)
907 {
908 if (gs.loglevel > LOG_DEBUG+1)
909 zlog_debug("postponing phased global restart: "
910 "elapsed time %ld < retry interval %ld",
911 (long)delay.tv_sec,gs.special->restart.interval);
912 break;
913 }
914 }
915 zlog_info("Phased restart: stopping all routing daemons.");
916 /* First step: stop all other daemons. */
917 for (dmn = gs.daemons; dmn; dmn = dmn->next)
918 {
919 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000920 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000921 }
922 set_phase(PHASE_STOPS_PENDING);
923 break;
924 default:
925 zlog_err("error: unknown restart mode %d",gs.mode);
926 break;
927 }
928}
929
930static int
931wakeup_unresponsive(struct thread *t_wakeup)
932{
933 struct daemon *dmn = THREAD_ARG(t_wakeup);
934
935 dmn->t_wakeup = NULL;
936 if (dmn->state != DAEMON_UNRESPONSIVE)
937 zlog_err("%s: no longer unresponsive (now %s), "
938 "wakeup should have been cancelled!",
939 dmn->name,state_str[dmn->state]);
940 else
941 {
942 SET_WAKEUP_UNRESPONSIVE(dmn);
943 try_restart(dmn);
944 }
945 return 0;
946}
947
948static int
949wakeup_no_answer(struct thread *t_wakeup)
950{
951 struct daemon *dmn = THREAD_ARG(t_wakeup);
952
953 dmn->t_wakeup = NULL;
954 dmn->state = DAEMON_UNRESPONSIVE;
955 zlog_err("%s state -> unresponsive : no response yet to ping "
956 "sent %ld seconds ago",dmn->name,gs.timeout);
957 if (gs.unresponsive_restart)
958 {
959 SET_WAKEUP_UNRESPONSIVE(dmn);
960 try_restart(dmn);
961 }
962 return 0;
963}
964
965static int
966wakeup_send_echo(struct thread *t_wakeup)
967{
968 static const char echocmd[] = "echo " PING_TOKEN;
969 ssize_t rc;
970 struct daemon *dmn = THREAD_ARG(t_wakeup);
971
972 dmn->t_wakeup = NULL;
973 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
974 ((size_t)rc != sizeof(echocmd)))
975 {
976 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000977 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
978 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000979 daemon_down(dmn,why);
980 }
981 else
982 {
983 gettimeofday(&dmn->echo_sent,NULL);
984 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
985 }
986 return 0;
987}
988
989static void
990sigint(void)
991{
992 zlog_notice("Terminating on signal");
993 exit(0);
994}
995
996static int
997valid_command(const char *cmd)
998{
999 char *p;
1000
1001 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1002}
1003
ajsc8b40f82004-12-22 16:17:16 +00001004/* This is an ugly hack to circumvent problems with passing command-line
1005 arguments that contain spaces. The fix is to use a configuration file. */
1006static char *
1007translate_blanks(const char *cmd, const char *blankstr)
1008{
1009 char *res;
1010 char *p;
1011 size_t bslen = strlen(blankstr);
1012
1013 if (!(res = strdup(cmd)))
1014 {
1015 perror("strdup");
1016 exit(1);
1017 }
1018 while ((p = strstr(res,blankstr)) != NULL)
1019 {
1020 *p = ' ';
1021 if (bslen != 1)
1022 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1023 }
1024 return res;
1025}
1026
ajs8b886ca2004-12-22 02:56:38 +00001027int
1028main(int argc, char **argv)
1029{
1030 const char *progname;
1031 int opt;
1032 int daemon_mode = 0;
1033 const char *pidfile = DEFAULT_PIDFILE;
1034 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001035 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001036 static struct quagga_signal_t my_signals[] =
1037 {
1038 {
1039 .signal = SIGINT,
1040 .handler = sigint,
1041 },
1042 {
1043 .signal = SIGTERM,
1044 .handler = sigint,
1045 },
1046 {
1047 .signal = SIGCHLD,
1048 .handler = sigchild,
1049 },
1050 };
1051
1052 if ((progname = strrchr (argv[0], '/')) != NULL)
1053 progname++;
1054 else
1055 progname = argv[0];
1056
ajs098e2402004-12-22 17:00:46 +00001057 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001058 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001059 longopts, 0)) != EOF)
1060 {
1061 switch (opt)
1062 {
1063 case 0:
1064 break;
1065 case 'a':
1066 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1067 {
1068 fputs("Ambiguous operating mode selected.\n",stderr);
1069 return usage(progname,1);
1070 }
1071 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1072 break;
1073 case 'A':
1074 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1075 {
1076 fputs("Ambiguous operating mode selected.\n",stderr);
1077 return usage(progname,1);
1078 }
1079 gs.mode = MODE_PHASED_ALL_RESTART;
1080 break;
ajsc8b40f82004-12-22 16:17:16 +00001081 case 'b':
1082 blankstr = optarg;
1083 break;
ajs8b886ca2004-12-22 02:56:38 +00001084 case 'd':
1085 daemon_mode = 1;
1086 break;
1087 case 'e':
1088 gs.do_ping = 0;
1089 break;
1090 case 'k':
1091 if (!valid_command(optarg))
1092 {
1093 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1094 optarg);
1095 return usage(progname,1);
1096 }
1097 gs.stop_command = optarg;
1098 break;
1099 case 'l':
1100 {
1101 char garbage[3];
1102 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1103 (gs.loglevel < LOG_EMERG))
1104 {
1105 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1106 return usage(progname,1);
1107 }
1108 }
1109 break;
1110 case 'm':
1111 {
1112 char garbage[3];
1113 if ((sscanf(optarg,"%ld%1s",
1114 &gs.min_restart_interval,garbage) != 1) ||
1115 (gs.min_restart_interval < 0))
1116 {
1117 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1118 optarg);
1119 return usage(progname,1);
1120 }
1121 }
1122 break;
1123 case 'M':
1124 {
1125 char garbage[3];
1126 if ((sscanf(optarg,"%ld%1s",
1127 &gs.max_restart_interval,garbage) != 1) ||
1128 (gs.max_restart_interval < 0))
1129 {
1130 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1131 optarg);
1132 return usage(progname,1);
1133 }
1134 }
1135 break;
1136 case 'i':
1137 {
1138 char garbage[3];
1139 int period;
1140 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1141 (gs.period < 1))
1142 {
1143 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1144 return usage(progname,1);
1145 }
1146 gs.period = 1000*period;
1147 }
1148 break;
1149 case 'p':
1150 pidfile = optarg;
1151 break;
1152 case 'r':
1153 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1154 (gs.mode == MODE_SEPARATE_RESTART))
1155 {
1156 fputs("Ambiguous operating mode selected.\n",stderr);
1157 return usage(progname,1);
1158 }
1159 if (!valid_command(optarg))
1160 {
1161 fprintf(stderr,
1162 "Invalid restart command, must contain '%%s': %s\n",
1163 optarg);
1164 return usage(progname,1);
1165 }
1166 gs.restart_command = optarg;
1167 if (gs.mode == MODE_MONITOR)
1168 gs.mode = MODE_SEPARATE_RESTART;
1169 break;
1170 case 'R':
1171 if (gs.mode != MODE_MONITOR)
1172 {
1173 fputs("Ambiguous operating mode selected.\n",stderr);
1174 return usage(progname,1);
1175 }
1176 if (strchr(optarg,'%'))
1177 {
1178 fprintf(stderr,
1179 "Invalid restart-all arg, must not contain '%%s': %s\n",
1180 optarg);
1181 return usage(progname,1);
1182 }
1183 gs.restart_command = optarg;
1184 gs.mode = MODE_GLOBAL_RESTART;
1185 break;
1186 case 's':
1187 if (!valid_command(optarg))
1188 {
1189 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1190 optarg);
1191 return usage(progname,1);
1192 }
1193 gs.start_command = optarg;
1194 break;
1195 case 'S':
1196 gs.vtydir = optarg;
1197 break;
1198 case 't':
1199 {
1200 char garbage[3];
1201 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1202 (gs.timeout < 1))
1203 {
1204 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1205 return usage(progname,1);
1206 }
1207 }
1208 break;
1209 case 'T':
1210 {
1211 char garbage[3];
1212 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1213 (gs.restart_timeout < 1))
1214 {
1215 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1216 return usage(progname,1);
1217 }
1218 }
1219 break;
1220 case 'z':
1221 gs.unresponsive_restart = 1;
1222 break;
1223 case 'v':
1224 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1225 puts("Copyright 2004 Andrew J. Schorr");
1226 return 0;
1227 case 'h':
1228 return usage(progname,0);
1229 default:
1230 fputs("Invalid option.\n",stderr);
1231 return usage(progname,1);
1232 }
1233 }
1234
1235 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1236 {
1237 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1238 return usage(progname,1);
1239 }
1240 switch (gs.mode)
1241 {
1242 case MODE_MONITOR:
1243 if (gs.restart_command || gs.start_command || gs.stop_command)
1244 {
1245 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1246 mode_str[gs.mode]);
1247 return usage(progname,1);
1248 }
1249 break;
1250 case MODE_GLOBAL_RESTART:
1251 case MODE_SEPARATE_RESTART:
1252 if (!gs.restart_command || gs.start_command || gs.stop_command)
1253 {
1254 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1255 mode_str[gs.mode]);
1256 return usage(progname,1);
1257 }
1258 break;
1259 case MODE_PHASED_ZEBRA_RESTART:
1260 case MODE_PHASED_ALL_RESTART:
1261 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1262 {
1263 fprintf(stderr,
1264 "Need start, kill, and restart commands in [%s] mode.\n",
1265 mode_str[gs.mode]);
1266 return usage(progname,1);
1267 }
1268 break;
1269 }
1270
ajsc8b40f82004-12-22 16:17:16 +00001271 if (blankstr)
1272 {
1273 if (gs.restart_command)
1274 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1275 if (gs.start_command)
1276 gs.start_command = translate_blanks(gs.start_command,blankstr);
1277 if (gs.stop_command)
1278 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1279 }
1280
ajs8b886ca2004-12-22 02:56:38 +00001281 gs.restart.interval = gs.min_restart_interval;
1282 master = thread_master_create();
Balaji.G837d16c2012-09-26 14:09:10 +05301283 signal_init (master, array_size(my_signals), my_signals);
ajs8b886ca2004-12-22 02:56:38 +00001284 srandom(time(NULL));
1285
1286 {
1287 int i;
1288 struct daemon *tail = NULL;
1289
1290 for (i = optind; i < argc; i++)
1291 {
1292 struct daemon *dmn;
1293
1294 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1295 {
ajs098e2402004-12-22 17:00:46 +00001296 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1297 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001298 return 1;
1299 }
1300 dmn->name = dmn->restart.name = argv[i];
1301 dmn->state = DAEMON_INIT;
1302 gs.numdaemons++;
1303 gs.numdown++;
1304 dmn->fd = -1;
1305 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1306 100+(random() % 900));
1307 dmn->restart.interval = gs.min_restart_interval;
1308 if (tail)
1309 tail->next = dmn;
1310 else
1311 gs.daemons = dmn;
1312 tail = dmn;
1313
1314 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1315 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1316 !strcmp(dmn->name,special))
1317 gs.special = dmn;
1318 }
1319 }
1320 if (!gs.daemons)
1321 {
1322 fputs("Must specify one or more daemons to monitor.\n",stderr);
1323 return usage(progname,1);
1324 }
1325 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1326 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1327 {
1328 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1329 mode_str[gs.mode],special);
1330 return usage(progname,1);
1331 }
1332 if (gs.special && (gs.numdaemons < 2))
1333 {
1334 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1335 "to watch.\n",mode_str[gs.mode]);
1336 return usage(progname,1);
1337 }
1338
1339 zlog_default = openzlog(progname, ZLOG_NONE,
1340 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1341 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1342 if (daemon_mode)
1343 {
1344 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001345 if (daemon (0, 0) < 0)
1346 {
1347 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1348 exit (1);
1349 }
ajs8b886ca2004-12-22 02:56:38 +00001350 }
1351 else
1352 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1353
1354 /* Make sure we're not already running. */
1355 pid_output (pidfile);
1356
1357 /* Announce which daemons are being monitored. */
1358 {
1359 struct daemon *dmn;
1360 size_t len = 0;
1361
1362 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1363 len += strlen(dmn->name)+1;
1364
1365 {
1366 char buf[len+1];
1367 char *p = buf;
1368
1369 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1370 {
1371 if (p != buf)
1372 *p++ = ' ';
1373 strcpy(p,dmn->name);
1374 p += strlen(p);
1375 }
1376 zlog_notice("%s %s watching [%s], mode [%s]",
1377 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1378 }
1379 }
1380
1381 {
1382 struct thread thread;
1383
1384 while (thread_fetch (master, &thread))
1385 thread_call (&thread);
1386 }
1387
1388 /* Not reached. */
1389 return 0;
1390}