blob: 9bd7a5f2538b1df9d18ea10c8cca30d222efe4d8 [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs8b886ca2004-12-22 02:56:38 +00002 Monitor status of quagga daemons and restart if necessary.
3
4 Copyright (C) 2004 Andrew J. Schorr
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
ajsa3655342004-12-29 17:39:10 +000021#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000022#include <thread.h>
23#include <log.h>
ajs52e66292005-02-16 20:40:25 +000024#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000025#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000026#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000027#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000028#include <sys/un.h>
29#include <sys/wait.h>
Balaji.G837d16c2012-09-26 14:09:10 +053030#include <memory.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
David Lamparter33b96632015-03-03 08:57:25 +0100218 {
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
ajs8b886ca2004-12-22 02:56:38 +0000220Watchdog program to monitor status of quagga daemons and try to restart\n\
221them if they are down or unresponsive. It determines whether a daemon is\n\
222up based on whether it can connect to the daemon's vty unix stream socket.\n\
223It then repeatedly sends echo commands over that socket to determine whether\n\
224the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225on the socket connection and know immediately that the daemon is down.\n\n\
226The daemons to be monitored should be listed on the command line.\n\n\
227This program can run in one of 5 modes:\n\n\
2280. Mode: %s.\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
2311. Mode: %s.\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
234 %s -dz \\\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
236 zebra ospfd\n\n\
2372. Mode: %s.\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2413. Mode: %s.\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
245 daemons. Example:\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2494. Mode: %s.\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255As of this writing, it is believed that mode 2 [%s]\n\
256is not safe, and mode 3 [%s] may not be safe with some of the\n\
257routing daemons.\n\n\
258In order to avoid attempting to restart the daemons in a fast loop,\n\
259the -m and -M options allow you to control the minimum delay between\n\
260restart commands. The minimum restart delay is recalculated each time\n\
261a restart is attempted: if the time since the last restart attempt exceeds\n\
262twice the -M value, then the restart delay is set to the -m value.\n\
David Lamparter33b96632015-03-03 08:57:25 +0100263Otherwise, the interval is doubled (but capped at the -M value).\n\n",
264 progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
265 progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],
266 mode_str[3]);
267
268 printf("Options:\n\
ajs8b886ca2004-12-22 02:56:38 +0000269-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
270 to syslog instead of stdout.\n\
271-S, --statedir Set the vty socket directory (default is %s)\n\
272-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
273 option is necessary if the daemons do not support the\n\
274 echo command)\n\
275-l, --loglevel Set the logging level (default is %d).\n\
276 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
277 but it can be set higher than %d if extra-verbose debugging\n\
278 messages are desired.\n\
279-m, --min-restart-interval\n\
280 Set the minimum seconds to wait between invocations of daemon\n\
281 restart commands (default is %d).\n\
282-M, --max-restart-interval\n\
283 Set the maximum seconds to wait between invocations of daemon\n\
284 restart commands (default is %d).\n\
285-i, --interval Set the status polling interval in seconds (default is %d)\n\
286-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
287-T, --restart-timeout\n\
288 Set the restart (kill) timeout in seconds (default is %d).\n\
289 If any background jobs are still running after this much\n\
290 time has elapsed, they will be killed.\n\
291-r, --restart Supply a Bourne shell command to use to restart a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294 Note that -r and -R are incompatible.\n\
295-s, --start-command\n\
296 Supply a Bourne shell to command to use to start a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
299-k, --kill-command\n\
300 Supply a Bourne shell to command to use to stop a single\n\
301 daemon. The command string should include '%%s' where the\n\
302 name of the daemon should be substituted.\n\
303-R, --restart-all\n\
304 When one or more daemons is down, try to restart everything\n\
305 using the Bourne shell command supplied as the argument.\n\
306 Note that -r and -R are incompatible.\n\
307-z, --unresponsive-restart\n\
308 When a daemon is unresponsive, treat it as being down for\n\
309 restart purposes.\n\
310-a, --all-restart\n\
311 When zebra hangs or crashes, restart all daemons using\n\
312 this phased approach: 1. stop all other daemons; 2. restart\n\
313 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
314-A, --always-all-restart\n\
315 When any daemon (not just zebra) hangs or crashes, use the\n\
316 same phased restart mechanism described above for -a.\n\
317 Requires -r, -s, and -k.\n\
318-p, --pid-file Set process identifier file name\n\
319 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000320-b, --blank-string\n\
321 When the supplied argument string is found in any of the\n\
322 various shell command arguments (-r, -s, -k, or -R), replace\n\
323 it with a space. This is an ugly hack to circumvent problems\n\
324 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000325-v, --version Print program version\n\
David Lamparter33b96632015-03-03 08:57:25 +0100326-h, --help Display this help and exit\n",
327 VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
328 DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
329 DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,
330 DEFAULT_PIDFILE);
331 }
ajs8b886ca2004-12-22 02:56:38 +0000332
333 return status;
334}
335
336static pid_t
Donald Sharp9487b4f2015-05-23 18:16:50 -0700337run_background(char *shell_cmd)
ajs8b886ca2004-12-22 02:56:38 +0000338{
339 pid_t child;
340
341 switch (child = fork())
342 {
343 case -1:
344 zlog_err("fork failed, cannot run command [%s]: %s",
345 shell_cmd,safe_strerror(errno));
346 return -1;
347 case 0:
348 /* Child process. */
349 /* Use separate process group so child processes can be killed easily. */
350 if (setpgid(0,0) < 0)
351 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
352 {
Donald Sharp9487b4f2015-05-23 18:16:50 -0700353 char shell[] = "sh";
354 char dashc[] = "-c";
355 char *const argv[4] = { shell, dashc, shell_cmd, NULL};
356 execv("/bin/sh", argv);
ajs8b886ca2004-12-22 02:56:38 +0000357 zlog_err("execv(/bin/sh -c '%s') failed: %s",
358 shell_cmd,safe_strerror(errno));
359 _exit(127);
360 }
361 default:
362 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000363 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000364 return child;
365 }
366}
367
368static struct timeval *
369time_elapsed(struct timeval *result, const struct timeval *start_time)
370{
371 gettimeofday(result,NULL);
372 result->tv_sec -= start_time->tv_sec;
373 result->tv_usec -= start_time->tv_usec;
374 while (result->tv_usec < 0)
375 {
376 result->tv_usec += 1000000L;
377 result->tv_sec--;
378 }
379 return result;
380}
381
382static int
383restart_kill(struct thread *t_kill)
384{
385 struct restart_info *restart = THREAD_ARG(t_kill);
386 struct timeval delay;
387
388 time_elapsed(&delay,&restart->time);
389 zlog_warn("Warning: %s %s child process %d still running after "
390 "%ld seconds, sending signal %d",
David Lamparteref008d22015-03-03 08:48:11 +0100391 restart->what,restart->name,(int)restart->pid, (long)delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000392 (restart->kills ? SIGKILL : SIGTERM));
393 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
394 restart->kills++;
395 restart->t_kill = thread_add_timer(master,restart_kill,restart,
396 gs.restart_timeout);
397 return 0;
398}
399
400static struct restart_info *
401find_child(pid_t child)
402{
403 if (gs.mode == MODE_GLOBAL_RESTART)
404 {
405 if (gs.restart.pid == child)
406 return &gs.restart;
407 }
408 else
409 {
410 struct daemon *dmn;
411 for (dmn = gs.daemons; dmn; dmn = dmn->next)
412 {
413 if (dmn->restart.pid == child)
414 return &dmn->restart;
415 }
416 }
417 return NULL;
418}
419
420static void
421sigchild(void)
422{
423 pid_t child;
424 int status;
425 const char *name;
426 const char *what;
427 struct restart_info *restart;
428
429 switch (child = waitpid(-1,&status,WNOHANG))
430 {
431 case -1:
432 zlog_err("waitpid failed: %s",safe_strerror(errno));
433 return;
434 case 0:
435 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
436 return;
437 }
438
439 if ((restart = find_child(child)) != NULL)
440 {
441 name = restart->name;
442 what = restart->what;
443 restart->pid = 0;
444 gs.numpids--;
445 thread_cancel(restart->t_kill);
446 restart->t_kill = NULL;
447 /* Update restart time to reflect the time the command completed. */
448 gettimeofday(&restart->time,NULL);
449 }
450 else
451 {
452 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000453 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000454 name = "(unknown)";
455 what = "background";
456 }
457 if (WIFSTOPPED(status))
458 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000459 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000460 else if (WIFSIGNALED(status))
461 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000462 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000463 else if (WIFEXITED(status))
464 {
465 if (WEXITSTATUS(status) != 0)
466 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000467 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000468 else
ajsf2d82572004-12-29 17:45:08 +0000469 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000470 }
471 else
472 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000473 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000474 phase_check();
475}
476
477static int
478run_job(struct restart_info *restart, const char *cmdtype, const char *command,
479 int force, int update_interval)
480{
481 struct timeval delay;
482
483 if (gs.loglevel > LOG_DEBUG+1)
484 zlog_debug("attempting to %s %s",cmdtype,restart->name);
485
486 if (restart->pid)
487 {
488 if (gs.loglevel > LOG_DEBUG+1)
489 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000490 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000491 return -1;
492 }
493
ajsa8a8ddc2005-01-12 16:24:51 +0000494 /* Note: time_elapsed test must come before the force test, since we need
495 to make sure that delay is initialized for use below in updating the
496 restart interval. */
497 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
498 !force)
ajs8b886ca2004-12-22 02:56:38 +0000499 {
500 if (gs.loglevel > LOG_DEBUG+1)
501 zlog_debug("postponing %s %s: "
502 "elapsed time %ld < retry interval %ld",
503 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
504 return -1;
505 }
506
507 gettimeofday(&restart->time,NULL);
508 restart->kills = 0;
509 {
510 char cmd[strlen(command)+strlen(restart->name)+1];
511 snprintf(cmd,sizeof(cmd),command,restart->name);
512 if ((restart->pid = run_background(cmd)) > 0)
513 {
514 restart->t_kill = thread_add_timer(master,restart_kill,restart,
515 gs.restart_timeout);
516 restart->what = cmdtype;
517 gs.numpids++;
518 }
519 else
520 restart->pid = 0;
521 }
522
523 /* Calculate the new restart interval. */
524 if (update_interval)
525 {
526 if (delay.tv_sec > 2*gs.max_restart_interval)
527 restart->interval = gs.min_restart_interval;
528 else if ((restart->interval *= 2) > gs.max_restart_interval)
529 restart->interval = gs.max_restart_interval;
530 if (gs.loglevel > LOG_DEBUG+1)
531 zlog_debug("restart %s interval is now %ld",
532 restart->name,restart->interval);
533 }
534 return restart->pid;
535}
536
537#define SET_READ_HANDLER(DMN) \
538 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
539
540#define SET_WAKEUP_DOWN(DMN) \
541 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
542 FUZZY(gs.period))
543
544#define SET_WAKEUP_UNRESPONSIVE(DMN) \
545 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
546 FUZZY(gs.period))
547
548#define SET_WAKEUP_ECHO(DMN) \
549 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
550 FUZZY(gs.period))
551
552static int
553wakeup_down(struct thread *t_wakeup)
554{
555 struct daemon *dmn = THREAD_ARG(t_wakeup);
556
557 dmn->t_wakeup = NULL;
558 if (try_connect(dmn) < 0)
559 SET_WAKEUP_DOWN(dmn);
560 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
561 try_restart(dmn);
562 return 0;
563}
564
565static int
566wakeup_init(struct thread *t_wakeup)
567{
568 struct daemon *dmn = THREAD_ARG(t_wakeup);
569
570 dmn->t_wakeup = NULL;
571 if (try_connect(dmn) < 0)
572 {
573 SET_WAKEUP_DOWN(dmn);
574 zlog_err("%s state -> down : initial connection attempt failed",
575 dmn->name);
576 dmn->state = DAEMON_DOWN;
577 }
578 return 0;
579}
580
581static void
582daemon_down(struct daemon *dmn, const char *why)
583{
584 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
585 zlog_err("%s state -> down : %s",dmn->name,why);
586 else if (gs.loglevel > LOG_DEBUG)
587 zlog_debug("%s still down : %s",dmn->name,why);
588 if (IS_UP(dmn))
589 gs.numdown++;
590 dmn->state = DAEMON_DOWN;
591 if (dmn->fd >= 0)
592 {
593 close(dmn->fd);
594 dmn->fd = -1;
595 }
596 THREAD_OFF(dmn->t_read);
597 THREAD_OFF(dmn->t_write);
598 THREAD_OFF(dmn->t_wakeup);
599 if (try_connect(dmn) < 0)
600 SET_WAKEUP_DOWN(dmn);
601 phase_check();
602}
603
604static int
605handle_read(struct thread *t_read)
606{
607 struct daemon *dmn = THREAD_ARG(t_read);
608 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
609 char buf[sizeof(resp)+100];
610 ssize_t rc;
611 struct timeval delay;
612
613 dmn->t_read = NULL;
614 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
615 {
616 char why[100];
617
ajs518cde82005-02-17 20:11:58 +0000618 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000619 {
620 /* Pretend it never happened. */
621 SET_READ_HANDLER(dmn);
622 return 0;
623 }
624 snprintf(why,sizeof(why),"unexpected read error: %s",
625 safe_strerror(errno));
626 daemon_down(dmn,why);
627 return 0;
628 }
629 if (rc == 0)
630 {
631 daemon_down(dmn,"read returned EOF");
632 return 0;
633 }
634 if (!dmn->echo_sent.tv_sec)
635 {
636 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000637 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
638 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000639 daemon_down(dmn,why);
640 return 0;
641 }
642
643 /* We are expecting an echo response: is there any chance that the
644 response would not be returned entirely in the first read? That
645 seems inconceivable... */
646 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
647 {
648 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000649 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
650 "(expecting %u): %.*s",
651 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000652 daemon_down(dmn,why);
653 return 0;
654 }
655
656 time_elapsed(&delay,&dmn->echo_sent);
657 dmn->echo_sent.tv_sec = 0;
658 if (dmn->state == DAEMON_UNRESPONSIVE)
659 {
660 if (delay.tv_sec < gs.timeout)
661 {
662 dmn->state = DAEMON_UP;
663 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100664 "seconds", dmn->name,
665 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000666 }
667 else
668 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100669 "seconds", dmn->name,
670 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000671 }
672 else if (gs.loglevel > LOG_DEBUG+1)
673 zlog_debug("%s: echo response received after %ld.%06ld seconds",
David Lamparteref008d22015-03-03 08:48:11 +0100674 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000675
676 SET_READ_HANDLER(dmn);
677 if (dmn->t_wakeup)
678 thread_cancel(dmn->t_wakeup);
679 SET_WAKEUP_ECHO(dmn);
680
681 return 0;
682}
683
684static void
685daemon_up(struct daemon *dmn, const char *why)
686{
687 dmn->state = DAEMON_UP;
688 gs.numdown--;
689 dmn->connect_tries = 0;
690 zlog_notice("%s state -> up : %s",dmn->name,why);
691 if (gs.do_ping)
692 SET_WAKEUP_ECHO(dmn);
693 phase_check();
694}
695
696static int
697check_connect(struct thread *t_write)
698{
699 struct daemon *dmn = THREAD_ARG(t_write);
700 int sockerr;
701 socklen_t reslen = sizeof(sockerr);
702
703 dmn->t_write = NULL;
704 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
705 {
706 zlog_warn("%s: check_connect: getsockopt failed: %s",
707 dmn->name,safe_strerror(errno));
708 daemon_down(dmn,"getsockopt failed checking connection success");
709 return 0;
710 }
711 if ((reslen == sizeof(sockerr)) && sockerr)
712 {
713 char why[100];
714 snprintf(why,sizeof(why),
715 "getsockopt reports that connection attempt failed: %s",
716 safe_strerror(sockerr));
717 daemon_down(dmn,why);
718 return 0;
719 }
720
721 daemon_up(dmn,"delayed connect succeeded");
722 return 0;
723}
724
725static int
726wakeup_connect_hanging(struct thread *t_wakeup)
727{
728 struct daemon *dmn = THREAD_ARG(t_wakeup);
729 char why[100];
730
731 dmn->t_wakeup = NULL;
732 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
733 gs.timeout);
734 daemon_down(dmn,why);
735 return 0;
736}
737
738/* Making connection to protocol daemon. */
739static int
740try_connect(struct daemon *dmn)
741{
742 int sock;
743 struct sockaddr_un addr;
744 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000745
746 if (gs.loglevel > LOG_DEBUG+1)
747 zlog_debug("%s: attempting to connect",dmn->name);
748 dmn->connect_tries++;
749
750 memset (&addr, 0, sizeof (struct sockaddr_un));
751 addr.sun_family = AF_UNIX;
752 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
753 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000754#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000755 len = addr.sun_len = SUN_LEN(&addr);
756#else
757 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000758#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000759
760 /* Quick check to see if we might succeed before we go to the trouble
761 of creating a socket. */
762 if (access(addr.sun_path, W_OK) < 0)
763 {
764 if (errno != ENOENT)
765 zlog_err("%s: access to socket %s denied: %s",
766 dmn->name,addr.sun_path,safe_strerror(errno));
767 return -1;
768 }
769
770 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
771 {
772 zlog_err("%s(%s): cannot make socket: %s",
773 __func__,addr.sun_path, safe_strerror(errno));
774 return -1;
775 }
776
ajs52e66292005-02-16 20:40:25 +0000777 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000778 {
ajs52e66292005-02-16 20:40:25 +0000779 zlog_err("%s(%s): set_nonblocking(%d) failed",
780 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000781 close(sock);
782 return -1;
783 }
784
785 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
786 {
787 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
788 {
789 if (gs.loglevel > LOG_DEBUG)
790 zlog_debug("%s(%s): connect failed: %s",
791 __func__,addr.sun_path, safe_strerror(errno));
792 close (sock);
793 return -1;
794 }
795 if (gs.loglevel > LOG_DEBUG)
796 zlog_debug("%s: connection in progress",dmn->name);
797 dmn->state = DAEMON_CONNECTING;
798 dmn->fd = sock;
799 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
800 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
801 gs.timeout);
802 SET_READ_HANDLER(dmn);
803 return 0;
804 }
805
806 dmn->fd = sock;
807 SET_READ_HANDLER(dmn);
808 daemon_up(dmn,"connect succeeded");
809 return 1;
810}
811
812static int
813phase_hanging(struct thread *t_hanging)
814{
815 gs.t_phase_hanging = NULL;
816 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
817 phase_str[gs.phase],PHASE_TIMEOUT);
818 gs.phase = PHASE_NONE;
819 return 0;
820}
821
822static void
823set_phase(restart_phase_t new_phase)
824{
825 gs.phase = new_phase;
826 if (gs.t_phase_hanging)
827 thread_cancel(gs.t_phase_hanging);
828 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
829 PHASE_TIMEOUT);
830}
831
832static void
833phase_check(void)
834{
835 switch (gs.phase)
836 {
837 case PHASE_NONE:
838 break;
839 case PHASE_STOPS_PENDING:
840 if (gs.numpids)
841 break;
842 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
843 set_phase(PHASE_WAITING_DOWN);
844 /*FALLTHRU*/
845 case PHASE_WAITING_DOWN:
846 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
847 break;
848 zlog_info("Phased restart: all routing daemons now down.");
849 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
850 set_phase(PHASE_ZEBRA_RESTART_PENDING);
851 /*FALLTHRU*/
852 case PHASE_ZEBRA_RESTART_PENDING:
853 if (gs.special->restart.pid)
854 break;
855 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
856 set_phase(PHASE_WAITING_ZEBRA_UP);
857 /*FALLTHRU*/
858 case PHASE_WAITING_ZEBRA_UP:
859 if (!IS_UP(gs.special))
860 break;
861 zlog_info("Phased restart: %s is now up.",gs.special->name);
862 {
863 struct daemon *dmn;
864 for (dmn = gs.daemons; dmn; dmn = dmn->next)
865 {
866 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000867 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000868 }
869 }
870 gs.phase = PHASE_NONE;
871 THREAD_OFF(gs.t_phase_hanging);
872 zlog_notice("Phased global restart has completed.");
873 break;
874 }
875}
876
877static void
878try_restart(struct daemon *dmn)
879{
880 switch (gs.mode)
881 {
882 case MODE_MONITOR:
883 return;
884 case MODE_GLOBAL_RESTART:
885 run_job(&gs.restart,"restart",gs.restart_command,0,1);
886 break;
887 case MODE_SEPARATE_RESTART:
888 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
889 break;
890 case MODE_PHASED_ZEBRA_RESTART:
891 if (dmn != gs.special)
892 {
893 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
894 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
895 else
896 zlog_debug("%s: postponing restart attempt because master %s daemon "
897 "not up [%s], or phased restart in progress",
898 dmn->name,gs.special->name,state_str[gs.special->state]);
899 break;
900 }
901 /*FALLTHRU*/
902 case MODE_PHASED_ALL_RESTART:
903 if ((gs.phase != PHASE_NONE) || gs.numpids)
904 {
905 if (gs.loglevel > LOG_DEBUG+1)
906 zlog_debug("postponing phased global restart: restart already in "
907 "progress [%s], or outstanding child processes [%d]",
908 phase_str[gs.phase],gs.numpids);
909 break;
910 }
911 /* Is it too soon for a restart? */
912 {
913 struct timeval delay;
914 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
915 gs.special->restart.interval)
916 {
917 if (gs.loglevel > LOG_DEBUG+1)
918 zlog_debug("postponing phased global restart: "
919 "elapsed time %ld < retry interval %ld",
920 (long)delay.tv_sec,gs.special->restart.interval);
921 break;
922 }
923 }
924 zlog_info("Phased restart: stopping all routing daemons.");
925 /* First step: stop all other daemons. */
926 for (dmn = gs.daemons; dmn; dmn = dmn->next)
927 {
928 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000929 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000930 }
931 set_phase(PHASE_STOPS_PENDING);
932 break;
933 default:
934 zlog_err("error: unknown restart mode %d",gs.mode);
935 break;
936 }
937}
938
939static int
940wakeup_unresponsive(struct thread *t_wakeup)
941{
942 struct daemon *dmn = THREAD_ARG(t_wakeup);
943
944 dmn->t_wakeup = NULL;
945 if (dmn->state != DAEMON_UNRESPONSIVE)
946 zlog_err("%s: no longer unresponsive (now %s), "
947 "wakeup should have been cancelled!",
948 dmn->name,state_str[dmn->state]);
949 else
950 {
951 SET_WAKEUP_UNRESPONSIVE(dmn);
952 try_restart(dmn);
953 }
954 return 0;
955}
956
957static int
958wakeup_no_answer(struct thread *t_wakeup)
959{
960 struct daemon *dmn = THREAD_ARG(t_wakeup);
961
962 dmn->t_wakeup = NULL;
963 dmn->state = DAEMON_UNRESPONSIVE;
964 zlog_err("%s state -> unresponsive : no response yet to ping "
965 "sent %ld seconds ago",dmn->name,gs.timeout);
966 if (gs.unresponsive_restart)
967 {
968 SET_WAKEUP_UNRESPONSIVE(dmn);
969 try_restart(dmn);
970 }
971 return 0;
972}
973
974static int
975wakeup_send_echo(struct thread *t_wakeup)
976{
977 static const char echocmd[] = "echo " PING_TOKEN;
978 ssize_t rc;
979 struct daemon *dmn = THREAD_ARG(t_wakeup);
980
981 dmn->t_wakeup = NULL;
982 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
983 ((size_t)rc != sizeof(echocmd)))
984 {
985 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000986 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
987 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000988 daemon_down(dmn,why);
989 }
990 else
991 {
992 gettimeofday(&dmn->echo_sent,NULL);
993 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
994 }
995 return 0;
996}
997
998static void
999sigint(void)
1000{
1001 zlog_notice("Terminating on signal");
1002 exit(0);
1003}
1004
1005static int
1006valid_command(const char *cmd)
1007{
1008 char *p;
1009
1010 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1011}
1012
ajsc8b40f82004-12-22 16:17:16 +00001013/* This is an ugly hack to circumvent problems with passing command-line
1014 arguments that contain spaces. The fix is to use a configuration file. */
1015static char *
1016translate_blanks(const char *cmd, const char *blankstr)
1017{
1018 char *res;
1019 char *p;
1020 size_t bslen = strlen(blankstr);
1021
1022 if (!(res = strdup(cmd)))
1023 {
1024 perror("strdup");
1025 exit(1);
1026 }
1027 while ((p = strstr(res,blankstr)) != NULL)
1028 {
1029 *p = ' ';
1030 if (bslen != 1)
1031 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1032 }
1033 return res;
1034}
1035
ajs8b886ca2004-12-22 02:56:38 +00001036int
1037main(int argc, char **argv)
1038{
1039 const char *progname;
1040 int opt;
1041 int daemon_mode = 0;
1042 const char *pidfile = DEFAULT_PIDFILE;
1043 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001044 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001045 static struct quagga_signal_t my_signals[] =
1046 {
1047 {
1048 .signal = SIGINT,
1049 .handler = sigint,
1050 },
1051 {
1052 .signal = SIGTERM,
1053 .handler = sigint,
1054 },
1055 {
1056 .signal = SIGCHLD,
1057 .handler = sigchild,
1058 },
1059 };
1060
1061 if ((progname = strrchr (argv[0], '/')) != NULL)
1062 progname++;
1063 else
1064 progname = argv[0];
1065
ajs098e2402004-12-22 17:00:46 +00001066 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001067 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001068 longopts, 0)) != EOF)
1069 {
1070 switch (opt)
1071 {
1072 case 0:
1073 break;
1074 case 'a':
1075 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1076 {
1077 fputs("Ambiguous operating mode selected.\n",stderr);
1078 return usage(progname,1);
1079 }
1080 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1081 break;
1082 case 'A':
1083 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1084 {
1085 fputs("Ambiguous operating mode selected.\n",stderr);
1086 return usage(progname,1);
1087 }
1088 gs.mode = MODE_PHASED_ALL_RESTART;
1089 break;
ajsc8b40f82004-12-22 16:17:16 +00001090 case 'b':
1091 blankstr = optarg;
1092 break;
ajs8b886ca2004-12-22 02:56:38 +00001093 case 'd':
1094 daemon_mode = 1;
1095 break;
1096 case 'e':
1097 gs.do_ping = 0;
1098 break;
1099 case 'k':
1100 if (!valid_command(optarg))
1101 {
1102 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1103 optarg);
1104 return usage(progname,1);
1105 }
1106 gs.stop_command = optarg;
1107 break;
1108 case 'l':
1109 {
1110 char garbage[3];
1111 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1112 (gs.loglevel < LOG_EMERG))
1113 {
1114 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1115 return usage(progname,1);
1116 }
1117 }
1118 break;
1119 case 'm':
1120 {
1121 char garbage[3];
1122 if ((sscanf(optarg,"%ld%1s",
1123 &gs.min_restart_interval,garbage) != 1) ||
1124 (gs.min_restart_interval < 0))
1125 {
1126 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1127 optarg);
1128 return usage(progname,1);
1129 }
1130 }
1131 break;
1132 case 'M':
1133 {
1134 char garbage[3];
1135 if ((sscanf(optarg,"%ld%1s",
1136 &gs.max_restart_interval,garbage) != 1) ||
1137 (gs.max_restart_interval < 0))
1138 {
1139 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1140 optarg);
1141 return usage(progname,1);
1142 }
1143 }
1144 break;
1145 case 'i':
1146 {
1147 char garbage[3];
1148 int period;
1149 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1150 (gs.period < 1))
1151 {
1152 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1153 return usage(progname,1);
1154 }
1155 gs.period = 1000*period;
1156 }
1157 break;
1158 case 'p':
1159 pidfile = optarg;
1160 break;
1161 case 'r':
1162 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1163 (gs.mode == MODE_SEPARATE_RESTART))
1164 {
1165 fputs("Ambiguous operating mode selected.\n",stderr);
1166 return usage(progname,1);
1167 }
1168 if (!valid_command(optarg))
1169 {
1170 fprintf(stderr,
1171 "Invalid restart command, must contain '%%s': %s\n",
1172 optarg);
1173 return usage(progname,1);
1174 }
1175 gs.restart_command = optarg;
1176 if (gs.mode == MODE_MONITOR)
1177 gs.mode = MODE_SEPARATE_RESTART;
1178 break;
1179 case 'R':
1180 if (gs.mode != MODE_MONITOR)
1181 {
1182 fputs("Ambiguous operating mode selected.\n",stderr);
1183 return usage(progname,1);
1184 }
1185 if (strchr(optarg,'%'))
1186 {
1187 fprintf(stderr,
1188 "Invalid restart-all arg, must not contain '%%s': %s\n",
1189 optarg);
1190 return usage(progname,1);
1191 }
1192 gs.restart_command = optarg;
1193 gs.mode = MODE_GLOBAL_RESTART;
1194 break;
1195 case 's':
1196 if (!valid_command(optarg))
1197 {
1198 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1199 optarg);
1200 return usage(progname,1);
1201 }
1202 gs.start_command = optarg;
1203 break;
1204 case 'S':
1205 gs.vtydir = optarg;
1206 break;
1207 case 't':
1208 {
1209 char garbage[3];
1210 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1211 (gs.timeout < 1))
1212 {
1213 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1214 return usage(progname,1);
1215 }
1216 }
1217 break;
1218 case 'T':
1219 {
1220 char garbage[3];
1221 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1222 (gs.restart_timeout < 1))
1223 {
1224 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1225 return usage(progname,1);
1226 }
1227 }
1228 break;
1229 case 'z':
1230 gs.unresponsive_restart = 1;
1231 break;
1232 case 'v':
1233 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1234 puts("Copyright 2004 Andrew J. Schorr");
1235 return 0;
1236 case 'h':
1237 return usage(progname,0);
1238 default:
1239 fputs("Invalid option.\n",stderr);
1240 return usage(progname,1);
1241 }
1242 }
1243
1244 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1245 {
1246 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1247 return usage(progname,1);
1248 }
1249 switch (gs.mode)
1250 {
1251 case MODE_MONITOR:
1252 if (gs.restart_command || gs.start_command || gs.stop_command)
1253 {
1254 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1255 mode_str[gs.mode]);
1256 return usage(progname,1);
1257 }
1258 break;
1259 case MODE_GLOBAL_RESTART:
1260 case MODE_SEPARATE_RESTART:
1261 if (!gs.restart_command || gs.start_command || gs.stop_command)
1262 {
1263 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1264 mode_str[gs.mode]);
1265 return usage(progname,1);
1266 }
1267 break;
1268 case MODE_PHASED_ZEBRA_RESTART:
1269 case MODE_PHASED_ALL_RESTART:
1270 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1271 {
1272 fprintf(stderr,
1273 "Need start, kill, and restart commands in [%s] mode.\n",
1274 mode_str[gs.mode]);
1275 return usage(progname,1);
1276 }
1277 break;
1278 }
1279
ajsc8b40f82004-12-22 16:17:16 +00001280 if (blankstr)
1281 {
1282 if (gs.restart_command)
1283 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1284 if (gs.start_command)
1285 gs.start_command = translate_blanks(gs.start_command,blankstr);
1286 if (gs.stop_command)
1287 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1288 }
1289
ajs8b886ca2004-12-22 02:56:38 +00001290 gs.restart.interval = gs.min_restart_interval;
1291 master = thread_master_create();
Balaji.G837d16c2012-09-26 14:09:10 +05301292 signal_init (master, array_size(my_signals), my_signals);
ajs8b886ca2004-12-22 02:56:38 +00001293 srandom(time(NULL));
1294
1295 {
1296 int i;
1297 struct daemon *tail = NULL;
1298
1299 for (i = optind; i < argc; i++)
1300 {
1301 struct daemon *dmn;
1302
1303 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1304 {
ajs098e2402004-12-22 17:00:46 +00001305 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1306 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001307 return 1;
1308 }
1309 dmn->name = dmn->restart.name = argv[i];
1310 dmn->state = DAEMON_INIT;
1311 gs.numdaemons++;
1312 gs.numdown++;
1313 dmn->fd = -1;
1314 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1315 100+(random() % 900));
1316 dmn->restart.interval = gs.min_restart_interval;
1317 if (tail)
1318 tail->next = dmn;
1319 else
1320 gs.daemons = dmn;
1321 tail = dmn;
1322
1323 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1324 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1325 !strcmp(dmn->name,special))
1326 gs.special = dmn;
1327 }
1328 }
1329 if (!gs.daemons)
1330 {
1331 fputs("Must specify one or more daemons to monitor.\n",stderr);
1332 return usage(progname,1);
1333 }
1334 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1335 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1336 {
1337 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1338 mode_str[gs.mode],special);
1339 return usage(progname,1);
1340 }
1341 if (gs.special && (gs.numdaemons < 2))
1342 {
1343 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1344 "to watch.\n",mode_str[gs.mode]);
1345 return usage(progname,1);
1346 }
1347
1348 zlog_default = openzlog(progname, ZLOG_NONE,
1349 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1350 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1351 if (daemon_mode)
1352 {
1353 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001354 if (daemon (0, 0) < 0)
1355 {
1356 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1357 exit (1);
1358 }
ajs8b886ca2004-12-22 02:56:38 +00001359 }
1360 else
1361 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1362
1363 /* Make sure we're not already running. */
1364 pid_output (pidfile);
1365
1366 /* Announce which daemons are being monitored. */
1367 {
1368 struct daemon *dmn;
1369 size_t len = 0;
1370
1371 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1372 len += strlen(dmn->name)+1;
1373
1374 {
1375 char buf[len+1];
1376 char *p = buf;
1377
1378 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1379 {
1380 if (p != buf)
1381 *p++ = ' ';
1382 strcpy(p,dmn->name);
1383 p += strlen(p);
1384 }
1385 zlog_notice("%s %s watching [%s], mode [%s]",
1386 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1387 }
1388 }
1389
1390 {
1391 struct thread thread;
1392
1393 while (thread_fetch (master, &thread))
1394 thread_call (&thread);
1395 }
1396
1397 /* Not reached. */
1398 return 0;
1399}