blob: d92803d7e09cccc58f8f49781d3c892dbc1d0d54 [file] [log] [blame]
ajs8b886ca2004-12-22 02:56:38 +00001/*
ajs8b886ca2004-12-22 02:56:38 +00002 Monitor status of quagga daemons and restart if necessary.
3
4 Copyright (C) 2004 Andrew J. Schorr
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
ajsa3655342004-12-29 17:39:10 +000021#include <zebra.h>
ajs8b886ca2004-12-22 02:56:38 +000022#include <thread.h>
23#include <log.h>
ajs52e66292005-02-16 20:40:25 +000024#include <network.h>
ajs8b886ca2004-12-22 02:56:38 +000025#include <sigevent.h>
ajsa3655342004-12-29 17:39:10 +000026#include <lib/version.h>
paul6f594022004-12-23 19:35:56 +000027#include <getopt.h>
ajsa3655342004-12-29 17:39:10 +000028#include <sys/un.h>
29#include <sys/wait.h>
Balaji.G837d16c2012-09-26 14:09:10 +053030#include <memory.h>
ajs8b886ca2004-12-22 02:56:38 +000031
32#ifndef MIN
33#define MIN(X,Y) (((X) <= (Y)) ? (X) : (Y))
34#endif
35
36/* Macros to help randomize timers. */
37#define JITTER(X) ((random() % ((X)+1))-((X)/2))
38#define FUZZY(X) ((X)+JITTER((X)/20))
39
40#define DEFAULT_PERIOD 5
41#define DEFAULT_TIMEOUT 10
42#define DEFAULT_RESTART_TIMEOUT 20
43#define DEFAULT_LOGLEVEL LOG_INFO
44#define DEFAULT_MIN_RESTART 60
45#define DEFAULT_MAX_RESTART 600
ajs6028df52004-12-22 14:08:13 +000046#ifdef PATH_WATCHQUAGGA_PID
47#define DEFAULT_PIDFILE PATH_WATCHQUAGGA_PID
48#else
ajs8b886ca2004-12-22 02:56:38 +000049#define DEFAULT_PIDFILE STATEDIR "/watchquagga.pid"
ajs6028df52004-12-22 14:08:13 +000050#endif
ajs16f65112004-12-22 15:37:44 +000051#ifdef DAEMON_VTY_DIR
52#define VTYDIR DAEMON_VTY_DIR
53#else
54#define VTYDIR STATEDIR
55#endif
ajs8b886ca2004-12-22 02:56:38 +000056
57#define PING_TOKEN "PING"
58
59/* Needs to be global, referenced somewhere inside libzebra. */
60struct thread_master *master;
61
62typedef enum
63{
64 MODE_MONITOR = 0,
65 MODE_GLOBAL_RESTART,
66 MODE_SEPARATE_RESTART,
67 MODE_PHASED_ZEBRA_RESTART,
68 MODE_PHASED_ALL_RESTART
69} watch_mode_t;
70
71static const char *mode_str[] =
72{
73 "monitor",
74 "global restart",
75 "individual daemon restart",
76 "phased zebra restart",
77 "phased global restart for any failure",
78};
79
80typedef enum
81{
82 PHASE_NONE = 0,
83 PHASE_STOPS_PENDING,
84 PHASE_WAITING_DOWN,
85 PHASE_ZEBRA_RESTART_PENDING,
86 PHASE_WAITING_ZEBRA_UP
87} restart_phase_t;
88
89static const char *phase_str[] =
90{
91 "None",
92 "Stop jobs running",
93 "Waiting for other daemons to come down",
94 "Zebra restart job running",
95 "Waiting for zebra to come up",
96 "Start jobs running",
97};
98
99#define PHASE_TIMEOUT (3*gs.restart_timeout)
100
ajs098e2402004-12-22 17:00:46 +0000101struct restart_info
102{
103 const char *name;
104 const char *what;
105 pid_t pid;
106 struct timeval time;
107 long interval;
108 struct thread *t_kill;
109 int kills;
110};
111
112static struct global_state
113{
ajs8b886ca2004-12-22 02:56:38 +0000114 watch_mode_t mode;
115 restart_phase_t phase;
116 struct thread *t_phase_hanging;
117 const char *vtydir;
118 long period;
119 long timeout;
120 long restart_timeout;
121 long min_restart_interval;
122 long max_restart_interval;
123 int do_ping;
124 struct daemon *daemons;
125 const char *restart_command;
126 const char *start_command;
127 const char *stop_command;
ajs098e2402004-12-22 17:00:46 +0000128 struct restart_info restart;
ajs8b886ca2004-12-22 02:56:38 +0000129 int unresponsive_restart;
130 int loglevel;
131 struct daemon *special; /* points to zebra when doing phased restart */
132 int numdaemons;
133 int numpids;
134 int numdown; /* # of daemons that are not UP or UNRESPONSIVE */
135} gs = {
136 .mode = MODE_MONITOR,
137 .phase = PHASE_NONE,
ajs16f65112004-12-22 15:37:44 +0000138 .vtydir = VTYDIR,
ajs8b886ca2004-12-22 02:56:38 +0000139 .period = 1000*DEFAULT_PERIOD,
140 .timeout = DEFAULT_TIMEOUT,
141 .restart_timeout = DEFAULT_RESTART_TIMEOUT,
142 .loglevel = DEFAULT_LOGLEVEL,
143 .min_restart_interval = DEFAULT_MIN_RESTART,
144 .max_restart_interval = DEFAULT_MAX_RESTART,
145 .do_ping = 1,
ajs8b886ca2004-12-22 02:56:38 +0000146};
147
148typedef enum
149{
150 DAEMON_INIT,
151 DAEMON_DOWN,
152 DAEMON_CONNECTING,
153 DAEMON_UP,
154 DAEMON_UNRESPONSIVE
155} daemon_state_t;
156
157#define IS_UP(DMN) \
158 (((DMN)->state == DAEMON_UP) || ((DMN)->state == DAEMON_UNRESPONSIVE))
159
160static const char *state_str[] =
161{
162 "Init",
163 "Down",
164 "Connecting",
165 "Up",
166 "Unresponsive",
167};
168
169struct daemon {
170 const char *name;
171 daemon_state_t state;
172 int fd;
173 struct timeval echo_sent;
174 u_int connect_tries;
175 struct thread *t_wakeup;
176 struct thread *t_read;
177 struct thread *t_write;
178 struct daemon *next;
179 struct restart_info restart;
180};
181
182static const struct option longopts[] =
183{
184 { "daemon", no_argument, NULL, 'd'},
185 { "statedir", required_argument, NULL, 'S'},
186 { "no-echo", no_argument, NULL, 'e'},
187 { "loglevel", required_argument, NULL, 'l'},
188 { "interval", required_argument, NULL, 'i'},
189 { "timeout", required_argument, NULL, 't'},
190 { "restart-timeout", required_argument, NULL, 'T'},
191 { "restart", required_argument, NULL, 'r'},
192 { "start-command", required_argument, NULL, 's'},
193 { "kill-command", required_argument, NULL, 'k'},
194 { "restart-all", required_argument, NULL, 'R'},
195 { "all-restart", no_argument, NULL, 'a'},
196 { "always-all-restart", no_argument, NULL, 'A'},
197 { "unresponsive-restart", no_argument, NULL, 'z'},
198 { "min-restart-interval", required_argument, NULL, 'm'},
199 { "max-restart-interval", required_argument, NULL, 'M'},
200 { "pid-file", required_argument, NULL, 'p'},
ajsc8b40f82004-12-22 16:17:16 +0000201 { "blank-string", required_argument, NULL, 'b'},
ajs8b886ca2004-12-22 02:56:38 +0000202 { "help", no_argument, NULL, 'h'},
203 { "version", no_argument, NULL, 'v'},
204 { NULL, 0, NULL, 0 }
205};
206
207static int try_connect(struct daemon *dmn);
208static int wakeup_send_echo(struct thread *t_wakeup);
209static void try_restart(struct daemon *dmn);
210static void phase_check(void);
211
212static int
213usage(const char *progname, int status)
214{
215 if (status != 0)
216 fprintf(stderr, "Try `%s --help' for more information.\n", progname);
217 else
David Lamparter33b96632015-03-03 08:57:25 +0100218 {
219 printf("Usage : %s [OPTION...] <daemon name> ...\n\n\
ajs8b886ca2004-12-22 02:56:38 +0000220Watchdog program to monitor status of quagga daemons and try to restart\n\
221them if they are down or unresponsive. It determines whether a daemon is\n\
222up based on whether it can connect to the daemon's vty unix stream socket.\n\
223It then repeatedly sends echo commands over that socket to determine whether\n\
224the daemon is responsive. If the daemon crashes, we will receive an EOF\n\
225on the socket connection and know immediately that the daemon is down.\n\n\
226The daemons to be monitored should be listed on the command line.\n\n\
227This program can run in one of 5 modes:\n\n\
2280. Mode: %s.\n\
229 Just monitor and report on status changes. Example:\n\
230 %s -d zebra ospfd bgpd\n\n\
2311. Mode: %s.\n\
232 Whenever any daemon hangs or crashes, use the given command to restart\n\
233 them all. Example:\n\
234 %s -dz \\\n\
235 -R '/sbin/service zebra restart; /sbin/service ospfd restart' \\\n\
236 zebra ospfd\n\n\
2372. Mode: %s.\n\
238 When any single daemon hangs or crashes, restart only the daemon that's\n\
239 in trouble using the supplied restart command. Example:\n\
240 %s -dz -r '/sbin/service %%s restart' zebra ospfd bgpd\n\n\
2413. Mode: %s.\n\
242 The same as the previous mode, except that there is special treatment when\n\
243 the zebra daemon is in trouble. In that case, a phased restart approach\n\
244 is used: 1. stop all other daemons; 2. restart zebra; 3. start the other\n\
245 daemons. Example:\n\
246 %s -adz -r '/sbin/service %%s restart' \\\n\
247 -s '/sbin/service %%s start' \\\n\
248 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
2494. Mode: %s.\n\
250 This is the same as the previous mode, except that the phased restart\n\
251 procedure is used whenever any of the daemons hangs or crashes. Example:\n\
252 %s -Adz -r '/sbin/service %%s restart' \\\n\
253 -s '/sbin/service %%s start' \\\n\
254 -k '/sbin/service %%s stop' zebra ospfd bgpd\n\n\
255As of this writing, it is believed that mode 2 [%s]\n\
256is not safe, and mode 3 [%s] may not be safe with some of the\n\
257routing daemons.\n\n\
258In order to avoid attempting to restart the daemons in a fast loop,\n\
259the -m and -M options allow you to control the minimum delay between\n\
260restart commands. The minimum restart delay is recalculated each time\n\
261a restart is attempted: if the time since the last restart attempt exceeds\n\
262twice the -M value, then the restart delay is set to the -m value.\n\
David Lamparter33b96632015-03-03 08:57:25 +0100263Otherwise, the interval is doubled (but capped at the -M value).\n\n",
264 progname,mode_str[0],progname,mode_str[1],progname,mode_str[2],
265 progname,mode_str[3],progname,mode_str[4],progname,mode_str[2],
266 mode_str[3]);
267
268 printf("Options:\n\
ajs8b886ca2004-12-22 02:56:38 +0000269-d, --daemon Run in daemon mode. In this mode, error messages are sent\n\
270 to syslog instead of stdout.\n\
271-S, --statedir Set the vty socket directory (default is %s)\n\
272-e, --no-echo Do not ping the daemons to test responsiveness (this\n\
273 option is necessary if the daemons do not support the\n\
274 echo command)\n\
275-l, --loglevel Set the logging level (default is %d).\n\
276 The value should range from %d (LOG_EMERG) to %d (LOG_DEBUG),\n\
277 but it can be set higher than %d if extra-verbose debugging\n\
278 messages are desired.\n\
279-m, --min-restart-interval\n\
280 Set the minimum seconds to wait between invocations of daemon\n\
281 restart commands (default is %d).\n\
282-M, --max-restart-interval\n\
283 Set the maximum seconds to wait between invocations of daemon\n\
284 restart commands (default is %d).\n\
285-i, --interval Set the status polling interval in seconds (default is %d)\n\
286-t, --timeout Set the unresponsiveness timeout in seconds (default is %d)\n\
287-T, --restart-timeout\n\
288 Set the restart (kill) timeout in seconds (default is %d).\n\
289 If any background jobs are still running after this much\n\
290 time has elapsed, they will be killed.\n\
291-r, --restart Supply a Bourne shell command to use to restart a single\n\
292 daemon. The command string should include '%%s' where the\n\
293 name of the daemon should be substituted.\n\
294 Note that -r and -R are incompatible.\n\
295-s, --start-command\n\
296 Supply a Bourne shell to command to use to start a single\n\
297 daemon. The command string should include '%%s' where the\n\
298 name of the daemon should be substituted.\n\
299-k, --kill-command\n\
300 Supply a Bourne shell to command to use to stop a single\n\
301 daemon. The command string should include '%%s' where the\n\
302 name of the daemon should be substituted.\n\
303-R, --restart-all\n\
304 When one or more daemons is down, try to restart everything\n\
305 using the Bourne shell command supplied as the argument.\n\
306 Note that -r and -R are incompatible.\n\
307-z, --unresponsive-restart\n\
308 When a daemon is unresponsive, treat it as being down for\n\
309 restart purposes.\n\
310-a, --all-restart\n\
311 When zebra hangs or crashes, restart all daemons using\n\
312 this phased approach: 1. stop all other daemons; 2. restart\n\
313 zebra; 3. start other daemons. Requires -r, -s, and -k.\n\
314-A, --always-all-restart\n\
315 When any daemon (not just zebra) hangs or crashes, use the\n\
316 same phased restart mechanism described above for -a.\n\
317 Requires -r, -s, and -k.\n\
318-p, --pid-file Set process identifier file name\n\
319 (default is %s).\n\
ajsc8b40f82004-12-22 16:17:16 +0000320-b, --blank-string\n\
321 When the supplied argument string is found in any of the\n\
322 various shell command arguments (-r, -s, -k, or -R), replace\n\
323 it with a space. This is an ugly hack to circumvent problems\n\
324 passing command-line arguments with embedded spaces.\n\
ajs8b886ca2004-12-22 02:56:38 +0000325-v, --version Print program version\n\
David Lamparter33b96632015-03-03 08:57:25 +0100326-h, --help Display this help and exit\n",
327 VTYDIR,DEFAULT_LOGLEVEL,LOG_EMERG,LOG_DEBUG,LOG_DEBUG,
328 DEFAULT_MIN_RESTART,DEFAULT_MAX_RESTART,
329 DEFAULT_PERIOD,DEFAULT_TIMEOUT,DEFAULT_RESTART_TIMEOUT,
330 DEFAULT_PIDFILE);
331 }
ajs8b886ca2004-12-22 02:56:38 +0000332
333 return status;
334}
335
336static pid_t
337run_background(const char *shell_cmd)
338{
339 pid_t child;
340
341 switch (child = fork())
342 {
343 case -1:
344 zlog_err("fork failed, cannot run command [%s]: %s",
345 shell_cmd,safe_strerror(errno));
346 return -1;
347 case 0:
348 /* Child process. */
349 /* Use separate process group so child processes can be killed easily. */
350 if (setpgid(0,0) < 0)
351 zlog_warn("warning: setpgid(0,0) failed: %s",safe_strerror(errno));
352 {
353 const char *argv[4] = { "sh", "-c", shell_cmd, NULL};
354 execv("/bin/sh",(char *const *)argv);
355 zlog_err("execv(/bin/sh -c '%s') failed: %s",
356 shell_cmd,safe_strerror(errno));
357 _exit(127);
358 }
359 default:
360 /* Parent process: we will reap the child later. */
ajsf2d82572004-12-29 17:45:08 +0000361 zlog_err("Forked background command [pid %d]: %s",(int)child,shell_cmd);
ajs8b886ca2004-12-22 02:56:38 +0000362 return child;
363 }
364}
365
366static struct timeval *
367time_elapsed(struct timeval *result, const struct timeval *start_time)
368{
369 gettimeofday(result,NULL);
370 result->tv_sec -= start_time->tv_sec;
371 result->tv_usec -= start_time->tv_usec;
372 while (result->tv_usec < 0)
373 {
374 result->tv_usec += 1000000L;
375 result->tv_sec--;
376 }
377 return result;
378}
379
380static int
381restart_kill(struct thread *t_kill)
382{
383 struct restart_info *restart = THREAD_ARG(t_kill);
384 struct timeval delay;
385
386 time_elapsed(&delay,&restart->time);
387 zlog_warn("Warning: %s %s child process %d still running after "
388 "%ld seconds, sending signal %d",
David Lamparteref008d22015-03-03 08:48:11 +0100389 restart->what,restart->name,(int)restart->pid, (long)delay.tv_sec,
ajs8b886ca2004-12-22 02:56:38 +0000390 (restart->kills ? SIGKILL : SIGTERM));
391 kill(-restart->pid,(restart->kills ? SIGKILL : SIGTERM));
392 restart->kills++;
393 restart->t_kill = thread_add_timer(master,restart_kill,restart,
394 gs.restart_timeout);
395 return 0;
396}
397
398static struct restart_info *
399find_child(pid_t child)
400{
401 if (gs.mode == MODE_GLOBAL_RESTART)
402 {
403 if (gs.restart.pid == child)
404 return &gs.restart;
405 }
406 else
407 {
408 struct daemon *dmn;
409 for (dmn = gs.daemons; dmn; dmn = dmn->next)
410 {
411 if (dmn->restart.pid == child)
412 return &dmn->restart;
413 }
414 }
415 return NULL;
416}
417
418static void
419sigchild(void)
420{
421 pid_t child;
422 int status;
423 const char *name;
424 const char *what;
425 struct restart_info *restart;
426
427 switch (child = waitpid(-1,&status,WNOHANG))
428 {
429 case -1:
430 zlog_err("waitpid failed: %s",safe_strerror(errno));
431 return;
432 case 0:
433 zlog_warn("SIGCHLD received, but waitpid did not reap a child");
434 return;
435 }
436
437 if ((restart = find_child(child)) != NULL)
438 {
439 name = restart->name;
440 what = restart->what;
441 restart->pid = 0;
442 gs.numpids--;
443 thread_cancel(restart->t_kill);
444 restart->t_kill = NULL;
445 /* Update restart time to reflect the time the command completed. */
446 gettimeofday(&restart->time,NULL);
447 }
448 else
449 {
450 zlog_err("waitpid returned status for an unknown child process %d",
ajsf2d82572004-12-29 17:45:08 +0000451 (int)child);
ajs8b886ca2004-12-22 02:56:38 +0000452 name = "(unknown)";
453 what = "background";
454 }
455 if (WIFSTOPPED(status))
456 zlog_warn("warning: %s %s process %d is stopped",
ajsf2d82572004-12-29 17:45:08 +0000457 what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000458 else if (WIFSIGNALED(status))
459 zlog_warn("%s %s process %d terminated due to signal %d",
ajsf2d82572004-12-29 17:45:08 +0000460 what,name,(int)child,WTERMSIG(status));
ajs8b886ca2004-12-22 02:56:38 +0000461 else if (WIFEXITED(status))
462 {
463 if (WEXITSTATUS(status) != 0)
464 zlog_warn("%s %s process %d exited with non-zero status %d",
ajsf2d82572004-12-29 17:45:08 +0000465 what,name,(int)child,WEXITSTATUS(status));
ajs8b886ca2004-12-22 02:56:38 +0000466 else
ajsf2d82572004-12-29 17:45:08 +0000467 zlog_debug("%s %s process %d exited normally",what,name,(int)child);
ajs8b886ca2004-12-22 02:56:38 +0000468 }
469 else
470 zlog_err("cannot interpret %s %s process %d wait status 0x%x",
ajsf2d82572004-12-29 17:45:08 +0000471 what,name,(int)child,status);
ajs8b886ca2004-12-22 02:56:38 +0000472 phase_check();
473}
474
475static int
476run_job(struct restart_info *restart, const char *cmdtype, const char *command,
477 int force, int update_interval)
478{
479 struct timeval delay;
480
481 if (gs.loglevel > LOG_DEBUG+1)
482 zlog_debug("attempting to %s %s",cmdtype,restart->name);
483
484 if (restart->pid)
485 {
486 if (gs.loglevel > LOG_DEBUG+1)
487 zlog_debug("cannot %s %s, previous pid %d still running",
ajsf2d82572004-12-29 17:45:08 +0000488 cmdtype,restart->name,(int)restart->pid);
ajs8b886ca2004-12-22 02:56:38 +0000489 return -1;
490 }
491
ajsa8a8ddc2005-01-12 16:24:51 +0000492 /* Note: time_elapsed test must come before the force test, since we need
493 to make sure that delay is initialized for use below in updating the
494 restart interval. */
495 if ((time_elapsed(&delay,&restart->time)->tv_sec < restart->interval) &&
496 !force)
ajs8b886ca2004-12-22 02:56:38 +0000497 {
498 if (gs.loglevel > LOG_DEBUG+1)
499 zlog_debug("postponing %s %s: "
500 "elapsed time %ld < retry interval %ld",
501 cmdtype,restart->name,(long)delay.tv_sec,restart->interval);
502 return -1;
503 }
504
505 gettimeofday(&restart->time,NULL);
506 restart->kills = 0;
507 {
508 char cmd[strlen(command)+strlen(restart->name)+1];
509 snprintf(cmd,sizeof(cmd),command,restart->name);
510 if ((restart->pid = run_background(cmd)) > 0)
511 {
512 restart->t_kill = thread_add_timer(master,restart_kill,restart,
513 gs.restart_timeout);
514 restart->what = cmdtype;
515 gs.numpids++;
516 }
517 else
518 restart->pid = 0;
519 }
520
521 /* Calculate the new restart interval. */
522 if (update_interval)
523 {
524 if (delay.tv_sec > 2*gs.max_restart_interval)
525 restart->interval = gs.min_restart_interval;
526 else if ((restart->interval *= 2) > gs.max_restart_interval)
527 restart->interval = gs.max_restart_interval;
528 if (gs.loglevel > LOG_DEBUG+1)
529 zlog_debug("restart %s interval is now %ld",
530 restart->name,restart->interval);
531 }
532 return restart->pid;
533}
534
535#define SET_READ_HANDLER(DMN) \
536 (DMN)->t_read = thread_add_read(master,handle_read,(DMN),(DMN)->fd)
537
538#define SET_WAKEUP_DOWN(DMN) \
539 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_down,(DMN), \
540 FUZZY(gs.period))
541
542#define SET_WAKEUP_UNRESPONSIVE(DMN) \
543 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_unresponsive,(DMN), \
544 FUZZY(gs.period))
545
546#define SET_WAKEUP_ECHO(DMN) \
547 (DMN)->t_wakeup = thread_add_timer_msec(master,wakeup_send_echo,(DMN), \
548 FUZZY(gs.period))
549
550static int
551wakeup_down(struct thread *t_wakeup)
552{
553 struct daemon *dmn = THREAD_ARG(t_wakeup);
554
555 dmn->t_wakeup = NULL;
556 if (try_connect(dmn) < 0)
557 SET_WAKEUP_DOWN(dmn);
558 if ((dmn->connect_tries > 1) && (dmn->state != DAEMON_UP))
559 try_restart(dmn);
560 return 0;
561}
562
563static int
564wakeup_init(struct thread *t_wakeup)
565{
566 struct daemon *dmn = THREAD_ARG(t_wakeup);
567
568 dmn->t_wakeup = NULL;
569 if (try_connect(dmn) < 0)
570 {
571 SET_WAKEUP_DOWN(dmn);
572 zlog_err("%s state -> down : initial connection attempt failed",
573 dmn->name);
574 dmn->state = DAEMON_DOWN;
575 }
576 return 0;
577}
578
579static void
580daemon_down(struct daemon *dmn, const char *why)
581{
582 if (IS_UP(dmn) || (dmn->state == DAEMON_INIT))
583 zlog_err("%s state -> down : %s",dmn->name,why);
584 else if (gs.loglevel > LOG_DEBUG)
585 zlog_debug("%s still down : %s",dmn->name,why);
586 if (IS_UP(dmn))
587 gs.numdown++;
588 dmn->state = DAEMON_DOWN;
589 if (dmn->fd >= 0)
590 {
591 close(dmn->fd);
592 dmn->fd = -1;
593 }
594 THREAD_OFF(dmn->t_read);
595 THREAD_OFF(dmn->t_write);
596 THREAD_OFF(dmn->t_wakeup);
597 if (try_connect(dmn) < 0)
598 SET_WAKEUP_DOWN(dmn);
599 phase_check();
600}
601
602static int
603handle_read(struct thread *t_read)
604{
605 struct daemon *dmn = THREAD_ARG(t_read);
606 static const char resp[sizeof(PING_TOKEN)+4] = PING_TOKEN "\n";
607 char buf[sizeof(resp)+100];
608 ssize_t rc;
609 struct timeval delay;
610
611 dmn->t_read = NULL;
612 if ((rc = read(dmn->fd,buf,sizeof(buf))) < 0)
613 {
614 char why[100];
615
ajs518cde82005-02-17 20:11:58 +0000616 if (ERRNO_IO_RETRY(errno))
ajs8b886ca2004-12-22 02:56:38 +0000617 {
618 /* Pretend it never happened. */
619 SET_READ_HANDLER(dmn);
620 return 0;
621 }
622 snprintf(why,sizeof(why),"unexpected read error: %s",
623 safe_strerror(errno));
624 daemon_down(dmn,why);
625 return 0;
626 }
627 if (rc == 0)
628 {
629 daemon_down(dmn,"read returned EOF");
630 return 0;
631 }
632 if (!dmn->echo_sent.tv_sec)
633 {
634 char why[sizeof(buf)+100];
ajs098e2402004-12-22 17:00:46 +0000635 snprintf(why,sizeof(why),"unexpected read returns %d bytes: %.*s",
636 (int)rc,(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000637 daemon_down(dmn,why);
638 return 0;
639 }
640
641 /* We are expecting an echo response: is there any chance that the
642 response would not be returned entirely in the first read? That
643 seems inconceivable... */
644 if ((rc != sizeof(resp)) || memcmp(buf,resp,sizeof(resp)))
645 {
646 char why[100+sizeof(buf)];
ajs098e2402004-12-22 17:00:46 +0000647 snprintf(why,sizeof(why),"read returned bad echo response of %d bytes "
648 "(expecting %u): %.*s",
649 (int)rc,(u_int)sizeof(resp),(int)rc,buf);
ajs8b886ca2004-12-22 02:56:38 +0000650 daemon_down(dmn,why);
651 return 0;
652 }
653
654 time_elapsed(&delay,&dmn->echo_sent);
655 dmn->echo_sent.tv_sec = 0;
656 if (dmn->state == DAEMON_UNRESPONSIVE)
657 {
658 if (delay.tv_sec < gs.timeout)
659 {
660 dmn->state = DAEMON_UP;
661 zlog_warn("%s state -> up : echo response received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100662 "seconds", dmn->name,
663 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000664 }
665 else
666 zlog_warn("%s: slow echo response finally received after %ld.%06ld "
David Lamparteref008d22015-03-03 08:48:11 +0100667 "seconds", dmn->name,
668 (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000669 }
670 else if (gs.loglevel > LOG_DEBUG+1)
671 zlog_debug("%s: echo response received after %ld.%06ld seconds",
David Lamparteref008d22015-03-03 08:48:11 +0100672 dmn->name, (long)delay.tv_sec, (long)delay.tv_usec);
ajs8b886ca2004-12-22 02:56:38 +0000673
674 SET_READ_HANDLER(dmn);
675 if (dmn->t_wakeup)
676 thread_cancel(dmn->t_wakeup);
677 SET_WAKEUP_ECHO(dmn);
678
679 return 0;
680}
681
682static void
683daemon_up(struct daemon *dmn, const char *why)
684{
685 dmn->state = DAEMON_UP;
686 gs.numdown--;
687 dmn->connect_tries = 0;
688 zlog_notice("%s state -> up : %s",dmn->name,why);
689 if (gs.do_ping)
690 SET_WAKEUP_ECHO(dmn);
691 phase_check();
692}
693
694static int
695check_connect(struct thread *t_write)
696{
697 struct daemon *dmn = THREAD_ARG(t_write);
698 int sockerr;
699 socklen_t reslen = sizeof(sockerr);
700
701 dmn->t_write = NULL;
702 if (getsockopt(dmn->fd,SOL_SOCKET,SO_ERROR,(char *)&sockerr,&reslen) < 0)
703 {
704 zlog_warn("%s: check_connect: getsockopt failed: %s",
705 dmn->name,safe_strerror(errno));
706 daemon_down(dmn,"getsockopt failed checking connection success");
707 return 0;
708 }
709 if ((reslen == sizeof(sockerr)) && sockerr)
710 {
711 char why[100];
712 snprintf(why,sizeof(why),
713 "getsockopt reports that connection attempt failed: %s",
714 safe_strerror(sockerr));
715 daemon_down(dmn,why);
716 return 0;
717 }
718
719 daemon_up(dmn,"delayed connect succeeded");
720 return 0;
721}
722
723static int
724wakeup_connect_hanging(struct thread *t_wakeup)
725{
726 struct daemon *dmn = THREAD_ARG(t_wakeup);
727 char why[100];
728
729 dmn->t_wakeup = NULL;
730 snprintf(why,sizeof(why),"connection attempt timed out after %ld seconds",
731 gs.timeout);
732 daemon_down(dmn,why);
733 return 0;
734}
735
736/* Making connection to protocol daemon. */
737static int
738try_connect(struct daemon *dmn)
739{
740 int sock;
741 struct sockaddr_un addr;
742 socklen_t len;
ajs8b886ca2004-12-22 02:56:38 +0000743
744 if (gs.loglevel > LOG_DEBUG+1)
745 zlog_debug("%s: attempting to connect",dmn->name);
746 dmn->connect_tries++;
747
748 memset (&addr, 0, sizeof (struct sockaddr_un));
749 addr.sun_family = AF_UNIX;
750 snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s.vty",
751 gs.vtydir,dmn->name);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000752#ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
ajs8b886ca2004-12-22 02:56:38 +0000753 len = addr.sun_len = SUN_LEN(&addr);
754#else
755 len = sizeof (addr.sun_family) + strlen (addr.sun_path);
Paul Jakma6f0e3f62007-05-10 02:38:51 +0000756#endif /* HAVE_STRUCT_SOCKADDR_UN_SUN_LEN */
ajs8b886ca2004-12-22 02:56:38 +0000757
758 /* Quick check to see if we might succeed before we go to the trouble
759 of creating a socket. */
760 if (access(addr.sun_path, W_OK) < 0)
761 {
762 if (errno != ENOENT)
763 zlog_err("%s: access to socket %s denied: %s",
764 dmn->name,addr.sun_path,safe_strerror(errno));
765 return -1;
766 }
767
768 if ((sock = socket (AF_UNIX, SOCK_STREAM, 0)) < 0)
769 {
770 zlog_err("%s(%s): cannot make socket: %s",
771 __func__,addr.sun_path, safe_strerror(errno));
772 return -1;
773 }
774
ajs52e66292005-02-16 20:40:25 +0000775 if (set_nonblocking(sock) < 0)
ajs8b886ca2004-12-22 02:56:38 +0000776 {
ajs52e66292005-02-16 20:40:25 +0000777 zlog_err("%s(%s): set_nonblocking(%d) failed",
778 __func__, addr.sun_path, sock);
ajs8b886ca2004-12-22 02:56:38 +0000779 close(sock);
780 return -1;
781 }
782
783 if (connect (sock, (struct sockaddr *) &addr, len) < 0)
784 {
785 if ((errno != EINPROGRESS) && (errno != EWOULDBLOCK))
786 {
787 if (gs.loglevel > LOG_DEBUG)
788 zlog_debug("%s(%s): connect failed: %s",
789 __func__,addr.sun_path, safe_strerror(errno));
790 close (sock);
791 return -1;
792 }
793 if (gs.loglevel > LOG_DEBUG)
794 zlog_debug("%s: connection in progress",dmn->name);
795 dmn->state = DAEMON_CONNECTING;
796 dmn->fd = sock;
797 dmn->t_write = thread_add_write(master,check_connect,dmn,dmn->fd);
798 dmn->t_wakeup = thread_add_timer(master,wakeup_connect_hanging,dmn,
799 gs.timeout);
800 SET_READ_HANDLER(dmn);
801 return 0;
802 }
803
804 dmn->fd = sock;
805 SET_READ_HANDLER(dmn);
806 daemon_up(dmn,"connect succeeded");
807 return 1;
808}
809
810static int
811phase_hanging(struct thread *t_hanging)
812{
813 gs.t_phase_hanging = NULL;
814 zlog_err("Phase [%s] hanging for %ld seconds, aborting phased restart",
815 phase_str[gs.phase],PHASE_TIMEOUT);
816 gs.phase = PHASE_NONE;
817 return 0;
818}
819
820static void
821set_phase(restart_phase_t new_phase)
822{
823 gs.phase = new_phase;
824 if (gs.t_phase_hanging)
825 thread_cancel(gs.t_phase_hanging);
826 gs.t_phase_hanging = thread_add_timer(master,phase_hanging,NULL,
827 PHASE_TIMEOUT);
828}
829
830static void
831phase_check(void)
832{
833 switch (gs.phase)
834 {
835 case PHASE_NONE:
836 break;
837 case PHASE_STOPS_PENDING:
838 if (gs.numpids)
839 break;
840 zlog_info("Phased restart: all routing daemon stop jobs have completed.");
841 set_phase(PHASE_WAITING_DOWN);
842 /*FALLTHRU*/
843 case PHASE_WAITING_DOWN:
844 if (gs.numdown+IS_UP(gs.special) < gs.numdaemons)
845 break;
846 zlog_info("Phased restart: all routing daemons now down.");
847 run_job(&gs.special->restart,"restart",gs.restart_command,1,1);
848 set_phase(PHASE_ZEBRA_RESTART_PENDING);
849 /*FALLTHRU*/
850 case PHASE_ZEBRA_RESTART_PENDING:
851 if (gs.special->restart.pid)
852 break;
853 zlog_info("Phased restart: %s restart job completed.",gs.special->name);
854 set_phase(PHASE_WAITING_ZEBRA_UP);
855 /*FALLTHRU*/
856 case PHASE_WAITING_ZEBRA_UP:
857 if (!IS_UP(gs.special))
858 break;
859 zlog_info("Phased restart: %s is now up.",gs.special->name);
860 {
861 struct daemon *dmn;
862 for (dmn = gs.daemons; dmn; dmn = dmn->next)
863 {
864 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000865 run_job(&dmn->restart,"start",gs.start_command,1,0);
ajs8b886ca2004-12-22 02:56:38 +0000866 }
867 }
868 gs.phase = PHASE_NONE;
869 THREAD_OFF(gs.t_phase_hanging);
870 zlog_notice("Phased global restart has completed.");
871 break;
872 }
873}
874
875static void
876try_restart(struct daemon *dmn)
877{
878 switch (gs.mode)
879 {
880 case MODE_MONITOR:
881 return;
882 case MODE_GLOBAL_RESTART:
883 run_job(&gs.restart,"restart",gs.restart_command,0,1);
884 break;
885 case MODE_SEPARATE_RESTART:
886 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
887 break;
888 case MODE_PHASED_ZEBRA_RESTART:
889 if (dmn != gs.special)
890 {
891 if ((gs.special->state == DAEMON_UP) && (gs.phase == PHASE_NONE))
892 run_job(&dmn->restart,"restart",gs.restart_command,0,1);
893 else
894 zlog_debug("%s: postponing restart attempt because master %s daemon "
895 "not up [%s], or phased restart in progress",
896 dmn->name,gs.special->name,state_str[gs.special->state]);
897 break;
898 }
899 /*FALLTHRU*/
900 case MODE_PHASED_ALL_RESTART:
901 if ((gs.phase != PHASE_NONE) || gs.numpids)
902 {
903 if (gs.loglevel > LOG_DEBUG+1)
904 zlog_debug("postponing phased global restart: restart already in "
905 "progress [%s], or outstanding child processes [%d]",
906 phase_str[gs.phase],gs.numpids);
907 break;
908 }
909 /* Is it too soon for a restart? */
910 {
911 struct timeval delay;
912 if (time_elapsed(&delay,&gs.special->restart.time)->tv_sec <
913 gs.special->restart.interval)
914 {
915 if (gs.loglevel > LOG_DEBUG+1)
916 zlog_debug("postponing phased global restart: "
917 "elapsed time %ld < retry interval %ld",
918 (long)delay.tv_sec,gs.special->restart.interval);
919 break;
920 }
921 }
922 zlog_info("Phased restart: stopping all routing daemons.");
923 /* First step: stop all other daemons. */
924 for (dmn = gs.daemons; dmn; dmn = dmn->next)
925 {
926 if (dmn != gs.special)
ajsa8a8ddc2005-01-12 16:24:51 +0000927 run_job(&dmn->restart,"stop",gs.stop_command,1,1);
ajs8b886ca2004-12-22 02:56:38 +0000928 }
929 set_phase(PHASE_STOPS_PENDING);
930 break;
931 default:
932 zlog_err("error: unknown restart mode %d",gs.mode);
933 break;
934 }
935}
936
937static int
938wakeup_unresponsive(struct thread *t_wakeup)
939{
940 struct daemon *dmn = THREAD_ARG(t_wakeup);
941
942 dmn->t_wakeup = NULL;
943 if (dmn->state != DAEMON_UNRESPONSIVE)
944 zlog_err("%s: no longer unresponsive (now %s), "
945 "wakeup should have been cancelled!",
946 dmn->name,state_str[dmn->state]);
947 else
948 {
949 SET_WAKEUP_UNRESPONSIVE(dmn);
950 try_restart(dmn);
951 }
952 return 0;
953}
954
955static int
956wakeup_no_answer(struct thread *t_wakeup)
957{
958 struct daemon *dmn = THREAD_ARG(t_wakeup);
959
960 dmn->t_wakeup = NULL;
961 dmn->state = DAEMON_UNRESPONSIVE;
962 zlog_err("%s state -> unresponsive : no response yet to ping "
963 "sent %ld seconds ago",dmn->name,gs.timeout);
964 if (gs.unresponsive_restart)
965 {
966 SET_WAKEUP_UNRESPONSIVE(dmn);
967 try_restart(dmn);
968 }
969 return 0;
970}
971
972static int
973wakeup_send_echo(struct thread *t_wakeup)
974{
975 static const char echocmd[] = "echo " PING_TOKEN;
976 ssize_t rc;
977 struct daemon *dmn = THREAD_ARG(t_wakeup);
978
979 dmn->t_wakeup = NULL;
980 if (((rc = write(dmn->fd,echocmd,sizeof(echocmd))) < 0) ||
981 ((size_t)rc != sizeof(echocmd)))
982 {
983 char why[100+sizeof(echocmd)];
ajs098e2402004-12-22 17:00:46 +0000984 snprintf(why,sizeof(why),"write '%s' returned %d instead of %u",
985 echocmd,(int)rc,(u_int)sizeof(echocmd));
ajs8b886ca2004-12-22 02:56:38 +0000986 daemon_down(dmn,why);
987 }
988 else
989 {
990 gettimeofday(&dmn->echo_sent,NULL);
991 dmn->t_wakeup = thread_add_timer(master,wakeup_no_answer,dmn,gs.timeout);
992 }
993 return 0;
994}
995
996static void
997sigint(void)
998{
999 zlog_notice("Terminating on signal");
1000 exit(0);
1001}
1002
1003static int
1004valid_command(const char *cmd)
1005{
1006 char *p;
1007
1008 return ((p = strchr(cmd,'%')) != NULL) && (*(p+1) == 's') && !strchr(p+1,'%');
1009}
1010
ajsc8b40f82004-12-22 16:17:16 +00001011/* This is an ugly hack to circumvent problems with passing command-line
1012 arguments that contain spaces. The fix is to use a configuration file. */
1013static char *
1014translate_blanks(const char *cmd, const char *blankstr)
1015{
1016 char *res;
1017 char *p;
1018 size_t bslen = strlen(blankstr);
1019
1020 if (!(res = strdup(cmd)))
1021 {
1022 perror("strdup");
1023 exit(1);
1024 }
1025 while ((p = strstr(res,blankstr)) != NULL)
1026 {
1027 *p = ' ';
1028 if (bslen != 1)
1029 memmove(p+1,p+bslen,strlen(p+bslen)+1);
1030 }
1031 return res;
1032}
1033
ajs8b886ca2004-12-22 02:56:38 +00001034int
1035main(int argc, char **argv)
1036{
1037 const char *progname;
1038 int opt;
1039 int daemon_mode = 0;
1040 const char *pidfile = DEFAULT_PIDFILE;
1041 const char *special = "zebra";
ajsc8b40f82004-12-22 16:17:16 +00001042 const char *blankstr = NULL;
ajs8b886ca2004-12-22 02:56:38 +00001043 static struct quagga_signal_t my_signals[] =
1044 {
1045 {
1046 .signal = SIGINT,
1047 .handler = sigint,
1048 },
1049 {
1050 .signal = SIGTERM,
1051 .handler = sigint,
1052 },
1053 {
1054 .signal = SIGCHLD,
1055 .handler = sigchild,
1056 },
1057 };
1058
1059 if ((progname = strrchr (argv[0], '/')) != NULL)
1060 progname++;
1061 else
1062 progname = argv[0];
1063
ajs098e2402004-12-22 17:00:46 +00001064 gs.restart.name = "all";
ajsc8b40f82004-12-22 16:17:16 +00001065 while ((opt = getopt_long(argc, argv, "aAb:dek:l:m:M:i:p:r:R:S:s:t:T:zvh",
ajs8b886ca2004-12-22 02:56:38 +00001066 longopts, 0)) != EOF)
1067 {
1068 switch (opt)
1069 {
1070 case 0:
1071 break;
1072 case 'a':
1073 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1074 {
1075 fputs("Ambiguous operating mode selected.\n",stderr);
1076 return usage(progname,1);
1077 }
1078 gs.mode = MODE_PHASED_ZEBRA_RESTART;
1079 break;
1080 case 'A':
1081 if ((gs.mode != MODE_MONITOR) && (gs.mode != MODE_SEPARATE_RESTART))
1082 {
1083 fputs("Ambiguous operating mode selected.\n",stderr);
1084 return usage(progname,1);
1085 }
1086 gs.mode = MODE_PHASED_ALL_RESTART;
1087 break;
ajsc8b40f82004-12-22 16:17:16 +00001088 case 'b':
1089 blankstr = optarg;
1090 break;
ajs8b886ca2004-12-22 02:56:38 +00001091 case 'd':
1092 daemon_mode = 1;
1093 break;
1094 case 'e':
1095 gs.do_ping = 0;
1096 break;
1097 case 'k':
1098 if (!valid_command(optarg))
1099 {
1100 fprintf(stderr,"Invalid kill command, must contain '%%s': %s\n",
1101 optarg);
1102 return usage(progname,1);
1103 }
1104 gs.stop_command = optarg;
1105 break;
1106 case 'l':
1107 {
1108 char garbage[3];
1109 if ((sscanf(optarg,"%d%1s",&gs.loglevel,garbage) != 1) ||
1110 (gs.loglevel < LOG_EMERG))
1111 {
1112 fprintf(stderr,"Invalid loglevel argument: %s\n",optarg);
1113 return usage(progname,1);
1114 }
1115 }
1116 break;
1117 case 'm':
1118 {
1119 char garbage[3];
1120 if ((sscanf(optarg,"%ld%1s",
1121 &gs.min_restart_interval,garbage) != 1) ||
1122 (gs.min_restart_interval < 0))
1123 {
1124 fprintf(stderr,"Invalid min_restart_interval argument: %s\n",
1125 optarg);
1126 return usage(progname,1);
1127 }
1128 }
1129 break;
1130 case 'M':
1131 {
1132 char garbage[3];
1133 if ((sscanf(optarg,"%ld%1s",
1134 &gs.max_restart_interval,garbage) != 1) ||
1135 (gs.max_restart_interval < 0))
1136 {
1137 fprintf(stderr,"Invalid max_restart_interval argument: %s\n",
1138 optarg);
1139 return usage(progname,1);
1140 }
1141 }
1142 break;
1143 case 'i':
1144 {
1145 char garbage[3];
1146 int period;
1147 if ((sscanf(optarg,"%d%1s",&period,garbage) != 1) ||
1148 (gs.period < 1))
1149 {
1150 fprintf(stderr,"Invalid interval argument: %s\n",optarg);
1151 return usage(progname,1);
1152 }
1153 gs.period = 1000*period;
1154 }
1155 break;
1156 case 'p':
1157 pidfile = optarg;
1158 break;
1159 case 'r':
1160 if ((gs.mode == MODE_GLOBAL_RESTART) ||
1161 (gs.mode == MODE_SEPARATE_RESTART))
1162 {
1163 fputs("Ambiguous operating mode selected.\n",stderr);
1164 return usage(progname,1);
1165 }
1166 if (!valid_command(optarg))
1167 {
1168 fprintf(stderr,
1169 "Invalid restart command, must contain '%%s': %s\n",
1170 optarg);
1171 return usage(progname,1);
1172 }
1173 gs.restart_command = optarg;
1174 if (gs.mode == MODE_MONITOR)
1175 gs.mode = MODE_SEPARATE_RESTART;
1176 break;
1177 case 'R':
1178 if (gs.mode != MODE_MONITOR)
1179 {
1180 fputs("Ambiguous operating mode selected.\n",stderr);
1181 return usage(progname,1);
1182 }
1183 if (strchr(optarg,'%'))
1184 {
1185 fprintf(stderr,
1186 "Invalid restart-all arg, must not contain '%%s': %s\n",
1187 optarg);
1188 return usage(progname,1);
1189 }
1190 gs.restart_command = optarg;
1191 gs.mode = MODE_GLOBAL_RESTART;
1192 break;
1193 case 's':
1194 if (!valid_command(optarg))
1195 {
1196 fprintf(stderr,"Invalid start command, must contain '%%s': %s\n",
1197 optarg);
1198 return usage(progname,1);
1199 }
1200 gs.start_command = optarg;
1201 break;
1202 case 'S':
1203 gs.vtydir = optarg;
1204 break;
1205 case 't':
1206 {
1207 char garbage[3];
1208 if ((sscanf(optarg,"%ld%1s",&gs.timeout,garbage) != 1) ||
1209 (gs.timeout < 1))
1210 {
1211 fprintf(stderr,"Invalid timeout argument: %s\n",optarg);
1212 return usage(progname,1);
1213 }
1214 }
1215 break;
1216 case 'T':
1217 {
1218 char garbage[3];
1219 if ((sscanf(optarg,"%ld%1s",&gs.restart_timeout,garbage) != 1) ||
1220 (gs.restart_timeout < 1))
1221 {
1222 fprintf(stderr,"Invalid restart timeout argument: %s\n",optarg);
1223 return usage(progname,1);
1224 }
1225 }
1226 break;
1227 case 'z':
1228 gs.unresponsive_restart = 1;
1229 break;
1230 case 'v':
1231 printf ("%s version %s\n", progname, QUAGGA_VERSION);
1232 puts("Copyright 2004 Andrew J. Schorr");
1233 return 0;
1234 case 'h':
1235 return usage(progname,0);
1236 default:
1237 fputs("Invalid option.\n",stderr);
1238 return usage(progname,1);
1239 }
1240 }
1241
1242 if (gs.unresponsive_restart && (gs.mode == MODE_MONITOR))
1243 {
1244 fputs("Option -z requires a -r or -R restart option.\n",stderr);
1245 return usage(progname,1);
1246 }
1247 switch (gs.mode)
1248 {
1249 case MODE_MONITOR:
1250 if (gs.restart_command || gs.start_command || gs.stop_command)
1251 {
1252 fprintf(stderr,"No kill/(re)start commands needed for %s mode.\n",
1253 mode_str[gs.mode]);
1254 return usage(progname,1);
1255 }
1256 break;
1257 case MODE_GLOBAL_RESTART:
1258 case MODE_SEPARATE_RESTART:
1259 if (!gs.restart_command || gs.start_command || gs.stop_command)
1260 {
1261 fprintf(stderr,"No start/kill commands needed in [%s] mode.\n",
1262 mode_str[gs.mode]);
1263 return usage(progname,1);
1264 }
1265 break;
1266 case MODE_PHASED_ZEBRA_RESTART:
1267 case MODE_PHASED_ALL_RESTART:
1268 if (!gs.restart_command || !gs.start_command || !gs.stop_command)
1269 {
1270 fprintf(stderr,
1271 "Need start, kill, and restart commands in [%s] mode.\n",
1272 mode_str[gs.mode]);
1273 return usage(progname,1);
1274 }
1275 break;
1276 }
1277
ajsc8b40f82004-12-22 16:17:16 +00001278 if (blankstr)
1279 {
1280 if (gs.restart_command)
1281 gs.restart_command = translate_blanks(gs.restart_command,blankstr);
1282 if (gs.start_command)
1283 gs.start_command = translate_blanks(gs.start_command,blankstr);
1284 if (gs.stop_command)
1285 gs.stop_command = translate_blanks(gs.stop_command,blankstr);
1286 }
1287
ajs8b886ca2004-12-22 02:56:38 +00001288 gs.restart.interval = gs.min_restart_interval;
1289 master = thread_master_create();
Balaji.G837d16c2012-09-26 14:09:10 +05301290 signal_init (master, array_size(my_signals), my_signals);
ajs8b886ca2004-12-22 02:56:38 +00001291 srandom(time(NULL));
1292
1293 {
1294 int i;
1295 struct daemon *tail = NULL;
1296
1297 for (i = optind; i < argc; i++)
1298 {
1299 struct daemon *dmn;
1300
1301 if (!(dmn = (struct daemon *)calloc(1,sizeof(*dmn))))
1302 {
ajs098e2402004-12-22 17:00:46 +00001303 fprintf(stderr,"calloc(1,%u) failed: %s\n",
1304 (u_int)sizeof(*dmn), safe_strerror(errno));
ajs8b886ca2004-12-22 02:56:38 +00001305 return 1;
1306 }
1307 dmn->name = dmn->restart.name = argv[i];
1308 dmn->state = DAEMON_INIT;
1309 gs.numdaemons++;
1310 gs.numdown++;
1311 dmn->fd = -1;
1312 dmn->t_wakeup = thread_add_timer_msec(master,wakeup_init,dmn,
1313 100+(random() % 900));
1314 dmn->restart.interval = gs.min_restart_interval;
1315 if (tail)
1316 tail->next = dmn;
1317 else
1318 gs.daemons = dmn;
1319 tail = dmn;
1320
1321 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1322 (gs.mode == MODE_PHASED_ALL_RESTART)) &&
1323 !strcmp(dmn->name,special))
1324 gs.special = dmn;
1325 }
1326 }
1327 if (!gs.daemons)
1328 {
1329 fputs("Must specify one or more daemons to monitor.\n",stderr);
1330 return usage(progname,1);
1331 }
1332 if (((gs.mode == MODE_PHASED_ZEBRA_RESTART) ||
1333 (gs.mode == MODE_PHASED_ALL_RESTART)) && !gs.special)
1334 {
1335 fprintf(stderr,"In mode [%s], but cannot find master daemon %s\n",
1336 mode_str[gs.mode],special);
1337 return usage(progname,1);
1338 }
1339 if (gs.special && (gs.numdaemons < 2))
1340 {
1341 fprintf(stderr,"Mode [%s] does not make sense with only 1 daemon "
1342 "to watch.\n",mode_str[gs.mode]);
1343 return usage(progname,1);
1344 }
1345
1346 zlog_default = openzlog(progname, ZLOG_NONE,
1347 LOG_CONS|LOG_NDELAY|LOG_PID, LOG_DAEMON);
1348 zlog_set_level(NULL, ZLOG_DEST_MONITOR, ZLOG_DISABLED);
1349 if (daemon_mode)
1350 {
1351 zlog_set_level(NULL, ZLOG_DEST_SYSLOG, MIN(gs.loglevel,LOG_DEBUG));
Stephen Hemminger065de902009-08-07 11:13:49 -07001352 if (daemon (0, 0) < 0)
1353 {
1354 fprintf(stderr, "Watchquagga daemon failed: %s", strerror(errno));
1355 exit (1);
1356 }
ajs8b886ca2004-12-22 02:56:38 +00001357 }
1358 else
1359 zlog_set_level(NULL, ZLOG_DEST_STDOUT, MIN(gs.loglevel,LOG_DEBUG));
1360
1361 /* Make sure we're not already running. */
1362 pid_output (pidfile);
1363
1364 /* Announce which daemons are being monitored. */
1365 {
1366 struct daemon *dmn;
1367 size_t len = 0;
1368
1369 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1370 len += strlen(dmn->name)+1;
1371
1372 {
1373 char buf[len+1];
1374 char *p = buf;
1375
1376 for (dmn = gs.daemons; dmn; dmn = dmn->next)
1377 {
1378 if (p != buf)
1379 *p++ = ' ';
1380 strcpy(p,dmn->name);
1381 p += strlen(p);
1382 }
1383 zlog_notice("%s %s watching [%s], mode [%s]",
1384 progname, QUAGGA_VERSION, buf, mode_str[gs.mode]);
1385 }
1386 }
1387
1388 {
1389 struct thread thread;
1390
1391 while (thread_fetch (master, &thread))
1392 thread_call (&thread);
1393 }
1394
1395 /* Not reached. */
1396 return 0;
1397}