]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - usr.sbin/watchdogd/watchdogd.c
DTrace: print() should try to resolve function pointers
[FreeBSD/FreeBSD.git] / usr.sbin / watchdogd / watchdogd.c
1 /*-
2  * Copyright (c) 2003-2004  Sean M. Kelly <smkelly@FreeBSD.org>
3  * Copyright (c) 2013 iXsystems.com,
4  *                    author: Alfred Perlstein <alfred@freebsd.org>
5  *
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29
30 /*
31  * Software watchdog daemon.
32  */
33
34 #include <sys/types.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/mman.h>
38 #include <sys/param.h>
39 #include <sys/rtprio.h>
40 #include <sys/stat.h>
41 #include <sys/time.h>
42 #include <sys/watchdog.h>
43
44 #include <err.h>
45 #include <errno.h>
46 #include <fcntl.h>
47 #include <libutil.h>
48 #include <math.h>
49 #include <paths.h>
50 #include <signal.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <strings.h>
55 #include <sysexits.h>
56 #include <syslog.h>
57 #include <unistd.h>
58
59 #include <getopt.h>
60
61 static void     parseargs(int, char *[]);
62 static void     sighandler(int);
63 static void     watchdog_loop(void);
64 static int      watchdog_init(void);
65 static int      watchdog_onoff(int onoff);
66 static int      watchdog_patpat(u_int timeout);
67 static void     usage(void);
68
69 static int debugging = 0;
70 static int end_program = 0;
71 static const char *pidfile = _PATH_VARRUN "watchdogd.pid";
72 static u_int timeout = WD_TO_128SEC;
73 static u_int pretimeout = 0;
74 static u_int passive = 0;
75 static int is_daemon = 0;
76 static int is_dry_run = 0;  /* do not arm the watchdog, only
77                                report on timing of the watch
78                                program */
79 static int do_timedog = 0;
80 static int do_syslog = 1;
81 static int fd = -1;
82 static int nap = 1;
83 static int carp_thresh_seconds = -1;
84 static char *test_cmd = NULL;
85
86 static const char *getopt_shortopts;
87
88 static int pretimeout_set;
89 static int pretimeout_act;
90 static int pretimeout_act_set;
91
92 static int softtimeout_set;
93 static int softtimeout_act;
94 static int softtimeout_act_set;
95
96 static struct option longopts[] = {
97         { "debug", no_argument, &debugging, 1 },
98         { "pretimeout", required_argument, &pretimeout_set, 1 },
99         { "pretimeout-action", required_argument, &pretimeout_act_set, 1 },
100         { "softtimeout", no_argument, &softtimeout_set, 1 },
101         { "softtimeout-action", required_argument, &softtimeout_act_set, 1 },
102         { NULL, 0, NULL, 0}
103 };
104
105 /*
106  * Ask malloc() to map minimum-sized chunks of virtual address space at a time,
107  * so that mlockall() won't needlessly wire megabytes of unused memory into the
108  * process.  This must be done using the malloc_conf string so that it gets set
109  * up before the first allocation, which happens before entry to main().
110  */
111 const char * malloc_conf = "lg_chunk:0";
112
113 /*
114  * Periodically pat the watchdog, preventing it from firing.
115  */
116 int
117 main(int argc, char *argv[])
118 {
119         struct rtprio rtp;
120         struct pidfh *pfh;
121         pid_t otherpid;
122
123         if (getuid() != 0)
124                 errx(EX_SOFTWARE, "not super user");
125                 
126         parseargs(argc, argv);
127
128         if (do_syslog)
129                 openlog("watchdogd", LOG_CONS|LOG_NDELAY|LOG_PERROR,
130                     LOG_DAEMON);
131
132         rtp.type = RTP_PRIO_REALTIME;
133         rtp.prio = 0;
134         if (rtprio(RTP_SET, 0, &rtp) == -1)
135                 err(EX_OSERR, "rtprio");
136
137         if (!is_dry_run && watchdog_init() == -1)
138                 errx(EX_SOFTWARE, "unable to initialize watchdog");
139
140         if (is_daemon) {
141                 if (watchdog_onoff(1) == -1)
142                         err(EX_OSERR, "patting the dog");
143
144                 pfh = pidfile_open(pidfile, 0600, &otherpid);
145                 if (pfh == NULL) {
146                         if (errno == EEXIST) {
147                                 watchdog_onoff(0);
148                                 errx(EX_SOFTWARE, "%s already running, pid: %d",
149                                     getprogname(), otherpid);
150                         }
151                         warn("Cannot open or create pidfile");
152                 }
153
154                 if (debugging == 0 && daemon(0, 0) == -1) {
155                         watchdog_onoff(0);
156                         pidfile_remove(pfh);
157                         err(EX_OSERR, "daemon");
158                 }
159
160                 signal(SIGHUP, SIG_IGN);
161                 signal(SIGINT, sighandler);
162                 signal(SIGTERM, sighandler);
163
164                 pidfile_write(pfh);
165                 if (madvise(0, 0, MADV_PROTECT) != 0)
166                         warn("madvise failed");
167                 if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0)
168                         warn("mlockall failed");
169
170                 watchdog_loop();
171
172                 /* exiting */
173                 pidfile_remove(pfh);
174                 return (EX_OK);
175         } else {
176                 if (passive)
177                         timeout |= WD_PASSIVE;
178                 else
179                         timeout |= WD_ACTIVE;
180                 if (watchdog_patpat(timeout) < 0)
181                         err(EX_OSERR, "patting the dog");
182                 return (EX_OK);
183         }
184 }
185
186 /*
187  * Catch signals and begin shutdown process.
188  */
189 static void
190 sighandler(int signum)
191 {
192
193         if (signum == SIGINT || signum == SIGTERM)
194                 end_program = 1;
195 }
196
197 /*
198  * Open the watchdog device.
199  */
200 static int
201 watchdog_init(void)
202 {
203
204         if (is_dry_run)
205                 return 0;
206
207         fd = open("/dev/" _PATH_WATCHDOG, O_RDWR);
208         if (fd >= 0)
209                 return (0);
210         warn("Could not open watchdog device");
211         return (-1);
212 }
213
214 /*
215  * If we are doing timing, then get the time.
216  */
217 static int
218 watchdog_getuptime(struct timespec *tp)
219 {
220         int error;
221
222         if (!do_timedog)
223                 return 0;
224
225         error = clock_gettime(CLOCK_UPTIME_FAST, tp);
226         if (error)
227                 warn("clock_gettime");
228         return (error);
229 }
230
231 static long
232 watchdog_check_dogfunction_time(struct timespec *tp_start,
233     struct timespec *tp_end)
234 {
235         struct timeval tv_start, tv_end, tv_now, tv;
236         const char *cmd_prefix, *cmd;
237         struct timespec tp_now;
238         int sec;
239
240         if (!do_timedog)
241                 return (0);
242
243         TIMESPEC_TO_TIMEVAL(&tv_start, tp_start);
244         TIMESPEC_TO_TIMEVAL(&tv_end, tp_end);
245         timersub(&tv_end, &tv_start, &tv);
246         sec = tv.tv_sec;
247         if (sec < carp_thresh_seconds)
248                 return (sec);
249
250         if (test_cmd) {
251                 cmd_prefix = "Watchdog program";
252                 cmd = test_cmd;
253         } else {
254                 cmd_prefix = "Watchdog operation";
255                 cmd = "stat(\"/etc\", &sb)";
256         }
257         if (do_syslog)
258                 syslog(LOG_CRIT, "%s: '%s' took too long: "
259                     "%d.%06ld seconds >= %d seconds threshold",
260                     cmd_prefix, cmd, sec, (long)tv.tv_usec,
261                     carp_thresh_seconds);
262         else
263                 warnx("%s: '%s' took too long: "
264                     "%d.%06ld seconds >= %d seconds threshold",
265                     cmd_prefix, cmd, sec, (long)tv.tv_usec,
266                     carp_thresh_seconds);
267
268         /*
269          * Adjust the sleep interval again in case syslog(3) took a non-trivial
270          * amount of time to run.
271          */
272         if (watchdog_getuptime(&tp_now))
273                 return (sec);
274         TIMESPEC_TO_TIMEVAL(&tv_now, &tp_now);
275         timersub(&tv_now, &tv_start, &tv);
276         sec = tv.tv_sec;
277
278         return (sec);
279 }
280
281 /*
282  * Main program loop which is iterated every second.
283  */
284 static void
285 watchdog_loop(void)
286 {
287         struct timespec ts_start, ts_end;
288         struct stat sb;
289         long waited;
290         int error, failed;
291
292         while (end_program != 2) {
293                 failed = 0;
294
295                 error = watchdog_getuptime(&ts_start);
296                 if (error) {
297                         end_program = 1;
298                         goto try_end;
299                 }
300
301                 if (test_cmd != NULL)
302                         failed = system(test_cmd);
303                 else
304                         failed = stat("/etc", &sb);
305
306                 error = watchdog_getuptime(&ts_end);
307                 if (error) {
308                         end_program = 1;
309                         goto try_end;
310                 }
311
312                 if (failed == 0)
313                         watchdog_patpat(timeout|WD_ACTIVE);
314
315                 waited = watchdog_check_dogfunction_time(&ts_start, &ts_end);
316                 if (nap - waited > 0)
317                         sleep(nap - waited);
318
319 try_end:
320                 if (end_program != 0) {
321                         if (watchdog_onoff(0) == 0) {
322                                 end_program = 2;
323                         } else {
324                                 warnx("Could not stop the watchdog, not exiting");
325                                 end_program = 0;
326                         }
327                 }
328         }
329 }
330
331 /*
332  * Reset the watchdog timer. This function must be called periodically
333  * to keep the watchdog from firing.
334  */
335 static int
336 watchdog_patpat(u_int t)
337 {
338
339         if (is_dry_run)
340                 return 0;
341
342         return ioctl(fd, WDIOCPATPAT, &t);
343 }
344
345 /*
346  * Toggle the kernel's watchdog. This routine is used to enable and
347  * disable the watchdog.
348  */
349 static int
350 watchdog_onoff(int onoff)
351 {
352         int error;
353
354         /* fake successful watchdog op if a dry run */
355         if (is_dry_run)
356                 return 0;
357
358         if (onoff) {
359                 /*
360                  * Call the WDIOC_SETSOFT regardless of softtimeout_set
361                  * because we'll need to turn it off if someone had turned
362                  * it on.
363                  */
364                 error = ioctl(fd, WDIOC_SETSOFT, &softtimeout_set);
365                 if (error) {
366                         warn("setting WDIOC_SETSOFT %d", softtimeout_set);
367                         return (error);
368                 }
369                 error = watchdog_patpat((timeout|WD_ACTIVE));
370                 if (error) {
371                         warn("watchdog_patpat failed");
372                         goto failsafe;
373                 }
374                 if (softtimeout_act_set) {
375                         error = ioctl(fd, WDIOC_SETSOFTTIMEOUTACT,
376                             &softtimeout_act);
377                         if (error) {
378                                 warn("setting WDIOC_SETSOFTTIMEOUTACT %d",
379                                     softtimeout_act);
380                                 goto failsafe;
381                         }
382                 }
383                 if (pretimeout_set) {
384                         error = ioctl(fd, WDIOC_SETPRETIMEOUT, &pretimeout);
385                         if (error) {
386                                 warn("setting WDIOC_SETPRETIMEOUT %d",
387                                     pretimeout);
388                                 goto failsafe;
389                         }
390                 }
391                 if (pretimeout_act_set) {
392                         error = ioctl(fd, WDIOC_SETPRETIMEOUTACT,
393                             &pretimeout_act);
394                         if (error) {
395                                 warn("setting WDIOC_SETPRETIMEOUTACT %d",
396                                     pretimeout_act);
397                                 goto failsafe;
398                         }
399                 }
400                 /* pat one more time for good measure */
401                 return watchdog_patpat((timeout|WD_ACTIVE));
402          } else {
403                 return watchdog_patpat(0);
404          }
405 failsafe:
406         watchdog_patpat(0);
407         return (error);
408 }
409
410 /*
411  * Tell user how to use the program.
412  */
413 static void
414 usage(void)
415 {
416         if (is_daemon)
417                 fprintf(stderr, "usage:\n"
418 "  watchdogd [-dnSw] [-e cmd] [-I file] [-s sleep] [-t timeout]\n"
419 "            [-T script_timeout]\n"
420 "            [--debug]\n"
421 "            [--pretimeout seconds] [-pretimeout-action action]\n"
422 "            [--softtimeout] [-softtimeout-action action]\n"
423 );
424         else
425                 fprintf(stderr, "usage: watchdog [-d] [-t timeout]\n");
426         exit(EX_USAGE);
427 }
428
429 static long
430 fetchtimeout(int opt, const char *longopt, const char *myoptarg)
431 {
432         const char *errstr;
433         char *p;
434         long rv;
435
436         errstr = NULL;
437         p = NULL;
438         errno = 0;
439         rv = strtol(myoptarg, &p, 0);
440         if ((p != NULL && *p != '\0') || errno != 0)
441                 errstr = "is not a number";
442         if (rv <= 0)
443                 errstr = "must be greater than zero";
444         if (errstr) {
445                 if (longopt) 
446                         errx(EX_USAGE, "--%s argument %s", longopt, errstr);
447                 else 
448                         errx(EX_USAGE, "-%c argument %s", opt, errstr);
449         }
450         return (rv);
451 }
452
453 struct act_tbl {
454         const char *at_act;
455         int at_value;
456 };
457
458 static const struct act_tbl act_tbl[] = {
459         { "panic", WD_SOFT_PANIC },
460         { "ddb", WD_SOFT_DDB },
461         { "log", WD_SOFT_LOG },
462         { "printf", WD_SOFT_PRINTF },
463         { NULL, 0 }
464 };
465
466 static void
467 timeout_act_error(const char *lopt, const char *badact)
468 {
469         char *opts, *oldopts;
470         int i;
471
472         opts = NULL;
473         for (i = 0; act_tbl[i].at_act != NULL; i++) {
474                 oldopts = opts;
475                 if (asprintf(&opts, "%s%s%s",
476                     oldopts == NULL ? "" : oldopts,
477                     oldopts == NULL ? "" : ", ",
478                     act_tbl[i].at_act) == -1)
479                         err(EX_OSERR, "malloc");
480                 free(oldopts);
481         }
482         warnx("bad --%s argument '%s' must be one of (%s).",
483             lopt, badact, opts);
484         usage();
485 }
486
487 /*
488  * Take a comma separated list of actions and or the flags
489  * together for the ioctl.
490  */
491 static int
492 timeout_act_str2int(const char *lopt, const char *acts)
493 {
494         int i;
495         char *dupacts, *tofree;
496         char *o;
497         int rv = 0;
498
499         tofree = dupacts = strdup(acts);
500         if (!tofree)
501                 err(EX_OSERR, "malloc");
502         while ((o = strsep(&dupacts, ",")) != NULL) {
503                 for (i = 0; act_tbl[i].at_act != NULL; i++) {
504                         if (!strcmp(o, act_tbl[i].at_act)) {
505                                 rv |= act_tbl[i].at_value;
506                                 break;
507                         }
508                 }
509                 if (act_tbl[i].at_act == NULL)
510                         timeout_act_error(lopt, o);
511         }
512         free(tofree);
513         return rv;
514 }
515
516 /*
517  * Handle the few command line arguments supported.
518  */
519 static void
520 parseargs(int argc, char *argv[])
521 {
522         int longindex;
523         int c;
524         char *p;
525         const char *lopt;
526         double a;
527
528         /*
529          * if we end with a 'd' aka 'watchdogd' then we are the daemon program,
530          * otherwise run as a command line utility.
531          */
532         c = strlen(argv[0]);
533         if (argv[0][c - 1] == 'd')
534                 is_daemon = 1;
535
536         if (is_daemon)
537                 getopt_shortopts = "I:de:ns:t:ST:w?";
538         else
539                 getopt_shortopts = "dt:?";
540
541         while ((c = getopt_long(argc, argv, getopt_shortopts, longopts,
542                     &longindex)) != -1) {
543                 switch (c) {
544                 case 'I':
545                         pidfile = optarg;
546                         break;
547                 case 'd':
548                         debugging = 1;
549                         break;
550                 case 'e':
551                         test_cmd = strdup(optarg);
552                         break;
553                 case 'n':
554                         is_dry_run = 1;
555                         break;
556 #ifdef notyet
557                 case 'p':
558                         passive = 1;
559                         break;
560 #endif
561                 case 's':
562                         nap = fetchtimeout(c, NULL, optarg);
563                         break;
564                 case 'S':
565                         do_syslog = 0;
566                         break;
567                 case 't':
568                         p = NULL;
569                         errno = 0;
570                         a = strtod(optarg, &p);
571                         if ((p != NULL && *p != '\0') || errno != 0)
572                                 errx(EX_USAGE, "-t argument is not a number");
573                         if (a < 0)
574                                 errx(EX_USAGE, "-t argument must be positive");
575
576                         if (a == 0)
577                                 timeout = WD_TO_NEVER;
578                         else
579                                 timeout = flsll(a * 1e9);
580                         if (debugging)
581                                 printf("Timeout is 2^%d nanoseconds\n",
582                                     timeout);
583                         break;
584                 case 'T':
585                         carp_thresh_seconds = fetchtimeout(c, "NULL", optarg);
586                         break;
587                 case 'w':
588                         do_timedog = 1;
589                         break;
590                 case 0:
591                         lopt = longopts[longindex].name;
592                         if (!strcmp(lopt, "pretimeout")) {
593                                 pretimeout = fetchtimeout(0, lopt, optarg);
594                         } else if (!strcmp(lopt, "pretimeout-action")) {
595                                 pretimeout_act = timeout_act_str2int(lopt,
596                                     optarg);
597                         } else if (!strcmp(lopt, "softtimeout-action")) {
598                                 softtimeout_act = timeout_act_str2int(lopt,
599                                     optarg);
600                         } else {
601                 /*              warnx("bad option at index %d: %s", optind,
602                                     argv[optind]);
603                                 usage();
604                                 */
605                         }
606                         break;
607                 case '?':
608                 default:
609                         usage();
610                         /* NOTREACHED */
611                 }
612         }
613
614         if (carp_thresh_seconds == -1)
615                 carp_thresh_seconds = nap;
616
617         if (argc != optind)
618                 errx(EX_USAGE, "extra arguments.");
619         if (is_daemon && timeout < WD_TO_1SEC)
620                 errx(EX_USAGE, "-t argument is less than one second.");
621 }