]> CyberLeo.Net >> Repos - FreeBSD/releng/8.2.git/blob - sbin/hastd/hastd.c
MFS r217050: Make minidumps work on i386/XEN.
[FreeBSD/releng/8.2.git] / sbin / hastd / hastd.c
1 /*-
2  * Copyright (c) 2009-2010 The FreeBSD Foundation
3  * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
4  * All rights reserved.
5  *
6  * This software was developed by Pawel Jakub Dawidek under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/linker.h>
36 #include <sys/module.h>
37 #include <sys/wait.h>
38
39 #include <assert.h>
40 #include <err.h>
41 #include <errno.h>
42 #include <libutil.h>
43 #include <signal.h>
44 #include <stdbool.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <sysexits.h>
49 #include <unistd.h>
50
51 #include <activemap.h>
52 #include <pjdlog.h>
53
54 #include "control.h"
55 #include "event.h"
56 #include "hast.h"
57 #include "hast_proto.h"
58 #include "hastd.h"
59 #include "hooks.h"
60 #include "subr.h"
61
62 /* Path to configuration file. */
63 const char *cfgpath = HAST_CONFIG;
64 /* Hastd configuration. */
65 static struct hastd_config *cfg;
66 /* Was SIGINT or SIGTERM signal received? */
67 bool sigexit_received = false;
68 /* PID file handle. */
69 struct pidfh *pfh;
70
71 /* How often check for hooks running for too long. */
72 #define REPORT_INTERVAL 5
73
74 static void
75 usage(void)
76 {
77
78         errx(EX_USAGE, "[-dFh] [-c config] [-P pidfile]");
79 }
80
81 static void
82 g_gate_load(void)
83 {
84
85         if (modfind("g_gate") == -1) {
86                 /* Not present in kernel, try loading it. */
87                 if (kldload("geom_gate") == -1 || modfind("g_gate") == -1) {
88                         if (errno != EEXIST) {
89                                 pjdlog_exit(EX_OSERR,
90                                     "Unable to load geom_gate module");
91                         }
92                 }
93         }
94 }
95
96 static void
97 child_exit_log(unsigned int pid, int status)
98 {
99
100         if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
101                 pjdlog_debug(1, "Worker process exited gracefully (pid=%u).",
102                     pid);
103         } else if (WIFSIGNALED(status)) {
104                 pjdlog_error("Worker process killed (pid=%u, signal=%d).",
105                     pid, WTERMSIG(status));
106         } else {
107                 pjdlog_error("Worker process exited ungracefully (pid=%u, exitcode=%d).",
108                     pid, WIFEXITED(status) ? WEXITSTATUS(status) : -1);
109         }
110 }
111
112 static void
113 child_exit(void)
114 {
115         struct hast_resource *res;
116         int status;
117         pid_t pid;
118
119         while ((pid = wait3(&status, WNOHANG, NULL)) > 0) {
120                 /* Find resource related to the process that just exited. */
121                 TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
122                         if (pid == res->hr_workerpid)
123                                 break;
124                 }
125                 if (res == NULL) {
126                         /*
127                          * This can happen when new connection arrives and we
128                          * cancel child responsible for the old one or if this
129                          * was hook which we executed.
130                          */
131                         hook_check_one(pid, status);
132                         continue;
133                 }
134                 pjdlog_prefix_set("[%s] (%s) ", res->hr_name,
135                     role2str(res->hr_role));
136                 child_exit_log(pid, status);
137                 child_cleanup(res);
138                 if (res->hr_role == HAST_ROLE_PRIMARY) {
139                         /*
140                          * Restart child process if it was killed by signal
141                          * or exited because of temporary problem.
142                          */
143                         if (WIFSIGNALED(status) ||
144                             (WIFEXITED(status) &&
145                              WEXITSTATUS(status) == EX_TEMPFAIL)) {
146                                 sleep(1);
147                                 pjdlog_info("Restarting worker process.");
148                                 hastd_primary(res);
149                         } else {
150                                 res->hr_role = HAST_ROLE_INIT;
151                                 pjdlog_info("Changing resource role back to %s.",
152                                     role2str(res->hr_role));
153                         }
154                 }
155                 pjdlog_prefix_set("%s", "");
156         }
157 }
158
159 static bool
160 resource_needs_restart(const struct hast_resource *res0,
161     const struct hast_resource *res1)
162 {
163
164         assert(strcmp(res0->hr_name, res1->hr_name) == 0);
165
166         if (strcmp(res0->hr_provname, res1->hr_provname) != 0)
167                 return (true);
168         if (strcmp(res0->hr_localpath, res1->hr_localpath) != 0)
169                 return (true);
170         if (res0->hr_role == HAST_ROLE_INIT ||
171             res0->hr_role == HAST_ROLE_SECONDARY) {
172                 if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0)
173                         return (true);
174                 if (res0->hr_replication != res1->hr_replication)
175                         return (true);
176                 if (res0->hr_timeout != res1->hr_timeout)
177                         return (true);
178                 if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
179                         return (true);
180         }
181         return (false);
182 }
183
184 static bool
185 resource_needs_reload(const struct hast_resource *res0,
186     const struct hast_resource *res1)
187 {
188
189         assert(strcmp(res0->hr_name, res1->hr_name) == 0);
190         assert(strcmp(res0->hr_provname, res1->hr_provname) == 0);
191         assert(strcmp(res0->hr_localpath, res1->hr_localpath) == 0);
192
193         if (res0->hr_role != HAST_ROLE_PRIMARY)
194                 return (false);
195
196         if (strcmp(res0->hr_remoteaddr, res1->hr_remoteaddr) != 0)
197                 return (true);
198         if (res0->hr_replication != res1->hr_replication)
199                 return (true);
200         if (res0->hr_timeout != res1->hr_timeout)
201                 return (true);
202         if (strcmp(res0->hr_exec, res1->hr_exec) != 0)
203                 return (true);
204         return (false);
205 }
206
207 static void
208 hastd_reload(void)
209 {
210         struct hastd_config *newcfg;
211         struct hast_resource *nres, *cres, *tres;
212         uint8_t role;
213
214         pjdlog_info("Reloading configuration...");
215
216         newcfg = yy_config_parse(cfgpath, false);
217         if (newcfg == NULL)
218                 goto failed;
219
220         /*
221          * Check if control address has changed.
222          */
223         if (strcmp(cfg->hc_controladdr, newcfg->hc_controladdr) != 0) {
224                 if (proto_server(newcfg->hc_controladdr,
225                     &newcfg->hc_controlconn) < 0) {
226                         pjdlog_errno(LOG_ERR,
227                             "Unable to listen on control address %s",
228                             newcfg->hc_controladdr);
229                         goto failed;
230                 }
231         }
232         /*
233          * Check if listen address has changed.
234          */
235         if (strcmp(cfg->hc_listenaddr, newcfg->hc_listenaddr) != 0) {
236                 if (proto_server(newcfg->hc_listenaddr,
237                     &newcfg->hc_listenconn) < 0) {
238                         pjdlog_errno(LOG_ERR, "Unable to listen on address %s",
239                             newcfg->hc_listenaddr);
240                         goto failed;
241                 }
242         }
243         /*
244          * Only when both control and listen sockets are successfully
245          * initialized switch them to new configuration.
246          */
247         if (newcfg->hc_controlconn != NULL) {
248                 pjdlog_info("Control socket changed from %s to %s.",
249                     cfg->hc_controladdr, newcfg->hc_controladdr);
250                 proto_close(cfg->hc_controlconn);
251                 cfg->hc_controlconn = newcfg->hc_controlconn;
252                 newcfg->hc_controlconn = NULL;
253                 strlcpy(cfg->hc_controladdr, newcfg->hc_controladdr,
254                     sizeof(cfg->hc_controladdr));
255         }
256         if (newcfg->hc_listenconn != NULL) {
257                 pjdlog_info("Listen socket changed from %s to %s.",
258                     cfg->hc_listenaddr, newcfg->hc_listenaddr);
259                 proto_close(cfg->hc_listenconn);
260                 cfg->hc_listenconn = newcfg->hc_listenconn;
261                 newcfg->hc_listenconn = NULL;
262                 strlcpy(cfg->hc_listenaddr, newcfg->hc_listenaddr,
263                     sizeof(cfg->hc_listenaddr));
264         }
265
266         /*
267          * Stop and remove resources that were removed from the configuration.
268          */
269         TAILQ_FOREACH_SAFE(cres, &cfg->hc_resources, hr_next, tres) {
270                 TAILQ_FOREACH(nres, &newcfg->hc_resources, hr_next) {
271                         if (strcmp(cres->hr_name, nres->hr_name) == 0)
272                                 break;
273                 }
274                 if (nres == NULL) {
275                         control_set_role(cres, HAST_ROLE_INIT);
276                         TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next);
277                         pjdlog_info("Resource %s removed.", cres->hr_name);
278                         free(cres);
279                 }
280         }
281         /*
282          * Move new resources to the current configuration.
283          */
284         TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
285                 TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
286                         if (strcmp(cres->hr_name, nres->hr_name) == 0)
287                                 break;
288                 }
289                 if (cres == NULL) {
290                         TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next);
291                         TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next);
292                         pjdlog_info("Resource %s added.", nres->hr_name);
293                 }
294         }
295         /*
296          * Deal with modified resources.
297          * Depending on what has changed exactly we might want to perform
298          * different actions.
299          *
300          * We do full resource restart in the following situations:
301          * Resource role is INIT or SECONDARY.
302          * Resource role is PRIMARY and path to local component or provider
303          * name has changed.
304          * In case of PRIMARY, the worker process will be killed and restarted,
305          * which also means removing /dev/hast/<name> provider and
306          * recreating it.
307          *
308          * We do just reload (send SIGHUP to worker process) if we act as
309          * PRIMARY, but only remote address, replication mode and timeout
310          * has changed. For those, there is no need to restart worker process.
311          * If PRIMARY receives SIGHUP, it will reconnect if remote address or
312          * replication mode has changed or simply set new timeout if only
313          * timeout has changed.
314          */
315         TAILQ_FOREACH_SAFE(nres, &newcfg->hc_resources, hr_next, tres) {
316                 TAILQ_FOREACH(cres, &cfg->hc_resources, hr_next) {
317                         if (strcmp(cres->hr_name, nres->hr_name) == 0)
318                                 break;
319                 }
320                 assert(cres != NULL);
321                 if (resource_needs_restart(cres, nres)) {
322                         pjdlog_info("Resource %s configuration was modified, restarting it.",
323                             cres->hr_name);
324                         role = cres->hr_role;
325                         control_set_role(cres, HAST_ROLE_INIT);
326                         TAILQ_REMOVE(&cfg->hc_resources, cres, hr_next);
327                         free(cres);
328                         TAILQ_REMOVE(&newcfg->hc_resources, nres, hr_next);
329                         TAILQ_INSERT_TAIL(&cfg->hc_resources, nres, hr_next);
330                         control_set_role(nres, role);
331                 } else if (resource_needs_reload(cres, nres)) {
332                         pjdlog_info("Resource %s configuration was modified, reloading it.",
333                             cres->hr_name);
334                         strlcpy(cres->hr_remoteaddr, nres->hr_remoteaddr,
335                             sizeof(cres->hr_remoteaddr));
336                         cres->hr_replication = nres->hr_replication;
337                         cres->hr_timeout = nres->hr_timeout;
338                         if (cres->hr_workerpid != 0) {
339                                 if (kill(cres->hr_workerpid, SIGHUP) < 0) {
340                                         pjdlog_errno(LOG_WARNING,
341                                             "Unable to send SIGHUP to worker process %u",
342                                             (unsigned int)cres->hr_workerpid);
343                                 }
344                         }
345                 }
346         }
347
348         yy_config_free(newcfg);
349         pjdlog_info("Configuration reloaded successfully.");
350         return;
351 failed:
352         if (newcfg != NULL) {
353                 if (newcfg->hc_controlconn != NULL)
354                         proto_close(newcfg->hc_controlconn);
355                 if (newcfg->hc_listenconn != NULL)
356                         proto_close(newcfg->hc_listenconn);
357                 yy_config_free(newcfg);
358         }
359         pjdlog_warning("Configuration not reloaded.");
360 }
361
362 static void
363 terminate_workers(void)
364 {
365         struct hast_resource *res;
366
367         pjdlog_info("Termination signal received, exiting.");
368         TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
369                 if (res->hr_workerpid == 0)
370                         continue;
371                 pjdlog_info("Terminating worker process (resource=%s, role=%s, pid=%u).",
372                     res->hr_name, role2str(res->hr_role), res->hr_workerpid);
373                 if (kill(res->hr_workerpid, SIGTERM) == 0)
374                         continue;
375                 pjdlog_errno(LOG_WARNING,
376                     "Unable to send signal to worker process (resource=%s, role=%s, pid=%u).",
377                     res->hr_name, role2str(res->hr_role), res->hr_workerpid);
378         }
379 }
380
381 static void
382 listen_accept(void)
383 {
384         struct hast_resource *res;
385         struct proto_conn *conn;
386         struct nv *nvin, *nvout, *nverr;
387         const char *resname;
388         const unsigned char *token;
389         char laddr[256], raddr[256];
390         size_t size;
391         pid_t pid;
392         int status;
393
394         proto_local_address(cfg->hc_listenconn, laddr, sizeof(laddr));
395         pjdlog_debug(1, "Accepting connection to %s.", laddr);
396
397         if (proto_accept(cfg->hc_listenconn, &conn) < 0) {
398                 pjdlog_errno(LOG_ERR, "Unable to accept connection %s", laddr);
399                 return;
400         }
401
402         proto_local_address(conn, laddr, sizeof(laddr));
403         proto_remote_address(conn, raddr, sizeof(raddr));
404         pjdlog_info("Connection from %s to %s.", raddr, laddr);
405
406         /* Error in setting timeout is not critical, but why should it fail? */
407         if (proto_timeout(conn, HAST_TIMEOUT) < 0)
408                 pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
409
410         nvin = nvout = nverr = NULL;
411
412         /*
413          * Before receiving any data see if remote host have access to any
414          * resource.
415          */
416         TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
417                 if (proto_address_match(conn, res->hr_remoteaddr))
418                         break;
419         }
420         if (res == NULL) {
421                 pjdlog_error("Client %s isn't known.", raddr);
422                 goto close;
423         }
424         /* Ok, remote host can access at least one resource. */
425
426         if (hast_proto_recv_hdr(conn, &nvin) < 0) {
427                 pjdlog_errno(LOG_ERR, "Unable to receive header from %s",
428                     raddr);
429                 goto close;
430         }
431
432         resname = nv_get_string(nvin, "resource");
433         if (resname == NULL) {
434                 pjdlog_error("No 'resource' field in the header received from %s.",
435                     raddr);
436                 goto close;
437         }
438         pjdlog_debug(2, "%s: resource=%s", raddr, resname);
439         token = nv_get_uint8_array(nvin, &size, "token");
440         /*
441          * NULL token means that this is first conection.
442          */
443         if (token != NULL && size != sizeof(res->hr_token)) {
444                 pjdlog_error("Received token of invalid size from %s (expected %zu, got %zu).",
445                     raddr, sizeof(res->hr_token), size);
446                 goto close;
447         }
448
449         /*
450          * From now on we want to send errors to the remote node.
451          */
452         nverr = nv_alloc();
453
454         /* Find resource related to this connection. */
455         TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
456                 if (strcmp(resname, res->hr_name) == 0)
457                         break;
458         }
459         /* Have we found the resource? */
460         if (res == NULL) {
461                 pjdlog_error("No resource '%s' as requested by %s.",
462                     resname, raddr);
463                 nv_add_stringf(nverr, "errmsg", "Resource not configured.");
464                 goto fail;
465         }
466
467         /* Now that we know resource name setup log prefix. */
468         pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
469
470         /* Does the remote host have access to this resource? */
471         if (!proto_address_match(conn, res->hr_remoteaddr)) {
472                 pjdlog_error("Client %s has no access to the resource.", raddr);
473                 nv_add_stringf(nverr, "errmsg", "No access to the resource.");
474                 goto fail;
475         }
476         /* Is the resource marked as secondary? */
477         if (res->hr_role != HAST_ROLE_SECONDARY) {
478                 pjdlog_error("We act as %s for the resource and not as %s as requested by %s.",
479                     role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY),
480                     raddr);
481                 nv_add_stringf(nverr, "errmsg",
482                     "Remote node acts as %s for the resource and not as %s.",
483                     role2str(res->hr_role), role2str(HAST_ROLE_SECONDARY));
484                 goto fail;
485         }
486         /* Does token (if exists) match? */
487         if (token != NULL && memcmp(token, res->hr_token,
488             sizeof(res->hr_token)) != 0) {
489                 pjdlog_error("Token received from %s doesn't match.", raddr);
490                 nv_add_stringf(nverr, "errmsg", "Token doesn't match.");
491                 goto fail;
492         }
493         /*
494          * If there is no token, but we have half-open connection
495          * (only remotein) or full connection (worker process is running)
496          * we have to cancel those and accept the new connection.
497          */
498         if (token == NULL) {
499                 assert(res->hr_remoteout == NULL);
500                 pjdlog_debug(1, "Initial connection from %s.", raddr);
501                 if (res->hr_workerpid != 0) {
502                         assert(res->hr_remotein == NULL);
503                         pjdlog_debug(1,
504                             "Worker process exists (pid=%u), stopping it.",
505                             (unsigned int)res->hr_workerpid);
506                         /* Stop child process. */
507                         if (kill(res->hr_workerpid, SIGINT) < 0) {
508                                 pjdlog_errno(LOG_ERR,
509                                     "Unable to stop worker process (pid=%u)",
510                                     (unsigned int)res->hr_workerpid);
511                                 /*
512                                  * Other than logging the problem we
513                                  * ignore it - nothing smart to do.
514                                  */
515                         }
516                         /* Wait for it to exit. */
517                         else if ((pid = waitpid(res->hr_workerpid,
518                             &status, 0)) != res->hr_workerpid) {
519                                 /* We can only log the problem. */
520                                 pjdlog_errno(LOG_ERR,
521                                     "Waiting for worker process (pid=%u) failed",
522                                     (unsigned int)res->hr_workerpid);
523                         } else {
524                                 child_exit_log(res->hr_workerpid, status);
525                         }
526                         child_cleanup(res);
527                 } else if (res->hr_remotein != NULL) {
528                         char oaddr[256];
529
530                         proto_remote_address(res->hr_remotein, oaddr,
531                             sizeof(oaddr));
532                         pjdlog_debug(1,
533                             "Canceling half-open connection from %s on connection from %s.",
534                             oaddr, raddr);
535                         proto_close(res->hr_remotein);
536                         res->hr_remotein = NULL;
537                 }
538         }
539
540         /*
541          * Checks and cleanups are done.
542          */
543
544         if (token == NULL) {
545                 arc4random_buf(res->hr_token, sizeof(res->hr_token));
546                 nvout = nv_alloc();
547                 nv_add_uint8_array(nvout, res->hr_token,
548                     sizeof(res->hr_token), "token");
549                 if (nv_error(nvout) != 0) {
550                         pjdlog_common(LOG_ERR, 0, nv_error(nvout),
551                             "Unable to prepare return header for %s", raddr);
552                         nv_add_stringf(nverr, "errmsg",
553                             "Remote node was unable to prepare return header: %s.",
554                             strerror(nv_error(nvout)));
555                         goto fail;
556                 }
557                 if (hast_proto_send(NULL, conn, nvout, NULL, 0) < 0) {
558                         int error = errno;
559
560                         pjdlog_errno(LOG_ERR, "Unable to send response to %s",
561                             raddr);
562                         nv_add_stringf(nverr, "errmsg",
563                             "Remote node was unable to send response: %s.",
564                             strerror(error));
565                         goto fail;
566                 }
567                 res->hr_remotein = conn;
568                 pjdlog_debug(1, "Incoming connection from %s configured.",
569                     raddr);
570         } else {
571                 res->hr_remoteout = conn;
572                 pjdlog_debug(1, "Outgoing connection to %s configured.", raddr);
573                 hastd_secondary(res, nvin);
574         }
575         nv_free(nvin);
576         nv_free(nvout);
577         nv_free(nverr);
578         pjdlog_prefix_set("%s", "");
579         return;
580 fail:
581         if (nv_error(nverr) != 0) {
582                 pjdlog_common(LOG_ERR, 0, nv_error(nverr),
583                     "Unable to prepare error header for %s", raddr);
584                 goto close;
585         }
586         if (hast_proto_send(NULL, conn, nverr, NULL, 0) < 0) {
587                 pjdlog_errno(LOG_ERR, "Unable to send error to %s", raddr);
588                 goto close;
589         }
590 close:
591         if (nvin != NULL)
592                 nv_free(nvin);
593         if (nvout != NULL)
594                 nv_free(nvout);
595         if (nverr != NULL)
596                 nv_free(nverr);
597         proto_close(conn);
598         pjdlog_prefix_set("%s", "");
599 }
600
601 static void
602 main_loop(void)
603 {
604         struct hast_resource *res;
605         struct timeval seltimeout;
606         struct timespec sigtimeout;
607         int fd, maxfd, ret, signo;
608         sigset_t mask;
609         fd_set rfds;
610
611         seltimeout.tv_sec = REPORT_INTERVAL;
612         seltimeout.tv_usec = 0;
613         sigtimeout.tv_sec = 0;
614         sigtimeout.tv_nsec = 0;
615
616         PJDLOG_VERIFY(sigemptyset(&mask) == 0);
617         PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0);
618         PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0);
619         PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0);
620         PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0);
621
622         for (;;) {
623                 while ((signo = sigtimedwait(&mask, NULL, &sigtimeout)) != -1) {
624                         switch (signo) {
625                         case SIGINT:
626                         case SIGTERM:
627                                 sigexit_received = true;
628                                 terminate_workers();
629                                 exit(EX_OK);
630                                 break;
631                         case SIGCHLD:
632                                 child_exit();
633                                 break;
634                         case SIGHUP:
635                                 hastd_reload();
636                                 break;
637                         default:
638                                 assert(!"invalid condition");
639                         }
640                 }
641
642                 /* Setup descriptors for select(2). */
643                 FD_ZERO(&rfds);
644                 maxfd = fd = proto_descriptor(cfg->hc_controlconn);
645                 assert(fd >= 0);
646                 FD_SET(fd, &rfds);
647                 fd = proto_descriptor(cfg->hc_listenconn);
648                 assert(fd >= 0);
649                 FD_SET(fd, &rfds);
650                 maxfd = fd > maxfd ? fd : maxfd;
651                 TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
652                         if (res->hr_event == NULL)
653                                 continue;
654                         fd = proto_descriptor(res->hr_event);
655                         assert(fd >= 0);
656                         FD_SET(fd, &rfds);
657                         maxfd = fd > maxfd ? fd : maxfd;
658                 }
659
660                 assert(maxfd + 1 <= (int)FD_SETSIZE);
661                 ret = select(maxfd + 1, &rfds, NULL, NULL, &seltimeout);
662                 if (ret == 0)
663                         hook_check();
664                 else if (ret == -1) {
665                         if (errno == EINTR)
666                                 continue;
667                         KEEP_ERRNO((void)pidfile_remove(pfh));
668                         pjdlog_exit(EX_OSERR, "select() failed");
669                 }
670
671                 if (FD_ISSET(proto_descriptor(cfg->hc_controlconn), &rfds))
672                         control_handle(cfg);
673                 if (FD_ISSET(proto_descriptor(cfg->hc_listenconn), &rfds))
674                         listen_accept();
675                 TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
676                         if (res->hr_event == NULL)
677                                 continue;
678                         if (FD_ISSET(proto_descriptor(res->hr_event), &rfds)) {
679                                 if (event_recv(res) == 0)
680                                         continue;
681                                 /* The worker process exited? */
682                                 proto_close(res->hr_event);
683                                 res->hr_event = NULL;
684                         }
685                 }
686         }
687 }
688
689 static void
690 dummy_sighandler(int sig __unused)
691 {
692         /* Nothing to do. */
693 }
694
695 int
696 main(int argc, char *argv[])
697 {
698         const char *pidfile;
699         pid_t otherpid;
700         bool foreground;
701         int debuglevel;
702         sigset_t mask;
703
704         foreground = false;
705         debuglevel = 0;
706         pidfile = HASTD_PIDFILE;
707
708         for (;;) {
709                 int ch;
710
711                 ch = getopt(argc, argv, "c:dFhP:");
712                 if (ch == -1)
713                         break;
714                 switch (ch) {
715                 case 'c':
716                         cfgpath = optarg;
717                         break;
718                 case 'd':
719                         debuglevel++;
720                         break;
721                 case 'F':
722                         foreground = true;
723                         break;
724                 case 'P':
725                         pidfile = optarg;
726                         break;
727                 case 'h':
728                 default:
729                         usage();
730                 }
731         }
732         argc -= optind;
733         argv += optind;
734
735         pjdlog_debug_set(debuglevel);
736
737         g_gate_load();
738
739         pfh = pidfile_open(pidfile, 0600, &otherpid);
740         if (pfh == NULL) {
741                 if (errno == EEXIST) {
742                         pjdlog_exitx(EX_TEMPFAIL,
743                             "Another hastd is already running, pid: %jd.",
744                             (intmax_t)otherpid);
745                 }
746                 /* If we cannot create pidfile from other reasons, only warn. */
747                 pjdlog_errno(LOG_WARNING, "Unable to open or create pidfile");
748         }
749
750         cfg = yy_config_parse(cfgpath, true);
751         assert(cfg != NULL);
752
753         /*
754          * Because SIGCHLD is ignored by default, setup dummy handler for it,
755          * so we can mask it.
756          */
757         PJDLOG_VERIFY(signal(SIGCHLD, dummy_sighandler) != SIG_ERR);
758         PJDLOG_VERIFY(sigemptyset(&mask) == 0);
759         PJDLOG_VERIFY(sigaddset(&mask, SIGHUP) == 0);
760         PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0);
761         PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0);
762         PJDLOG_VERIFY(sigaddset(&mask, SIGCHLD) == 0);
763         PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
764
765         /* Listen on control address. */
766         if (proto_server(cfg->hc_controladdr, &cfg->hc_controlconn) < 0) {
767                 KEEP_ERRNO((void)pidfile_remove(pfh));
768                 pjdlog_exit(EX_OSERR, "Unable to listen on control address %s",
769                     cfg->hc_controladdr);
770         }
771         /* Listen for remote connections. */
772         if (proto_server(cfg->hc_listenaddr, &cfg->hc_listenconn) < 0) {
773                 KEEP_ERRNO((void)pidfile_remove(pfh));
774                 pjdlog_exit(EX_OSERR, "Unable to listen on address %s",
775                     cfg->hc_listenaddr);
776         }
777
778         if (!foreground) {
779                 if (daemon(0, 0) < 0) {
780                         KEEP_ERRNO((void)pidfile_remove(pfh));
781                         pjdlog_exit(EX_OSERR, "Unable to daemonize");
782                 }
783
784                 /* Start logging to syslog. */
785                 pjdlog_mode_set(PJDLOG_MODE_SYSLOG);
786
787                 /* Write PID to a file. */
788                 if (pidfile_write(pfh) < 0) {
789                         pjdlog_errno(LOG_WARNING,
790                             "Unable to write PID to a file");
791                 }
792         }
793
794         hook_init();
795
796         main_loop();
797
798         exit(0);
799 }