sbin/hastd/secondary.c

   1 /*-
   2  * Copyright (c) 2009-2010 The FreeBSD Foundation
   3  * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
   4  * All rights reserved.
   5  *
   6  * This software was developed by Pawel Jakub Dawidek under sponsorship from
   7  * the FreeBSD Foundation.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  */
  30
  31 #include <sys/cdefs.h>
  32 __FBSDID("$FreeBSD$");
  33
  34 #include <sys/param.h>
  35 #include <sys/time.h>
  36 #include <sys/bio.h>
  37 #include <sys/disk.h>
  38 #include <sys/stat.h>
  39
  40 #include <assert.h>
  41 #include <err.h>
  42 #include <errno.h>
  43 #include <fcntl.h>
  44 #include <libgeom.h>
  45 #include <pthread.h>
  46 #include <signal.h>
  47 #include <stdint.h>
  48 #include <stdio.h>
  49 #include <string.h>
  50 #include <sysexits.h>
  51 #include <unistd.h>
  52
  53 #include <activemap.h>
  54 #include <nv.h>
  55 #include <pjdlog.h>
  56
  57 #include "control.h"
  58 #include "event.h"
  59 #include "hast.h"
  60 #include "hast_proto.h"
  61 #include "hastd.h"
  62 #include "hooks.h"
  63 #include "metadata.h"
  64 #include "proto.h"
  65 #include "subr.h"
  66 #include "synch.h"
  67
  68 struct hio {
  69         uint64_t         hio_seq;
  70         int              hio_error;
  71         struct nv       *hio_nv;
  72         void            *hio_data;
  73         uint8_t          hio_cmd;
  74         uint64_t         hio_offset;
  75         uint64_t         hio_length;
  76         TAILQ_ENTRY(hio) hio_next;
  77 };
  78
  79 static struct hast_resource *gres;
  80
  81 /*
  82  * Free list holds unused structures. When free list is empty, we have to wait
  83  * until some in-progress requests are freed.
  84  */
  85 static TAILQ_HEAD(, hio) hio_free_list;
  86 static pthread_mutex_t hio_free_list_lock;
  87 static pthread_cond_t hio_free_list_cond;
  88 /*
  89  * Disk thread (the one that do I/O requests) takes requests from this list.
  90  */
  91 static TAILQ_HEAD(, hio) hio_disk_list;
  92 static pthread_mutex_t hio_disk_list_lock;
  93 static pthread_cond_t hio_disk_list_cond;
  94 /*
  95  * There is one recv list for every component, although local components don't
  96  * use recv lists as local requests are done synchronously.
  97  */
  98 static TAILQ_HEAD(, hio) hio_send_list;
  99 static pthread_mutex_t hio_send_list_lock;
 100 static pthread_cond_t hio_send_list_cond;
 101
 102 /*
 103  * Maximum number of outstanding I/O requests.
 104  */
 105 #define HAST_HIO_MAX    256
 106
 107 static void *recv_thread(void *arg);
 108 static void *disk_thread(void *arg);
 109 static void *send_thread(void *arg);
 110
 111 #define QUEUE_INSERT(name, hio) do {                                    \
 112         bool _wakeup;                                                   \
 113                                                                         \
 114         mtx_lock(&hio_##name##_list_lock);                              \
 115         _wakeup = TAILQ_EMPTY(&hio_##name##_list);                      \
 116         TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_next);         \
 117         mtx_unlock(&hio_##name##_list_lock);                            \
 118         if (_wakeup)                                                    \
 119                 cv_signal(&hio_##name##_list_cond);                     \
 120 } while (0)
 121 #define QUEUE_TAKE(name, hio)   do {                                    \
 122         mtx_lock(&hio_##name##_list_lock);                              \
 123         while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) {     \
 124                 cv_wait(&hio_##name##_list_cond,                        \
 125                     &hio_##name##_list_lock);                           \
 126         }                                                               \
 127         TAILQ_REMOVE(&hio_##name##_list, (hio), hio_next);              \
 128         mtx_unlock(&hio_##name##_list_lock);                            \
 129 } while (0)
 130
 131 static void
 132 init_environment(void)
 133 {
 134         struct hio *hio;
 135         unsigned int ii;
 136
 137         /*
 138          * Initialize lists, their locks and theirs condition variables.
 139          */
 140         TAILQ_INIT(&hio_free_list);
 141         mtx_init(&hio_free_list_lock);
 142         cv_init(&hio_free_list_cond);
 143         TAILQ_INIT(&hio_disk_list);
 144         mtx_init(&hio_disk_list_lock);
 145         cv_init(&hio_disk_list_cond);
 146         TAILQ_INIT(&hio_send_list);
 147         mtx_init(&hio_send_list_lock);
 148         cv_init(&hio_send_list_cond);
 149
 150         /*
 151          * Allocate requests pool and initialize requests.
 152          */
 153         for (ii = 0; ii < HAST_HIO_MAX; ii++) {
 154                 hio = malloc(sizeof(*hio));
 155                 if (hio == NULL) {
 156                         pjdlog_exitx(EX_TEMPFAIL,
 157                             "Unable to allocate memory (%zu bytes) for hio request.",
 158                             sizeof(*hio));
 159                 }
 160                 hio->hio_error = 0;
 161                 hio->hio_data = malloc(MAXPHYS);
 162                 if (hio->hio_data == NULL) {
 163                         pjdlog_exitx(EX_TEMPFAIL,
 164                             "Unable to allocate memory (%zu bytes) for gctl_data.",
 165                             (size_t)MAXPHYS);
 166                 }
 167                 TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_next);
 168         }
 169 }
 170
 171 static void
 172 init_local(struct hast_resource *res)
 173 {
 174
 175         if (metadata_read(res, true) < 0)
 176                 exit(EX_NOINPUT);
 177 }
 178
 179 static void
 180 init_remote(struct hast_resource *res, struct nv *nvin)
 181 {
 182         uint64_t resuid;
 183         struct nv *nvout;
 184         unsigned char *map;
 185         size_t mapsize;
 186
 187         map = NULL;
 188         mapsize = 0;
 189         nvout = nv_alloc();
 190         nv_add_int64(nvout, (int64_t)res->hr_datasize, "datasize");
 191         nv_add_int32(nvout, (int32_t)res->hr_extentsize, "extentsize");
 192         resuid = nv_get_uint64(nvin, "resuid");
 193         res->hr_primary_localcnt = nv_get_uint64(nvin, "localcnt");
 194         res->hr_primary_remotecnt = nv_get_uint64(nvin, "remotecnt");
 195         nv_add_uint64(nvout, res->hr_secondary_localcnt, "localcnt");
 196         nv_add_uint64(nvout, res->hr_secondary_remotecnt, "remotecnt");
 197         mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
 198             METADATA_SIZE, res->hr_extentsize, res->hr_local_sectorsize);
 199         map = malloc(mapsize);
 200         if (map == NULL) {
 201                 pjdlog_exitx(EX_TEMPFAIL,
 202                     "Unable to allocate memory (%zu bytes) for activemap.",
 203                     mapsize);
 204         }
 205         nv_add_uint32(nvout, (uint32_t)mapsize, "mapsize");
 206         /*
 207          * When we work as primary and secondary is missing we will increase
 208          * localcnt in our metadata. When secondary is connected and synced
 209          * we make localcnt be equal to remotecnt, which means nodes are more
 210          * or less in sync.
 211          * Split-brain condition is when both nodes are not able to communicate
 212          * and are both configured as primary nodes. In turn, they can both
 213          * make incompatible changes to the data and we have to detect that.
 214          * Under split-brain condition we will increase our localcnt on first
 215          * write and remote node will increase its localcnt on first write.
 216          * When we connect we can see that primary's localcnt is greater than
 217          * our remotecnt (primary was modified while we weren't watching) and
 218          * our localcnt is greater than primary's remotecnt (we were modified
 219          * while primary wasn't watching).
 220          * There are many possible combinations which are all gathered below.
 221          * Don't pay too much attention to exact numbers, the more important
 222          * is to compare them. We compare secondary's local with primary's
 223          * remote and secondary's remote with primary's local.
 224          * Note that every case where primary's localcnt is smaller than
 225          * secondary's remotecnt and where secondary's localcnt is smaller than
 226          * primary's remotecnt should be impossible in practise. We will perform
 227          * full synchronization then. Those cases are marked with an asterisk.
 228          * Regular synchronization means that only extents marked as dirty are
 229          * synchronized (regular synchronization).
 230          *
 231          * SECONDARY METADATA PRIMARY METADATA
 232          * local=3 remote=3   local=2 remote=2*  ?! Full sync from secondary.
 233          * local=3 remote=3   local=2 remote=3*  ?! Full sync from primary.
 234          * local=3 remote=3   local=2 remote=4*  ?! Full sync from primary.
 235          * local=3 remote=3   local=3 remote=2   Primary is out-of-date,
 236          *                                       regular sync from secondary.
 237          * local=3 remote=3   local=3 remote=3   Regular sync just in case.
 238          * local=3 remote=3   local=3 remote=4*  ?! Full sync from primary.
 239          * local=3 remote=3   local=4 remote=2   Split-brain condition.
 240          * local=3 remote=3   local=4 remote=3   Secondary out-of-date,
 241          *                                       regular sync from primary.
 242          * local=3 remote=3   local=4 remote=4*  ?! Full sync from primary.
 243          */
 244         if (res->hr_resuid == 0) {
 245                 /*
 246                  * Provider is used for the first time. If primary node done no
 247                  * writes yet as well (we will find "virgin" argument) then
 248                  * there is no need to synchronize anything. If primary node
 249                  * done any writes already we have to synchronize everything.
 250                  */
 251                 assert(res->hr_secondary_localcnt == 0);
 252                 res->hr_resuid = resuid;
 253                 if (metadata_write(res) < 0)
 254                         exit(EX_NOINPUT);
 255                 if (nv_exists(nvin, "virgin")) {
 256                         free(map);
 257                         map = NULL;
 258                         mapsize = 0;
 259                 } else {
 260                         memset(map, 0xff, mapsize);
 261                 }
 262                 nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
 263         } else if (
 264             /* Is primary is out-of-date? */
 265             (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
 266              res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
 267             /* Node are more or less in sync? */
 268             (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
 269              res->hr_secondary_remotecnt == res->hr_primary_localcnt) ||
 270             /* Is secondary is out-of-date? */
 271             (res->hr_secondary_localcnt == res->hr_primary_remotecnt &&
 272              res->hr_secondary_remotecnt < res->hr_primary_localcnt)) {
 273                 /*
 274                  * Nodes are more or less in sync or one of the nodes is
 275                  * out-of-date.
 276                  * It doesn't matter at this point which one, we just have to
 277                  * send out local bitmap to the remote node.
 278                  */
 279                 if (pread(res->hr_localfd, map, mapsize, METADATA_SIZE) !=
 280                     (ssize_t)mapsize) {
 281                         pjdlog_exit(LOG_ERR, "Unable to read activemap");
 282                 }
 283                 if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
 284                      res->hr_secondary_remotecnt == res->hr_primary_localcnt) {
 285                         /* Primary is out-of-date, sync from secondary. */
 286                         nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
 287                 } else {
 288                         /*
 289                          * Secondary is out-of-date or counts match.
 290                          * Sync from primary.
 291                          */
 292                         nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
 293                 }
 294         } else if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
 295              res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
 296                 /*
 297                  * Not good, we have split-brain condition.
 298                  */
 299                 pjdlog_error("Split-brain detected, exiting.");
 300                 nv_add_string(nvout, "Split-brain condition!", "errmsg");
 301                 free(map);
 302                 map = NULL;
 303                 mapsize = 0;
 304         } else /* if (res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
 305             res->hr_primary_localcnt < res->hr_secondary_remotecnt) */ {
 306                 /*
 307                  * This should never happen in practise, but we will perform
 308                  * full synchronization.
 309                  */
 310                 assert(res->hr_secondary_localcnt < res->hr_primary_remotecnt ||
 311                     res->hr_primary_localcnt < res->hr_secondary_remotecnt);
 312                 mapsize = activemap_calc_ondisk_size(res->hr_local_mediasize -
 313                     METADATA_SIZE, res->hr_extentsize,
 314                     res->hr_local_sectorsize);
 315                 memset(map, 0xff, mapsize);
 316                 if (res->hr_secondary_localcnt > res->hr_primary_remotecnt) {
 317                         /* In this one of five cases sync from secondary. */
 318                         nv_add_uint8(nvout, HAST_SYNCSRC_SECONDARY, "syncsrc");
 319                 } else {
 320                         /* For the rest four cases sync from primary. */
 321                         nv_add_uint8(nvout, HAST_SYNCSRC_PRIMARY, "syncsrc");
 322                 }
 323                 pjdlog_warning("This should never happen, asking for full synchronization (primary(local=%ju, remote=%ju), secondary(local=%ju, remote=%ju)).",
 324                     (uintmax_t)res->hr_primary_localcnt,
 325                     (uintmax_t)res->hr_primary_remotecnt,
 326                     (uintmax_t)res->hr_secondary_localcnt,
 327                     (uintmax_t)res->hr_secondary_remotecnt);
 328         }
 329         if (hast_proto_send(res, res->hr_remotein, nvout, map, mapsize) < 0) {
 330                 pjdlog_exit(EX_TEMPFAIL, "Unable to send activemap to %s",
 331                     res->hr_remoteaddr);
 332         }
 333         if (map != NULL)
 334                 free(map);
 335         nv_free(nvout);
 336         if (res->hr_secondary_localcnt > res->hr_primary_remotecnt &&
 337              res->hr_primary_localcnt > res->hr_secondary_remotecnt) {
 338                 /* Exit on split-brain. */
 339                 event_send(res, EVENT_SPLITBRAIN);
 340                 exit(EX_CONFIG);
 341         }
 342 }
 343
 344 void
 345 hastd_secondary(struct hast_resource *res, struct nv *nvin)
 346 {
 347         sigset_t mask;
 348         pthread_t td;
 349         pid_t pid;
 350         int error;
 351
 352         /*
 353          * Create communication channel between parent and child.
 354          */
 355         if (proto_client("socketpair://", &res->hr_ctrl) < 0) {
 356                 KEEP_ERRNO((void)pidfile_remove(pfh));
 357                 pjdlog_exit(EX_OSERR,
 358                     "Unable to create control sockets between parent and child");
 359         }
 360         /*
 361          * Create communication channel between child and parent.
 362          */
 363         if (proto_client("socketpair://", &res->hr_event) < 0) {
 364                 KEEP_ERRNO((void)pidfile_remove(pfh));
 365                 pjdlog_exit(EX_OSERR,
 366                     "Unable to create event sockets between child and parent");
 367         }
 368
 369         pid = fork();
 370         if (pid < 0) {
 371                 KEEP_ERRNO((void)pidfile_remove(pfh));
 372                 pjdlog_exit(EX_OSERR, "Unable to fork");
 373         }
 374
 375         if (pid > 0) {
 376                 /* This is parent. */
 377                 proto_close(res->hr_remotein);
 378                 res->hr_remotein = NULL;
 379                 proto_close(res->hr_remoteout);
 380                 res->hr_remoteout = NULL;
 381                 /* Declare that we are receiver. */
 382                 proto_recv(res->hr_event, NULL, 0);
 383                 res->hr_workerpid = pid;
 384                 return;
 385         }
 386
 387         gres = res;
 388
 389         (void)pidfile_close(pfh);
 390         hook_fini();
 391
 392         setproctitle("%s (secondary)", res->hr_name);
 393
 394         PJDLOG_VERIFY(sigemptyset(&mask) == 0);
 395         PJDLOG_VERIFY(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 396
 397         /* Declare that we are sender. */
 398         proto_send(res->hr_event, NULL, 0);
 399
 400         /* Error in setting timeout is not critical, but why should it fail? */
 401         if (proto_timeout(res->hr_remotein, 0) < 0)
 402                 pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
 403         if (proto_timeout(res->hr_remoteout, res->hr_timeout) < 0)
 404                 pjdlog_errno(LOG_WARNING, "Unable to set connection timeout");
 405
 406         init_local(res);
 407         init_environment();
 408
 409         /*
 410          * Create the control thread before sending any event to the parent,
 411          * as we can deadlock when parent sends control request to worker,
 412          * but worker has no control thread started yet, so parent waits.
 413          * In the meantime worker sends an event to the parent, but parent
 414          * is unable to handle the event, because it waits for control
 415          * request response.
 416          */
 417         error = pthread_create(&td, NULL, ctrl_thread, res);
 418         assert(error == 0);
 419
 420         init_remote(res, nvin);
 421         event_send(res, EVENT_CONNECT);
 422
 423         error = pthread_create(&td, NULL, recv_thread, res);
 424         assert(error == 0);
 425         error = pthread_create(&td, NULL, disk_thread, res);
 426         assert(error == 0);
 427         (void)send_thread(res);
 428 }
 429
 430 static void
 431 reqlog(int loglevel, int debuglevel, int error, struct hio *hio, const char *fmt, ...)
 432 {
 433         char msg[1024];
 434         va_list ap;
 435         int len;
 436
 437         va_start(ap, fmt);
 438         len = vsnprintf(msg, sizeof(msg), fmt, ap);
 439         va_end(ap);
 440         if ((size_t)len < sizeof(msg)) {
 441                 switch (hio->hio_cmd) {
 442                 case HIO_READ:
 443                         (void)snprintf(msg + len, sizeof(msg) - len,
 444                             "READ(%ju, %ju).", (uintmax_t)hio->hio_offset,
 445                             (uintmax_t)hio->hio_length);
 446                         break;
 447                 case HIO_DELETE:
 448                         (void)snprintf(msg + len, sizeof(msg) - len,
 449                             "DELETE(%ju, %ju).", (uintmax_t)hio->hio_offset,
 450                             (uintmax_t)hio->hio_length);
 451                         break;
 452                 case HIO_FLUSH:
 453                         (void)snprintf(msg + len, sizeof(msg) - len, "FLUSH.");
 454                         break;
 455                 case HIO_WRITE:
 456                         (void)snprintf(msg + len, sizeof(msg) - len,
 457                             "WRITE(%ju, %ju).", (uintmax_t)hio->hio_offset,
 458                             (uintmax_t)hio->hio_length);
 459                         break;
 460                 case HIO_KEEPALIVE:
 461                         (void)snprintf(msg + len, sizeof(msg) - len, "KEEPALIVE.");
 462                         break;
 463                 default:
 464                         (void)snprintf(msg + len, sizeof(msg) - len,
 465                             "UNKNOWN(%u).", (unsigned int)hio->hio_cmd);
 466                         break;
 467                 }
 468         }
 469         pjdlog_common(loglevel, debuglevel, error, "%s", msg);
 470 }
 471
 472 static int
 473 requnpack(struct hast_resource *res, struct hio *hio)
 474 {
 475
 476         hio->hio_cmd = nv_get_uint8(hio->hio_nv, "cmd");
 477         if (hio->hio_cmd == 0) {
 478                 pjdlog_error("Header contains no 'cmd' field.");
 479                 hio->hio_error = EINVAL;
 480                 goto end;
 481         }
 482         switch (hio->hio_cmd) {
 483         case HIO_KEEPALIVE:
 484                 break;
 485         case HIO_READ:
 486         case HIO_WRITE:
 487         case HIO_DELETE:
 488                 hio->hio_offset = nv_get_uint64(hio->hio_nv, "offset");
 489                 if (nv_error(hio->hio_nv) != 0) {
 490                         pjdlog_error("Header is missing 'offset' field.");
 491                         hio->hio_error = EINVAL;
 492                         goto end;
 493                 }
 494                 hio->hio_length = nv_get_uint64(hio->hio_nv, "length");
 495                 if (nv_error(hio->hio_nv) != 0) {
 496                         pjdlog_error("Header is missing 'length' field.");
 497                         hio->hio_error = EINVAL;
 498                         goto end;
 499                 }
 500                 if (hio->hio_length == 0) {
 501                         pjdlog_error("Data length is zero.");
 502                         hio->hio_error = EINVAL;
 503                         goto end;
 504                 }
 505                 if (hio->hio_length > MAXPHYS) {
 506                         pjdlog_error("Data length is too large (%ju > %ju).",
 507                             (uintmax_t)hio->hio_length, (uintmax_t)MAXPHYS);
 508                         hio->hio_error = EINVAL;
 509                         goto end;
 510                 }
 511                 if ((hio->hio_offset % res->hr_local_sectorsize) != 0) {
 512                         pjdlog_error("Offset %ju is not multiple of sector size.",
 513                             (uintmax_t)hio->hio_offset);
 514                         hio->hio_error = EINVAL;
 515                         goto end;
 516                 }
 517                 if ((hio->hio_length % res->hr_local_sectorsize) != 0) {
 518                         pjdlog_error("Length %ju is not multiple of sector size.",
 519                             (uintmax_t)hio->hio_length);
 520                         hio->hio_error = EINVAL;
 521                         goto end;
 522                 }
 523                 if (hio->hio_offset + hio->hio_length >
 524                     (uint64_t)res->hr_datasize) {
 525                         pjdlog_error("Data offset is too large (%ju > %ju).",
 526                             (uintmax_t)(hio->hio_offset + hio->hio_length),
 527                             (uintmax_t)res->hr_datasize);
 528                         hio->hio_error = EINVAL;
 529                         goto end;
 530                 }
 531                 break;
 532         default:
 533                 pjdlog_error("Header contains invalid 'cmd' (%hhu).",
 534                     hio->hio_cmd);
 535                 hio->hio_error = EINVAL;
 536                 goto end;
 537         }
 538         hio->hio_error = 0;
 539 end:
 540         return (hio->hio_error);
 541 }
 542
 543 static __dead2 void
 544 secondary_exit(int exitcode, const char *fmt, ...)
 545 {
 546         va_list ap;
 547
 548         assert(exitcode != EX_OK);
 549         va_start(ap, fmt);
 550         pjdlogv_errno(LOG_ERR, fmt, ap);
 551         va_end(ap);
 552         event_send(gres, EVENT_DISCONNECT);
 553         exit(exitcode);
 554 }
 555
 556 /*
 557  * Thread receives requests from the primary node.
 558  */
 559 static void *
 560 recv_thread(void *arg)
 561 {
 562         struct hast_resource *res = arg;
 563         struct hio *hio;
 564
 565         for (;;) {
 566                 pjdlog_debug(2, "recv: Taking free request.");
 567                 QUEUE_TAKE(free, hio);
 568                 pjdlog_debug(2, "recv: (%p) Got request.", hio);
 569                 if (hast_proto_recv_hdr(res->hr_remotein, &hio->hio_nv) < 0) {
 570                         secondary_exit(EX_TEMPFAIL,
 571                             "Unable to receive request header");
 572                 }
 573                 if (requnpack(res, hio) != 0) {
 574                         pjdlog_debug(2,
 575                             "recv: (%p) Moving request to the send queue.",
 576                             hio);
 577                         QUEUE_INSERT(send, hio);
 578                         continue;
 579                 }
 580                 reqlog(LOG_DEBUG, 2, -1, hio,
 581                     "recv: (%p) Got request header: ", hio);
 582                 if (hio->hio_cmd == HIO_KEEPALIVE) {
 583                         pjdlog_debug(2,
 584                             "recv: (%p) Moving request to the free queue.",
 585                             hio);
 586                         nv_free(hio->hio_nv);
 587                         QUEUE_INSERT(free, hio);
 588                         continue;
 589                 } else if (hio->hio_cmd == HIO_WRITE) {
 590                         if (hast_proto_recv_data(res, res->hr_remotein,
 591                             hio->hio_nv, hio->hio_data, MAXPHYS) < 0) {
 592                                 secondary_exit(EX_TEMPFAIL,
 593                                     "Unable to receive request data");
 594                         }
 595                 }
 596                 pjdlog_debug(2, "recv: (%p) Moving request to the disk queue.",
 597                     hio);
 598                 QUEUE_INSERT(disk, hio);
 599         }
 600         /* NOTREACHED */
 601         return (NULL);
 602 }
 603
 604 /*
 605  * Thread reads from or writes to local component and also handles DELETE and
 606  * FLUSH requests.
 607  */
 608 static void *
 609 disk_thread(void *arg)
 610 {
 611         struct hast_resource *res = arg;
 612         struct hio *hio;
 613         ssize_t ret;
 614         bool clear_activemap;
 615
 616         clear_activemap = true;
 617
 618         for (;;) {
 619                 pjdlog_debug(2, "disk: Taking request.");
 620                 QUEUE_TAKE(disk, hio);
 621                 while (clear_activemap) {
 622                         unsigned char *map;
 623                         size_t mapsize;
 624
 625                         /*
 626                          * When first request is received, it means that primary
 627                          * already received our activemap, merged it and stored
 628                          * locally. We can now safely clear our activemap.
 629                          */
 630                         mapsize =
 631                             activemap_calc_ondisk_size(res->hr_local_mediasize -
 632                             METADATA_SIZE, res->hr_extentsize,
 633                             res->hr_local_sectorsize);
 634                         map = calloc(1, mapsize);
 635                         if (map == NULL) {
 636                                 pjdlog_warning("Unable to allocate memory to clear local activemap.");
 637                                 break;
 638                         }
 639                         if (pwrite(res->hr_localfd, map, mapsize,
 640                             METADATA_SIZE) != (ssize_t)mapsize) {
 641                                 pjdlog_errno(LOG_WARNING,
 642                                     "Unable to store cleared activemap");
 643                                 free(map);
 644                                 break;
 645                         }
 646                         free(map);
 647                         clear_activemap = false;
 648                         pjdlog_debug(1, "Local activemap cleared.");
 649                 }
 650                 reqlog(LOG_DEBUG, 2, -1, hio, "disk: (%p) Got request: ", hio);
 651                 /* Handle the actual request. */
 652                 switch (hio->hio_cmd) {
 653                 case HIO_READ:
 654                         ret = pread(res->hr_localfd, hio->hio_data,
 655                             hio->hio_length,
 656                             hio->hio_offset + res->hr_localoff);
 657                         if (ret < 0)
 658                                 hio->hio_error = errno;
 659                         else if (ret != (int64_t)hio->hio_length)
 660                                 hio->hio_error = EIO;
 661                         else
 662                                 hio->hio_error = 0;
 663                         break;
 664                 case HIO_WRITE:
 665                         ret = pwrite(res->hr_localfd, hio->hio_data,
 666                             hio->hio_length,
 667                             hio->hio_offset + res->hr_localoff);
 668                         if (ret < 0)
 669                                 hio->hio_error = errno;
 670                         else if (ret != (int64_t)hio->hio_length)
 671                                 hio->hio_error = EIO;
 672                         else
 673                                 hio->hio_error = 0;
 674                         break;
 675                 case HIO_DELETE:
 676                         ret = g_delete(res->hr_localfd,
 677                             hio->hio_offset + res->hr_localoff,
 678                             hio->hio_length);
 679                         if (ret < 0)
 680                                 hio->hio_error = errno;
 681                         else
 682                                 hio->hio_error = 0;
 683                         break;
 684                 case HIO_FLUSH:
 685                         ret = g_flush(res->hr_localfd);
 686                         if (ret < 0)
 687                                 hio->hio_error = errno;
 688                         else
 689                                 hio->hio_error = 0;
 690                         break;
 691                 }
 692                 if (hio->hio_error != 0) {
 693                         reqlog(LOG_ERR, 0, hio->hio_error, hio,
 694                             "Request failed: ");
 695                 }
 696                 pjdlog_debug(2, "disk: (%p) Moving request to the send queue.",
 697                     hio);
 698                 QUEUE_INSERT(send, hio);
 699         }
 700         /* NOTREACHED */
 701         return (NULL);
 702 }
 703
 704 /*
 705  * Thread sends requests back to primary node.
 706  */
 707 static void *
 708 send_thread(void *arg)
 709 {
 710         struct hast_resource *res = arg;
 711         struct nv *nvout;
 712         struct hio *hio;
 713         void *data;
 714         size_t length;
 715
 716         for (;;) {
 717                 pjdlog_debug(2, "send: Taking request.");
 718                 QUEUE_TAKE(send, hio);
 719                 reqlog(LOG_DEBUG, 2, -1, hio, "send: (%p) Got request: ", hio);
 720                 nvout = nv_alloc();
 721                 /* Copy sequence number. */
 722                 nv_add_uint64(nvout, nv_get_uint64(hio->hio_nv, "seq"), "seq");
 723                 switch (hio->hio_cmd) {
 724                 case HIO_READ:
 725                         if (hio->hio_error == 0) {
 726                                 data = hio->hio_data;
 727                                 length = hio->hio_length;
 728                                 break;
 729                         }
 730                         /*
 731                          * We send no data in case of an error.
 732                          */
 733                         /* FALLTHROUGH */
 734                 case HIO_DELETE:
 735                 case HIO_FLUSH:
 736                 case HIO_WRITE:
 737                         data = NULL;
 738                         length = 0;
 739                         break;
 740                 default:
 741                         abort();
 742                         break;
 743                 }
 744                 if (hio->hio_error != 0)
 745                         nv_add_int16(nvout, hio->hio_error, "error");
 746                 if (hast_proto_send(res, res->hr_remoteout, nvout, data,
 747                     length) < 0) {
 748                         secondary_exit(EX_TEMPFAIL, "Unable to send reply.");
 749                 }
 750                 nv_free(nvout);
 751                 pjdlog_debug(2, "send: (%p) Moving request to the free queue.",
 752                     hio);
 753                 nv_free(hio->hio_nv);
 754                 hio->hio_error = 0;
 755                 QUEUE_INSERT(free, hio);
 756         }
 757         /* NOTREACHED */
 758         return (NULL);
 759 }