]> CyberLeo.Net >> Repos - FreeBSD/stable/9.git/blob - contrib/bind9/lib/dns/journal.c
MFC r254651:
[FreeBSD/stable/9.git] / contrib / bind9 / lib / dns / journal.c
1 /*
2  * Copyright (C) 2004, 2005, 2007-2011, 2013  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2002  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: journal.c,v 1.120 2011/12/22 07:32:41 each Exp $ */
19
20 #include <config.h>
21
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <errno.h>
25
26 #include <isc/file.h>
27 #include <isc/mem.h>
28 #include <isc/stdio.h>
29 #include <isc/string.h>
30 #include <isc/util.h>
31
32 #include <dns/compress.h>
33 #include <dns/db.h>
34 #include <dns/dbiterator.h>
35 #include <dns/diff.h>
36 #include <dns/fixedname.h>
37 #include <dns/journal.h>
38 #include <dns/log.h>
39 #include <dns/rdataset.h>
40 #include <dns/rdatasetiter.h>
41 #include <dns/result.h>
42 #include <dns/soa.h>
43
44 /*! \file
45  * \brief Journaling.
46  *
47  * A journal file consists of
48  *
49  *   \li A fixed-size header of type journal_rawheader_t.
50  *
51  *   \li The index.  This is an unordered array of index entries
52  *     of type journal_rawpos_t giving the locations
53  *     of some arbitrary subset of the journal's addressable
54  *     transactions.  The index entries are used as hints to
55  *     speed up the process of locating a transaction with a given
56  *     serial number.  Unused index entries have an "offset"
57  *     field of zero.  The size of the index can vary between
58  *     journal files, but does not change during the lifetime
59  *     of a file.  The size can be zero.
60  *
61  *   \li The journal data.  This  consists of one or more transactions.
62  *     Each transaction begins with a transaction header of type
63  *     journal_rawxhdr_t.  The transaction header is followed by a
64  *     sequence of RRs, similar in structure to an IXFR difference
65  *     sequence (RFC1995).  That is, the pre-transaction SOA,
66  *     zero or more other deleted RRs, the post-transaction SOA,
67  *     and zero or more other added RRs.  Unlike in IXFR, each RR
68  *     is prefixed with a 32-bit length.
69  *
70  *     The journal data part grows as new transactions are
71  *     appended to the file.  Only those transactions
72  *     whose serial number is current-(2^31-1) to current
73  *     are considered "addressable" and may be pointed
74  *     to from the header or index.  They may be preceded
75  *     by old transactions that are no longer addressable,
76  *     and they may be followed by transactions that were
77  *     appended to the journal but never committed by updating
78  *     the "end" position in the header.  The latter will
79  *     be overwritten when new transactions are added.
80  */
81 /*%
82  * When true, accept IXFR difference sequences where the
83  * SOA serial number does not change (BIND 8 sends such
84  * sequences).
85  */
86 static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
87
88 /**************************************************************************/
89 /*
90  * Miscellaneous utilities.
91  */
92
93 #define JOURNAL_COMMON_LOGARGS \
94         dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
95
96 #define JOURNAL_DEBUG_LOGARGS(n) \
97         JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
98
99 /*%
100  * It would be non-sensical (or at least obtuse) to use FAIL() with an
101  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
102  * from complaining about "end-of-loop code not reached".
103  */
104 #define FAIL(code) \
105         do { result = (code);                                   \
106                 if (result != ISC_R_SUCCESS) goto failure;      \
107         } while (0)
108
109 #define CHECK(op) \
110         do { result = (op);                                     \
111                 if (result != ISC_R_SUCCESS) goto failure;      \
112         } while (0)
113
114 #define JOURNAL_SERIALSET       0x01U
115
116 static isc_result_t index_to_disk(dns_journal_t *);
117
118 static inline isc_uint32_t
119 decode_uint32(unsigned char *p) {
120         return ((p[0] << 24) +
121                 (p[1] << 16) +
122                 (p[2] <<  8) +
123                 (p[3] <<  0));
124 }
125
126 static inline void
127 encode_uint32(isc_uint32_t val, unsigned char *p) {
128         p[0] = (isc_uint8_t)(val >> 24);
129         p[1] = (isc_uint8_t)(val >> 16);
130         p[2] = (isc_uint8_t)(val >>  8);
131         p[3] = (isc_uint8_t)(val >>  0);
132 }
133
134 isc_result_t
135 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
136                       dns_diffop_t op, dns_difftuple_t **tp)
137 {
138         isc_result_t result;
139         dns_dbnode_t *node;
140         dns_rdataset_t rdataset;
141         dns_rdata_t rdata = DNS_RDATA_INIT;
142         dns_name_t *zonename;
143
144         zonename = dns_db_origin(db);
145
146         node = NULL;
147         result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
148         if (result != ISC_R_SUCCESS)
149                 goto nonode;
150
151         dns_rdataset_init(&rdataset);
152         result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
153                                      (isc_stdtime_t)0, &rdataset, NULL);
154         if (result != ISC_R_SUCCESS)
155                 goto freenode;
156
157         result = dns_rdataset_first(&rdataset);
158         if (result != ISC_R_SUCCESS)
159                 goto freenode;
160
161         dns_rdataset_current(&rdataset, &rdata);
162
163         result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
164                                       &rdata, tp);
165
166         dns_rdataset_disassociate(&rdataset);
167         dns_db_detachnode(db, &node);
168         return (result);
169
170  freenode:
171         dns_db_detachnode(db, &node);
172  nonode:
173         UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
174         return (result);
175 }
176
177 /* Journaling */
178
179 /*%
180  * On-disk representation of a "pointer" to a journal entry.
181  * These are used in the journal header to locate the beginning
182  * and end of the journal, and in the journal index to locate
183  * other transactions.
184  */
185 typedef struct {
186         unsigned char   serial[4];  /*%< SOA serial before update. */
187         /*
188          * XXXRTH  Should offset be 8 bytes?
189          * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
190          * XXXAG  ... but we will not be able to seek >2G anyway on many
191          *            platforms as long as we are using fseek() rather
192          *            than lseek().
193          */
194         unsigned char   offset[4];  /*%< Offset from beginning of file. */
195 } journal_rawpos_t;
196
197
198 /*%
199  * The header is of a fixed size, with some spare room for future
200  * extensions.
201  */
202 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
203
204 /*%
205  * The on-disk representation of the journal header.
206  * All numbers are stored in big-endian order.
207  */
208 typedef union {
209         struct {
210                 /*% File format version ID. */
211                 unsigned char           format[16];
212                 /*% Position of the first addressable transaction */
213                 journal_rawpos_t        begin;
214                 /*% Position of the next (yet nonexistent) transaction. */
215                 journal_rawpos_t        end;
216                 /*% Number of index entries following the header. */
217                 unsigned char           index_size[4];
218                 /*% Source serial number. */
219                 unsigned char           sourceserial[4];
220                 unsigned char           flags;
221         } h;
222         /* Pad the header to a fixed size. */
223         unsigned char pad[JOURNAL_HEADER_SIZE];
224 } journal_rawheader_t;
225
226 /*%
227  * The on-disk representation of the transaction header.
228  * There is one of these at the beginning of each transaction.
229  */
230 typedef struct {
231         unsigned char   size[4];        /*%< In bytes, excluding header. */
232         unsigned char   serial0[4];     /*%< SOA serial before update. */
233         unsigned char   serial1[4];     /*%< SOA serial after update. */
234 } journal_rawxhdr_t;
235
236 /*%
237  * The on-disk representation of the RR header.
238  * There is one of these at the beginning of each RR.
239  */
240 typedef struct {
241         unsigned char   size[4];        /*%< In bytes, excluding header. */
242 } journal_rawrrhdr_t;
243
244 /*%
245  * The in-core representation of the journal header.
246  */
247 typedef struct {
248         isc_uint32_t    serial;
249         isc_offset_t    offset;
250 } journal_pos_t;
251
252 #define POS_VALID(pos)          ((pos).offset != 0)
253 #define POS_INVALIDATE(pos)     ((pos).offset = 0, (pos).serial = 0)
254
255 typedef struct {
256         unsigned char   format[16];
257         journal_pos_t   begin;
258         journal_pos_t   end;
259         isc_uint32_t    index_size;
260         isc_uint32_t    sourceserial;
261         isc_boolean_t   serialset;
262 } journal_header_t;
263
264 /*%
265  * The in-core representation of the transaction header.
266  */
267
268 typedef struct {
269         isc_uint32_t    size;
270         isc_uint32_t    serial0;
271         isc_uint32_t    serial1;
272 } journal_xhdr_t;
273
274 /*%
275  * The in-core representation of the RR header.
276  */
277 typedef struct {
278         isc_uint32_t    size;
279 } journal_rrhdr_t;
280
281
282 /*%
283  * Initial contents to store in the header of a newly created
284  * journal file.
285  *
286  * The header starts with the magic string ";BIND LOG V9\n"
287  * to identify the file as a BIND 9 journal file.  An ASCII
288  * identification string is used rather than a binary magic
289  * number to be consistent with BIND 8 (BIND 8 journal files
290  * are ASCII text files).
291  */
292
293 static journal_header_t
294 initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0 };
295
296 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
297
298 typedef enum {
299         JOURNAL_STATE_INVALID,
300         JOURNAL_STATE_READ,
301         JOURNAL_STATE_WRITE,
302         JOURNAL_STATE_TRANSACTION,
303         JOURNAL_STATE_INLINE
304 } journal_state_t;
305
306 struct dns_journal {
307         unsigned int            magic;          /*%< JOUR */
308         isc_mem_t               *mctx;          /*%< Memory context */
309         journal_state_t         state;
310         const char              *filename;      /*%< Journal file name */
311         FILE *                  fp;             /*%< File handle */
312         isc_offset_t            offset;         /*%< Current file offset */
313         journal_header_t        header;         /*%< In-core journal header */
314         unsigned char           *rawindex;      /*%< In-core buffer for journal index in on-disk format */
315         journal_pos_t           *index;         /*%< In-core journal index */
316
317         /*% Current transaction state (when writing). */
318         struct {
319                 unsigned int    n_soa;          /*%< Number of SOAs seen */
320                 journal_pos_t   pos[2];         /*%< Begin/end position */
321         } x;
322
323         /*% Iteration state (when reading). */
324         struct {
325                 /* These define the part of the journal we iterate over. */
326                 journal_pos_t bpos;             /*%< Position before first, */
327                 journal_pos_t epos;             /*%< and after last transaction */
328                 /* The rest is iterator state. */
329                 isc_uint32_t current_serial;    /*%< Current SOA serial */
330                 isc_buffer_t source;            /*%< Data from disk */
331                 isc_buffer_t target;            /*%< Data from _fromwire check */
332                 dns_decompress_t dctx;          /*%< Dummy decompression ctx */
333                 dns_name_t name;                /*%< Current domain name */
334                 dns_rdata_t rdata;              /*%< Current rdata */
335                 isc_uint32_t ttl;               /*%< Current TTL */
336                 unsigned int xsize;             /*%< Size of transaction data */
337                 unsigned int xpos;              /*%< Current position in it */
338                 isc_result_t result;            /*%< Result of last call */
339         } it;
340 };
341
342 #define DNS_JOURNAL_MAGIC       ISC_MAGIC('J', 'O', 'U', 'R')
343 #define DNS_JOURNAL_VALID(t)    ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
344
345 static void
346 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
347         cooked->serial = decode_uint32(raw->serial);
348         cooked->offset = decode_uint32(raw->offset);
349 }
350
351 static void
352 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
353         encode_uint32(cooked->serial, raw->serial);
354         encode_uint32(cooked->offset, raw->offset);
355 }
356
357 static void
358 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
359         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
360         memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
361         journal_pos_decode(&raw->h.begin, &cooked->begin);
362         journal_pos_decode(&raw->h.end, &cooked->end);
363         cooked->index_size = decode_uint32(raw->h.index_size);
364         cooked->sourceserial = decode_uint32(raw->h.sourceserial);
365         cooked->serialset = ISC_TF(raw->h.flags & JOURNAL_SERIALSET);
366 }
367
368 static void
369 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
370         unsigned char flags = 0;
371
372         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
373         memset(raw->pad, 0, sizeof(raw->pad));
374         memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
375         journal_pos_encode(&raw->h.begin, &cooked->begin);
376         journal_pos_encode(&raw->h.end, &cooked->end);
377         encode_uint32(cooked->index_size, raw->h.index_size);
378         encode_uint32(cooked->sourceserial, raw->h.sourceserial);
379         if (cooked->serialset)
380                 flags |= JOURNAL_SERIALSET;
381         raw->h.flags = flags;
382 }
383
384 /*
385  * Journal file I/O subroutines, with error checking and reporting.
386  */
387 static isc_result_t
388 journal_seek(dns_journal_t *j, isc_uint32_t offset) {
389         isc_result_t result;
390         result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
391         if (result != ISC_R_SUCCESS) {
392                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
393                               "%s: seek: %s", j->filename,
394                               isc_result_totext(result));
395                 return (ISC_R_UNEXPECTED);
396         }
397         j->offset = offset;
398         return (ISC_R_SUCCESS);
399 }
400
401 static isc_result_t
402 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
403         isc_result_t result;
404
405         result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
406         if (result != ISC_R_SUCCESS) {
407                 if (result == ISC_R_EOF)
408                         return (ISC_R_NOMORE);
409                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
410                               "%s: read: %s",
411                               j->filename, isc_result_totext(result));
412                 return (ISC_R_UNEXPECTED);
413         }
414         j->offset += nbytes;
415         return (ISC_R_SUCCESS);
416 }
417
418 static isc_result_t
419 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
420         isc_result_t result;
421
422         result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
423         if (result != ISC_R_SUCCESS) {
424                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
425                               "%s: write: %s",
426                               j->filename, isc_result_totext(result));
427                 return (ISC_R_UNEXPECTED);
428         }
429         j->offset += nbytes;
430         return (ISC_R_SUCCESS);
431 }
432
433 static isc_result_t
434 journal_fsync(dns_journal_t *j) {
435         isc_result_t result;
436         result = isc_stdio_flush(j->fp);
437         if (result != ISC_R_SUCCESS) {
438                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
439                               "%s: flush: %s",
440                               j->filename, isc_result_totext(result));
441                 return (ISC_R_UNEXPECTED);
442         }
443         result = isc_stdio_sync(j->fp);
444         if (result != ISC_R_SUCCESS) {
445                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
446                               "%s: fsync: %s",
447                               j->filename, isc_result_totext(result));
448                 return (ISC_R_UNEXPECTED);
449         }
450         return (ISC_R_SUCCESS);
451 }
452
453 /*
454  * Read/write a transaction header at the current file position.
455  */
456
457 static isc_result_t
458 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
459         journal_rawxhdr_t raw;
460         isc_result_t result;
461         result = journal_read(j, &raw, sizeof(raw));
462         if (result != ISC_R_SUCCESS)
463                 return (result);
464         xhdr->size = decode_uint32(raw.size);
465         xhdr->serial0 = decode_uint32(raw.serial0);
466         xhdr->serial1 = decode_uint32(raw.serial1);
467         return (ISC_R_SUCCESS);
468 }
469
470 static isc_result_t
471 journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
472                    isc_uint32_t serial0, isc_uint32_t serial1)
473 {
474         journal_rawxhdr_t raw;
475         encode_uint32(size, raw.size);
476         encode_uint32(serial0, raw.serial0);
477         encode_uint32(serial1, raw.serial1);
478         return (journal_write(j, &raw, sizeof(raw)));
479 }
480
481
482 /*
483  * Read an RR header at the current file position.
484  */
485
486 static isc_result_t
487 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
488         journal_rawrrhdr_t raw;
489         isc_result_t result;
490         result = journal_read(j, &raw, sizeof(raw));
491         if (result != ISC_R_SUCCESS)
492                 return (result);
493         rrhdr->size = decode_uint32(raw.size);
494         return (ISC_R_SUCCESS);
495 }
496
497 static isc_result_t
498 journal_file_create(isc_mem_t *mctx, const char *filename) {
499         FILE *fp = NULL;
500         isc_result_t result;
501         journal_header_t header;
502         journal_rawheader_t rawheader;
503         int index_size = 56; /* XXX configurable */
504         int size;
505         void *mem; /* Memory for temporary index image. */
506
507         INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
508
509         result = isc_stdio_open(filename, "wb", &fp);
510         if (result != ISC_R_SUCCESS) {
511                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
512                               "%s: create: %s",
513                               filename, isc_result_totext(result));
514                 return (ISC_R_UNEXPECTED);
515         }
516
517         header = initial_journal_header;
518         header.index_size = index_size;
519         journal_header_encode(&header, &rawheader);
520
521         size = sizeof(journal_rawheader_t) +
522                 index_size * sizeof(journal_rawpos_t);
523
524         mem = isc_mem_get(mctx, size);
525         if (mem == NULL) {
526                 (void)isc_stdio_close(fp);
527                 (void)isc_file_remove(filename);
528                 return (ISC_R_NOMEMORY);
529         }
530         memset(mem, 0, size);
531         memcpy(mem, &rawheader, sizeof(rawheader));
532
533         result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
534         if (result != ISC_R_SUCCESS) {
535                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
536                                  "%s: write: %s",
537                                  filename, isc_result_totext(result));
538                 (void)isc_stdio_close(fp);
539                 (void)isc_file_remove(filename);
540                 isc_mem_put(mctx, mem, size);
541                 return (ISC_R_UNEXPECTED);
542         }
543         isc_mem_put(mctx, mem, size);
544
545         result = isc_stdio_close(fp);
546         if (result != ISC_R_SUCCESS) {
547                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
548                                  "%s: close: %s",
549                                  filename, isc_result_totext(result));
550                 (void)isc_file_remove(filename);
551                 return (ISC_R_UNEXPECTED);
552         }
553
554         return (ISC_R_SUCCESS);
555 }
556
557 static isc_result_t
558 journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
559              isc_boolean_t create, dns_journal_t **journalp)
560 {
561         FILE *fp = NULL;
562         isc_result_t result;
563         journal_rawheader_t rawheader;
564         dns_journal_t *j;
565
566         INSIST(journalp != NULL && *journalp == NULL);
567         j = isc_mem_get(mctx, sizeof(*j));
568         if (j == NULL)
569                 return (ISC_R_NOMEMORY);
570
571         j->mctx = NULL;
572         isc_mem_attach(mctx, &j->mctx);
573         j->state = JOURNAL_STATE_INVALID;
574         j->fp = NULL;
575         j->filename = filename;
576         j->index = NULL;
577         j->rawindex = NULL;
578
579         result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
580
581         if (result == ISC_R_FILENOTFOUND) {
582                 if (create) {
583                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
584                                       "journal file %s does not exist, "
585                                       "creating it", j->filename);
586                         CHECK(journal_file_create(mctx, filename));
587                         /*
588                          * Retry.
589                          */
590                         result = isc_stdio_open(j->filename, "rb+", &fp);
591                 } else {
592                         FAIL(ISC_R_NOTFOUND);
593                 }
594         }
595         if (result != ISC_R_SUCCESS) {
596                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
597                               "%s: open: %s",
598                               j->filename, isc_result_totext(result));
599                 FAIL(ISC_R_UNEXPECTED);
600         }
601
602         j->fp = fp;
603
604         /*
605          * Set magic early so that seek/read can succeed.
606          */
607         j->magic = DNS_JOURNAL_MAGIC;
608
609         CHECK(journal_seek(j, 0));
610         CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
611
612         if (memcmp(rawheader.h.format, initial_journal_header.format,
613                    sizeof(initial_journal_header.format)) != 0) {
614                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
615                                  "%s: journal format not recognized",
616                                  j->filename);
617                 FAIL(ISC_R_UNEXPECTED);
618         }
619         journal_header_decode(&rawheader, &j->header);
620
621         /*
622          * If there is an index, read the raw index into a dynamically
623          * allocated buffer and then convert it into a cooked index.
624          */
625         if (j->header.index_size != 0) {
626                 unsigned int i;
627                 unsigned int rawbytes;
628                 unsigned char *p;
629
630                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
631                 j->rawindex = isc_mem_get(mctx, rawbytes);
632                 if (j->rawindex == NULL)
633                         FAIL(ISC_R_NOMEMORY);
634
635                 CHECK(journal_read(j, j->rawindex, rawbytes));
636
637                 j->index = isc_mem_get(mctx, j->header.index_size *
638                                        sizeof(journal_pos_t));
639                 if (j->index == NULL)
640                         FAIL(ISC_R_NOMEMORY);
641
642                 p = j->rawindex;
643                 for (i = 0; i < j->header.index_size; i++) {
644                         j->index[i].serial = decode_uint32(p);
645                         p += 4;
646                         j->index[i].offset = decode_uint32(p);
647                         p += 4;
648                 }
649                 INSIST(p == j->rawindex + rawbytes);
650         }
651         j->offset = -1; /* Invalid, must seek explicitly. */
652
653         /*
654          * Initialize the iterator.
655          */
656         dns_name_init(&j->it.name, NULL);
657         dns_rdata_init(&j->it.rdata);
658
659         /*
660          * Set up empty initial buffers for unchecked and checked
661          * wire format RR data.  They will be reallocated
662          * later.
663          */
664         isc_buffer_init(&j->it.source, NULL, 0);
665         isc_buffer_init(&j->it.target, NULL, 0);
666         dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
667
668         j->state =
669                 write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
670
671         *journalp = j;
672         return (ISC_R_SUCCESS);
673
674  failure:
675         j->magic = 0;
676         if (j->index != NULL) {
677                 isc_mem_put(j->mctx, j->index, j->header.index_size *
678                             sizeof(journal_rawpos_t));
679                 j->index = NULL;
680         }
681         if (j->fp != NULL)
682                 (void)isc_stdio_close(j->fp);
683         isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
684         return (result);
685 }
686
687 isc_result_t
688 dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
689                  dns_journal_t **journalp)
690 {
691         isc_result_t result;
692         int namelen;
693         char backup[1024];
694         isc_boolean_t write, create;
695
696         create = ISC_TF(mode & DNS_JOURNAL_CREATE);
697         write = ISC_TF(mode & (DNS_JOURNAL_WRITE|DNS_JOURNAL_CREATE));
698
699         result = journal_open(mctx, filename, write, create, journalp);
700         if (result == ISC_R_NOTFOUND) {
701                 namelen = strlen(filename);
702                 if (namelen > 4 && strcmp(filename + namelen - 4, ".jnl") == 0)
703                         namelen -= 4;
704
705                 result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
706                                            namelen, filename);
707                 if (result != ISC_R_SUCCESS)
708                         return (result);
709                 result = journal_open(mctx, backup, write, write, journalp);
710         }
711         return (result);
712 }
713
714 /*
715  * A comparison function defining the sorting order for
716  * entries in the IXFR-style journal file.
717  *
718  * The IXFR format requires that deletions are sorted before
719  * additions, and within either one, SOA records are sorted
720  * before others.
721  *
722  * Also sort the non-SOA records by type as a courtesy to the
723  * server receiving the IXFR - it may help reduce the amount of
724  * rdataset merging it has to do.
725  */
726 static int
727 ixfr_order(const void *av, const void *bv) {
728         dns_difftuple_t const * const *ap = av;
729         dns_difftuple_t const * const *bp = bv;
730         dns_difftuple_t const *a = *ap;
731         dns_difftuple_t const *b = *bp;
732         int r;
733         int bop = 0, aop = 0;
734
735         switch (a->op) {
736         case DNS_DIFFOP_DEL:
737         case DNS_DIFFOP_DELRESIGN:
738                 aop = 1;
739                 break;
740         case DNS_DIFFOP_ADD:
741         case DNS_DIFFOP_ADDRESIGN:
742                 aop = 0;
743                 break;
744         default:
745                 INSIST(0);
746         }
747
748         switch (b->op) {
749         case DNS_DIFFOP_DEL:
750         case DNS_DIFFOP_DELRESIGN:
751                 bop = 1;
752                 break;
753         case DNS_DIFFOP_ADD:
754         case DNS_DIFFOP_ADDRESIGN:
755                 bop = 0;
756                 break;
757         default:
758                 INSIST(0);
759         }
760
761         r = bop - aop;
762         if (r != 0)
763                 return (r);
764
765         r = (b->rdata.type == dns_rdatatype_soa) -
766                 (a->rdata.type == dns_rdatatype_soa);
767         if (r != 0)
768                 return (r);
769
770         r = (a->rdata.type - b->rdata.type);
771         return (r);
772 }
773
774 /*
775  * Advance '*pos' to the next journal transaction.
776  *
777  * Requires:
778  *      *pos refers to a valid journal transaction.
779  *
780  * Ensures:
781  *      When ISC_R_SUCCESS is returned,
782  *      *pos refers to the next journal transaction.
783  *
784  * Returns one of:
785  *
786  *    ISC_R_SUCCESS
787  *    ISC_R_NOMORE      *pos pointed at the last transaction
788  *    Other results due to file errors are possible.
789  */
790 static isc_result_t
791 journal_next(dns_journal_t *j, journal_pos_t *pos) {
792         isc_result_t result;
793         journal_xhdr_t xhdr;
794         REQUIRE(DNS_JOURNAL_VALID(j));
795
796         result = journal_seek(j, pos->offset);
797         if (result != ISC_R_SUCCESS)
798                 return (result);
799
800         if (pos->serial == j->header.end.serial)
801                 return (ISC_R_NOMORE);
802         /*
803          * Read the header of the current transaction.
804          * This will return ISC_R_NOMORE if we are at EOF.
805          */
806         result = journal_read_xhdr(j, &xhdr);
807         if (result != ISC_R_SUCCESS)
808                 return (result);
809
810         /*
811          * Check serial number consistency.
812          */
813         if (xhdr.serial0 != pos->serial) {
814                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
815                               "%s: journal file corrupt: "
816                               "expected serial %u, got %u",
817                               j->filename, pos->serial, xhdr.serial0);
818                 return (ISC_R_UNEXPECTED);
819         }
820
821         /*
822          * Check for offset wraparound.
823          */
824         if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
825             < pos->offset) {
826                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
827                               "%s: offset too large", j->filename);
828                 return (ISC_R_UNEXPECTED);
829         }
830
831         pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
832         pos->serial = xhdr.serial1;
833         return (ISC_R_SUCCESS);
834 }
835
836 /*
837  * If the index of the journal 'j' contains an entry "better"
838  * than '*best_guess', replace '*best_guess' with it.
839  *
840  * "Better" means having a serial number closer to 'serial'
841  * but not greater than 'serial'.
842  */
843 static void
844 index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
845         unsigned int i;
846         if (j->index == NULL)
847                 return;
848         for (i = 0; i < j->header.index_size; i++) {
849                 if (POS_VALID(j->index[i]) &&
850                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
851                     DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
852                         *best_guess = j->index[i];
853         }
854 }
855
856 /*
857  * Add a new index entry.  If there is no room, make room by removing
858  * the odd-numbered entries and compacting the others into the first
859  * half of the index.  This decimates old index entries exponentially
860  * over time, so that the index always contains a much larger fraction
861  * of recent serial numbers than of old ones.  This is deliberate -
862  * most index searches are for outgoing IXFR, and IXFR tends to request
863  * recent versions more often than old ones.
864  */
865 static void
866 index_add(dns_journal_t *j, journal_pos_t *pos) {
867         unsigned int i;
868         if (j->index == NULL)
869                 return;
870         /*
871          * Search for a vacant position.
872          */
873         for (i = 0; i < j->header.index_size; i++) {
874                 if (! POS_VALID(j->index[i]))
875                         break;
876         }
877         if (i == j->header.index_size) {
878                 unsigned int k = 0;
879                 /*
880                  * Found no vacant position.  Make some room.
881                  */
882                 for (i = 0; i < j->header.index_size; i += 2) {
883                         j->index[k++] = j->index[i];
884                 }
885                 i = k; /* 'i' identifies the first vacant position. */
886                 while (k < j->header.index_size) {
887                         POS_INVALIDATE(j->index[k]);
888                         k++;
889                 }
890         }
891         INSIST(i < j->header.index_size);
892         INSIST(! POS_VALID(j->index[i]));
893
894         /*
895          * Store the new index entry.
896          */
897         j->index[i] = *pos;
898 }
899
900 /*
901  * Invalidate any existing index entries that could become
902  * ambiguous when a new transaction with number 'serial' is added.
903  */
904 static void
905 index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
906         unsigned int i;
907         if (j->index == NULL)
908                 return;
909         for (i = 0; i < j->header.index_size; i++) {
910                 if (! DNS_SERIAL_GT(serial, j->index[i].serial))
911                         POS_INVALIDATE(j->index[i]);
912         }
913 }
914
915 /*
916  * Try to find a transaction with initial serial number 'serial'
917  * in the journal 'j'.
918  *
919  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
920  *
921  * If 'serial' is current (= the ending serial number of the
922  * last transaction in the journal), set '*pos' to
923  * the position immediately following the last transaction and
924  * return ISC_R_SUCCESS.
925  *
926  * If 'serial' is within the range of addressable serial numbers
927  * covered by the journal but that particular serial number is missing
928  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
929  *
930  * If 'serial' is outside the range of addressable serial numbers
931  * covered by the journal, return ISC_R_RANGE.
932  *
933  */
934 static isc_result_t
935 journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
936         isc_result_t result;
937         journal_pos_t current_pos;
938         REQUIRE(DNS_JOURNAL_VALID(j));
939
940         if (DNS_SERIAL_GT(j->header.begin.serial, serial))
941                 return (ISC_R_RANGE);
942         if (DNS_SERIAL_GT(serial, j->header.end.serial))
943                 return (ISC_R_RANGE);
944         if (serial == j->header.end.serial) {
945                 *pos = j->header.end;
946                 return (ISC_R_SUCCESS);
947         }
948
949         current_pos = j->header.begin;
950         index_find(j, serial, &current_pos);
951
952         while (current_pos.serial != serial) {
953                 if (DNS_SERIAL_GT(current_pos.serial, serial))
954                         return (ISC_R_NOTFOUND);
955                 result = journal_next(j, &current_pos);
956                 if (result != ISC_R_SUCCESS)
957                         return (result);
958         }
959         *pos = current_pos;
960         return (ISC_R_SUCCESS);
961 }
962
963 isc_result_t
964 dns_journal_begin_transaction(dns_journal_t *j) {
965         isc_uint32_t offset;
966         isc_result_t result;
967         journal_rawxhdr_t hdr;
968
969         REQUIRE(DNS_JOURNAL_VALID(j));
970         REQUIRE(j->state == JOURNAL_STATE_WRITE ||
971                 j->state == JOURNAL_STATE_INLINE);
972
973         /*
974          * Find the file offset where the new transaction should
975          * be written, and seek there.
976          */
977         if (JOURNAL_EMPTY(&j->header)) {
978                 offset = sizeof(journal_rawheader_t) +
979                         j->header.index_size * sizeof(journal_rawpos_t);
980         } else {
981                 offset = j->header.end.offset;
982         }
983         j->x.pos[0].offset = offset;
984         j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
985         j->x.n_soa = 0;
986
987         CHECK(journal_seek(j, offset));
988
989         /*
990          * Write a dummy transaction header of all zeroes to reserve
991          * space.  It will be filled in when the transaction is
992          * finished.
993          */
994         memset(&hdr, 0, sizeof(hdr));
995         CHECK(journal_write(j, &hdr, sizeof(hdr)));
996         j->x.pos[1].offset = j->offset;
997
998         j->state = JOURNAL_STATE_TRANSACTION;
999         result = ISC_R_SUCCESS;
1000  failure:
1001         return (result);
1002 }
1003
1004 isc_result_t
1005 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
1006         dns_difftuple_t *t;
1007         isc_buffer_t buffer;
1008         void *mem = NULL;
1009         unsigned int size;
1010         isc_result_t result;
1011         isc_region_t used;
1012
1013         REQUIRE(DNS_DIFF_VALID(diff));
1014         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1015
1016         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
1017         (void)dns_diff_print(diff, NULL);
1018
1019         /*
1020          * Pass 1: determine the buffer size needed, and
1021          * keep track of SOA serial numbers.
1022          */
1023         size = 0;
1024         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1025              t = ISC_LIST_NEXT(t, link))
1026         {
1027                 if (t->rdata.type == dns_rdatatype_soa) {
1028                         if (j->x.n_soa < 2)
1029                                 j->x.pos[j->x.n_soa].serial =
1030                                         dns_soa_getserial(&t->rdata);
1031                         j->x.n_soa++;
1032                 }
1033                 size += sizeof(journal_rawrrhdr_t);
1034                 size += t->name.length; /* XXX should have access macro? */
1035                 size += 10;
1036                 size += t->rdata.length;
1037         }
1038
1039         mem = isc_mem_get(j->mctx, size);
1040         if (mem == NULL)
1041                 return (ISC_R_NOMEMORY);
1042
1043         isc_buffer_init(&buffer, mem, size);
1044
1045         /*
1046          * Pass 2.  Write RRs to buffer.
1047          */
1048         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1049              t = ISC_LIST_NEXT(t, link))
1050         {
1051                 /*
1052                  * Write the RR header.
1053                  */
1054                 isc_buffer_putuint32(&buffer, t->name.length + 10 +
1055                                      t->rdata.length);
1056                 /*
1057                  * Write the owner name, RR header, and RR data.
1058                  */
1059                 isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1060                 isc_buffer_putuint16(&buffer, t->rdata.type);
1061                 isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1062                 isc_buffer_putuint32(&buffer, t->ttl);
1063                 INSIST(t->rdata.length < 65536);
1064                 isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1065                 INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1066                 isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1067         }
1068
1069         isc_buffer_usedregion(&buffer, &used);
1070         INSIST(used.length == size);
1071
1072         j->x.pos[1].offset += used.length;
1073
1074         /*
1075          * Write the buffer contents to the journal file.
1076          */
1077         CHECK(journal_write(j, used.base, used.length));
1078
1079         result = ISC_R_SUCCESS;
1080
1081  failure:
1082         if (mem != NULL)
1083                 isc_mem_put(j->mctx, mem, size);
1084         return (result);
1085
1086 }
1087
1088 isc_result_t
1089 dns_journal_commit(dns_journal_t *j) {
1090         isc_result_t result;
1091         journal_rawheader_t rawheader;
1092
1093         REQUIRE(DNS_JOURNAL_VALID(j));
1094         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
1095                 j->state == JOURNAL_STATE_INLINE);
1096
1097         /*
1098          * Just write out a updated header.
1099          */
1100         if (j->state == JOURNAL_STATE_INLINE) {
1101                 CHECK(journal_fsync(j));
1102                 journal_header_encode(&j->header, &rawheader);
1103                 CHECK(journal_seek(j, 0));
1104                 CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1105                 CHECK(journal_fsync(j));
1106                 j->state = JOURNAL_STATE_WRITE;
1107                 return (ISC_R_SUCCESS);
1108         }
1109
1110         /*
1111          * Perform some basic consistency checks.
1112          */
1113         if (j->x.n_soa != 2) {
1114                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1115                               "%s: malformed transaction: %d SOAs",
1116                               j->filename, j->x.n_soa);
1117                 return (ISC_R_UNEXPECTED);
1118         }
1119         if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1120                (bind8_compat &&
1121                 j->x.pos[1].serial == j->x.pos[0].serial)))
1122         {
1123                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1124                               "%s: malformed transaction: serial number "
1125                               "would decrease", j->filename);
1126                 return (ISC_R_UNEXPECTED);
1127         }
1128         if (! JOURNAL_EMPTY(&j->header)) {
1129                 if (j->x.pos[0].serial != j->header.end.serial) {
1130                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1131                                          "malformed transaction: "
1132                                          "%s last serial %u != "
1133                                          "transaction first serial %u",
1134                                          j->filename,
1135                                          j->header.end.serial,
1136                                          j->x.pos[0].serial);
1137                         return (ISC_R_UNEXPECTED);
1138                 }
1139         }
1140
1141         /*
1142          * Some old journal entries may become non-addressable
1143          * when we increment the current serial number.  Purge them
1144          * by stepping header.begin forward to the first addressable
1145          * transaction.  Also purge them from the index.
1146          */
1147         if (! JOURNAL_EMPTY(&j->header)) {
1148                 while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1149                                        j->header.begin.serial)) {
1150                         CHECK(journal_next(j, &j->header.begin));
1151                 }
1152                 index_invalidate(j, j->x.pos[1].serial);
1153         }
1154 #ifdef notyet
1155         if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1156                 force_dump(...);
1157         }
1158 #endif
1159
1160         /*
1161          * Commit the transaction data to stable storage.
1162          */
1163         CHECK(journal_fsync(j));
1164
1165         if (j->state == JOURNAL_STATE_TRANSACTION) {
1166                 isc_offset_t offset;
1167                 offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
1168                                  sizeof(journal_rawxhdr_t);
1169                 /*
1170                  * Update the transaction header.
1171                  */
1172                 CHECK(journal_seek(j, j->x.pos[0].offset));
1173                 CHECK(journal_write_xhdr(j, offset, j->x.pos[0].serial,
1174                                          j->x.pos[1].serial));
1175         }
1176
1177         /*
1178          * Update the journal header.
1179          */
1180         if (JOURNAL_EMPTY(&j->header))
1181                 j->header.begin = j->x.pos[0];
1182         j->header.end = j->x.pos[1];
1183         journal_header_encode(&j->header, &rawheader);
1184         CHECK(journal_seek(j, 0));
1185         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1186
1187         /*
1188          * Update the index.
1189          */
1190         index_add(j, &j->x.pos[0]);
1191
1192         /*
1193          * Convert the index into on-disk format and write
1194          * it to disk.
1195          */
1196         CHECK(index_to_disk(j));
1197
1198         /*
1199          * Commit the header to stable storage.
1200          */
1201         CHECK(journal_fsync(j));
1202
1203         /*
1204          * We no longer have a transaction open.
1205          */
1206         j->state = JOURNAL_STATE_WRITE;
1207
1208         result = ISC_R_SUCCESS;
1209
1210  failure:
1211         return (result);
1212 }
1213
1214 isc_result_t
1215 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1216         isc_result_t result;
1217         CHECK(dns_diff_sort(diff, ixfr_order));
1218         CHECK(dns_journal_begin_transaction(j));
1219         CHECK(dns_journal_writediff(j, diff));
1220         CHECK(dns_journal_commit(j));
1221         result = ISC_R_SUCCESS;
1222  failure:
1223         return (result);
1224 }
1225
1226 void
1227 dns_journal_destroy(dns_journal_t **journalp) {
1228         dns_journal_t *j = *journalp;
1229         REQUIRE(DNS_JOURNAL_VALID(j));
1230
1231         j->it.result = ISC_R_FAILURE;
1232         dns_name_invalidate(&j->it.name);
1233         dns_decompress_invalidate(&j->it.dctx);
1234         if (j->rawindex != NULL)
1235                 isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1236                             sizeof(journal_rawpos_t));
1237         if (j->index != NULL)
1238                 isc_mem_put(j->mctx, j->index, j->header.index_size *
1239                             sizeof(journal_pos_t));
1240         if (j->it.target.base != NULL)
1241                 isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1242         if (j->it.source.base != NULL)
1243                 isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1244
1245         if (j->fp != NULL)
1246                 (void)isc_stdio_close(j->fp);
1247         j->magic = 0;
1248         isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
1249         *journalp = NULL;
1250 }
1251
1252 /*
1253  * Roll the open journal 'j' into the database 'db'.
1254  * A new database version will be created.
1255  */
1256
1257 /* XXX Share code with incoming IXFR? */
1258
1259 static isc_result_t
1260 roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options,
1261              isc_uint32_t resign)
1262 {
1263         isc_buffer_t source;            /* Transaction data from disk */
1264         isc_buffer_t target;            /* Ditto after _fromwire check */
1265         isc_uint32_t db_serial;         /* Database SOA serial */
1266         isc_uint32_t end_serial;        /* Last journal SOA serial */
1267         isc_result_t result;
1268         dns_dbversion_t *ver = NULL;
1269         journal_pos_t pos;
1270         dns_diff_t diff;
1271         unsigned int n_soa = 0;
1272         unsigned int n_put = 0;
1273         dns_diffop_t op;
1274
1275         REQUIRE(DNS_JOURNAL_VALID(j));
1276         REQUIRE(DNS_DB_VALID(db));
1277
1278         dns_diff_init(j->mctx, &diff);
1279         diff.resign = resign;
1280
1281         /*
1282          * Set up empty initial buffers for unchecked and checked
1283          * wire format transaction data.  They will be reallocated
1284          * later.
1285          */
1286         isc_buffer_init(&source, NULL, 0);
1287         isc_buffer_init(&target, NULL, 0);
1288
1289         /*
1290          * Create the new database version.
1291          */
1292         CHECK(dns_db_newversion(db, &ver));
1293
1294         /*
1295          * Get the current database SOA serial number.
1296          */
1297         CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1298
1299         /*
1300          * Locate a journal entry for the current database serial.
1301          */
1302         CHECK(journal_find(j, db_serial, &pos));
1303         /*
1304          * XXX do more drastic things, like marking zone stale,
1305          * if this fails?
1306          */
1307         /*
1308          * XXXRTH  The zone code should probably mark the zone as bad and
1309          *         scream loudly into the log if this is a dynamic update
1310          *         log reply that failed.
1311          */
1312
1313         end_serial = dns_journal_last_serial(j);
1314         if (db_serial == end_serial)
1315                 CHECK(DNS_R_UPTODATE);
1316
1317         CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1318
1319         for (result = dns_journal_first_rr(j);
1320              result == ISC_R_SUCCESS;
1321              result = dns_journal_next_rr(j))
1322         {
1323                 dns_name_t *name;
1324                 isc_uint32_t ttl;
1325                 dns_rdata_t *rdata;
1326                 dns_difftuple_t *tuple = NULL;
1327
1328                 name = NULL;
1329                 rdata = NULL;
1330                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1331
1332                 if (rdata->type == dns_rdatatype_soa) {
1333                         n_soa++;
1334                         if (n_soa == 2)
1335                                 db_serial = j->it.current_serial;
1336                 }
1337
1338                 if (n_soa == 3)
1339                         n_soa = 1;
1340                 if (n_soa == 0) {
1341                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1342                                          "%s: journal file corrupt: missing "
1343                                          "initial SOA", j->filename);
1344                         FAIL(ISC_R_UNEXPECTED);
1345                 }
1346                 if ((options & DNS_JOURNALOPT_RESIGN) != 0)
1347                         op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN :
1348                                             DNS_DIFFOP_ADDRESIGN;
1349                 else
1350                         op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
1351
1352                 CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
1353                                            &tuple));
1354                 dns_diff_append(&diff, &tuple);
1355
1356                 if (++n_put > 100)  {
1357                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1358                                       "%s: applying diff to database (%u)",
1359                                       j->filename, db_serial);
1360                         (void)dns_diff_print(&diff, NULL);
1361                         CHECK(dns_diff_apply(&diff, db, ver));
1362                         dns_diff_clear(&diff);
1363                         n_put = 0;
1364                 }
1365         }
1366         if (result == ISC_R_NOMORE)
1367                 result = ISC_R_SUCCESS;
1368         CHECK(result);
1369
1370         if (n_put != 0) {
1371                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1372                               "%s: applying final diff to database (%u)",
1373                               j->filename, db_serial);
1374                 (void)dns_diff_print(&diff, NULL);
1375                 CHECK(dns_diff_apply(&diff, db, ver));
1376                 dns_diff_clear(&diff);
1377         }
1378
1379  failure:
1380         if (ver != NULL)
1381                 dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1382                                     ISC_TRUE : ISC_FALSE);
1383
1384         if (source.base != NULL)
1385                 isc_mem_put(j->mctx, source.base, source.length);
1386         if (target.base != NULL)
1387                 isc_mem_put(j->mctx, target.base, target.length);
1388
1389         dns_diff_clear(&diff);
1390
1391         return (result);
1392 }
1393
1394 isc_result_t
1395 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db,
1396                         unsigned int options, const char *filename)
1397 {
1398         REQUIRE((options & DNS_JOURNALOPT_RESIGN) == 0);
1399         return (dns_journal_rollforward2(mctx, db, options, 0, filename));
1400 }
1401
1402 isc_result_t
1403 dns_journal_rollforward2(isc_mem_t *mctx, dns_db_t *db, unsigned int options,
1404                          isc_uint32_t resign, const char *filename)
1405 {
1406         dns_journal_t *j;
1407         isc_result_t result;
1408
1409         REQUIRE(DNS_DB_VALID(db));
1410         REQUIRE(filename != NULL);
1411
1412         j = NULL;
1413         result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1414         if (result == ISC_R_NOTFOUND) {
1415                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1416                               "no journal file, but that's OK");
1417                 return (DNS_R_NOJOURNAL);
1418         }
1419         if (result != ISC_R_SUCCESS)
1420                 return (result);
1421         if (JOURNAL_EMPTY(&j->header))
1422                 result = DNS_R_UPTODATE;
1423         else
1424                 result = roll_forward(j, db, options, resign);
1425
1426         dns_journal_destroy(&j);
1427
1428         return (result);
1429 }
1430
1431 isc_result_t
1432 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1433         dns_journal_t *j;
1434         isc_buffer_t source;            /* Transaction data from disk */
1435         isc_buffer_t target;            /* Ditto after _fromwire check */
1436         isc_uint32_t start_serial;              /* Database SOA serial */
1437         isc_uint32_t end_serial;        /* Last journal SOA serial */
1438         isc_result_t result;
1439         dns_diff_t diff;
1440         unsigned int n_soa = 0;
1441         unsigned int n_put = 0;
1442
1443         REQUIRE(filename != NULL);
1444
1445         j = NULL;
1446         result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1447         if (result == ISC_R_NOTFOUND) {
1448                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1449                 return (DNS_R_NOJOURNAL);
1450         }
1451
1452         if (result != ISC_R_SUCCESS) {
1453                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1454                               "journal open failure: %s: %s",
1455                               isc_result_totext(result), filename);
1456                 return (result);
1457         }
1458
1459         if (j->header.serialset)
1460                 fprintf(file, "Source serial = %u\n", j->header.sourceserial);
1461         dns_diff_init(j->mctx, &diff);
1462
1463         /*
1464          * Set up empty initial buffers for unchecked and checked
1465          * wire format transaction data.  They will be reallocated
1466          * later.
1467          */
1468         isc_buffer_init(&source, NULL, 0);
1469         isc_buffer_init(&target, NULL, 0);
1470
1471         start_serial = dns_journal_first_serial(j);
1472         end_serial = dns_journal_last_serial(j);
1473
1474         CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1475
1476         for (result = dns_journal_first_rr(j);
1477              result == ISC_R_SUCCESS;
1478              result = dns_journal_next_rr(j))
1479         {
1480                 dns_name_t *name;
1481                 isc_uint32_t ttl;
1482                 dns_rdata_t *rdata;
1483                 dns_difftuple_t *tuple = NULL;
1484
1485                 name = NULL;
1486                 rdata = NULL;
1487                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1488
1489                 if (rdata->type == dns_rdatatype_soa)
1490                         n_soa++;
1491
1492                 if (n_soa == 3)
1493                         n_soa = 1;
1494                 if (n_soa == 0) {
1495                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1496                                       "%s: journal file corrupt: missing "
1497                                       "initial SOA", j->filename);
1498                         FAIL(ISC_R_UNEXPECTED);
1499                 }
1500                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1501                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1502                                            name, ttl, rdata, &tuple));
1503                 dns_diff_append(&diff, &tuple);
1504
1505                 if (++n_put > 100)  {
1506                         result = dns_diff_print(&diff, file);
1507                         dns_diff_clear(&diff);
1508                         n_put = 0;
1509                         if (result != ISC_R_SUCCESS)
1510                                 break;
1511                 }
1512         }
1513         if (result == ISC_R_NOMORE)
1514                 result = ISC_R_SUCCESS;
1515         CHECK(result);
1516
1517         if (n_put != 0) {
1518                 result = dns_diff_print(&diff, file);
1519                 dns_diff_clear(&diff);
1520         }
1521         goto cleanup;
1522
1523  failure:
1524         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1525                       "%s: cannot print: journal file corrupt", j->filename);
1526
1527  cleanup:
1528         if (source.base != NULL)
1529                 isc_mem_put(j->mctx, source.base, source.length);
1530         if (target.base != NULL)
1531                 isc_mem_put(j->mctx, target.base, target.length);
1532
1533         dns_diff_clear(&diff);
1534         dns_journal_destroy(&j);
1535
1536         return (result);
1537 }
1538
1539 /**************************************************************************/
1540 /*
1541  * Miscellaneous accessors.
1542  */
1543 isc_uint32_t
1544 dns_journal_first_serial(dns_journal_t *j) {
1545         return (j->header.begin.serial);
1546 }
1547
1548 isc_uint32_t
1549 dns_journal_last_serial(dns_journal_t *j) {
1550         return (j->header.end.serial);
1551 }
1552
1553 void
1554 dns_journal_set_sourceserial(dns_journal_t *j, isc_uint32_t sourceserial) {
1555
1556         REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1557                 j->state == JOURNAL_STATE_INLINE ||
1558                 j->state == JOURNAL_STATE_TRANSACTION);
1559
1560         j->header.sourceserial = sourceserial;
1561         j->header.serialset = ISC_TRUE;
1562         if (j->state == JOURNAL_STATE_WRITE)
1563                 j->state = JOURNAL_STATE_INLINE;
1564 }
1565
1566 isc_boolean_t
1567 dns_journal_get_sourceserial(dns_journal_t *j, isc_uint32_t *sourceserial) {
1568         REQUIRE(sourceserial != NULL);
1569
1570         if (!j->header.serialset)
1571                 return (ISC_FALSE);
1572         *sourceserial = j->header.sourceserial;
1573         return (ISC_TRUE);
1574 }
1575
1576 /**************************************************************************/
1577 /*
1578  * Iteration support.
1579  *
1580  * When serving an outgoing IXFR, we transmit a part the journal starting
1581  * at the serial number in the IXFR request and ending at the serial
1582  * number that is current when the IXFR request arrives.  The ending
1583  * serial number is not necessarily at the end of the journal:
1584  * the journal may grow while the IXFR is in progress, but we stop
1585  * when we reach the serial number that was current when the IXFR started.
1586  */
1587
1588 static isc_result_t read_one_rr(dns_journal_t *j);
1589
1590 /*
1591  * Make sure the buffer 'b' is has at least 'size' bytes
1592  * allocated, and clear it.
1593  *
1594  * Requires:
1595  *      Either b->base is NULL, or it points to b->length bytes of memory
1596  *      previously allocated by isc_mem_get().
1597  */
1598
1599 static isc_result_t
1600 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1601         if (b->length < size) {
1602                 void *mem = isc_mem_get(mctx, size);
1603                 if (mem == NULL)
1604                         return (ISC_R_NOMEMORY);
1605                 if (b->base != NULL)
1606                         isc_mem_put(mctx, b->base, b->length);
1607                 b->base = mem;
1608                 b->length = size;
1609         }
1610         isc_buffer_clear(b);
1611         return (ISC_R_SUCCESS);
1612 }
1613
1614 isc_result_t
1615 dns_journal_iter_init(dns_journal_t *j,
1616                       isc_uint32_t begin_serial, isc_uint32_t end_serial)
1617 {
1618         isc_result_t result;
1619
1620         CHECK(journal_find(j, begin_serial, &j->it.bpos));
1621         INSIST(j->it.bpos.serial == begin_serial);
1622
1623         CHECK(journal_find(j, end_serial, &j->it.epos));
1624         INSIST(j->it.epos.serial == end_serial);
1625
1626         result = ISC_R_SUCCESS;
1627  failure:
1628         j->it.result = result;
1629         return (j->it.result);
1630 }
1631
1632
1633 isc_result_t
1634 dns_journal_first_rr(dns_journal_t *j) {
1635         isc_result_t result;
1636
1637         /*
1638          * Seek to the beginning of the first transaction we are
1639          * interested in.
1640          */
1641         CHECK(journal_seek(j, j->it.bpos.offset));
1642         j->it.current_serial = j->it.bpos.serial;
1643
1644         j->it.xsize = 0;  /* We have no transaction data yet... */
1645         j->it.xpos = 0;   /* ...and haven't used any of it. */
1646
1647         return (read_one_rr(j));
1648
1649  failure:
1650         return (result);
1651 }
1652
1653 static isc_result_t
1654 read_one_rr(dns_journal_t *j) {
1655         isc_result_t result;
1656
1657         dns_rdatatype_t rdtype;
1658         dns_rdataclass_t rdclass;
1659         unsigned int rdlen;
1660         isc_uint32_t ttl;
1661         journal_xhdr_t xhdr;
1662         journal_rrhdr_t rrhdr;
1663
1664         INSIST(j->offset <= j->it.epos.offset);
1665         if (j->offset == j->it.epos.offset)
1666                 return (ISC_R_NOMORE);
1667         if (j->it.xpos == j->it.xsize) {
1668                 /*
1669                  * We are at a transaction boundary.
1670                  * Read another transaction header.
1671                  */
1672                 CHECK(journal_read_xhdr(j, &xhdr));
1673                 if (xhdr.size == 0) {
1674                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1675                                       "%s: journal corrupt: empty transaction",
1676                                       j->filename);
1677                         FAIL(ISC_R_UNEXPECTED);
1678                 }
1679                 if (xhdr.serial0 != j->it.current_serial) {
1680                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1681                                          "%s: journal file corrupt: "
1682                                          "expected serial %u, got %u",
1683                                          j->filename,
1684                                          j->it.current_serial, xhdr.serial0);
1685                         FAIL(ISC_R_UNEXPECTED);
1686                 }
1687                 j->it.xsize = xhdr.size;
1688                 j->it.xpos = 0;
1689         }
1690         /*
1691          * Read an RR.
1692          */
1693         CHECK(journal_read_rrhdr(j, &rrhdr));
1694         /*
1695          * Perform a sanity check on the journal RR size.
1696          * The smallest possible RR has a 1-byte owner name
1697          * and a 10-byte header.  The largest possible
1698          * RR has 65535 bytes of data, a header, and a maximum-
1699          * size owner name, well below 70 k total.
1700          */
1701         if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1702                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1703                                  "%s: journal corrupt: impossible RR size "
1704                                  "(%d bytes)", j->filename, rrhdr.size);
1705                 FAIL(ISC_R_UNEXPECTED);
1706         }
1707
1708         CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1709         CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1710         isc_buffer_add(&j->it.source, rrhdr.size);
1711
1712         /*
1713          * The target buffer is made the same size
1714          * as the source buffer, with the assumption that when
1715          * no compression in present, the output of dns_*_fromwire()
1716          * is no larger than the input.
1717          */
1718         CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1719
1720         /*
1721          * Parse the owner name.  We don't know where it
1722          * ends yet, so we make the entire "remaining"
1723          * part of the buffer "active".
1724          */
1725         isc_buffer_setactive(&j->it.source,
1726                              j->it.source.used - j->it.source.current);
1727         CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1728                                 &j->it.dctx, 0, &j->it.target));
1729
1730         /*
1731          * Check that the RR header is there, and parse it.
1732          */
1733         if (isc_buffer_remaininglength(&j->it.source) < 10)
1734                 FAIL(DNS_R_FORMERR);
1735
1736         rdtype = isc_buffer_getuint16(&j->it.source);
1737         rdclass = isc_buffer_getuint16(&j->it.source);
1738         ttl = isc_buffer_getuint32(&j->it.source);
1739         rdlen = isc_buffer_getuint16(&j->it.source);
1740
1741         /*
1742          * Parse the rdata.
1743          */
1744         if (isc_buffer_remaininglength(&j->it.source) != rdlen)
1745                 FAIL(DNS_R_FORMERR);
1746         isc_buffer_setactive(&j->it.source, rdlen);
1747         dns_rdata_reset(&j->it.rdata);
1748         CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1749                                  rdtype, &j->it.source, &j->it.dctx,
1750                                  0, &j->it.target));
1751         j->it.ttl = ttl;
1752
1753         j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1754         if (rdtype == dns_rdatatype_soa) {
1755                 /* XXX could do additional consistency checks here */
1756                 j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1757         }
1758
1759         result = ISC_R_SUCCESS;
1760
1761  failure:
1762         j->it.result = result;
1763         return (result);
1764 }
1765
1766 isc_result_t
1767 dns_journal_next_rr(dns_journal_t *j) {
1768         j->it.result = read_one_rr(j);
1769         return (j->it.result);
1770 }
1771
1772 void
1773 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1774                    dns_rdata_t **rdata)
1775 {
1776         REQUIRE(j->it.result == ISC_R_SUCCESS);
1777         *name = &j->it.name;
1778         *ttl = j->it.ttl;
1779         *rdata = &j->it.rdata;
1780 }
1781
1782 /**************************************************************************/
1783 /*
1784  * Generating diffs from databases
1785  */
1786
1787 /*
1788  * Construct a diff containing all the RRs at the current name of the
1789  * database iterator 'dbit' in database 'db', version 'ver'.
1790  * Set '*name' to the current name, and append the diff to 'diff'.
1791  * All new tuples will have the operation 'op'.
1792  *
1793  * Requires: 'name' must have buffer large enough to hold the name.
1794  * Typically, a dns_fixedname_t would be used.
1795  */
1796 static isc_result_t
1797 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1798               dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1799               dns_diff_t *diff)
1800 {
1801         isc_result_t result;
1802         dns_dbnode_t *node = NULL;
1803         dns_rdatasetiter_t *rdsiter = NULL;
1804         dns_difftuple_t *tuple = NULL;
1805
1806         result = dns_dbiterator_current(dbit, &node, name);
1807         if (result != ISC_R_SUCCESS)
1808                 return (result);
1809
1810         result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1811         if (result != ISC_R_SUCCESS)
1812                 goto cleanup_node;
1813
1814         for (result = dns_rdatasetiter_first(rdsiter);
1815              result == ISC_R_SUCCESS;
1816              result = dns_rdatasetiter_next(rdsiter))
1817         {
1818                 dns_rdataset_t rdataset;
1819
1820                 dns_rdataset_init(&rdataset);
1821                 dns_rdatasetiter_current(rdsiter, &rdataset);
1822
1823                 for (result = dns_rdataset_first(&rdataset);
1824                      result == ISC_R_SUCCESS;
1825                      result = dns_rdataset_next(&rdataset))
1826                 {
1827                         dns_rdata_t rdata = DNS_RDATA_INIT;
1828                         dns_rdataset_current(&rdataset, &rdata);
1829                         result = dns_difftuple_create(diff->mctx, op, name,
1830                                                       rdataset.ttl, &rdata,
1831                                                       &tuple);
1832                         if (result != ISC_R_SUCCESS) {
1833                                 dns_rdataset_disassociate(&rdataset);
1834                                 goto cleanup_iterator;
1835                         }
1836                         dns_diff_append(diff, &tuple);
1837                 }
1838                 dns_rdataset_disassociate(&rdataset);
1839                 if (result != ISC_R_NOMORE)
1840                         goto cleanup_iterator;
1841         }
1842         if (result != ISC_R_NOMORE)
1843                 goto cleanup_iterator;
1844
1845         result = ISC_R_SUCCESS;
1846
1847  cleanup_iterator:
1848         dns_rdatasetiter_destroy(&rdsiter);
1849
1850  cleanup_node:
1851         dns_db_detachnode(db, &node);
1852
1853         return (result);
1854 }
1855
1856 /*
1857  * Comparison function for use by dns_diff_subtract when sorting
1858  * the diffs to be subtracted.  The sort keys are the rdata type
1859  * and the rdata itself.  The owner name is ignored, because
1860  * it is known to be the same for all tuples.
1861  */
1862 static int
1863 rdata_order(const void *av, const void *bv) {
1864         dns_difftuple_t const * const *ap = av;
1865         dns_difftuple_t const * const *bp = bv;
1866         dns_difftuple_t const *a = *ap;
1867         dns_difftuple_t const *b = *bp;
1868         int r;
1869         r = (b->rdata.type - a->rdata.type);
1870         if (r != 0)
1871                 return (r);
1872         r = dns_rdata_compare(&a->rdata, &b->rdata);
1873         return (r);
1874 }
1875
1876 static isc_result_t
1877 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1878         isc_result_t result;
1879         dns_difftuple_t *p[2];
1880         int i, t;
1881         isc_boolean_t append;
1882
1883         CHECK(dns_diff_sort(&diff[0], rdata_order));
1884         CHECK(dns_diff_sort(&diff[1], rdata_order));
1885
1886         for (;;) {
1887                 p[0] = ISC_LIST_HEAD(diff[0].tuples);
1888                 p[1] = ISC_LIST_HEAD(diff[1].tuples);
1889                 if (p[0] == NULL && p[1] == NULL)
1890                         break;
1891
1892                 for (i = 0; i < 2; i++)
1893                         if (p[!i] == NULL) {
1894                                 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1895                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1896                                 goto next;
1897                         }
1898                 t = rdata_order(&p[0], &p[1]);
1899                 if (t < 0) {
1900                         ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1901                         ISC_LIST_APPEND(r->tuples, p[0], link);
1902                         goto next;
1903                 }
1904                 if (t > 0) {
1905                         ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1906                         ISC_LIST_APPEND(r->tuples, p[1], link);
1907                         goto next;
1908                 }
1909                 INSIST(t == 0);
1910                 /*
1911                  * Identical RRs in both databases; skip them both
1912                  * if the ttl differs.
1913                  */
1914                 append = ISC_TF(p[0]->ttl != p[1]->ttl);
1915                 for (i = 0; i < 2; i++) {
1916                         ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1917                         if (append) {
1918                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1919                         } else {
1920                                 dns_difftuple_free(&p[i]);
1921                         }
1922                 }
1923         next: ;
1924         }
1925         result = ISC_R_SUCCESS;
1926  failure:
1927         return (result);
1928 }
1929
1930 static isc_result_t
1931 diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera,
1932                dns_db_t *dbb, dns_dbversion_t *dbverb,
1933                unsigned int options, dns_diff_t *resultdiff)
1934 {
1935         dns_db_t *db[2];
1936         dns_dbversion_t *ver[2];
1937         dns_dbiterator_t *dbit[2] = { NULL, NULL };
1938         isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1939         dns_fixedname_t fixname[2];
1940         isc_result_t result, itresult[2];
1941         dns_diff_t diff[2];
1942         int i, t;
1943
1944         db[0] = dba, db[1] = dbb;
1945         ver[0] = dbvera, ver[1] = dbverb;
1946
1947         dns_diff_init(resultdiff->mctx, &diff[0]);
1948         dns_diff_init(resultdiff->mctx, &diff[1]);
1949
1950         dns_fixedname_init(&fixname[0]);
1951         dns_fixedname_init(&fixname[1]);
1952
1953         result = dns_db_createiterator(db[0], options, &dbit[0]);
1954         if (result != ISC_R_SUCCESS)
1955                 return (result);
1956         result = dns_db_createiterator(db[1], options, &dbit[1]);
1957         if (result != ISC_R_SUCCESS)
1958                 goto cleanup_iterator;
1959
1960         itresult[0] = dns_dbiterator_first(dbit[0]);
1961         itresult[1] = dns_dbiterator_first(dbit[1]);
1962
1963         for (;;) {
1964                 for (i = 0; i < 2; i++) {
1965                         if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1966                                 CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1967                                             dns_fixedname_name(&fixname[i]),
1968                                             i == 0 ?
1969                                             DNS_DIFFOP_ADD :
1970                                             DNS_DIFFOP_DEL,
1971                                             &diff[i]));
1972                                 itresult[i] = dns_dbiterator_next(dbit[i]);
1973                                 have[i] = ISC_TRUE;
1974                         }
1975                 }
1976
1977                 if (! have[0] && ! have[1]) {
1978                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1979                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1980                         break;
1981                 }
1982
1983                 for (i = 0; i < 2; i++) {
1984                         if (! have[!i]) {
1985                                 ISC_LIST_APPENDLIST(resultdiff->tuples,
1986                                                     diff[i].tuples, link);
1987                                 INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1988                                 have[i] = ISC_FALSE;
1989                                 goto next;
1990                         }
1991                 }
1992
1993                 t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1994                                      dns_fixedname_name(&fixname[1]));
1995                 if (t < 0) {
1996                         ISC_LIST_APPENDLIST(resultdiff->tuples,
1997                                             diff[0].tuples, link);
1998                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1999                         have[0] = ISC_FALSE;
2000                         continue;
2001                 }
2002                 if (t > 0) {
2003                         ISC_LIST_APPENDLIST(resultdiff->tuples,
2004                                             diff[1].tuples, link);
2005                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2006                         have[1] = ISC_FALSE;
2007                         continue;
2008                 }
2009                 INSIST(t == 0);
2010                 CHECK(dns_diff_subtract(diff, resultdiff));
2011                 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2012                 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2013                 have[0] = have[1] = ISC_FALSE;
2014         next: ;
2015         }
2016         if (itresult[0] != ISC_R_NOMORE)
2017                 FAIL(itresult[0]);
2018         if (itresult[1] != ISC_R_NOMORE)
2019                 FAIL(itresult[1]);
2020
2021         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2022         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2023
2024  failure:
2025         dns_dbiterator_destroy(&dbit[1]);
2026
2027  cleanup_iterator:
2028         dns_dbiterator_destroy(&dbit[0]);
2029         dns_diff_clear(&diff[0]);
2030         dns_diff_clear(&diff[1]);
2031         return (result);
2032 }
2033
2034 /*
2035  * Compare the databases 'dba' and 'dbb' and generate a journal
2036  * entry containing the changes to make 'dba' from 'dbb' (note
2037  * the order).  This journal entry will consist of a single,
2038  * possibly very large transaction.
2039  */
2040 isc_result_t
2041 dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
2042             dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
2043 {
2044         isc_result_t result;
2045         dns_diff_t diff;
2046
2047         dns_diff_init(mctx, &diff);
2048
2049         result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
2050
2051         dns_diff_clear(&diff);
2052
2053         return (result);
2054 }
2055
2056 isc_result_t
2057 dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
2058              dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
2059 {
2060         isc_result_t result;
2061         dns_journal_t *journal = NULL;
2062
2063         if (filename != NULL) {
2064                 result = dns_journal_open(diff->mctx, filename,
2065                                           DNS_JOURNAL_CREATE, &journal);
2066                 if (result != ISC_R_SUCCESS)
2067                         return (result);
2068         }
2069
2070         CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
2071         CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
2072
2073         if (journal != NULL) {
2074                 if (ISC_LIST_EMPTY(diff->tuples))
2075                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
2076                 else
2077                         CHECK(dns_journal_write_transaction(journal, diff));
2078         }
2079
2080  failure:
2081         if (journal != NULL)
2082                 dns_journal_destroy(&journal);
2083         return (result);
2084 }
2085
2086 isc_result_t
2087 dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
2088                     isc_uint32_t target_size)
2089 {
2090         unsigned int i;
2091         journal_pos_t best_guess;
2092         journal_pos_t current_pos;
2093         dns_journal_t *j = NULL;
2094         dns_journal_t *new = NULL;
2095         journal_rawheader_t rawheader;
2096         unsigned int copy_length;
2097         int namelen;
2098         char *buf = NULL;
2099         unsigned int size = 0;
2100         isc_result_t result;
2101         unsigned int indexend;
2102         char newname[1024];
2103         char backup[1024];
2104         isc_boolean_t is_backup = ISC_FALSE;
2105
2106         namelen = strlen(filename);
2107         if (namelen > 4 && strcmp(filename + namelen - 4, ".jnl") == 0)
2108                 namelen -= 4;
2109
2110         result = isc_string_printf(newname, sizeof(newname), "%.*s.jnw",
2111                                    namelen, filename);
2112         if (result != ISC_R_SUCCESS)
2113                 return (result);
2114
2115         result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
2116                                    namelen, filename);
2117         if (result != ISC_R_SUCCESS)
2118                 return (result);
2119
2120         result = journal_open(mctx, filename, ISC_FALSE, ISC_FALSE, &j);
2121         if (result == ISC_R_NOTFOUND) {
2122                 is_backup = ISC_TRUE;
2123                 result = journal_open(mctx, backup, ISC_FALSE, ISC_FALSE, &j);
2124         }
2125         if (result != ISC_R_SUCCESS)
2126                 return (result);
2127
2128         if (JOURNAL_EMPTY(&j->header)) {
2129                 dns_journal_destroy(&j);
2130                 return (ISC_R_SUCCESS);
2131         }
2132
2133         if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
2134             DNS_SERIAL_GT(serial, j->header.end.serial)) {
2135                 dns_journal_destroy(&j);
2136                 return (ISC_R_RANGE);
2137         }
2138
2139         /*
2140          * Cope with very small target sizes.
2141          */
2142         indexend = sizeof(journal_rawheader_t) +
2143                    j->header.index_size * sizeof(journal_rawpos_t);
2144         if (target_size < indexend * 2)
2145                 target_size = target_size/2 + indexend;
2146
2147         /*
2148          * See if there is any work to do.
2149          */
2150         if ((isc_uint32_t) j->header.end.offset < target_size) {
2151                 dns_journal_destroy(&j);
2152                 return (ISC_R_SUCCESS);
2153         }
2154
2155         CHECK(journal_open(mctx, newname, ISC_TRUE, ISC_TRUE, &new));
2156
2157         /*
2158          * Remove overhead so space test below can succeed.
2159          */
2160         if (target_size >= indexend)
2161                 target_size -= indexend;
2162
2163         /*
2164          * Find if we can create enough free space.
2165          */
2166         best_guess = j->header.begin;
2167         for (i = 0; i < j->header.index_size; i++) {
2168                 if (POS_VALID(j->index[i]) &&
2169                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
2170                     ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
2171                      >= target_size / 2) &&
2172                     j->index[i].offset > best_guess.offset)
2173                         best_guess = j->index[i];
2174         }
2175
2176         current_pos = best_guess;
2177         while (current_pos.serial != serial) {
2178                 CHECK(journal_next(j, &current_pos));
2179                 if (current_pos.serial == j->header.end.serial)
2180                         break;
2181
2182                 if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2183                    ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
2184                      >= (target_size / 2)) &&
2185                     current_pos.offset > best_guess.offset)
2186                         best_guess = current_pos;
2187                 else
2188                         break;
2189         }
2190
2191         INSIST(best_guess.serial != j->header.end.serial);
2192         if (best_guess.serial != serial)
2193                 CHECK(journal_next(j, &best_guess));
2194
2195         /*
2196          * We should now be roughly half target_size provided
2197          * we did not reach 'serial'.  If not we will just copy
2198          * all uncommitted deltas regardless of the size.
2199          */
2200         copy_length = j->header.end.offset - best_guess.offset;
2201
2202         if (copy_length != 0) {
2203                 /*
2204                  * Copy best_guess to end into space just freed.
2205                  */
2206                 size = 64*1024;
2207                 if (copy_length < size)
2208                         size = copy_length;
2209                 buf = isc_mem_get(mctx, size);
2210                 if (buf == NULL) {
2211                         result = ISC_R_NOMEMORY;
2212                         goto failure;
2213                 }
2214
2215                 CHECK(journal_seek(j, best_guess.offset));
2216                 CHECK(journal_seek(new, indexend));
2217                 for (i = 0; i < copy_length; i += size) {
2218                         unsigned int len = (copy_length - i) > size ? size :
2219                                                          (copy_length - i);
2220                         CHECK(journal_read(j, buf, len));
2221                         CHECK(journal_write(new, buf, len));
2222                 }
2223
2224                 CHECK(journal_fsync(new));
2225
2226                 /*
2227                  * Compute new header.
2228                  */
2229                 new->header.begin.serial = best_guess.serial;
2230                 new->header.begin.offset = indexend;
2231                 new->header.end.serial = j->header.end.serial;
2232                 new->header.end.offset = indexend + copy_length;
2233                 new->header.sourceserial = j->header.sourceserial;
2234                 new->header.serialset = j->header.serialset;
2235
2236                 /*
2237                  * Update the journal header.
2238                  */
2239                 journal_header_encode(&new->header, &rawheader);
2240                 CHECK(journal_seek(new, 0));
2241                 CHECK(journal_write(new, &rawheader, sizeof(rawheader)));
2242                 CHECK(journal_fsync(new));
2243
2244                 /*
2245                  * Build new index.
2246                  */
2247                 current_pos = new->header.begin;
2248                 while (current_pos.serial != new->header.end.serial) {
2249                         index_add(new, &current_pos);
2250                         CHECK(journal_next(new, &current_pos));
2251                 }
2252
2253                 /*
2254                  * Write index.
2255                  */
2256                 CHECK(index_to_disk(new));
2257                 CHECK(journal_fsync(new));
2258
2259                 indexend = new->header.end.offset;
2260                 POST(indexend);
2261         }
2262
2263         /*
2264          * Close both journals before trying to rename files (this is
2265          * necessary on WIN32).
2266          */
2267         dns_journal_destroy(&j);
2268         dns_journal_destroy(&new);
2269
2270         /*
2271          * With a UFS file system this should just succeed and be atomic.
2272          * Any IXFR outs will just continue and the old journal will be
2273          * removed on final close.
2274          *
2275          * With MSDOS / NTFS we need to do a two stage rename, triggered
2276          * by EEXIST.  (If any IXFR's are running in other threads, however,
2277          * this will fail, and the journal will not be compacted.  But
2278          * if so, hopefully they'll be finished by the next time we
2279          * compact.)
2280          */
2281         if (rename(newname, filename) == -1) {
2282                 if (errno == EEXIST && !is_backup) {
2283                         result = isc_file_remove(backup);
2284                         if (result != ISC_R_SUCCESS &&
2285                             result != ISC_R_FILENOTFOUND)
2286                                 goto failure;
2287                         if (rename(filename, backup) == -1)
2288                                 goto maperrno;
2289                         if (rename(newname, filename) == -1)
2290                                 goto maperrno;
2291                         (void)isc_file_remove(backup);
2292                 } else {
2293  maperrno:
2294                         result = ISC_R_FAILURE;
2295                         goto failure;
2296                 }
2297         }
2298
2299         result = ISC_R_SUCCESS;
2300
2301  failure:
2302         (void)isc_file_remove(newname);
2303         if (buf != NULL)
2304                 isc_mem_put(mctx, buf, size);
2305         if (j != NULL)
2306                 dns_journal_destroy(&j);
2307         if (new != NULL)
2308                 dns_journal_destroy(&new);
2309         return (result);
2310 }
2311
2312 static isc_result_t
2313 index_to_disk(dns_journal_t *j) {
2314         isc_result_t result = ISC_R_SUCCESS;
2315
2316         if (j->header.index_size != 0) {
2317                 unsigned int i;
2318                 unsigned char *p;
2319                 unsigned int rawbytes;
2320
2321                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2322
2323                 p = j->rawindex;
2324                 for (i = 0; i < j->header.index_size; i++) {
2325                         encode_uint32(j->index[i].serial, p);
2326                         p += 4;
2327                         encode_uint32(j->index[i].offset, p);
2328                         p += 4;
2329                 }
2330                 INSIST(p == j->rawindex + rawbytes);
2331
2332                 CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2333                 CHECK(journal_write(j, j->rawindex, rawbytes));
2334         }
2335 failure:
2336         return (result);
2337 }