]> CyberLeo.Net >> Repos - FreeBSD/releng/9.3.git/blob - contrib/bind9/lib/dns/journal.c
Fix resource exhaustion in TCP reassembly. [SA-15:15]
[FreeBSD/releng/9.3.git] / contrib / bind9 / lib / dns / journal.c
1 /*
2  * Copyright (C) 2004, 2005, 2007-2011, 2013, 2014  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2002  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: journal.c,v 1.120 2011/12/22 07:32:41 each Exp $ */
19
20 #include <config.h>
21
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <errno.h>
25
26 #include <isc/file.h>
27 #include <isc/mem.h>
28 #include <isc/stdio.h>
29 #include <isc/string.h>
30 #include <isc/util.h>
31
32 #include <dns/compress.h>
33 #include <dns/db.h>
34 #include <dns/dbiterator.h>
35 #include <dns/diff.h>
36 #include <dns/fixedname.h>
37 #include <dns/journal.h>
38 #include <dns/log.h>
39 #include <dns/rdataset.h>
40 #include <dns/rdatasetiter.h>
41 #include <dns/result.h>
42 #include <dns/soa.h>
43
44 /*! \file
45  * \brief Journaling.
46  *
47  * A journal file consists of
48  *
49  *   \li A fixed-size header of type journal_rawheader_t.
50  *
51  *   \li The index.  This is an unordered array of index entries
52  *     of type journal_rawpos_t giving the locations
53  *     of some arbitrary subset of the journal's addressable
54  *     transactions.  The index entries are used as hints to
55  *     speed up the process of locating a transaction with a given
56  *     serial number.  Unused index entries have an "offset"
57  *     field of zero.  The size of the index can vary between
58  *     journal files, but does not change during the lifetime
59  *     of a file.  The size can be zero.
60  *
61  *   \li The journal data.  This  consists of one or more transactions.
62  *     Each transaction begins with a transaction header of type
63  *     journal_rawxhdr_t.  The transaction header is followed by a
64  *     sequence of RRs, similar in structure to an IXFR difference
65  *     sequence (RFC1995).  That is, the pre-transaction SOA,
66  *     zero or more other deleted RRs, the post-transaction SOA,
67  *     and zero or more other added RRs.  Unlike in IXFR, each RR
68  *     is prefixed with a 32-bit length.
69  *
70  *     The journal data part grows as new transactions are
71  *     appended to the file.  Only those transactions
72  *     whose serial number is current-(2^31-1) to current
73  *     are considered "addressable" and may be pointed
74  *     to from the header or index.  They may be preceded
75  *     by old transactions that are no longer addressable,
76  *     and they may be followed by transactions that were
77  *     appended to the journal but never committed by updating
78  *     the "end" position in the header.  The latter will
79  *     be overwritten when new transactions are added.
80  */
81 /*%
82  * When true, accept IXFR difference sequences where the
83  * SOA serial number does not change (BIND 8 sends such
84  * sequences).
85  */
86 static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
87
88 /**************************************************************************/
89 /*
90  * Miscellaneous utilities.
91  */
92
93 #define JOURNAL_COMMON_LOGARGS \
94         dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
95
96 #define JOURNAL_DEBUG_LOGARGS(n) \
97         JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
98
99 /*%
100  * It would be non-sensical (or at least obtuse) to use FAIL() with an
101  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
102  * from complaining about "end-of-loop code not reached".
103  */
104 #define FAIL(code) \
105         do { result = (code);                                   \
106                 if (result != ISC_R_SUCCESS) goto failure;      \
107         } while (0)
108
109 #define CHECK(op) \
110         do { result = (op);                                     \
111                 if (result != ISC_R_SUCCESS) goto failure;      \
112         } while (0)
113
114 #define JOURNAL_SERIALSET       0x01U
115
116 static isc_result_t index_to_disk(dns_journal_t *);
117
118 static inline isc_uint32_t
119 decode_uint32(unsigned char *p) {
120         return ((p[0] << 24) +
121                 (p[1] << 16) +
122                 (p[2] <<  8) +
123                 (p[3] <<  0));
124 }
125
126 static inline void
127 encode_uint32(isc_uint32_t val, unsigned char *p) {
128         p[0] = (isc_uint8_t)(val >> 24);
129         p[1] = (isc_uint8_t)(val >> 16);
130         p[2] = (isc_uint8_t)(val >>  8);
131         p[3] = (isc_uint8_t)(val >>  0);
132 }
133
134 isc_result_t
135 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
136                       dns_diffop_t op, dns_difftuple_t **tp)
137 {
138         isc_result_t result;
139         dns_dbnode_t *node;
140         dns_rdataset_t rdataset;
141         dns_rdata_t rdata = DNS_RDATA_INIT;
142         dns_name_t *zonename;
143
144         zonename = dns_db_origin(db);
145
146         node = NULL;
147         result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
148         if (result != ISC_R_SUCCESS)
149                 goto nonode;
150
151         dns_rdataset_init(&rdataset);
152         result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
153                                      (isc_stdtime_t)0, &rdataset, NULL);
154         if (result != ISC_R_SUCCESS)
155                 goto freenode;
156
157         result = dns_rdataset_first(&rdataset);
158         if (result != ISC_R_SUCCESS)
159                 goto freenode;
160
161         dns_rdataset_current(&rdataset, &rdata);
162
163         result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
164                                       &rdata, tp);
165
166         dns_rdataset_disassociate(&rdataset);
167         dns_db_detachnode(db, &node);
168         return (result);
169
170  freenode:
171         dns_db_detachnode(db, &node);
172  nonode:
173         UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
174         return (result);
175 }
176
177 /* Journaling */
178
179 /*%
180  * On-disk representation of a "pointer" to a journal entry.
181  * These are used in the journal header to locate the beginning
182  * and end of the journal, and in the journal index to locate
183  * other transactions.
184  */
185 typedef struct {
186         unsigned char   serial[4];  /*%< SOA serial before update. */
187         /*
188          * XXXRTH  Should offset be 8 bytes?
189          * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
190          * XXXAG  ... but we will not be able to seek >2G anyway on many
191          *            platforms as long as we are using fseek() rather
192          *            than lseek().
193          */
194         unsigned char   offset[4];  /*%< Offset from beginning of file. */
195 } journal_rawpos_t;
196
197
198 /*%
199  * The header is of a fixed size, with some spare room for future
200  * extensions.
201  */
202 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
203
204 /*%
205  * The on-disk representation of the journal header.
206  * All numbers are stored in big-endian order.
207  */
208 typedef union {
209         struct {
210                 /*% File format version ID. */
211                 unsigned char           format[16];
212                 /*% Position of the first addressable transaction */
213                 journal_rawpos_t        begin;
214                 /*% Position of the next (yet nonexistent) transaction. */
215                 journal_rawpos_t        end;
216                 /*% Number of index entries following the header. */
217                 unsigned char           index_size[4];
218                 /*% Source serial number. */
219                 unsigned char           sourceserial[4];
220                 unsigned char           flags;
221         } h;
222         /* Pad the header to a fixed size. */
223         unsigned char pad[JOURNAL_HEADER_SIZE];
224 } journal_rawheader_t;
225
226 /*%
227  * The on-disk representation of the transaction header.
228  * There is one of these at the beginning of each transaction.
229  */
230 typedef struct {
231         unsigned char   size[4];        /*%< In bytes, excluding header. */
232         unsigned char   serial0[4];     /*%< SOA serial before update. */
233         unsigned char   serial1[4];     /*%< SOA serial after update. */
234 } journal_rawxhdr_t;
235
236 /*%
237  * The on-disk representation of the RR header.
238  * There is one of these at the beginning of each RR.
239  */
240 typedef struct {
241         unsigned char   size[4];        /*%< In bytes, excluding header. */
242 } journal_rawrrhdr_t;
243
244 /*%
245  * The in-core representation of the journal header.
246  */
247 typedef struct {
248         isc_uint32_t    serial;
249         isc_offset_t    offset;
250 } journal_pos_t;
251
252 #define POS_VALID(pos)          ((pos).offset != 0)
253 #define POS_INVALIDATE(pos)     ((pos).offset = 0, (pos).serial = 0)
254
255 typedef struct {
256         unsigned char   format[16];
257         journal_pos_t   begin;
258         journal_pos_t   end;
259         isc_uint32_t    index_size;
260         isc_uint32_t    sourceserial;
261         isc_boolean_t   serialset;
262 } journal_header_t;
263
264 /*%
265  * The in-core representation of the transaction header.
266  */
267
268 typedef struct {
269         isc_uint32_t    size;
270         isc_uint32_t    serial0;
271         isc_uint32_t    serial1;
272 } journal_xhdr_t;
273
274 /*%
275  * The in-core representation of the RR header.
276  */
277 typedef struct {
278         isc_uint32_t    size;
279 } journal_rrhdr_t;
280
281
282 /*%
283  * Initial contents to store in the header of a newly created
284  * journal file.
285  *
286  * The header starts with the magic string ";BIND LOG V9\n"
287  * to identify the file as a BIND 9 journal file.  An ASCII
288  * identification string is used rather than a binary magic
289  * number to be consistent with BIND 8 (BIND 8 journal files
290  * are ASCII text files).
291  */
292
293 static journal_header_t
294 initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0 };
295
296 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
297
298 typedef enum {
299         JOURNAL_STATE_INVALID,
300         JOURNAL_STATE_READ,
301         JOURNAL_STATE_WRITE,
302         JOURNAL_STATE_TRANSACTION,
303         JOURNAL_STATE_INLINE
304 } journal_state_t;
305
306 struct dns_journal {
307         unsigned int            magic;          /*%< JOUR */
308         isc_mem_t               *mctx;          /*%< Memory context */
309         journal_state_t         state;
310         char                    *filename;      /*%< Journal file name */
311         FILE *                  fp;             /*%< File handle */
312         isc_offset_t            offset;         /*%< Current file offset */
313         journal_header_t        header;         /*%< In-core journal header */
314         unsigned char           *rawindex;      /*%< In-core buffer for journal index in on-disk format */
315         journal_pos_t           *index;         /*%< In-core journal index */
316
317         /*% Current transaction state (when writing). */
318         struct {
319                 unsigned int    n_soa;          /*%< Number of SOAs seen */
320                 journal_pos_t   pos[2];         /*%< Begin/end position */
321         } x;
322
323         /*% Iteration state (when reading). */
324         struct {
325                 /* These define the part of the journal we iterate over. */
326                 journal_pos_t bpos;             /*%< Position before first, */
327                 journal_pos_t epos;             /*%< and after last transaction */
328                 /* The rest is iterator state. */
329                 isc_uint32_t current_serial;    /*%< Current SOA serial */
330                 isc_buffer_t source;            /*%< Data from disk */
331                 isc_buffer_t target;            /*%< Data from _fromwire check */
332                 dns_decompress_t dctx;          /*%< Dummy decompression ctx */
333                 dns_name_t name;                /*%< Current domain name */
334                 dns_rdata_t rdata;              /*%< Current rdata */
335                 isc_uint32_t ttl;               /*%< Current TTL */
336                 unsigned int xsize;             /*%< Size of transaction data */
337                 unsigned int xpos;              /*%< Current position in it */
338                 isc_result_t result;            /*%< Result of last call */
339         } it;
340 };
341
342 #define DNS_JOURNAL_MAGIC       ISC_MAGIC('J', 'O', 'U', 'R')
343 #define DNS_JOURNAL_VALID(t)    ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
344
345 static void
346 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
347         cooked->serial = decode_uint32(raw->serial);
348         cooked->offset = decode_uint32(raw->offset);
349 }
350
351 static void
352 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
353         encode_uint32(cooked->serial, raw->serial);
354         encode_uint32(cooked->offset, raw->offset);
355 }
356
357 static void
358 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
359         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
360         memmove(cooked->format, raw->h.format, sizeof(cooked->format));
361         journal_pos_decode(&raw->h.begin, &cooked->begin);
362         journal_pos_decode(&raw->h.end, &cooked->end);
363         cooked->index_size = decode_uint32(raw->h.index_size);
364         cooked->sourceserial = decode_uint32(raw->h.sourceserial);
365         cooked->serialset = ISC_TF(raw->h.flags & JOURNAL_SERIALSET);
366 }
367
368 static void
369 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
370         unsigned char flags = 0;
371
372         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
373         memset(raw->pad, 0, sizeof(raw->pad));
374         memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
375         journal_pos_encode(&raw->h.begin, &cooked->begin);
376         journal_pos_encode(&raw->h.end, &cooked->end);
377         encode_uint32(cooked->index_size, raw->h.index_size);
378         encode_uint32(cooked->sourceserial, raw->h.sourceserial);
379         if (cooked->serialset)
380                 flags |= JOURNAL_SERIALSET;
381         raw->h.flags = flags;
382 }
383
384 /*
385  * Journal file I/O subroutines, with error checking and reporting.
386  */
387 static isc_result_t
388 journal_seek(dns_journal_t *j, isc_uint32_t offset) {
389         isc_result_t result;
390         result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
391         if (result != ISC_R_SUCCESS) {
392                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
393                               "%s: seek: %s", j->filename,
394                               isc_result_totext(result));
395                 return (ISC_R_UNEXPECTED);
396         }
397         j->offset = offset;
398         return (ISC_R_SUCCESS);
399 }
400
401 static isc_result_t
402 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
403         isc_result_t result;
404
405         result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
406         if (result != ISC_R_SUCCESS) {
407                 if (result == ISC_R_EOF)
408                         return (ISC_R_NOMORE);
409                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
410                               "%s: read: %s",
411                               j->filename, isc_result_totext(result));
412                 return (ISC_R_UNEXPECTED);
413         }
414         j->offset += (isc_offset_t)nbytes;
415         return (ISC_R_SUCCESS);
416 }
417
418 static isc_result_t
419 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
420         isc_result_t result;
421
422         result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
423         if (result != ISC_R_SUCCESS) {
424                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
425                               "%s: write: %s",
426                               j->filename, isc_result_totext(result));
427                 return (ISC_R_UNEXPECTED);
428         }
429         j->offset += (isc_offset_t)nbytes;
430         return (ISC_R_SUCCESS);
431 }
432
433 static isc_result_t
434 journal_fsync(dns_journal_t *j) {
435         isc_result_t result;
436         result = isc_stdio_flush(j->fp);
437         if (result != ISC_R_SUCCESS) {
438                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
439                               "%s: flush: %s",
440                               j->filename, isc_result_totext(result));
441                 return (ISC_R_UNEXPECTED);
442         }
443         result = isc_stdio_sync(j->fp);
444         if (result != ISC_R_SUCCESS) {
445                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
446                               "%s: fsync: %s",
447                               j->filename, isc_result_totext(result));
448                 return (ISC_R_UNEXPECTED);
449         }
450         return (ISC_R_SUCCESS);
451 }
452
453 /*
454  * Read/write a transaction header at the current file position.
455  */
456
457 static isc_result_t
458 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
459         journal_rawxhdr_t raw;
460         isc_result_t result;
461         result = journal_read(j, &raw, sizeof(raw));
462         if (result != ISC_R_SUCCESS)
463                 return (result);
464         xhdr->size = decode_uint32(raw.size);
465         xhdr->serial0 = decode_uint32(raw.serial0);
466         xhdr->serial1 = decode_uint32(raw.serial1);
467         return (ISC_R_SUCCESS);
468 }
469
470 static isc_result_t
471 journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
472                    isc_uint32_t serial0, isc_uint32_t serial1)
473 {
474         journal_rawxhdr_t raw;
475         encode_uint32(size, raw.size);
476         encode_uint32(serial0, raw.serial0);
477         encode_uint32(serial1, raw.serial1);
478         return (journal_write(j, &raw, sizeof(raw)));
479 }
480
481
482 /*
483  * Read an RR header at the current file position.
484  */
485
486 static isc_result_t
487 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
488         journal_rawrrhdr_t raw;
489         isc_result_t result;
490         result = journal_read(j, &raw, sizeof(raw));
491         if (result != ISC_R_SUCCESS)
492                 return (result);
493         rrhdr->size = decode_uint32(raw.size);
494         return (ISC_R_SUCCESS);
495 }
496
497 static isc_result_t
498 journal_file_create(isc_mem_t *mctx, const char *filename) {
499         FILE *fp = NULL;
500         isc_result_t result;
501         journal_header_t header;
502         journal_rawheader_t rawheader;
503         int index_size = 56; /* XXX configurable */
504         int size;
505         void *mem; /* Memory for temporary index image. */
506
507         INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
508
509         result = isc_stdio_open(filename, "wb", &fp);
510         if (result != ISC_R_SUCCESS) {
511                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
512                               "%s: create: %s",
513                               filename, isc_result_totext(result));
514                 return (ISC_R_UNEXPECTED);
515         }
516
517         header = initial_journal_header;
518         header.index_size = index_size;
519         journal_header_encode(&header, &rawheader);
520
521         size = sizeof(journal_rawheader_t) +
522                 index_size * sizeof(journal_rawpos_t);
523
524         mem = isc_mem_get(mctx, size);
525         if (mem == NULL) {
526                 (void)isc_stdio_close(fp);
527                 (void)isc_file_remove(filename);
528                 return (ISC_R_NOMEMORY);
529         }
530         memset(mem, 0, size);
531         memmove(mem, &rawheader, sizeof(rawheader));
532
533         result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
534         if (result != ISC_R_SUCCESS) {
535                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
536                                  "%s: write: %s",
537                                  filename, isc_result_totext(result));
538                 (void)isc_stdio_close(fp);
539                 (void)isc_file_remove(filename);
540                 isc_mem_put(mctx, mem, size);
541                 return (ISC_R_UNEXPECTED);
542         }
543         isc_mem_put(mctx, mem, size);
544
545         result = isc_stdio_close(fp);
546         if (result != ISC_R_SUCCESS) {
547                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
548                                  "%s: close: %s",
549                                  filename, isc_result_totext(result));
550                 (void)isc_file_remove(filename);
551                 return (ISC_R_UNEXPECTED);
552         }
553
554         return (ISC_R_SUCCESS);
555 }
556
557 static isc_result_t
558 journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
559              isc_boolean_t create, dns_journal_t **journalp)
560 {
561         FILE *fp = NULL;
562         isc_result_t result;
563         journal_rawheader_t rawheader;
564         dns_journal_t *j;
565
566         INSIST(journalp != NULL && *journalp == NULL);
567         j = isc_mem_get(mctx, sizeof(*j));
568         if (j == NULL)
569                 return (ISC_R_NOMEMORY);
570
571         j->mctx = NULL;
572         isc_mem_attach(mctx, &j->mctx);
573         j->state = JOURNAL_STATE_INVALID;
574         j->fp = NULL;
575         j->filename = isc_mem_strdup(mctx, filename);
576         j->index = NULL;
577         j->rawindex = NULL;
578
579         if (j->filename == NULL)
580                 FAIL(ISC_R_NOMEMORY);
581
582         result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
583
584         if (result == ISC_R_FILENOTFOUND) {
585                 if (create) {
586                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
587                                       "journal file %s does not exist, "
588                                       "creating it", j->filename);
589                         CHECK(journal_file_create(mctx, filename));
590                         /*
591                          * Retry.
592                          */
593                         result = isc_stdio_open(j->filename, "rb+", &fp);
594                 } else {
595                         FAIL(ISC_R_NOTFOUND);
596                 }
597         }
598         if (result != ISC_R_SUCCESS) {
599                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
600                               "%s: open: %s",
601                               j->filename, isc_result_totext(result));
602                 FAIL(ISC_R_UNEXPECTED);
603         }
604
605         j->fp = fp;
606
607         /*
608          * Set magic early so that seek/read can succeed.
609          */
610         j->magic = DNS_JOURNAL_MAGIC;
611
612         CHECK(journal_seek(j, 0));
613         CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
614
615         if (memcmp(rawheader.h.format, initial_journal_header.format,
616                    sizeof(initial_journal_header.format)) != 0) {
617                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
618                                  "%s: journal format not recognized",
619                                  j->filename);
620                 FAIL(ISC_R_UNEXPECTED);
621         }
622         journal_header_decode(&rawheader, &j->header);
623
624         /*
625          * If there is an index, read the raw index into a dynamically
626          * allocated buffer and then convert it into a cooked index.
627          */
628         if (j->header.index_size != 0) {
629                 unsigned int i;
630                 unsigned int rawbytes;
631                 unsigned char *p;
632
633                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
634                 j->rawindex = isc_mem_get(mctx, rawbytes);
635                 if (j->rawindex == NULL)
636                         FAIL(ISC_R_NOMEMORY);
637
638                 CHECK(journal_read(j, j->rawindex, rawbytes));
639
640                 j->index = isc_mem_get(mctx, j->header.index_size *
641                                        sizeof(journal_pos_t));
642                 if (j->index == NULL)
643                         FAIL(ISC_R_NOMEMORY);
644
645                 p = j->rawindex;
646                 for (i = 0; i < j->header.index_size; i++) {
647                         j->index[i].serial = decode_uint32(p);
648                         p += 4;
649                         j->index[i].offset = decode_uint32(p);
650                         p += 4;
651                 }
652                 INSIST(p == j->rawindex + rawbytes);
653         }
654         j->offset = -1; /* Invalid, must seek explicitly. */
655
656         /*
657          * Initialize the iterator.
658          */
659         dns_name_init(&j->it.name, NULL);
660         dns_rdata_init(&j->it.rdata);
661
662         /*
663          * Set up empty initial buffers for unchecked and checked
664          * wire format RR data.  They will be reallocated
665          * later.
666          */
667         isc_buffer_init(&j->it.source, NULL, 0);
668         isc_buffer_init(&j->it.target, NULL, 0);
669         dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
670
671         j->state =
672                 write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
673
674         *journalp = j;
675         return (ISC_R_SUCCESS);
676
677  failure:
678         j->magic = 0;
679         if (j->index != NULL) {
680                 isc_mem_put(j->mctx, j->index, j->header.index_size *
681                             sizeof(journal_rawpos_t));
682                 j->index = NULL;
683         }
684         if (j->filename != NULL)
685                 isc_mem_free(j->mctx, j->filename);
686         if (j->fp != NULL)
687                 (void)isc_stdio_close(j->fp);
688         isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
689         return (result);
690 }
691
692 isc_result_t
693 dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
694                  dns_journal_t **journalp)
695 {
696         isc_result_t result;
697         size_t namelen;
698         char backup[1024];
699         isc_boolean_t write, create;
700
701         create = ISC_TF(mode & DNS_JOURNAL_CREATE);
702         write = ISC_TF(mode & (DNS_JOURNAL_WRITE|DNS_JOURNAL_CREATE));
703
704         result = journal_open(mctx, filename, write, create, journalp);
705         if (result == ISC_R_NOTFOUND) {
706                 namelen = strlen(filename);
707                 if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
708                         namelen -= 4;
709
710                 result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
711                                            (int)namelen, filename);
712                 if (result != ISC_R_SUCCESS)
713                         return (result);
714                 result = journal_open(mctx, backup, write, write, journalp);
715         }
716         return (result);
717 }
718
719 /*
720  * A comparison function defining the sorting order for
721  * entries in the IXFR-style journal file.
722  *
723  * The IXFR format requires that deletions are sorted before
724  * additions, and within either one, SOA records are sorted
725  * before others.
726  *
727  * Also sort the non-SOA records by type as a courtesy to the
728  * server receiving the IXFR - it may help reduce the amount of
729  * rdataset merging it has to do.
730  */
731 static int
732 ixfr_order(const void *av, const void *bv) {
733         dns_difftuple_t const * const *ap = av;
734         dns_difftuple_t const * const *bp = bv;
735         dns_difftuple_t const *a = *ap;
736         dns_difftuple_t const *b = *bp;
737         int r;
738         int bop = 0, aop = 0;
739
740         switch (a->op) {
741         case DNS_DIFFOP_DEL:
742         case DNS_DIFFOP_DELRESIGN:
743                 aop = 1;
744                 break;
745         case DNS_DIFFOP_ADD:
746         case DNS_DIFFOP_ADDRESIGN:
747                 aop = 0;
748                 break;
749         default:
750                 INSIST(0);
751         }
752
753         switch (b->op) {
754         case DNS_DIFFOP_DEL:
755         case DNS_DIFFOP_DELRESIGN:
756                 bop = 1;
757                 break;
758         case DNS_DIFFOP_ADD:
759         case DNS_DIFFOP_ADDRESIGN:
760                 bop = 0;
761                 break;
762         default:
763                 INSIST(0);
764         }
765
766         r = bop - aop;
767         if (r != 0)
768                 return (r);
769
770         r = (b->rdata.type == dns_rdatatype_soa) -
771                 (a->rdata.type == dns_rdatatype_soa);
772         if (r != 0)
773                 return (r);
774
775         r = (a->rdata.type - b->rdata.type);
776         return (r);
777 }
778
779 /*
780  * Advance '*pos' to the next journal transaction.
781  *
782  * Requires:
783  *      *pos refers to a valid journal transaction.
784  *
785  * Ensures:
786  *      When ISC_R_SUCCESS is returned,
787  *      *pos refers to the next journal transaction.
788  *
789  * Returns one of:
790  *
791  *    ISC_R_SUCCESS
792  *    ISC_R_NOMORE      *pos pointed at the last transaction
793  *    Other results due to file errors are possible.
794  */
795 static isc_result_t
796 journal_next(dns_journal_t *j, journal_pos_t *pos) {
797         isc_result_t result;
798         journal_xhdr_t xhdr;
799         REQUIRE(DNS_JOURNAL_VALID(j));
800
801         result = journal_seek(j, pos->offset);
802         if (result != ISC_R_SUCCESS)
803                 return (result);
804
805         if (pos->serial == j->header.end.serial)
806                 return (ISC_R_NOMORE);
807         /*
808          * Read the header of the current transaction.
809          * This will return ISC_R_NOMORE if we are at EOF.
810          */
811         result = journal_read_xhdr(j, &xhdr);
812         if (result != ISC_R_SUCCESS)
813                 return (result);
814
815         /*
816          * Check serial number consistency.
817          */
818         if (xhdr.serial0 != pos->serial) {
819                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
820                               "%s: journal file corrupt: "
821                               "expected serial %u, got %u",
822                               j->filename, pos->serial, xhdr.serial0);
823                 return (ISC_R_UNEXPECTED);
824         }
825
826         /*
827          * Check for offset wraparound.
828          */
829         if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
830             < pos->offset) {
831                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
832                               "%s: offset too large", j->filename);
833                 return (ISC_R_UNEXPECTED);
834         }
835
836         pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
837         pos->serial = xhdr.serial1;
838         return (ISC_R_SUCCESS);
839 }
840
841 /*
842  * If the index of the journal 'j' contains an entry "better"
843  * than '*best_guess', replace '*best_guess' with it.
844  *
845  * "Better" means having a serial number closer to 'serial'
846  * but not greater than 'serial'.
847  */
848 static void
849 index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
850         unsigned int i;
851         if (j->index == NULL)
852                 return;
853         for (i = 0; i < j->header.index_size; i++) {
854                 if (POS_VALID(j->index[i]) &&
855                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
856                     DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
857                         *best_guess = j->index[i];
858         }
859 }
860
861 /*
862  * Add a new index entry.  If there is no room, make room by removing
863  * the odd-numbered entries and compacting the others into the first
864  * half of the index.  This decimates old index entries exponentially
865  * over time, so that the index always contains a much larger fraction
866  * of recent serial numbers than of old ones.  This is deliberate -
867  * most index searches are for outgoing IXFR, and IXFR tends to request
868  * recent versions more often than old ones.
869  */
870 static void
871 index_add(dns_journal_t *j, journal_pos_t *pos) {
872         unsigned int i;
873         if (j->index == NULL)
874                 return;
875         /*
876          * Search for a vacant position.
877          */
878         for (i = 0; i < j->header.index_size; i++) {
879                 if (! POS_VALID(j->index[i]))
880                         break;
881         }
882         if (i == j->header.index_size) {
883                 unsigned int k = 0;
884                 /*
885                  * Found no vacant position.  Make some room.
886                  */
887                 for (i = 0; i < j->header.index_size; i += 2) {
888                         j->index[k++] = j->index[i];
889                 }
890                 i = k; /* 'i' identifies the first vacant position. */
891                 while (k < j->header.index_size) {
892                         POS_INVALIDATE(j->index[k]);
893                         k++;
894                 }
895         }
896         INSIST(i < j->header.index_size);
897         INSIST(! POS_VALID(j->index[i]));
898
899         /*
900          * Store the new index entry.
901          */
902         j->index[i] = *pos;
903 }
904
905 /*
906  * Invalidate any existing index entries that could become
907  * ambiguous when a new transaction with number 'serial' is added.
908  */
909 static void
910 index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
911         unsigned int i;
912         if (j->index == NULL)
913                 return;
914         for (i = 0; i < j->header.index_size; i++) {
915                 if (! DNS_SERIAL_GT(serial, j->index[i].serial))
916                         POS_INVALIDATE(j->index[i]);
917         }
918 }
919
920 /*
921  * Try to find a transaction with initial serial number 'serial'
922  * in the journal 'j'.
923  *
924  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
925  *
926  * If 'serial' is current (= the ending serial number of the
927  * last transaction in the journal), set '*pos' to
928  * the position immediately following the last transaction and
929  * return ISC_R_SUCCESS.
930  *
931  * If 'serial' is within the range of addressable serial numbers
932  * covered by the journal but that particular serial number is missing
933  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
934  *
935  * If 'serial' is outside the range of addressable serial numbers
936  * covered by the journal, return ISC_R_RANGE.
937  *
938  */
939 static isc_result_t
940 journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
941         isc_result_t result;
942         journal_pos_t current_pos;
943         REQUIRE(DNS_JOURNAL_VALID(j));
944
945         if (DNS_SERIAL_GT(j->header.begin.serial, serial))
946                 return (ISC_R_RANGE);
947         if (DNS_SERIAL_GT(serial, j->header.end.serial))
948                 return (ISC_R_RANGE);
949         if (serial == j->header.end.serial) {
950                 *pos = j->header.end;
951                 return (ISC_R_SUCCESS);
952         }
953
954         current_pos = j->header.begin;
955         index_find(j, serial, &current_pos);
956
957         while (current_pos.serial != serial) {
958                 if (DNS_SERIAL_GT(current_pos.serial, serial))
959                         return (ISC_R_NOTFOUND);
960                 result = journal_next(j, &current_pos);
961                 if (result != ISC_R_SUCCESS)
962                         return (result);
963         }
964         *pos = current_pos;
965         return (ISC_R_SUCCESS);
966 }
967
968 isc_result_t
969 dns_journal_begin_transaction(dns_journal_t *j) {
970         isc_uint32_t offset;
971         isc_result_t result;
972         journal_rawxhdr_t hdr;
973
974         REQUIRE(DNS_JOURNAL_VALID(j));
975         REQUIRE(j->state == JOURNAL_STATE_WRITE ||
976                 j->state == JOURNAL_STATE_INLINE);
977
978         /*
979          * Find the file offset where the new transaction should
980          * be written, and seek there.
981          */
982         if (JOURNAL_EMPTY(&j->header)) {
983                 offset = sizeof(journal_rawheader_t) +
984                         j->header.index_size * sizeof(journal_rawpos_t);
985         } else {
986                 offset = j->header.end.offset;
987         }
988         j->x.pos[0].offset = offset;
989         j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
990         j->x.n_soa = 0;
991
992         CHECK(journal_seek(j, offset));
993
994         /*
995          * Write a dummy transaction header of all zeroes to reserve
996          * space.  It will be filled in when the transaction is
997          * finished.
998          */
999         memset(&hdr, 0, sizeof(hdr));
1000         CHECK(journal_write(j, &hdr, sizeof(hdr)));
1001         j->x.pos[1].offset = j->offset;
1002
1003         j->state = JOURNAL_STATE_TRANSACTION;
1004         result = ISC_R_SUCCESS;
1005  failure:
1006         return (result);
1007 }
1008
1009 isc_result_t
1010 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
1011         dns_difftuple_t *t;
1012         isc_buffer_t buffer;
1013         void *mem = NULL;
1014         unsigned int size;
1015         isc_result_t result;
1016         isc_region_t used;
1017
1018         REQUIRE(DNS_DIFF_VALID(diff));
1019         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1020
1021         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
1022         (void)dns_diff_print(diff, NULL);
1023
1024         /*
1025          * Pass 1: determine the buffer size needed, and
1026          * keep track of SOA serial numbers.
1027          */
1028         size = 0;
1029         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1030              t = ISC_LIST_NEXT(t, link))
1031         {
1032                 if (t->rdata.type == dns_rdatatype_soa) {
1033                         if (j->x.n_soa < 2)
1034                                 j->x.pos[j->x.n_soa].serial =
1035                                         dns_soa_getserial(&t->rdata);
1036                         j->x.n_soa++;
1037                 }
1038                 size += sizeof(journal_rawrrhdr_t);
1039                 size += t->name.length; /* XXX should have access macro? */
1040                 size += 10;
1041                 size += t->rdata.length;
1042         }
1043
1044         mem = isc_mem_get(j->mctx, size);
1045         if (mem == NULL)
1046                 return (ISC_R_NOMEMORY);
1047
1048         isc_buffer_init(&buffer, mem, size);
1049
1050         /*
1051          * Pass 2.  Write RRs to buffer.
1052          */
1053         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1054              t = ISC_LIST_NEXT(t, link))
1055         {
1056                 /*
1057                  * Write the RR header.
1058                  */
1059                 isc_buffer_putuint32(&buffer, t->name.length + 10 +
1060                                      t->rdata.length);
1061                 /*
1062                  * Write the owner name, RR header, and RR data.
1063                  */
1064                 isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1065                 isc_buffer_putuint16(&buffer, t->rdata.type);
1066                 isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1067                 isc_buffer_putuint32(&buffer, t->ttl);
1068                 INSIST(t->rdata.length < 65536);
1069                 isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1070                 INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1071                 isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1072         }
1073
1074         isc_buffer_usedregion(&buffer, &used);
1075         INSIST(used.length == size);
1076
1077         j->x.pos[1].offset += used.length;
1078
1079         /*
1080          * Write the buffer contents to the journal file.
1081          */
1082         CHECK(journal_write(j, used.base, used.length));
1083
1084         result = ISC_R_SUCCESS;
1085
1086  failure:
1087         if (mem != NULL)
1088                 isc_mem_put(j->mctx, mem, size);
1089         return (result);
1090
1091 }
1092
1093 isc_result_t
1094 dns_journal_commit(dns_journal_t *j) {
1095         isc_result_t result;
1096         journal_rawheader_t rawheader;
1097
1098         REQUIRE(DNS_JOURNAL_VALID(j));
1099         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
1100                 j->state == JOURNAL_STATE_INLINE);
1101
1102         /*
1103          * Just write out a updated header.
1104          */
1105         if (j->state == JOURNAL_STATE_INLINE) {
1106                 CHECK(journal_fsync(j));
1107                 journal_header_encode(&j->header, &rawheader);
1108                 CHECK(journal_seek(j, 0));
1109                 CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1110                 CHECK(journal_fsync(j));
1111                 j->state = JOURNAL_STATE_WRITE;
1112                 return (ISC_R_SUCCESS);
1113         }
1114
1115         /*
1116          * Perform some basic consistency checks.
1117          */
1118         if (j->x.n_soa != 2) {
1119                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1120                               "%s: malformed transaction: %d SOAs",
1121                               j->filename, j->x.n_soa);
1122                 return (ISC_R_UNEXPECTED);
1123         }
1124         if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1125                (bind8_compat &&
1126                 j->x.pos[1].serial == j->x.pos[0].serial)))
1127         {
1128                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1129                               "%s: malformed transaction: serial number "
1130                               "would decrease", j->filename);
1131                 return (ISC_R_UNEXPECTED);
1132         }
1133         if (! JOURNAL_EMPTY(&j->header)) {
1134                 if (j->x.pos[0].serial != j->header.end.serial) {
1135                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1136                                          "malformed transaction: "
1137                                          "%s last serial %u != "
1138                                          "transaction first serial %u",
1139                                          j->filename,
1140                                          j->header.end.serial,
1141                                          j->x.pos[0].serial);
1142                         return (ISC_R_UNEXPECTED);
1143                 }
1144         }
1145
1146         /*
1147          * Some old journal entries may become non-addressable
1148          * when we increment the current serial number.  Purge them
1149          * by stepping header.begin forward to the first addressable
1150          * transaction.  Also purge them from the index.
1151          */
1152         if (! JOURNAL_EMPTY(&j->header)) {
1153                 while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1154                                        j->header.begin.serial)) {
1155                         CHECK(journal_next(j, &j->header.begin));
1156                 }
1157                 index_invalidate(j, j->x.pos[1].serial);
1158         }
1159 #ifdef notyet
1160         if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1161                 force_dump(...);
1162         }
1163 #endif
1164
1165         /*
1166          * Commit the transaction data to stable storage.
1167          */
1168         CHECK(journal_fsync(j));
1169
1170         if (j->state == JOURNAL_STATE_TRANSACTION) {
1171                 isc_offset_t offset;
1172                 offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
1173                                  sizeof(journal_rawxhdr_t);
1174                 /*
1175                  * Update the transaction header.
1176                  */
1177                 CHECK(journal_seek(j, j->x.pos[0].offset));
1178                 CHECK(journal_write_xhdr(j, offset, j->x.pos[0].serial,
1179                                          j->x.pos[1].serial));
1180         }
1181
1182         /*
1183          * Update the journal header.
1184          */
1185         if (JOURNAL_EMPTY(&j->header))
1186                 j->header.begin = j->x.pos[0];
1187         j->header.end = j->x.pos[1];
1188         journal_header_encode(&j->header, &rawheader);
1189         CHECK(journal_seek(j, 0));
1190         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1191
1192         /*
1193          * Update the index.
1194          */
1195         index_add(j, &j->x.pos[0]);
1196
1197         /*
1198          * Convert the index into on-disk format and write
1199          * it to disk.
1200          */
1201         CHECK(index_to_disk(j));
1202
1203         /*
1204          * Commit the header to stable storage.
1205          */
1206         CHECK(journal_fsync(j));
1207
1208         /*
1209          * We no longer have a transaction open.
1210          */
1211         j->state = JOURNAL_STATE_WRITE;
1212
1213         result = ISC_R_SUCCESS;
1214
1215  failure:
1216         return (result);
1217 }
1218
1219 isc_result_t
1220 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1221         isc_result_t result;
1222         CHECK(dns_diff_sort(diff, ixfr_order));
1223         CHECK(dns_journal_begin_transaction(j));
1224         CHECK(dns_journal_writediff(j, diff));
1225         CHECK(dns_journal_commit(j));
1226         result = ISC_R_SUCCESS;
1227  failure:
1228         return (result);
1229 }
1230
1231 void
1232 dns_journal_destroy(dns_journal_t **journalp) {
1233         dns_journal_t *j = *journalp;
1234         REQUIRE(DNS_JOURNAL_VALID(j));
1235
1236         j->it.result = ISC_R_FAILURE;
1237         dns_name_invalidate(&j->it.name);
1238         dns_decompress_invalidate(&j->it.dctx);
1239         if (j->rawindex != NULL)
1240                 isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1241                             sizeof(journal_rawpos_t));
1242         if (j->index != NULL)
1243                 isc_mem_put(j->mctx, j->index, j->header.index_size *
1244                             sizeof(journal_pos_t));
1245         if (j->it.target.base != NULL)
1246                 isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1247         if (j->it.source.base != NULL)
1248                 isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1249         if (j->filename != NULL)
1250                 isc_mem_free(j->mctx, j->filename);
1251         if (j->fp != NULL)
1252                 (void)isc_stdio_close(j->fp);
1253         j->magic = 0;
1254         isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
1255         *journalp = NULL;
1256 }
1257
1258 /*
1259  * Roll the open journal 'j' into the database 'db'.
1260  * A new database version will be created.
1261  */
1262
1263 /* XXX Share code with incoming IXFR? */
1264
1265 static isc_result_t
1266 roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
1267         isc_buffer_t source;            /* Transaction data from disk */
1268         isc_buffer_t target;            /* Ditto after _fromwire check */
1269         isc_uint32_t db_serial;         /* Database SOA serial */
1270         isc_uint32_t end_serial;        /* Last journal SOA serial */
1271         isc_result_t result;
1272         dns_dbversion_t *ver = NULL;
1273         journal_pos_t pos;
1274         dns_diff_t diff;
1275         unsigned int n_soa = 0;
1276         unsigned int n_put = 0;
1277         dns_diffop_t op;
1278
1279         REQUIRE(DNS_JOURNAL_VALID(j));
1280         REQUIRE(DNS_DB_VALID(db));
1281
1282         dns_diff_init(j->mctx, &diff);
1283
1284         /*
1285          * Set up empty initial buffers for unchecked and checked
1286          * wire format transaction data.  They will be reallocated
1287          * later.
1288          */
1289         isc_buffer_init(&source, NULL, 0);
1290         isc_buffer_init(&target, NULL, 0);
1291
1292         /*
1293          * Create the new database version.
1294          */
1295         CHECK(dns_db_newversion(db, &ver));
1296
1297         /*
1298          * Get the current database SOA serial number.
1299          */
1300         CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1301
1302         /*
1303          * Locate a journal entry for the current database serial.
1304          */
1305         CHECK(journal_find(j, db_serial, &pos));
1306         /*
1307          * XXX do more drastic things, like marking zone stale,
1308          * if this fails?
1309          */
1310         /*
1311          * XXXRTH  The zone code should probably mark the zone as bad and
1312          *         scream loudly into the log if this is a dynamic update
1313          *         log reply that failed.
1314          */
1315
1316         end_serial = dns_journal_last_serial(j);
1317         if (db_serial == end_serial)
1318                 CHECK(DNS_R_UPTODATE);
1319
1320         CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1321
1322         for (result = dns_journal_first_rr(j);
1323              result == ISC_R_SUCCESS;
1324              result = dns_journal_next_rr(j))
1325         {
1326                 dns_name_t *name;
1327                 isc_uint32_t ttl;
1328                 dns_rdata_t *rdata;
1329                 dns_difftuple_t *tuple = NULL;
1330
1331                 name = NULL;
1332                 rdata = NULL;
1333                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1334
1335                 if (rdata->type == dns_rdatatype_soa) {
1336                         n_soa++;
1337                         if (n_soa == 2)
1338                                 db_serial = j->it.current_serial;
1339                 }
1340
1341                 if (n_soa == 3)
1342                         n_soa = 1;
1343                 if (n_soa == 0) {
1344                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1345                                          "%s: journal file corrupt: missing "
1346                                          "initial SOA", j->filename);
1347                         FAIL(ISC_R_UNEXPECTED);
1348                 }
1349                 if ((options & DNS_JOURNALOPT_RESIGN) != 0)
1350                         op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN :
1351                                             DNS_DIFFOP_ADDRESIGN;
1352                 else
1353                         op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
1354
1355                 CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
1356                                            &tuple));
1357                 dns_diff_append(&diff, &tuple);
1358
1359                 if (++n_put > 100)  {
1360                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1361                                       "%s: applying diff to database (%u)",
1362                                       j->filename, db_serial);
1363                         (void)dns_diff_print(&diff, NULL);
1364                         CHECK(dns_diff_apply(&diff, db, ver));
1365                         dns_diff_clear(&diff);
1366                         n_put = 0;
1367                 }
1368         }
1369         if (result == ISC_R_NOMORE)
1370                 result = ISC_R_SUCCESS;
1371         CHECK(result);
1372
1373         if (n_put != 0) {
1374                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1375                               "%s: applying final diff to database (%u)",
1376                               j->filename, db_serial);
1377                 (void)dns_diff_print(&diff, NULL);
1378                 CHECK(dns_diff_apply(&diff, db, ver));
1379                 dns_diff_clear(&diff);
1380         }
1381
1382  failure:
1383         if (ver != NULL)
1384                 dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1385                                     ISC_TRUE : ISC_FALSE);
1386
1387         if (source.base != NULL)
1388                 isc_mem_put(j->mctx, source.base, source.length);
1389         if (target.base != NULL)
1390                 isc_mem_put(j->mctx, target.base, target.length);
1391
1392         dns_diff_clear(&diff);
1393
1394         return (result);
1395 }
1396
1397 isc_result_t
1398 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db,
1399                         unsigned int options, const char *filename)
1400 {
1401         REQUIRE((options & DNS_JOURNALOPT_RESIGN) == 0);
1402         return (dns_journal_rollforward2(mctx, db, options, 0, filename));
1403 }
1404
1405 isc_result_t
1406 dns_journal_rollforward2(isc_mem_t *mctx, dns_db_t *db, unsigned int options,
1407                          isc_uint32_t resign, const char *filename)
1408 {
1409         dns_journal_t *j;
1410         isc_result_t result;
1411
1412         REQUIRE(DNS_DB_VALID(db));
1413         REQUIRE(filename != NULL);
1414
1415         UNUSED(resign);
1416
1417         j = NULL;
1418         result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1419         if (result == ISC_R_NOTFOUND) {
1420                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1421                               "no journal file, but that's OK");
1422                 return (DNS_R_NOJOURNAL);
1423         }
1424         if (result != ISC_R_SUCCESS)
1425                 return (result);
1426         if (JOURNAL_EMPTY(&j->header))
1427                 result = DNS_R_UPTODATE;
1428         else
1429                 result = roll_forward(j, db, options);
1430
1431         dns_journal_destroy(&j);
1432
1433         return (result);
1434 }
1435
1436 isc_result_t
1437 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1438         dns_journal_t *j;
1439         isc_buffer_t source;            /* Transaction data from disk */
1440         isc_buffer_t target;            /* Ditto after _fromwire check */
1441         isc_uint32_t start_serial;              /* Database SOA serial */
1442         isc_uint32_t end_serial;        /* Last journal SOA serial */
1443         isc_result_t result;
1444         dns_diff_t diff;
1445         unsigned int n_soa = 0;
1446         unsigned int n_put = 0;
1447
1448         REQUIRE(filename != NULL);
1449
1450         j = NULL;
1451         result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1452         if (result == ISC_R_NOTFOUND) {
1453                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1454                 return (DNS_R_NOJOURNAL);
1455         }
1456
1457         if (result != ISC_R_SUCCESS) {
1458                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1459                               "journal open failure: %s: %s",
1460                               isc_result_totext(result), filename);
1461                 return (result);
1462         }
1463
1464         if (j->header.serialset)
1465                 fprintf(file, "Source serial = %u\n", j->header.sourceserial);
1466         dns_diff_init(j->mctx, &diff);
1467
1468         /*
1469          * Set up empty initial buffers for unchecked and checked
1470          * wire format transaction data.  They will be reallocated
1471          * later.
1472          */
1473         isc_buffer_init(&source, NULL, 0);
1474         isc_buffer_init(&target, NULL, 0);
1475
1476         start_serial = dns_journal_first_serial(j);
1477         end_serial = dns_journal_last_serial(j);
1478
1479         CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1480
1481         for (result = dns_journal_first_rr(j);
1482              result == ISC_R_SUCCESS;
1483              result = dns_journal_next_rr(j))
1484         {
1485                 dns_name_t *name;
1486                 isc_uint32_t ttl;
1487                 dns_rdata_t *rdata;
1488                 dns_difftuple_t *tuple = NULL;
1489
1490                 name = NULL;
1491                 rdata = NULL;
1492                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1493
1494                 if (rdata->type == dns_rdatatype_soa)
1495                         n_soa++;
1496
1497                 if (n_soa == 3)
1498                         n_soa = 1;
1499                 if (n_soa == 0) {
1500                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1501                                       "%s: journal file corrupt: missing "
1502                                       "initial SOA", j->filename);
1503                         FAIL(ISC_R_UNEXPECTED);
1504                 }
1505                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1506                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1507                                            name, ttl, rdata, &tuple));
1508                 dns_diff_append(&diff, &tuple);
1509
1510                 if (++n_put > 100)  {
1511                         result = dns_diff_print(&diff, file);
1512                         dns_diff_clear(&diff);
1513                         n_put = 0;
1514                         if (result != ISC_R_SUCCESS)
1515                                 break;
1516                 }
1517         }
1518         if (result == ISC_R_NOMORE)
1519                 result = ISC_R_SUCCESS;
1520         CHECK(result);
1521
1522         if (n_put != 0) {
1523                 result = dns_diff_print(&diff, file);
1524                 dns_diff_clear(&diff);
1525         }
1526         goto cleanup;
1527
1528  failure:
1529         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1530                       "%s: cannot print: journal file corrupt", j->filename);
1531
1532  cleanup:
1533         if (source.base != NULL)
1534                 isc_mem_put(j->mctx, source.base, source.length);
1535         if (target.base != NULL)
1536                 isc_mem_put(j->mctx, target.base, target.length);
1537
1538         dns_diff_clear(&diff);
1539         dns_journal_destroy(&j);
1540
1541         return (result);
1542 }
1543
1544 /**************************************************************************/
1545 /*
1546  * Miscellaneous accessors.
1547  */
1548 isc_uint32_t
1549 dns_journal_first_serial(dns_journal_t *j) {
1550         return (j->header.begin.serial);
1551 }
1552
1553 isc_uint32_t
1554 dns_journal_last_serial(dns_journal_t *j) {
1555         return (j->header.end.serial);
1556 }
1557
1558 void
1559 dns_journal_set_sourceserial(dns_journal_t *j, isc_uint32_t sourceserial) {
1560
1561         REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1562                 j->state == JOURNAL_STATE_INLINE ||
1563                 j->state == JOURNAL_STATE_TRANSACTION);
1564
1565         j->header.sourceserial = sourceserial;
1566         j->header.serialset = ISC_TRUE;
1567         if (j->state == JOURNAL_STATE_WRITE)
1568                 j->state = JOURNAL_STATE_INLINE;
1569 }
1570
1571 isc_boolean_t
1572 dns_journal_get_sourceserial(dns_journal_t *j, isc_uint32_t *sourceserial) {
1573         REQUIRE(sourceserial != NULL);
1574
1575         if (!j->header.serialset)
1576                 return (ISC_FALSE);
1577         *sourceserial = j->header.sourceserial;
1578         return (ISC_TRUE);
1579 }
1580
1581 /**************************************************************************/
1582 /*
1583  * Iteration support.
1584  *
1585  * When serving an outgoing IXFR, we transmit a part the journal starting
1586  * at the serial number in the IXFR request and ending at the serial
1587  * number that is current when the IXFR request arrives.  The ending
1588  * serial number is not necessarily at the end of the journal:
1589  * the journal may grow while the IXFR is in progress, but we stop
1590  * when we reach the serial number that was current when the IXFR started.
1591  */
1592
1593 static isc_result_t read_one_rr(dns_journal_t *j);
1594
1595 /*
1596  * Make sure the buffer 'b' is has at least 'size' bytes
1597  * allocated, and clear it.
1598  *
1599  * Requires:
1600  *      Either b->base is NULL, or it points to b->length bytes of memory
1601  *      previously allocated by isc_mem_get().
1602  */
1603
1604 static isc_result_t
1605 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1606         if (b->length < size) {
1607                 void *mem = isc_mem_get(mctx, size);
1608                 if (mem == NULL)
1609                         return (ISC_R_NOMEMORY);
1610                 if (b->base != NULL)
1611                         isc_mem_put(mctx, b->base, b->length);
1612                 b->base = mem;
1613                 b->length = size;
1614         }
1615         isc_buffer_clear(b);
1616         return (ISC_R_SUCCESS);
1617 }
1618
1619 isc_result_t
1620 dns_journal_iter_init(dns_journal_t *j,
1621                       isc_uint32_t begin_serial, isc_uint32_t end_serial)
1622 {
1623         isc_result_t result;
1624
1625         CHECK(journal_find(j, begin_serial, &j->it.bpos));
1626         INSIST(j->it.bpos.serial == begin_serial);
1627
1628         CHECK(journal_find(j, end_serial, &j->it.epos));
1629         INSIST(j->it.epos.serial == end_serial);
1630
1631         result = ISC_R_SUCCESS;
1632  failure:
1633         j->it.result = result;
1634         return (j->it.result);
1635 }
1636
1637
1638 isc_result_t
1639 dns_journal_first_rr(dns_journal_t *j) {
1640         isc_result_t result;
1641
1642         /*
1643          * Seek to the beginning of the first transaction we are
1644          * interested in.
1645          */
1646         CHECK(journal_seek(j, j->it.bpos.offset));
1647         j->it.current_serial = j->it.bpos.serial;
1648
1649         j->it.xsize = 0;  /* We have no transaction data yet... */
1650         j->it.xpos = 0;   /* ...and haven't used any of it. */
1651
1652         return (read_one_rr(j));
1653
1654  failure:
1655         return (result);
1656 }
1657
1658 static isc_result_t
1659 read_one_rr(dns_journal_t *j) {
1660         isc_result_t result;
1661
1662         dns_rdatatype_t rdtype;
1663         dns_rdataclass_t rdclass;
1664         unsigned int rdlen;
1665         isc_uint32_t ttl;
1666         journal_xhdr_t xhdr;
1667         journal_rrhdr_t rrhdr;
1668
1669         INSIST(j->offset <= j->it.epos.offset);
1670         if (j->offset == j->it.epos.offset)
1671                 return (ISC_R_NOMORE);
1672         if (j->it.xpos == j->it.xsize) {
1673                 /*
1674                  * We are at a transaction boundary.
1675                  * Read another transaction header.
1676                  */
1677                 CHECK(journal_read_xhdr(j, &xhdr));
1678                 if (xhdr.size == 0) {
1679                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1680                                       "%s: journal corrupt: empty transaction",
1681                                       j->filename);
1682                         FAIL(ISC_R_UNEXPECTED);
1683                 }
1684                 if (xhdr.serial0 != j->it.current_serial) {
1685                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1686                                          "%s: journal file corrupt: "
1687                                          "expected serial %u, got %u",
1688                                          j->filename,
1689                                          j->it.current_serial, xhdr.serial0);
1690                         FAIL(ISC_R_UNEXPECTED);
1691                 }
1692                 j->it.xsize = xhdr.size;
1693                 j->it.xpos = 0;
1694         }
1695         /*
1696          * Read an RR.
1697          */
1698         CHECK(journal_read_rrhdr(j, &rrhdr));
1699         /*
1700          * Perform a sanity check on the journal RR size.
1701          * The smallest possible RR has a 1-byte owner name
1702          * and a 10-byte header.  The largest possible
1703          * RR has 65535 bytes of data, a header, and a maximum-
1704          * size owner name, well below 70 k total.
1705          */
1706         if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1707                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1708                                  "%s: journal corrupt: impossible RR size "
1709                                  "(%d bytes)", j->filename, rrhdr.size);
1710                 FAIL(ISC_R_UNEXPECTED);
1711         }
1712
1713         CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1714         CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1715         isc_buffer_add(&j->it.source, rrhdr.size);
1716
1717         /*
1718          * The target buffer is made the same size
1719          * as the source buffer, with the assumption that when
1720          * no compression in present, the output of dns_*_fromwire()
1721          * is no larger than the input.
1722          */
1723         CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1724
1725         /*
1726          * Parse the owner name.  We don't know where it
1727          * ends yet, so we make the entire "remaining"
1728          * part of the buffer "active".
1729          */
1730         isc_buffer_setactive(&j->it.source,
1731                              j->it.source.used - j->it.source.current);
1732         CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1733                                 &j->it.dctx, 0, &j->it.target));
1734
1735         /*
1736          * Check that the RR header is there, and parse it.
1737          */
1738         if (isc_buffer_remaininglength(&j->it.source) < 10)
1739                 FAIL(DNS_R_FORMERR);
1740
1741         rdtype = isc_buffer_getuint16(&j->it.source);
1742         rdclass = isc_buffer_getuint16(&j->it.source);
1743         ttl = isc_buffer_getuint32(&j->it.source);
1744         rdlen = isc_buffer_getuint16(&j->it.source);
1745
1746         /*
1747          * Parse the rdata.
1748          */
1749         if (isc_buffer_remaininglength(&j->it.source) != rdlen)
1750                 FAIL(DNS_R_FORMERR);
1751         isc_buffer_setactive(&j->it.source, rdlen);
1752         dns_rdata_reset(&j->it.rdata);
1753         CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1754                                  rdtype, &j->it.source, &j->it.dctx,
1755                                  0, &j->it.target));
1756         j->it.ttl = ttl;
1757
1758         j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1759         if (rdtype == dns_rdatatype_soa) {
1760                 /* XXX could do additional consistency checks here */
1761                 j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1762         }
1763
1764         result = ISC_R_SUCCESS;
1765
1766  failure:
1767         j->it.result = result;
1768         return (result);
1769 }
1770
1771 isc_result_t
1772 dns_journal_next_rr(dns_journal_t *j) {
1773         j->it.result = read_one_rr(j);
1774         return (j->it.result);
1775 }
1776
1777 void
1778 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1779                    dns_rdata_t **rdata)
1780 {
1781         REQUIRE(j->it.result == ISC_R_SUCCESS);
1782         *name = &j->it.name;
1783         *ttl = j->it.ttl;
1784         *rdata = &j->it.rdata;
1785 }
1786
1787 /**************************************************************************/
1788 /*
1789  * Generating diffs from databases
1790  */
1791
1792 /*
1793  * Construct a diff containing all the RRs at the current name of the
1794  * database iterator 'dbit' in database 'db', version 'ver'.
1795  * Set '*name' to the current name, and append the diff to 'diff'.
1796  * All new tuples will have the operation 'op'.
1797  *
1798  * Requires: 'name' must have buffer large enough to hold the name.
1799  * Typically, a dns_fixedname_t would be used.
1800  */
1801 static isc_result_t
1802 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1803               dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1804               dns_diff_t *diff)
1805 {
1806         isc_result_t result;
1807         dns_dbnode_t *node = NULL;
1808         dns_rdatasetiter_t *rdsiter = NULL;
1809         dns_difftuple_t *tuple = NULL;
1810
1811         result = dns_dbiterator_current(dbit, &node, name);
1812         if (result != ISC_R_SUCCESS)
1813                 return (result);
1814
1815         result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1816         if (result != ISC_R_SUCCESS)
1817                 goto cleanup_node;
1818
1819         for (result = dns_rdatasetiter_first(rdsiter);
1820              result == ISC_R_SUCCESS;
1821              result = dns_rdatasetiter_next(rdsiter))
1822         {
1823                 dns_rdataset_t rdataset;
1824
1825                 dns_rdataset_init(&rdataset);
1826                 dns_rdatasetiter_current(rdsiter, &rdataset);
1827
1828                 for (result = dns_rdataset_first(&rdataset);
1829                      result == ISC_R_SUCCESS;
1830                      result = dns_rdataset_next(&rdataset))
1831                 {
1832                         dns_rdata_t rdata = DNS_RDATA_INIT;
1833                         dns_rdataset_current(&rdataset, &rdata);
1834                         result = dns_difftuple_create(diff->mctx, op, name,
1835                                                       rdataset.ttl, &rdata,
1836                                                       &tuple);
1837                         if (result != ISC_R_SUCCESS) {
1838                                 dns_rdataset_disassociate(&rdataset);
1839                                 goto cleanup_iterator;
1840                         }
1841                         dns_diff_append(diff, &tuple);
1842                 }
1843                 dns_rdataset_disassociate(&rdataset);
1844                 if (result != ISC_R_NOMORE)
1845                         goto cleanup_iterator;
1846         }
1847         if (result != ISC_R_NOMORE)
1848                 goto cleanup_iterator;
1849
1850         result = ISC_R_SUCCESS;
1851
1852  cleanup_iterator:
1853         dns_rdatasetiter_destroy(&rdsiter);
1854
1855  cleanup_node:
1856         dns_db_detachnode(db, &node);
1857
1858         return (result);
1859 }
1860
1861 /*
1862  * Comparison function for use by dns_diff_subtract when sorting
1863  * the diffs to be subtracted.  The sort keys are the rdata type
1864  * and the rdata itself.  The owner name is ignored, because
1865  * it is known to be the same for all tuples.
1866  */
1867 static int
1868 rdata_order(const void *av, const void *bv) {
1869         dns_difftuple_t const * const *ap = av;
1870         dns_difftuple_t const * const *bp = bv;
1871         dns_difftuple_t const *a = *ap;
1872         dns_difftuple_t const *b = *bp;
1873         int r;
1874         r = (b->rdata.type - a->rdata.type);
1875         if (r != 0)
1876                 return (r);
1877         r = dns_rdata_compare(&a->rdata, &b->rdata);
1878         return (r);
1879 }
1880
1881 static isc_result_t
1882 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1883         isc_result_t result;
1884         dns_difftuple_t *p[2];
1885         int i, t;
1886         isc_boolean_t append;
1887
1888         CHECK(dns_diff_sort(&diff[0], rdata_order));
1889         CHECK(dns_diff_sort(&diff[1], rdata_order));
1890
1891         for (;;) {
1892                 p[0] = ISC_LIST_HEAD(diff[0].tuples);
1893                 p[1] = ISC_LIST_HEAD(diff[1].tuples);
1894                 if (p[0] == NULL && p[1] == NULL)
1895                         break;
1896
1897                 for (i = 0; i < 2; i++)
1898                         if (p[!i] == NULL) {
1899                                 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1900                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1901                                 goto next;
1902                         }
1903                 t = rdata_order(&p[0], &p[1]);
1904                 if (t < 0) {
1905                         ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1906                         ISC_LIST_APPEND(r->tuples, p[0], link);
1907                         goto next;
1908                 }
1909                 if (t > 0) {
1910                         ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1911                         ISC_LIST_APPEND(r->tuples, p[1], link);
1912                         goto next;
1913                 }
1914                 INSIST(t == 0);
1915                 /*
1916                  * Identical RRs in both databases; skip them both
1917                  * if the ttl differs.
1918                  */
1919                 append = ISC_TF(p[0]->ttl != p[1]->ttl);
1920                 for (i = 0; i < 2; i++) {
1921                         ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1922                         if (append) {
1923                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1924                         } else {
1925                                 dns_difftuple_free(&p[i]);
1926                         }
1927                 }
1928         next: ;
1929         }
1930         result = ISC_R_SUCCESS;
1931  failure:
1932         return (result);
1933 }
1934
1935 static isc_result_t
1936 diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera,
1937                dns_db_t *dbb, dns_dbversion_t *dbverb,
1938                unsigned int options, dns_diff_t *resultdiff)
1939 {
1940         dns_db_t *db[2];
1941         dns_dbversion_t *ver[2];
1942         dns_dbiterator_t *dbit[2] = { NULL, NULL };
1943         isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1944         dns_fixedname_t fixname[2];
1945         isc_result_t result, itresult[2];
1946         dns_diff_t diff[2];
1947         int i, t;
1948
1949         db[0] = dba, db[1] = dbb;
1950         ver[0] = dbvera, ver[1] = dbverb;
1951
1952         dns_diff_init(resultdiff->mctx, &diff[0]);
1953         dns_diff_init(resultdiff->mctx, &diff[1]);
1954
1955         dns_fixedname_init(&fixname[0]);
1956         dns_fixedname_init(&fixname[1]);
1957
1958         result = dns_db_createiterator(db[0], options, &dbit[0]);
1959         if (result != ISC_R_SUCCESS)
1960                 return (result);
1961         result = dns_db_createiterator(db[1], options, &dbit[1]);
1962         if (result != ISC_R_SUCCESS)
1963                 goto cleanup_iterator;
1964
1965         itresult[0] = dns_dbiterator_first(dbit[0]);
1966         itresult[1] = dns_dbiterator_first(dbit[1]);
1967
1968         for (;;) {
1969                 for (i = 0; i < 2; i++) {
1970                         if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1971                                 CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1972                                             dns_fixedname_name(&fixname[i]),
1973                                             i == 0 ?
1974                                             DNS_DIFFOP_ADD :
1975                                             DNS_DIFFOP_DEL,
1976                                             &diff[i]));
1977                                 itresult[i] = dns_dbiterator_next(dbit[i]);
1978                                 have[i] = ISC_TRUE;
1979                         }
1980                 }
1981
1982                 if (! have[0] && ! have[1]) {
1983                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1984                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1985                         break;
1986                 }
1987
1988                 for (i = 0; i < 2; i++) {
1989                         if (! have[!i]) {
1990                                 ISC_LIST_APPENDLIST(resultdiff->tuples,
1991                                                     diff[i].tuples, link);
1992                                 INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1993                                 have[i] = ISC_FALSE;
1994                                 goto next;
1995                         }
1996                 }
1997
1998                 t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1999                                      dns_fixedname_name(&fixname[1]));
2000                 if (t < 0) {
2001                         ISC_LIST_APPENDLIST(resultdiff->tuples,
2002                                             diff[0].tuples, link);
2003                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2004                         have[0] = ISC_FALSE;
2005                         continue;
2006                 }
2007                 if (t > 0) {
2008                         ISC_LIST_APPENDLIST(resultdiff->tuples,
2009                                             diff[1].tuples, link);
2010                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2011                         have[1] = ISC_FALSE;
2012                         continue;
2013                 }
2014                 INSIST(t == 0);
2015                 CHECK(dns_diff_subtract(diff, resultdiff));
2016                 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2017                 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2018                 have[0] = have[1] = ISC_FALSE;
2019         next: ;
2020         }
2021         if (itresult[0] != ISC_R_NOMORE)
2022                 FAIL(itresult[0]);
2023         if (itresult[1] != ISC_R_NOMORE)
2024                 FAIL(itresult[1]);
2025
2026         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2027         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2028
2029  failure:
2030         dns_dbiterator_destroy(&dbit[1]);
2031
2032  cleanup_iterator:
2033         dns_dbiterator_destroy(&dbit[0]);
2034         dns_diff_clear(&diff[0]);
2035         dns_diff_clear(&diff[1]);
2036         return (result);
2037 }
2038
2039 /*
2040  * Compare the databases 'dba' and 'dbb' and generate a journal
2041  * entry containing the changes to make 'dba' from 'dbb' (note
2042  * the order).  This journal entry will consist of a single,
2043  * possibly very large transaction.
2044  */
2045 isc_result_t
2046 dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
2047             dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
2048 {
2049         isc_result_t result;
2050         dns_diff_t diff;
2051
2052         dns_diff_init(mctx, &diff);
2053
2054         result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
2055
2056         dns_diff_clear(&diff);
2057
2058         return (result);
2059 }
2060
2061 isc_result_t
2062 dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
2063              dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename)
2064 {
2065         isc_result_t result;
2066         dns_journal_t *journal = NULL;
2067
2068         if (filename != NULL) {
2069                 result = dns_journal_open(diff->mctx, filename,
2070                                           DNS_JOURNAL_CREATE, &journal);
2071                 if (result != ISC_R_SUCCESS)
2072                         return (result);
2073         }
2074
2075         CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
2076         CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
2077
2078         if (journal != NULL) {
2079                 if (ISC_LIST_EMPTY(diff->tuples))
2080                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
2081                 else
2082                         CHECK(dns_journal_write_transaction(journal, diff));
2083         }
2084
2085  failure:
2086         if (journal != NULL)
2087                 dns_journal_destroy(&journal);
2088         return (result);
2089 }
2090
2091 isc_result_t
2092 dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
2093                     isc_uint32_t target_size)
2094 {
2095         unsigned int i;
2096         journal_pos_t best_guess;
2097         journal_pos_t current_pos;
2098         dns_journal_t *j = NULL;
2099         dns_journal_t *new = NULL;
2100         journal_rawheader_t rawheader;
2101         unsigned int copy_length;
2102         size_t namelen;
2103         char *buf = NULL;
2104         unsigned int size = 0;
2105         isc_result_t result;
2106         unsigned int indexend;
2107         char newname[1024];
2108         char backup[1024];
2109         isc_boolean_t is_backup = ISC_FALSE;
2110
2111         namelen = strlen(filename);
2112         if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
2113                 namelen -= 4;
2114
2115         result = isc_string_printf(newname, sizeof(newname), "%.*s.jnw",
2116                                    (int)namelen, filename);
2117         if (result != ISC_R_SUCCESS)
2118                 return (result);
2119
2120         result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
2121                                    (int)namelen, filename);
2122         if (result != ISC_R_SUCCESS)
2123                 return (result);
2124
2125         result = journal_open(mctx, filename, ISC_FALSE, ISC_FALSE, &j);
2126         if (result == ISC_R_NOTFOUND) {
2127                 is_backup = ISC_TRUE;
2128                 result = journal_open(mctx, backup, ISC_FALSE, ISC_FALSE, &j);
2129         }
2130         if (result != ISC_R_SUCCESS)
2131                 return (result);
2132
2133         if (JOURNAL_EMPTY(&j->header)) {
2134                 dns_journal_destroy(&j);
2135                 return (ISC_R_SUCCESS);
2136         }
2137
2138         if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
2139             DNS_SERIAL_GT(serial, j->header.end.serial)) {
2140                 dns_journal_destroy(&j);
2141                 return (ISC_R_RANGE);
2142         }
2143
2144         /*
2145          * Cope with very small target sizes.
2146          */
2147         indexend = sizeof(journal_rawheader_t) +
2148                    j->header.index_size * sizeof(journal_rawpos_t);
2149         if (target_size < indexend * 2)
2150                 target_size = target_size/2 + indexend;
2151
2152         /*
2153          * See if there is any work to do.
2154          */
2155         if ((isc_uint32_t) j->header.end.offset < target_size) {
2156                 dns_journal_destroy(&j);
2157                 return (ISC_R_SUCCESS);
2158         }
2159
2160         CHECK(journal_open(mctx, newname, ISC_TRUE, ISC_TRUE, &new));
2161
2162         /*
2163          * Remove overhead so space test below can succeed.
2164          */
2165         if (target_size >= indexend)
2166                 target_size -= indexend;
2167
2168         /*
2169          * Find if we can create enough free space.
2170          */
2171         best_guess = j->header.begin;
2172         for (i = 0; i < j->header.index_size; i++) {
2173                 if (POS_VALID(j->index[i]) &&
2174                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
2175                     ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
2176                      >= target_size / 2) &&
2177                     j->index[i].offset > best_guess.offset)
2178                         best_guess = j->index[i];
2179         }
2180
2181         current_pos = best_guess;
2182         while (current_pos.serial != serial) {
2183                 CHECK(journal_next(j, &current_pos));
2184                 if (current_pos.serial == j->header.end.serial)
2185                         break;
2186
2187                 if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2188                    ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
2189                      >= (target_size / 2)) &&
2190                     current_pos.offset > best_guess.offset)
2191                         best_guess = current_pos;
2192                 else
2193                         break;
2194         }
2195
2196         INSIST(best_guess.serial != j->header.end.serial);
2197         if (best_guess.serial != serial)
2198                 CHECK(journal_next(j, &best_guess));
2199
2200         /*
2201          * We should now be roughly half target_size provided
2202          * we did not reach 'serial'.  If not we will just copy
2203          * all uncommitted deltas regardless of the size.
2204          */
2205         copy_length = j->header.end.offset - best_guess.offset;
2206
2207         if (copy_length != 0) {
2208                 /*
2209                  * Copy best_guess to end into space just freed.
2210                  */
2211                 size = 64*1024;
2212                 if (copy_length < size)
2213                         size = copy_length;
2214                 buf = isc_mem_get(mctx, size);
2215                 if (buf == NULL) {
2216                         result = ISC_R_NOMEMORY;
2217                         goto failure;
2218                 }
2219
2220                 CHECK(journal_seek(j, best_guess.offset));
2221                 CHECK(journal_seek(new, indexend));
2222                 for (i = 0; i < copy_length; i += size) {
2223                         unsigned int len = (copy_length - i) > size ? size :
2224                                                          (copy_length - i);
2225                         CHECK(journal_read(j, buf, len));
2226                         CHECK(journal_write(new, buf, len));
2227                 }
2228
2229                 CHECK(journal_fsync(new));
2230
2231                 /*
2232                  * Compute new header.
2233                  */
2234                 new->header.begin.serial = best_guess.serial;
2235                 new->header.begin.offset = indexend;
2236                 new->header.end.serial = j->header.end.serial;
2237                 new->header.end.offset = indexend + copy_length;
2238                 new->header.sourceserial = j->header.sourceserial;
2239                 new->header.serialset = j->header.serialset;
2240
2241                 /*
2242                  * Update the journal header.
2243                  */
2244                 journal_header_encode(&new->header, &rawheader);
2245                 CHECK(journal_seek(new, 0));
2246                 CHECK(journal_write(new, &rawheader, sizeof(rawheader)));
2247                 CHECK(journal_fsync(new));
2248
2249                 /*
2250                  * Build new index.
2251                  */
2252                 current_pos = new->header.begin;
2253                 while (current_pos.serial != new->header.end.serial) {
2254                         index_add(new, &current_pos);
2255                         CHECK(journal_next(new, &current_pos));
2256                 }
2257
2258                 /*
2259                  * Write index.
2260                  */
2261                 CHECK(index_to_disk(new));
2262                 CHECK(journal_fsync(new));
2263
2264                 indexend = new->header.end.offset;
2265                 POST(indexend);
2266         }
2267
2268         /*
2269          * Close both journals before trying to rename files (this is
2270          * necessary on WIN32).
2271          */
2272         dns_journal_destroy(&j);
2273         dns_journal_destroy(&new);
2274
2275         /*
2276          * With a UFS file system this should just succeed and be atomic.
2277          * Any IXFR outs will just continue and the old journal will be
2278          * removed on final close.
2279          *
2280          * With MSDOS / NTFS we need to do a two stage rename, triggered
2281          * by EEXIST.  (If any IXFR's are running in other threads, however,
2282          * this will fail, and the journal will not be compacted.  But
2283          * if so, hopefully they'll be finished by the next time we
2284          * compact.)
2285          */
2286         if (rename(newname, filename) == -1) {
2287                 if (errno == EEXIST && !is_backup) {
2288                         result = isc_file_remove(backup);
2289                         if (result != ISC_R_SUCCESS &&
2290                             result != ISC_R_FILENOTFOUND)
2291                                 goto failure;
2292                         if (rename(filename, backup) == -1)
2293                                 goto maperrno;
2294                         if (rename(newname, filename) == -1)
2295                                 goto maperrno;
2296                         (void)isc_file_remove(backup);
2297                 } else {
2298  maperrno:
2299                         result = ISC_R_FAILURE;
2300                         goto failure;
2301                 }
2302         }
2303
2304         result = ISC_R_SUCCESS;
2305
2306  failure:
2307         (void)isc_file_remove(newname);
2308         if (buf != NULL)
2309                 isc_mem_put(mctx, buf, size);
2310         if (j != NULL)
2311                 dns_journal_destroy(&j);
2312         if (new != NULL)
2313                 dns_journal_destroy(&new);
2314         return (result);
2315 }
2316
2317 static isc_result_t
2318 index_to_disk(dns_journal_t *j) {
2319         isc_result_t result = ISC_R_SUCCESS;
2320
2321         if (j->header.index_size != 0) {
2322                 unsigned int i;
2323                 unsigned char *p;
2324                 unsigned int rawbytes;
2325
2326                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2327
2328                 p = j->rawindex;
2329                 for (i = 0; i < j->header.index_size; i++) {
2330                         encode_uint32(j->index[i].serial, p);
2331                         p += 4;
2332                         encode_uint32(j->index[i].offset, p);
2333                         p += 4;
2334                 }
2335                 INSIST(p == j->rawindex + rawbytes);
2336
2337                 CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2338                 CHECK(journal_write(j, j->rawindex, rawbytes));
2339         }
2340 failure:
2341         return (result);
2342 }