]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/bind9/lib/dns/journal.c
This commit was generated by cvs2svn to compensate for changes in r171164,
[FreeBSD/FreeBSD.git] / contrib / bind9 / lib / dns / journal.c
1 /*
2  * Copyright (C) 2004, 2005  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2002  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: journal.c,v 1.86.18.8 2005/11/03 23:02:23 marka Exp $ */
19
20 #include <config.h>
21
22 #include <stdlib.h>
23 #include <unistd.h>
24
25 #include <isc/file.h>
26 #include <isc/mem.h>
27 #include <isc/stdio.h>
28 #include <isc/string.h>
29 #include <isc/util.h>
30
31 #include <dns/compress.h>
32 #include <dns/db.h>
33 #include <dns/dbiterator.h>
34 #include <dns/diff.h>
35 #include <dns/fixedname.h>
36 #include <dns/journal.h>
37 #include <dns/log.h>
38 #include <dns/rdataset.h>
39 #include <dns/rdatasetiter.h>
40 #include <dns/result.h>
41 #include <dns/soa.h>
42
43 /*! \file 
44  * \brief Journalling.
45  *
46  * A journal file consists of
47  *
48  *   \li A fixed-size header of type journal_rawheader_t.
49  *
50  *   \li The index.  This is an unordered array of index entries
51  *     of type journal_rawpos_t giving the locations
52  *     of some arbitrary subset of the journal's addressable
53  *     transactions.  The index entries are used as hints to
54  *     speed up the process of locating a transaction with a given
55  *     serial number.  Unused index entries have an "offset"
56  *     field of zero.  The size of the index can vary between
57  *     journal files, but does not change during the lifetime
58  *     of a file.  The size can be zero.
59  *
60  *   \li The journal data.  This  consists of one or more transactions.
61  *     Each transaction begins with a transaction header of type
62  *     journal_rawxhdr_t.  The transaction header is followed by a
63  *     sequence of RRs, similar in structure to an IXFR difference
64  *     sequence (RFC1995).  That is, the pre-transaction SOA,
65  *     zero or more other deleted RRs, the post-transaction SOA,
66  *     and zero or more other added RRs.  Unlike in IXFR, each RR
67  *     is prefixed with a 32-bit length.
68  *
69  *     The journal data part grows as new transactions are
70  *     appended to the file.  Only those transactions
71  *     whose serial number is current-(2^31-1) to current
72  *     are considered "addressable" and may be pointed
73  *     to from the header or index.  They may be preceded
74  *     by old transactions that are no longer addressable,
75  *     and they may be followed by transactions that were
76  *     appended to the journal but never committed by updating
77  *     the "end" position in the header.  The latter will
78  *     be overwritten when new transactions are added.
79  */
80 /*%
81  * When true, accept IXFR difference sequences where the
82  * SOA serial number does not change (BIND 8 sends such
83  * sequences).
84  */
85 static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
86
87 /**************************************************************************/
88 /*
89  * Miscellaneous utilities.
90  */
91
92 #define JOURNAL_COMMON_LOGARGS \
93         dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
94
95 #define JOURNAL_DEBUG_LOGARGS(n) \
96         JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
97
98 /*%
99  * It would be non-sensical (or at least obtuse) to use FAIL() with an
100  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
101  * from complaining about "end-of-loop code not reached".
102  */
103 #define FAIL(code) \
104         do { result = (code);                                   \
105                 if (result != ISC_R_SUCCESS) goto failure;      \
106         } while (0)
107
108 #define CHECK(op) \
109         do { result = (op);                                     \
110                 if (result != ISC_R_SUCCESS) goto failure;      \
111         } while (0)
112
113 static isc_result_t index_to_disk(dns_journal_t *);
114
115 static inline isc_uint32_t
116 decode_uint32(unsigned char *p) {
117         return ((p[0] << 24) +
118                 (p[1] << 16) +
119                 (p[2] <<  8) +
120                 (p[3] <<  0));
121 }
122
123 static inline void
124 encode_uint32(isc_uint32_t val, unsigned char *p) {
125         p[0] = (isc_uint8_t)(val >> 24);
126         p[1] = (isc_uint8_t)(val >> 16);
127         p[2] = (isc_uint8_t)(val >>  8);
128         p[3] = (isc_uint8_t)(val >>  0);
129 }
130
131 isc_result_t
132 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
133                       dns_diffop_t op, dns_difftuple_t **tp)
134 {
135         isc_result_t result;
136         dns_dbnode_t *node;
137         dns_rdataset_t rdataset;
138         dns_rdata_t rdata = DNS_RDATA_INIT;
139         dns_name_t *zonename;
140
141         zonename = dns_db_origin(db);
142
143         node = NULL;
144         result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
145         if (result != ISC_R_SUCCESS)
146                 goto nonode;
147
148         dns_rdataset_init(&rdataset);
149         result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
150                                      (isc_stdtime_t)0, &rdataset, NULL);
151         if (result != ISC_R_SUCCESS)
152                 goto freenode;
153
154         result = dns_rdataset_first(&rdataset);
155         if (result != ISC_R_SUCCESS)
156                 goto freenode;
157
158         dns_rdataset_current(&rdataset, &rdata);
159
160         result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
161                                       &rdata, tp);
162
163         dns_rdataset_disassociate(&rdataset);
164         dns_db_detachnode(db, &node);
165         return (ISC_R_SUCCESS);
166
167  freenode:
168         dns_db_detachnode(db, &node);
169  nonode:
170         UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
171         return (result);
172 }
173
174 /* Journalling */
175
176 /*%
177  * On-disk representation of a "pointer" to a journal entry.
178  * These are used in the journal header to locate the beginning
179  * and end of the journal, and in the journal index to locate
180  * other transactions.
181  */
182 typedef struct {
183         unsigned char   serial[4];  /*%< SOA serial before update. */
184         /*
185          * XXXRTH  Should offset be 8 bytes?
186          * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
187          * XXXAG  ... but we will not be able to seek >2G anyway on many
188          *            platforms as long as we are using fseek() rather
189          *            than lseek().
190          */
191         unsigned char   offset[4];  /*%< Offset from beginning of file. */
192 } journal_rawpos_t;
193
194
195 /*%
196  * The header is of a fixed size, with some spare room for future
197  * extensions.
198  */
199 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
200
201 /*%
202  * The on-disk representation of the journal header.
203  * All numbers are stored in big-endian order.
204  */
205 typedef union {
206         struct {
207                 /*% File format version ID. */
208                 unsigned char           format[16];
209                 /*% Position of the first addressable transaction */
210                 journal_rawpos_t        begin;
211                 /*% Position of the next (yet nonexistent) transaction. */
212                 journal_rawpos_t        end;
213                 /*% Number of index entries following the header. */
214                 unsigned char           index_size[4];
215         } h;
216         /* Pad the header to a fixed size. */
217         unsigned char pad[JOURNAL_HEADER_SIZE];
218 } journal_rawheader_t;
219
220 /*%
221  * The on-disk representation of the transaction header.
222  * There is one of these at the beginning of each transaction.
223  */
224 typedef struct {
225         unsigned char   size[4];        /*%< In bytes, excluding header. */
226         unsigned char   serial0[4];     /*%< SOA serial before update. */
227         unsigned char   serial1[4];     /*%< SOA serial after update. */
228 } journal_rawxhdr_t;
229
230 /*%
231  * The on-disk representation of the RR header.
232  * There is one of these at the beginning of each RR.
233  */
234 typedef struct {
235         unsigned char   size[4];        /*%< In bytes, excluding header. */
236 } journal_rawrrhdr_t;
237
238 /*%
239  * The in-core representation of the journal header.
240  */
241 typedef struct {
242         isc_uint32_t    serial;
243         isc_offset_t    offset;
244 } journal_pos_t;
245
246 #define POS_VALID(pos)          ((pos).offset != 0)
247 #define POS_INVALIDATE(pos)     ((pos).offset = 0, (pos).serial = 0)
248
249 typedef struct {
250         unsigned char   format[16];
251         journal_pos_t   begin;
252         journal_pos_t   end;
253         isc_uint32_t    index_size;
254 } journal_header_t;
255
256 /*%
257  * The in-core representation of the transaction header.
258  */
259
260 typedef struct {
261         isc_uint32_t    size;
262         isc_uint32_t    serial0;
263         isc_uint32_t    serial1;
264 } journal_xhdr_t;
265
266 /*%
267  * The in-core representation of the RR header.
268  */
269 typedef struct {
270         isc_uint32_t    size;
271 } journal_rrhdr_t;
272
273
274 /*%
275  * Initial contents to store in the header of a newly created
276  * journal file.
277  *
278  * The header starts with the magic string ";BIND LOG V9\n"
279  * to identify the file as a BIND 9 journal file.  An ASCII
280  * identification string is used rather than a binary magic
281  * number to be consistent with BIND 8 (BIND 8 journal files
282  * are ASCII text files).
283  */
284
285 static journal_header_t
286 initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
287
288 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
289
290 typedef enum {
291         JOURNAL_STATE_INVALID,
292         JOURNAL_STATE_READ,
293         JOURNAL_STATE_WRITE,
294         JOURNAL_STATE_TRANSACTION
295 } journal_state_t;
296
297 struct dns_journal {
298         unsigned int            magic;          /*%< JOUR */
299         isc_mem_t               *mctx;          /*%< Memory context */
300         journal_state_t         state;
301         const char              *filename;      /*%< Journal file name */
302         FILE *                  fp;             /*%< File handle */
303         isc_offset_t            offset;         /*%< Current file offset */
304         journal_header_t        header;         /*%< In-core journal header */
305         unsigned char           *rawindex;      /*%< In-core buffer for journal index in on-disk format */
306         journal_pos_t           *index;         /*%< In-core journal index */
307
308         /*% Current transaction state (when writing). */
309         struct {
310                 unsigned int    n_soa;          /*%< Number of SOAs seen */
311                 journal_pos_t   pos[2];         /*%< Begin/end position */
312         } x;
313
314         /*% Iteration state (when reading). */
315         struct {
316                 /* These define the part of the journal we iterate over. */
317                 journal_pos_t bpos;             /*%< Position before first, */
318                 journal_pos_t epos;             /*%< and after last transaction */
319                 /* The rest is iterator state. */
320                 isc_uint32_t current_serial;    /*%< Current SOA serial */
321                 isc_buffer_t source;            /*%< Data from disk */
322                 isc_buffer_t target;            /*%< Data from _fromwire check */
323                 dns_decompress_t dctx;          /*%< Dummy decompression ctx */
324                 dns_name_t name;                /*%< Current domain name */
325                 dns_rdata_t rdata;              /*%< Current rdata */
326                 isc_uint32_t ttl;               /*%< Current TTL */
327                 unsigned int xsize;             /*%< Size of transaction data */
328                 unsigned int xpos;              /*%< Current position in it */
329                 isc_result_t result;            /*%< Result of last call */
330         } it;
331 };
332
333 #define DNS_JOURNAL_MAGIC       ISC_MAGIC('J', 'O', 'U', 'R')
334 #define DNS_JOURNAL_VALID(t)    ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
335
336 static void
337 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
338         cooked->serial = decode_uint32(raw->serial);
339         cooked->offset = decode_uint32(raw->offset);
340 }
341
342 static void
343 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
344         encode_uint32(cooked->serial, raw->serial);
345         encode_uint32(cooked->offset, raw->offset);
346 }
347
348 static void
349 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
350         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
351         memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
352         journal_pos_decode(&raw->h.begin, &cooked->begin);
353         journal_pos_decode(&raw->h.end, &cooked->end);
354         cooked->index_size = decode_uint32(raw->h.index_size);
355 }
356
357 static void
358 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
359         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
360         memset(raw->pad, 0, sizeof(raw->pad));
361         memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
362         journal_pos_encode(&raw->h.begin, &cooked->begin);
363         journal_pos_encode(&raw->h.end, &cooked->end);
364         encode_uint32(cooked->index_size, raw->h.index_size);
365 }
366
367 /*
368  * Journal file I/O subroutines, with error checking and reporting.
369  */
370 static isc_result_t
371 journal_seek(dns_journal_t *j, isc_uint32_t offset) {
372         isc_result_t result;
373         result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
374         if (result != ISC_R_SUCCESS) {
375                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
376                               "%s: seek: %s", j->filename,
377                               isc_result_totext(result));
378                 return (ISC_R_UNEXPECTED);
379         }
380         j->offset = offset;
381         return (ISC_R_SUCCESS);
382 }
383
384 static isc_result_t
385 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
386         isc_result_t result;
387
388         result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
389         if (result != ISC_R_SUCCESS) {
390                 if (result == ISC_R_EOF)
391                         return (ISC_R_NOMORE);
392                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
393                               "%s: read: %s",
394                               j->filename, isc_result_totext(result));
395                 return (ISC_R_UNEXPECTED);
396         }
397         j->offset += nbytes;
398         return (ISC_R_SUCCESS);
399 }
400
401 static isc_result_t
402 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
403         isc_result_t result;
404
405         result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
406         if (result != ISC_R_SUCCESS) {
407                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
408                               "%s: write: %s",
409                               j->filename, isc_result_totext(result));
410                 return (ISC_R_UNEXPECTED);
411         }
412         j->offset += nbytes;
413         return (ISC_R_SUCCESS);
414 }
415
416 static isc_result_t
417 journal_fsync(dns_journal_t *j) {
418         isc_result_t result;
419         result = isc_stdio_flush(j->fp);
420         if (result != ISC_R_SUCCESS) {
421                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
422                               "%s: flush: %s",
423                               j->filename, isc_result_totext(result));
424                 return (ISC_R_UNEXPECTED);
425         }
426         result = isc_stdio_sync(j->fp);
427         if (result != ISC_R_SUCCESS) {
428                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
429                               "%s: fsync: %s",
430                               j->filename, isc_result_totext(result));
431                 return (ISC_R_UNEXPECTED);
432         }
433         return (ISC_R_SUCCESS);
434 }
435
436 /*
437  * Read/write a transaction header at the current file position.
438  */
439
440 static isc_result_t
441 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
442         journal_rawxhdr_t raw;
443         isc_result_t result;
444         result = journal_read(j, &raw, sizeof(raw));
445         if (result != ISC_R_SUCCESS)
446                 return (result);
447         xhdr->size = decode_uint32(raw.size);
448         xhdr->serial0 = decode_uint32(raw.serial0);
449         xhdr->serial1 = decode_uint32(raw.serial1);
450         return (ISC_R_SUCCESS);
451 }
452
453 static isc_result_t
454 journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
455                    isc_uint32_t serial0, isc_uint32_t serial1)
456 {
457         journal_rawxhdr_t raw;
458         encode_uint32(size, raw.size);
459         encode_uint32(serial0, raw.serial0);
460         encode_uint32(serial1, raw.serial1);
461         return (journal_write(j, &raw, sizeof(raw)));
462 }
463
464
465 /*
466  * Read an RR header at the current file position.
467  */
468
469 static isc_result_t
470 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
471         journal_rawrrhdr_t raw;
472         isc_result_t result;
473         result = journal_read(j, &raw, sizeof(raw));
474         if (result != ISC_R_SUCCESS)
475                 return (result);
476         rrhdr->size = decode_uint32(raw.size);
477         return (ISC_R_SUCCESS);
478 }
479
480 static isc_result_t
481 journal_file_create(isc_mem_t *mctx, const char *filename) {
482         FILE *fp = NULL;
483         isc_result_t result;
484         journal_header_t header;
485         journal_rawheader_t rawheader;
486         int index_size = 56; /* XXX configurable */
487         int size;
488         void *mem; /* Memory for temporary index image. */
489
490         INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
491
492         result = isc_stdio_open(filename, "wb", &fp);
493         if (result != ISC_R_SUCCESS) {
494                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
495                               "%s: create: %s",
496                               filename, isc_result_totext(result));
497                 return (ISC_R_UNEXPECTED);
498         }
499
500         header = initial_journal_header;
501         header.index_size = index_size;
502         journal_header_encode(&header, &rawheader);
503
504         size = sizeof(journal_rawheader_t) +
505                 index_size * sizeof(journal_rawpos_t);
506
507         mem = isc_mem_get(mctx, size);
508         if (mem == NULL) {
509                 (void)isc_stdio_close(fp);
510                 (void)isc_file_remove(filename);
511                 return (ISC_R_NOMEMORY);
512         }
513         memset(mem, 0, size);
514         memcpy(mem, &rawheader, sizeof(rawheader));
515
516         result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
517         if (result != ISC_R_SUCCESS) {
518                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
519                                  "%s: write: %s",
520                                  filename, isc_result_totext(result));
521                 (void)isc_stdio_close(fp);
522                 (void)isc_file_remove(filename);
523                 isc_mem_put(mctx, mem, size);
524                 return (ISC_R_UNEXPECTED);
525         }
526         isc_mem_put(mctx, mem, size);
527
528         result = isc_stdio_close(fp);
529         if (result != ISC_R_SUCCESS) {
530                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
531                                  "%s: close: %s",
532                                  filename, isc_result_totext(result));
533                 (void)isc_file_remove(filename);
534                 return (ISC_R_UNEXPECTED);
535         }
536
537         return (ISC_R_SUCCESS);
538 }
539
540 static isc_result_t
541 journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
542              isc_boolean_t create, dns_journal_t **journalp) {
543         FILE *fp = NULL;
544         isc_result_t result;
545         journal_rawheader_t rawheader;
546         dns_journal_t *j;
547
548         INSIST(journalp != NULL && *journalp == NULL);
549         j = isc_mem_get(mctx, sizeof(*j));
550         if (j == NULL)
551                 return (ISC_R_NOMEMORY);
552
553         j->mctx = mctx;
554         j->state = JOURNAL_STATE_INVALID;
555         j->fp = NULL;
556         j->filename = filename;
557         j->index = NULL;
558         j->rawindex = NULL;
559
560         result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
561
562         if (result == ISC_R_FILENOTFOUND) {
563                 if (create) {
564                         isc_log_write(JOURNAL_COMMON_LOGARGS,
565                                       ISC_LOG_INFO,
566                                       "journal file %s does not exist, "
567                                       "creating it",
568                                       j->filename);
569                         CHECK(journal_file_create(mctx, filename));
570                         /*
571                          * Retry.
572                          */
573                         result = isc_stdio_open(j->filename, "rb+", &fp);
574                 } else {
575                         FAIL(ISC_R_NOTFOUND);
576                 }
577         }
578         if (result != ISC_R_SUCCESS) {
579                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
580                               "%s: open: %s",
581                               j->filename, isc_result_totext(result));
582                 FAIL(ISC_R_UNEXPECTED);
583         }
584
585         j->fp = fp;
586
587         /*
588          * Set magic early so that seek/read can succeed.
589          */
590         j->magic = DNS_JOURNAL_MAGIC;
591
592         CHECK(journal_seek(j, 0));
593         CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
594
595         if (memcmp(rawheader.h.format, initial_journal_header.format,
596                    sizeof(initial_journal_header.format)) != 0) {
597                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
598                                  "%s: journal format not recognized",
599                                  j->filename);
600                 FAIL(ISC_R_UNEXPECTED);
601         }
602         journal_header_decode(&rawheader, &j->header);
603
604         /*
605          * If there is an index, read the raw index into a dynamically
606          * allocated buffer and then convert it into a cooked index.
607          */
608         if (j->header.index_size != 0) {
609                 unsigned int i;
610                 unsigned int rawbytes;
611                 unsigned char *p;
612
613                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
614                 j->rawindex = isc_mem_get(mctx, rawbytes);
615                 if (j->rawindex == NULL)
616                         FAIL(ISC_R_NOMEMORY);
617
618                 CHECK(journal_read(j, j->rawindex, rawbytes));
619
620                 j->index = isc_mem_get(mctx, j->header.index_size *
621                                        sizeof(journal_pos_t));
622                 if (j->index == NULL)
623                         FAIL(ISC_R_NOMEMORY);
624
625                 p = j->rawindex;
626                 for (i = 0; i < j->header.index_size; i++) {
627                         j->index[i].serial = decode_uint32(p);
628                         p += 4;
629                         j->index[i].offset = decode_uint32(p);
630                         p += 4;
631                 }
632                 INSIST(p == j->rawindex + rawbytes);
633         }
634         j->offset = -1; /* Invalid, must seek explicitly. */
635
636         /*
637          * Initialize the iterator.
638          */
639         dns_name_init(&j->it.name, NULL);
640         dns_rdata_init(&j->it.rdata);
641
642         /*
643          * Set up empty initial buffers for uncheched and checked
644          * wire format RR data.  They will be reallocated
645          * later.
646          */
647         isc_buffer_init(&j->it.source, NULL, 0);
648         isc_buffer_init(&j->it.target, NULL, 0);
649         dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
650
651         j->state =
652                 write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
653
654         *journalp = j;
655         return (ISC_R_SUCCESS);
656
657  failure:
658         j->magic = 0;
659         if (j->index != NULL) {
660                 isc_mem_put(j->mctx, j->index, j->header.index_size *
661                             sizeof(journal_rawpos_t));
662                 j->index = NULL;
663         }
664         if (j->fp != NULL)
665                 (void)isc_stdio_close(j->fp);
666         isc_mem_put(j->mctx, j, sizeof(*j));
667         return (result);
668 }
669
670 isc_result_t
671 dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
672                  dns_journal_t **journalp) {
673         return (journal_open(mctx, filename, write, write, journalp));
674 }
675
676 /*
677  * A comparison function defining the sorting order for
678  * entries in the IXFR-style journal file.
679  *
680  * The IXFR format requires that deletions are sorted before
681  * additions, and within either one, SOA records are sorted
682  * before others.
683  *
684  * Also sort the non-SOA records by type as a courtesy to the
685  * server receiving the IXFR - it may help reduce the amount of
686  * rdataset merging it has to do.
687  */
688 static int
689 ixfr_order(const void *av, const void *bv) {
690         dns_difftuple_t const * const *ap = av;
691         dns_difftuple_t const * const *bp = bv;
692         dns_difftuple_t const *a = *ap;
693         dns_difftuple_t const *b = *bp;
694         int r;
695
696         r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
697         if (r != 0)
698                 return (r);
699
700         r = (b->rdata.type == dns_rdatatype_soa) -
701                 (a->rdata.type == dns_rdatatype_soa);
702         if (r != 0)
703                 return (r);
704
705         r = (a->rdata.type - b->rdata.type);
706         return (r);
707 }
708
709 /*
710  * Advance '*pos' to the next journal transaction.
711  *
712  * Requires:
713  *      *pos refers to a valid journal transaction.
714  *
715  * Ensures:
716  *      When ISC_R_SUCCESS is returned,
717  *      *pos refers to the next journal transaction.
718  *
719  * Returns one of:
720  *
721  *    ISC_R_SUCCESS
722  *    ISC_R_NOMORE      *pos pointed at the last transaction
723  *    Other results due to file errors are possible.
724  */
725 static isc_result_t
726 journal_next(dns_journal_t *j, journal_pos_t *pos) {
727         isc_result_t result;
728         journal_xhdr_t xhdr;
729         REQUIRE(DNS_JOURNAL_VALID(j));
730
731         result = journal_seek(j, pos->offset);
732         if (result != ISC_R_SUCCESS)
733                 return (result);
734
735         if (pos->serial == j->header.end.serial)
736                 return (ISC_R_NOMORE);
737         /*
738          * Read the header of the current transaction.
739          * This will return ISC_R_NOMORE if we are at EOF.
740          */
741         result = journal_read_xhdr(j, &xhdr);
742         if (result != ISC_R_SUCCESS)
743                 return (result);
744
745         /*
746          * Check serial number consistency.
747          */
748         if (xhdr.serial0 != pos->serial) {
749                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
750                               "%s: journal file corrupt: "
751                               "expected serial %u, got %u",
752                               j->filename, pos->serial, xhdr.serial0);
753                 return (ISC_R_UNEXPECTED);
754         }
755
756         /*
757          * Check for offset wraparound.
758          */
759         if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
760             < pos->offset) {
761                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
762                               "%s: offset too large", j->filename);
763                 return (ISC_R_UNEXPECTED);
764         }
765
766         pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
767         pos->serial = xhdr.serial1;
768         return (ISC_R_SUCCESS);
769 }
770
771 /*
772  * If the index of the journal 'j' contains an entry "better"
773  * than '*best_guess', replace '*best_guess' with it.
774  *
775  * "Better" means having a serial number closer to 'serial'
776  * but not greater than 'serial'.
777  */
778 static void
779 index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
780         unsigned int i;
781         if (j->index == NULL)
782                 return;
783         for (i = 0; i < j->header.index_size; i++) {
784                 if (POS_VALID(j->index[i]) &&
785                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
786                     DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
787                         *best_guess = j->index[i];
788         }
789 }
790
791 /*
792  * Add a new index entry.  If there is no room, make room by removing
793  * the odd-numbered entries and compacting the others into the first
794  * half of the index.  This decimates old index entries exponentially
795  * over time, so that the index always contains a much larger fraction
796  * of recent serial numbers than of old ones.  This is deliberate -
797  * most index searches are for outgoing IXFR, and IXFR tends to request
798  * recent versions more often than old ones.
799  */
800 static void
801 index_add(dns_journal_t *j, journal_pos_t *pos) {
802         unsigned int i;
803         if (j->index == NULL)
804                 return;
805         /*
806          * Search for a vacant position.
807          */
808         for (i = 0; i < j->header.index_size; i++) {
809                 if (! POS_VALID(j->index[i]))
810                         break;
811         }
812         if (i == j->header.index_size) {
813                 unsigned int k = 0;
814                 /*
815                  * Found no vacant position.  Make some room.
816                  */
817                 for (i = 0; i < j->header.index_size; i += 2) {
818                         j->index[k++] = j->index[i];
819                 }
820                 i = k; /* 'i' identifies the first vacant position. */
821                 while (k < j->header.index_size) {
822                         POS_INVALIDATE(j->index[k]);
823                         k++;
824                 }
825         }
826         INSIST(i < j->header.index_size);
827         INSIST(! POS_VALID(j->index[i]));
828
829         /*
830          * Store the new index entry.
831          */
832         j->index[i] = *pos;
833 }
834
835 /*
836  * Invalidate any existing index entries that could become
837  * ambiguous when a new transaction with number 'serial' is added.
838  */
839 static void
840 index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
841         unsigned int i;
842         if (j->index == NULL)
843                 return;
844         for (i = 0; i < j->header.index_size; i++) {
845                 if (! DNS_SERIAL_GT(serial, j->index[i].serial))
846                         POS_INVALIDATE(j->index[i]);
847         }
848 }
849
850 /*
851  * Try to find a transaction with initial serial number 'serial'
852  * in the journal 'j'.
853  *
854  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
855  *
856  * If 'serial' is current (= the ending serial number of the
857  * last transaction in the journal), set '*pos' to
858  * the position immediately following the last transaction and
859  * return ISC_R_SUCCESS.
860  *
861  * If 'serial' is within the range of addressable serial numbers
862  * covered by the journal but that particular serial number is missing
863  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
864  *
865  * If 'serial' is outside the range of addressable serial numbers
866  * covered by the journal, return ISC_R_RANGE.
867  *
868  */
869 static isc_result_t
870 journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
871         isc_result_t result;
872         journal_pos_t current_pos;
873         REQUIRE(DNS_JOURNAL_VALID(j));
874
875         if (DNS_SERIAL_GT(j->header.begin.serial, serial))
876                 return (ISC_R_RANGE);
877         if (DNS_SERIAL_GT(serial, j->header.end.serial))
878                 return (ISC_R_RANGE);
879         if (serial == j->header.end.serial) {
880                 *pos = j->header.end;
881                 return (ISC_R_SUCCESS);
882         }
883
884         current_pos = j->header.begin;
885         index_find(j, serial, &current_pos);
886
887         while (current_pos.serial != serial) {
888                 if (DNS_SERIAL_GT(current_pos.serial, serial))
889                         return (ISC_R_NOTFOUND);
890                 result = journal_next(j, &current_pos);
891                 if (result != ISC_R_SUCCESS)
892                         return (result);
893         }
894         *pos = current_pos;
895         return (ISC_R_SUCCESS);
896 }
897
898 isc_result_t
899 dns_journal_begin_transaction(dns_journal_t *j) {
900         isc_uint32_t offset;
901         isc_result_t result;
902         journal_rawxhdr_t hdr;
903
904         REQUIRE(DNS_JOURNAL_VALID(j));
905         REQUIRE(j->state == JOURNAL_STATE_WRITE);
906
907         /*
908          * Find the file offset where the new transaction should
909          * be written, and seek there.
910          */
911         if (JOURNAL_EMPTY(&j->header)) {
912                 offset = sizeof(journal_rawheader_t) +
913                         j->header.index_size * sizeof(journal_rawpos_t);
914         } else {
915                 offset = j->header.end.offset;
916         }
917         j->x.pos[0].offset = offset;
918         j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
919         j->x.n_soa = 0;
920
921         CHECK(journal_seek(j, offset));
922
923         /*
924          * Write a dummy transaction header of all zeroes to reserve
925          * space.  It will be filled in when the transaction is
926          * finished.
927          */
928         memset(&hdr, 0, sizeof(hdr));
929         CHECK(journal_write(j, &hdr, sizeof(hdr)));
930         j->x.pos[1].offset = j->offset;
931
932         j->state = JOURNAL_STATE_TRANSACTION;
933         result = ISC_R_SUCCESS;
934  failure:
935         return (result);
936 }
937
938 isc_result_t
939 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
940         dns_difftuple_t *t;
941         isc_buffer_t buffer;
942         void *mem = NULL;
943         unsigned int size;
944         isc_result_t result;
945         isc_region_t used;
946
947         REQUIRE(DNS_DIFF_VALID(diff));
948         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
949
950         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
951         (void)dns_diff_print(diff, NULL);
952
953         /*
954          * Pass 1: determine the buffer size needed, and
955          * keep track of SOA serial numbers.
956          */
957         size = 0;
958         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
959              t = ISC_LIST_NEXT(t, link))
960         {
961                 if (t->rdata.type == dns_rdatatype_soa) {
962                         if (j->x.n_soa < 2)
963                                 j->x.pos[j->x.n_soa].serial =
964                                         dns_soa_getserial(&t->rdata);
965                         j->x.n_soa++;
966                 }
967                 size += sizeof(journal_rawrrhdr_t);
968                 size += t->name.length; /* XXX should have access macro? */
969                 size += 10;
970                 size += t->rdata.length;
971         }
972
973         mem = isc_mem_get(j->mctx, size);
974         if (mem == NULL)
975                 return (ISC_R_NOMEMORY);
976
977         isc_buffer_init(&buffer, mem, size);
978
979         /*
980          * Pass 2.  Write RRs to buffer.
981          */
982         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
983              t = ISC_LIST_NEXT(t, link))
984         {
985                 /*
986                  * Write the RR header.
987                  */
988                 isc_buffer_putuint32(&buffer, t->name.length + 10 +
989                                      t->rdata.length);
990                 /*
991                  * Write the owner name, RR header, and RR data.
992                  */
993                 isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
994                 isc_buffer_putuint16(&buffer, t->rdata.type);
995                 isc_buffer_putuint16(&buffer, t->rdata.rdclass);
996                 isc_buffer_putuint32(&buffer, t->ttl);
997                 INSIST(t->rdata.length < 65536);
998                 isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
999                 INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1000                 isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1001         }
1002
1003         isc_buffer_usedregion(&buffer, &used);
1004         INSIST(used.length == size);
1005
1006         j->x.pos[1].offset += used.length;
1007
1008         /*
1009          * Write the buffer contents to the journal file.
1010          */
1011         CHECK(journal_write(j, used.base, used.length));
1012
1013         result = ISC_R_SUCCESS;
1014
1015  failure:
1016         if (mem != NULL)
1017                 isc_mem_put(j->mctx, mem, size);
1018         return (result);
1019
1020 }
1021
1022 isc_result_t
1023 dns_journal_commit(dns_journal_t *j) {
1024         isc_result_t result;
1025         journal_rawheader_t rawheader;
1026
1027         REQUIRE(DNS_JOURNAL_VALID(j));
1028         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1029
1030         /*
1031          * Perform some basic consistency checks.
1032          */
1033         if (j->x.n_soa != 2) {
1034                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1035                               "%s: malformed transaction: %d SOAs",
1036                               j->filename, j->x.n_soa);
1037                 return (ISC_R_UNEXPECTED);
1038         }
1039         if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1040                (bind8_compat &&
1041                 j->x.pos[1].serial == j->x.pos[0].serial)))
1042         {
1043                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1044                               "%s: malformed transaction: serial number "
1045                               "would decrease", j->filename);
1046                 return (ISC_R_UNEXPECTED);
1047         }
1048         if (! JOURNAL_EMPTY(&j->header)) {
1049                 if (j->x.pos[0].serial != j->header.end.serial) {
1050                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1051                                          "malformed transaction: "
1052                                          "%s last serial %u != "
1053                                          "transaction first serial %u",
1054                                          j->filename,
1055                                          j->header.end.serial,
1056                                          j->x.pos[0].serial);
1057                         return (ISC_R_UNEXPECTED);
1058                 }
1059         }
1060
1061         /*
1062          * Some old journal entries may become non-addressable
1063          * when we increment the current serial number.  Purge them
1064          * by stepping header.begin forward to the first addressable
1065          * transaction.  Also purge them from the index.
1066          */
1067         if (! JOURNAL_EMPTY(&j->header)) {
1068                 while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1069                                        j->header.begin.serial)) {
1070                         CHECK(journal_next(j, &j->header.begin));
1071                 }
1072                 index_invalidate(j, j->x.pos[1].serial);
1073         }
1074 #ifdef notyet
1075         if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1076                 force_dump(...);
1077         }
1078 #endif
1079
1080         /*
1081          * Commit the transaction data to stable storage.
1082          */
1083         CHECK(journal_fsync(j));
1084
1085         /*
1086          * Update the transaction header.
1087          */
1088         CHECK(journal_seek(j, j->x.pos[0].offset));
1089         CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1090                                  sizeof(journal_rawxhdr_t),
1091                                  j->x.pos[0].serial, j->x.pos[1].serial));
1092
1093         /*
1094          * Update the journal header.
1095          */
1096         if (JOURNAL_EMPTY(&j->header)) {
1097                 j->header.begin = j->x.pos[0];
1098         }
1099         j->header.end = j->x.pos[1];
1100         journal_header_encode(&j->header, &rawheader);
1101         CHECK(journal_seek(j, 0));
1102         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1103
1104         /*
1105          * Update the index.
1106          */
1107         index_add(j, &j->x.pos[0]);
1108
1109         /*
1110          * Convert the index into on-disk format and write
1111          * it to disk.
1112          */
1113         CHECK(index_to_disk(j));
1114
1115         /*
1116          * Commit the header to stable storage.
1117          */
1118         CHECK(journal_fsync(j));
1119
1120         /*
1121          * We no longer have a transaction open.
1122          */
1123         j->state = JOURNAL_STATE_WRITE;
1124
1125         result = ISC_R_SUCCESS;
1126
1127  failure:
1128         return (result);
1129 }
1130
1131 isc_result_t
1132 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1133         isc_result_t result;
1134         CHECK(dns_diff_sort(diff, ixfr_order));
1135         CHECK(dns_journal_begin_transaction(j));
1136         CHECK(dns_journal_writediff(j, diff));
1137         CHECK(dns_journal_commit(j));
1138         result = ISC_R_SUCCESS;
1139  failure:
1140         return (result);
1141 }
1142
1143 void
1144 dns_journal_destroy(dns_journal_t **journalp) {
1145         dns_journal_t *j = *journalp;
1146         REQUIRE(DNS_JOURNAL_VALID(j));
1147
1148         j->it.result = ISC_R_FAILURE;
1149         dns_name_invalidate(&j->it.name);
1150         dns_decompress_invalidate(&j->it.dctx);
1151         if (j->rawindex != NULL)
1152                 isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1153                             sizeof(journal_rawpos_t));
1154         if (j->index != NULL)
1155                 isc_mem_put(j->mctx, j->index, j->header.index_size *
1156                             sizeof(journal_pos_t));
1157         if (j->it.target.base != NULL)
1158                 isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1159         if (j->it.source.base != NULL)
1160                 isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1161
1162         if (j->fp != NULL)
1163                 (void)isc_stdio_close(j->fp);
1164         j->magic = 0;
1165         isc_mem_put(j->mctx, j, sizeof(*j));
1166         *journalp = NULL;
1167 }
1168
1169 /*
1170  * Roll the open journal 'j' into the database 'db'.
1171  * A new database version will be created.
1172  */
1173
1174 /* XXX Share code with incoming IXFR? */
1175
1176 static isc_result_t
1177 roll_forward(dns_journal_t *j, dns_db_t *db) {
1178         isc_buffer_t source;            /* Transaction data from disk */
1179         isc_buffer_t target;            /* Ditto after _fromwire check */
1180         isc_uint32_t db_serial;         /* Database SOA serial */
1181         isc_uint32_t end_serial;        /* Last journal SOA serial */
1182         isc_result_t result;
1183         dns_dbversion_t *ver = NULL;
1184         journal_pos_t pos;
1185         dns_diff_t diff;
1186         unsigned int n_soa = 0;
1187         unsigned int n_put = 0;
1188
1189         REQUIRE(DNS_JOURNAL_VALID(j));
1190         REQUIRE(DNS_DB_VALID(db));
1191
1192         dns_diff_init(j->mctx, &diff);
1193
1194         /*
1195          * Set up empty initial buffers for uncheched and checked
1196          * wire format transaction data.  They will be reallocated
1197          * later.
1198          */
1199         isc_buffer_init(&source, NULL, 0);
1200         isc_buffer_init(&target, NULL, 0);
1201
1202         /*
1203          * Create the new database version.
1204          */
1205         CHECK(dns_db_newversion(db, &ver));
1206
1207         /*
1208          * Get the current database SOA serial number.
1209          */
1210         CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1211
1212         /*
1213          * Locate a journal entry for the current database serial.
1214          */
1215         CHECK(journal_find(j, db_serial, &pos));
1216         /*
1217          * XXX do more drastic things, like marking zone stale,
1218          * if this fails?
1219          */
1220         /*
1221          * XXXRTH  The zone code should probably mark the zone as bad and
1222          *         scream loudly into the log if this is a dynamic update
1223          *         log reply that failed.
1224          */
1225
1226         end_serial = dns_journal_last_serial(j);
1227         if (db_serial == end_serial)
1228                 CHECK(DNS_R_UPTODATE);
1229
1230         CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1231
1232         for (result = dns_journal_first_rr(j);
1233              result == ISC_R_SUCCESS;
1234              result = dns_journal_next_rr(j))
1235         {
1236                 dns_name_t *name;
1237                 isc_uint32_t ttl;
1238                 dns_rdata_t *rdata;
1239                 dns_difftuple_t *tuple = NULL;
1240
1241                 name = NULL;
1242                 rdata = NULL;
1243                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1244
1245                 if (rdata->type == dns_rdatatype_soa) {
1246                         n_soa++;
1247                         if (n_soa == 2)
1248                                 db_serial = j->it.current_serial;
1249                 }
1250
1251                 if (n_soa == 3)
1252                         n_soa = 1;
1253                 if (n_soa == 0) {
1254                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1255                                          "%s: journal file corrupt: missing "
1256                                          "initial SOA", j->filename);
1257                         FAIL(ISC_R_UNEXPECTED);
1258                 }
1259                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1260                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1261                                            name, ttl, rdata, &tuple));
1262                 dns_diff_append(&diff, &tuple);
1263
1264                 if (++n_put > 100)  {
1265                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1266                                       "%s: applying diff to database (%u)",
1267                                       j->filename, db_serial);
1268                         (void)dns_diff_print(&diff, NULL);
1269                         CHECK(dns_diff_apply(&diff, db, ver));
1270                         dns_diff_clear(&diff);
1271                         n_put = 0;
1272                 }
1273         }
1274         if (result == ISC_R_NOMORE)
1275                 result = ISC_R_SUCCESS;
1276         CHECK(result);
1277
1278         if (n_put != 0) {
1279                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1280                               "%s: applying final diff to database (%u)",
1281                               j->filename, db_serial);
1282                 (void)dns_diff_print(&diff, NULL);
1283                 CHECK(dns_diff_apply(&diff, db, ver));
1284                 dns_diff_clear(&diff);
1285         }
1286
1287  failure:
1288         if (ver != NULL)
1289                 dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1290                                     ISC_TRUE : ISC_FALSE);
1291
1292         if (source.base != NULL)
1293                 isc_mem_put(j->mctx, source.base, source.length);
1294         if (target.base != NULL)
1295                 isc_mem_put(j->mctx, target.base, target.length);
1296
1297         dns_diff_clear(&diff);
1298
1299         return (result);
1300 }
1301
1302 isc_result_t
1303 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1304         dns_journal_t *j;
1305         isc_result_t result;
1306
1307         REQUIRE(DNS_DB_VALID(db));
1308         REQUIRE(filename != NULL);
1309
1310         j = NULL;
1311         result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1312         if (result == ISC_R_NOTFOUND) {
1313                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1314                               "no journal file, but that's OK");
1315                 return (DNS_R_NOJOURNAL);
1316         }
1317         if (result != ISC_R_SUCCESS)
1318                 return (result);
1319         if (JOURNAL_EMPTY(&j->header))
1320                 result = DNS_R_UPTODATE;
1321         else
1322                 result = roll_forward(j, db);
1323
1324         dns_journal_destroy(&j);
1325
1326         return (result);
1327 }
1328
1329 isc_result_t
1330 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1331         dns_journal_t *j;
1332         isc_buffer_t source;            /* Transaction data from disk */
1333         isc_buffer_t target;            /* Ditto after _fromwire check */
1334         isc_uint32_t start_serial;              /* Database SOA serial */
1335         isc_uint32_t end_serial;        /* Last journal SOA serial */
1336         isc_result_t result;
1337         dns_diff_t diff;
1338         unsigned int n_soa = 0;
1339         unsigned int n_put = 0;
1340
1341         REQUIRE(filename != NULL);
1342
1343         j = NULL;
1344         result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1345         if (result == ISC_R_NOTFOUND) {
1346                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1347                 return (DNS_R_NOJOURNAL);
1348         }
1349
1350         if (result != ISC_R_SUCCESS) {
1351                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1352                               "journal open failure: %s: %s",
1353                               isc_result_totext(result), j->filename);
1354                 return (result);
1355         }
1356
1357         dns_diff_init(j->mctx, &diff);
1358
1359         /*
1360          * Set up empty initial buffers for uncheched and checked
1361          * wire format transaction data.  They will be reallocated
1362          * later.
1363          */
1364         isc_buffer_init(&source, NULL, 0);
1365         isc_buffer_init(&target, NULL, 0);
1366
1367         start_serial = dns_journal_first_serial(j);
1368         end_serial = dns_journal_last_serial(j);
1369
1370         CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1371
1372         for (result = dns_journal_first_rr(j);
1373              result == ISC_R_SUCCESS;
1374              result = dns_journal_next_rr(j))
1375         {
1376                 dns_name_t *name;
1377                 isc_uint32_t ttl;
1378                 dns_rdata_t *rdata;
1379                 dns_difftuple_t *tuple = NULL;
1380
1381                 name = NULL;
1382                 rdata = NULL;
1383                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1384
1385                 if (rdata->type == dns_rdatatype_soa)
1386                         n_soa++;
1387
1388                 if (n_soa == 3)
1389                         n_soa = 1;
1390                 if (n_soa == 0) {
1391                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1392                                          "%s: journal file corrupt: missing "
1393                                          "initial SOA", j->filename);
1394                         FAIL(ISC_R_UNEXPECTED);
1395                 }
1396                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1397                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1398                                            name, ttl, rdata, &tuple));
1399                 dns_diff_append(&diff, &tuple);
1400
1401                 if (++n_put > 100)  {
1402                         result = dns_diff_print(&diff, file);
1403                         dns_diff_clear(&diff);
1404                         n_put = 0;
1405                         if (result != ISC_R_SUCCESS)
1406                                 break;
1407                 }
1408         }
1409         if (result == ISC_R_NOMORE)
1410                 result = ISC_R_SUCCESS;
1411         CHECK(result);
1412
1413         if (n_put != 0) {
1414                 result = dns_diff_print(&diff, file);
1415                 dns_diff_clear(&diff);
1416         }
1417         goto cleanup;
1418
1419  failure:
1420         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1421                       "%s: cannot print: journal file corrupt", j->filename);
1422
1423  cleanup:
1424         if (source.base != NULL)
1425                 isc_mem_put(j->mctx, source.base, source.length);
1426         if (target.base != NULL)
1427                 isc_mem_put(j->mctx, target.base, target.length);
1428
1429         dns_diff_clear(&diff);
1430         dns_journal_destroy(&j);
1431
1432         return (result);
1433 }
1434
1435 /**************************************************************************/
1436 /*
1437  * Miscellaneous accessors.
1438  */
1439 isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1440         return (j->header.begin.serial);
1441 }
1442
1443 isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1444         return (j->header.end.serial);
1445 }
1446
1447 /**************************************************************************/
1448 /*
1449  * Iteration support.
1450  *
1451  * When serving an outgoing IXFR, we transmit a part the journal starting
1452  * at the serial number in the IXFR request and ending at the serial
1453  * number that is current when the IXFR request arrives.  The ending
1454  * serial number is not necessarily at the end of the journal:
1455  * the journal may grow while the IXFR is in progress, but we stop
1456  * when we reach the serial number that was current when the IXFR started.
1457  */
1458
1459 static isc_result_t read_one_rr(dns_journal_t *j);
1460
1461 /*
1462  * Make sure the buffer 'b' is has at least 'size' bytes
1463  * allocated, and clear it.
1464  *
1465  * Requires:
1466  *      Either b->base is NULL, or it points to b->length bytes of memory
1467  *      previously allocated by isc_mem_get().
1468  */
1469
1470 static isc_result_t
1471 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1472         if (b->length < size) {
1473                 void *mem = isc_mem_get(mctx, size);
1474                 if (mem == NULL)
1475                         return (ISC_R_NOMEMORY);
1476                 if (b->base != NULL)
1477                         isc_mem_put(mctx, b->base, b->length);
1478                 b->base = mem;
1479                 b->length = size;
1480         }
1481         isc_buffer_clear(b);
1482         return (ISC_R_SUCCESS);
1483 }
1484
1485 isc_result_t
1486 dns_journal_iter_init(dns_journal_t *j,
1487                       isc_uint32_t begin_serial, isc_uint32_t end_serial)
1488 {
1489         isc_result_t result;
1490
1491         CHECK(journal_find(j, begin_serial, &j->it.bpos));
1492         INSIST(j->it.bpos.serial == begin_serial);
1493
1494         CHECK(journal_find(j, end_serial, &j->it.epos));
1495         INSIST(j->it.epos.serial == end_serial);
1496
1497         result = ISC_R_SUCCESS;
1498  failure:
1499         j->it.result = result;
1500         return (j->it.result);
1501 }
1502
1503
1504 isc_result_t
1505 dns_journal_first_rr(dns_journal_t *j) {
1506         isc_result_t result;
1507
1508         /*
1509          * Seek to the beginning of the first transaction we are
1510          * interested in.
1511          */
1512         CHECK(journal_seek(j, j->it.bpos.offset));
1513         j->it.current_serial = j->it.bpos.serial;
1514
1515         j->it.xsize = 0;  /* We have no transaction data yet... */
1516         j->it.xpos = 0;   /* ...and haven't used any of it. */
1517
1518         return (read_one_rr(j));
1519
1520  failure:
1521         return (result);
1522 }
1523
1524 static isc_result_t
1525 read_one_rr(dns_journal_t *j) {
1526         isc_result_t result;
1527
1528         dns_rdatatype_t rdtype;
1529         dns_rdataclass_t rdclass;
1530         unsigned int rdlen;
1531         isc_uint32_t ttl;
1532         journal_xhdr_t xhdr;
1533         journal_rrhdr_t rrhdr;
1534
1535         INSIST(j->offset <= j->it.epos.offset);
1536         if (j->offset == j->it.epos.offset)
1537                 return (ISC_R_NOMORE);
1538         if (j->it.xpos == j->it.xsize) {
1539                 /*
1540                  * We are at a transaction boundary.
1541                  * Read another transaction header.
1542                  */
1543                 CHECK(journal_read_xhdr(j, &xhdr));
1544                 if (xhdr.size == 0) {
1545                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1546                                       "%s: journal corrupt: empty transaction",
1547                                       j->filename);
1548                         FAIL(ISC_R_UNEXPECTED);
1549                 }
1550                 if (xhdr.serial0 != j->it.current_serial) {
1551                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1552                                          "%s: journal file corrupt: "
1553                                          "expected serial %u, got %u",
1554                                          j->filename,
1555                                          j->it.current_serial, xhdr.serial0);
1556                         FAIL(ISC_R_UNEXPECTED);
1557                 }
1558                 j->it.xsize = xhdr.size;
1559                 j->it.xpos = 0;
1560         }
1561         /*
1562          * Read an RR.
1563          */
1564         CHECK(journal_read_rrhdr(j, &rrhdr));
1565         /*
1566          * Perform a sanity check on the journal RR size.
1567          * The smallest possible RR has a 1-byte owner name
1568          * and a 10-byte header.  The largest possible
1569          * RR has 65535 bytes of data, a header, and a maximum-
1570          * size owner name, well below 70 k total.
1571          */
1572         if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1573                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1574                                  "%s: journal corrupt: impossible RR size "
1575                                  "(%d bytes)", j->filename, rrhdr.size);
1576                 FAIL(ISC_R_UNEXPECTED);
1577         }
1578
1579         CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1580         CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1581         isc_buffer_add(&j->it.source, rrhdr.size);
1582
1583         /*
1584          * The target buffer is made the same size
1585          * as the source buffer, with the assumption that when
1586          * no compression in present, the output of dns_*_fromwire()
1587          * is no larger than the input.
1588          */
1589         CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1590
1591         /*
1592          * Parse the owner name.  We don't know where it
1593          * ends yet, so we make the entire "remaining"
1594          * part of the buffer "active".
1595          */
1596         isc_buffer_setactive(&j->it.source,
1597                              j->it.source.used - j->it.source.current);
1598         CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1599                                 &j->it.dctx, 0, &j->it.target));
1600
1601         /*
1602          * Check that the RR header is there, and parse it.
1603          */
1604         if (isc_buffer_remaininglength(&j->it.source) < 10)
1605                 FAIL(DNS_R_FORMERR);
1606
1607         rdtype = isc_buffer_getuint16(&j->it.source);
1608         rdclass = isc_buffer_getuint16(&j->it.source);
1609         ttl = isc_buffer_getuint32(&j->it.source);
1610         rdlen = isc_buffer_getuint16(&j->it.source);
1611
1612         /*
1613          * Parse the rdata.
1614          */
1615         isc_buffer_setactive(&j->it.source, rdlen);
1616         dns_rdata_reset(&j->it.rdata);
1617         CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1618                                  rdtype, &j->it.source, &j->it.dctx,
1619                                  0, &j->it.target));
1620         j->it.ttl = ttl;
1621
1622         j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1623         if (rdtype == dns_rdatatype_soa) {
1624                 /* XXX could do additional consistency checks here */
1625                 j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1626         }
1627
1628         result = ISC_R_SUCCESS;
1629
1630  failure:
1631         j->it.result = result;
1632         return (result);
1633 }
1634
1635 isc_result_t
1636 dns_journal_next_rr(dns_journal_t *j) {
1637         j->it.result = read_one_rr(j);
1638         return (j->it.result);
1639 }
1640
1641 void
1642 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1643                    dns_rdata_t **rdata)
1644 {
1645         REQUIRE(j->it.result == ISC_R_SUCCESS);
1646         *name = &j->it.name;
1647         *ttl = j->it.ttl;
1648         *rdata = &j->it.rdata;
1649 }
1650
1651 /**************************************************************************/
1652 /*
1653  * Generating diffs from databases
1654  */
1655
1656 /*
1657  * Construct a diff containing all the RRs at the current name of the
1658  * database iterator 'dbit' in database 'db', version 'ver'.
1659  * Set '*name' to the current name, and append the diff to 'diff'.
1660  * All new tuples will have the operation 'op'.
1661  *
1662  * Requires: 'name' must have buffer large enough to hold the name.
1663  * Typically, a dns_fixedname_t would be used.
1664  */
1665 static isc_result_t
1666 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1667               dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1668               dns_diff_t *diff)
1669 {
1670         isc_result_t result;
1671         dns_dbnode_t *node = NULL;
1672         dns_rdatasetiter_t *rdsiter = NULL;
1673         dns_difftuple_t *tuple = NULL;
1674
1675         result = dns_dbiterator_current(dbit, &node, name);
1676         if (result != ISC_R_SUCCESS)
1677                 return (result);
1678
1679         result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1680         if (result != ISC_R_SUCCESS)
1681                 goto cleanup_node;
1682
1683         for (result = dns_rdatasetiter_first(rdsiter);
1684              result == ISC_R_SUCCESS;
1685              result = dns_rdatasetiter_next(rdsiter))
1686         {
1687                 dns_rdataset_t rdataset;
1688
1689                 dns_rdataset_init(&rdataset);
1690                 dns_rdatasetiter_current(rdsiter, &rdataset);
1691
1692                 for (result = dns_rdataset_first(&rdataset);
1693                      result == ISC_R_SUCCESS;
1694                      result = dns_rdataset_next(&rdataset))
1695                 {
1696                         dns_rdata_t rdata = DNS_RDATA_INIT;
1697                         dns_rdataset_current(&rdataset, &rdata);
1698                         result = dns_difftuple_create(diff->mctx, op, name,
1699                                                       rdataset.ttl, &rdata,
1700                                                       &tuple);
1701                         if (result != ISC_R_SUCCESS) {
1702                                 dns_rdataset_disassociate(&rdataset);
1703                                 goto cleanup_iterator;
1704                         }
1705                         dns_diff_append(diff, &tuple);
1706                 }
1707                 dns_rdataset_disassociate(&rdataset);
1708                 if (result != ISC_R_NOMORE)
1709                         goto cleanup_iterator;
1710         }
1711         if (result != ISC_R_NOMORE)
1712                 goto cleanup_iterator;
1713
1714         result = ISC_R_SUCCESS;
1715
1716  cleanup_iterator:
1717         dns_rdatasetiter_destroy(&rdsiter);
1718
1719  cleanup_node:
1720         dns_db_detachnode(db, &node);
1721
1722         return (result);
1723 }
1724
1725 /*
1726  * Comparison function for use by dns_diff_subtract when sorting
1727  * the diffs to be subtracted.  The sort keys are the rdata type
1728  * and the rdata itself.  The owner name is ignored, because
1729  * it is known to be the same for all tuples.
1730  */
1731 static int
1732 rdata_order(const void *av, const void *bv) {
1733         dns_difftuple_t const * const *ap = av;
1734         dns_difftuple_t const * const *bp = bv;
1735         dns_difftuple_t const *a = *ap;
1736         dns_difftuple_t const *b = *bp;
1737         int r;
1738         r = (b->rdata.type - a->rdata.type);
1739         if (r != 0)
1740                 return (r);
1741         r = dns_rdata_compare(&a->rdata, &b->rdata);
1742         return (r);
1743 }
1744
1745 static isc_result_t
1746 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1747         isc_result_t result;
1748         dns_difftuple_t *p[2];
1749         int i, t;
1750         isc_boolean_t append;
1751
1752         CHECK(dns_diff_sort(&diff[0], rdata_order));
1753         CHECK(dns_diff_sort(&diff[1], rdata_order));
1754
1755         for (;;) {
1756                 p[0] = ISC_LIST_HEAD(diff[0].tuples);
1757                 p[1] = ISC_LIST_HEAD(diff[1].tuples);
1758                 if (p[0] == NULL && p[1] == NULL)
1759                         break;
1760
1761                 for (i = 0; i < 2; i++)
1762                         if (p[!i] == NULL) {
1763                                 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1764                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1765                                 goto next;
1766                         }
1767                 t = rdata_order(&p[0], &p[1]);
1768                 if (t < 0) {
1769                         ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1770                         ISC_LIST_APPEND(r->tuples, p[0], link);
1771                         goto next;
1772                 }
1773                 if (t > 0) {
1774                         ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1775                         ISC_LIST_APPEND(r->tuples, p[1], link);
1776                         goto next;
1777                 }
1778                 INSIST(t == 0);
1779                 /*
1780                  * Identical RRs in both databases; skip them both
1781                  * if the ttl differs.
1782                  */
1783                 append = ISC_TF(p[0]->ttl != p[1]->ttl);
1784                 for (i = 0; i < 2; i++) {
1785                         ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1786                         if (append) {
1787                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1788                         } else {
1789                                 dns_difftuple_free(&p[i]);
1790                         }
1791                 }
1792         next: ;
1793         }
1794         result = ISC_R_SUCCESS;
1795  failure:
1796         return (result);
1797 }
1798
1799 /*
1800  * Compare the databases 'dba' and 'dbb' and generate a journal
1801  * entry containing the changes to make 'dba' from 'dbb' (note
1802  * the order).  This journal entry will consist of a single,
1803  * possibly very large transaction.
1804  */
1805
1806 isc_result_t
1807 dns_db_diff(isc_mem_t *mctx,
1808             dns_db_t *dba, dns_dbversion_t *dbvera,
1809             dns_db_t *dbb, dns_dbversion_t *dbverb,
1810             const char *journal_filename)
1811 {
1812         dns_db_t *db[2];
1813         dns_dbversion_t *ver[2];
1814         dns_dbiterator_t *dbit[2] = { NULL, NULL };
1815         isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1816         dns_fixedname_t fixname[2];
1817         isc_result_t result, itresult[2];
1818         dns_diff_t diff[2], resultdiff;
1819         int i, t;
1820         dns_journal_t *journal = NULL;
1821
1822         db[0] = dba, db[1] = dbb;
1823         ver[0] = dbvera, ver[1] = dbverb;
1824
1825         dns_diff_init(mctx, &diff[0]);
1826         dns_diff_init(mctx, &diff[1]);
1827         dns_diff_init(mctx, &resultdiff);
1828
1829         dns_fixedname_init(&fixname[0]);
1830         dns_fixedname_init(&fixname[1]);
1831
1832         result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1833         if (result != ISC_R_SUCCESS)
1834                 return (result);
1835
1836         result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1837         if (result != ISC_R_SUCCESS)
1838                 goto cleanup_journal;
1839         result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1840         if (result != ISC_R_SUCCESS)
1841                 goto cleanup_interator0;
1842
1843         itresult[0] = dns_dbiterator_first(dbit[0]);
1844         itresult[1] = dns_dbiterator_first(dbit[1]);
1845
1846         for (;;) {
1847                 for (i = 0; i < 2; i++) {
1848                         if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1849                                 CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1850                                             dns_fixedname_name(&fixname[i]),
1851                                             i == 0 ?
1852                                             DNS_DIFFOP_ADD :
1853                                             DNS_DIFFOP_DEL,
1854                                             &diff[i]));
1855                                 itresult[i] = dns_dbiterator_next(dbit[i]);
1856                                 have[i] = ISC_TRUE;
1857                         }
1858                 }
1859
1860                 if (! have[0] && ! have[1]) {
1861                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1862                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1863                         break;
1864                 }
1865
1866                 for (i = 0; i < 2; i++) {
1867                         if (! have[!i]) {
1868                                 ISC_LIST_APPENDLIST(resultdiff.tuples,
1869                                                     diff[i].tuples, link);
1870                                 INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1871                                 have[i] = ISC_FALSE;
1872                                 goto next;
1873                         }
1874                 }
1875
1876                 t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1877                                      dns_fixedname_name(&fixname[1]));
1878                 if (t < 0) {
1879                         ISC_LIST_APPENDLIST(resultdiff.tuples,
1880                                             diff[0].tuples, link);
1881                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1882                         have[0] = ISC_FALSE;
1883                         continue;
1884                 }
1885                 if (t > 0) {
1886                         ISC_LIST_APPENDLIST(resultdiff.tuples,
1887                                             diff[1].tuples, link);
1888                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1889                         have[1] = ISC_FALSE;
1890                         continue;
1891                 }
1892                 INSIST(t == 0);
1893                 CHECK(dns_diff_subtract(diff, &resultdiff));
1894                 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1895                 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1896                 have[0] = have[1] = ISC_FALSE;
1897         next: ;
1898         }
1899         if (itresult[0] != ISC_R_NOMORE)
1900                 FAIL(itresult[0]);
1901         if (itresult[1] != ISC_R_NOMORE)
1902                 FAIL(itresult[1]);
1903
1904         if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1905                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1906         } else {
1907                 CHECK(dns_journal_write_transaction(journal, &resultdiff));
1908         }
1909         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1910         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1911
1912  failure:
1913         dns_diff_clear(&resultdiff);
1914         dns_dbiterator_destroy(&dbit[1]);
1915  cleanup_interator0:
1916         dns_dbiterator_destroy(&dbit[0]);
1917  cleanup_journal:
1918         dns_journal_destroy(&journal);
1919         return (result);
1920 }
1921
1922 isc_result_t
1923 dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1924                     isc_uint32_t target_size)
1925 {
1926         unsigned int i;
1927         journal_pos_t best_guess;
1928         journal_pos_t current_pos;
1929         dns_journal_t *j = NULL;
1930         journal_rawheader_t rawheader;
1931         unsigned int copy_length;
1932         unsigned int len;
1933         char *buf = NULL;
1934         unsigned int size = 0;
1935         isc_result_t result;
1936         unsigned int indexend;
1937
1938         CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1939
1940         if (JOURNAL_EMPTY(&j->header)) {
1941                 dns_journal_destroy(&j);
1942                 return (ISC_R_SUCCESS);
1943         }
1944                 
1945         if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1946             DNS_SERIAL_GT(serial, j->header.end.serial)) {
1947                 dns_journal_destroy(&j);
1948                 return (ISC_R_RANGE);
1949         }
1950
1951         /*
1952          * Cope with very small target sizes.
1953          */
1954         indexend = sizeof(journal_rawheader_t) +
1955                    j->header.index_size * sizeof(journal_rawpos_t);
1956         if (target_size < indexend * 2)
1957                 target_size = target_size/2 + indexend;
1958
1959         /*
1960          * See if there is any work to do.
1961          */
1962         if ((isc_uint32_t) j->header.end.offset < target_size) {
1963                 dns_journal_destroy(&j);
1964                 return (ISC_R_SUCCESS);
1965         }
1966         
1967         /*
1968          * Remove overhead so space test below can succeed.
1969          */
1970         if (target_size >= indexend)
1971                 target_size -= indexend;
1972
1973         /*
1974          * Find if we can create enough free space.
1975          */
1976         best_guess = j->header.begin;
1977         for (i = 0; i < j->header.index_size; i++) {
1978                 if (POS_VALID(j->index[i]) &&
1979                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
1980                     ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1981                      >= target_size / 2) &&
1982                     j->index[i].offset > best_guess.offset)
1983                         best_guess = j->index[i];
1984         }
1985
1986         current_pos = best_guess;
1987         while (current_pos.serial != serial) {
1988                 CHECK(journal_next(j, &current_pos));
1989                 if (current_pos.serial == j->header.end.serial)
1990                         break;
1991
1992                 if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1993                    ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1994                      >= (target_size / 2)) &&
1995                     current_pos.offset > best_guess.offset)
1996                         best_guess = current_pos;
1997                 else
1998                         break;
1999         }
2000
2001         INSIST(best_guess.serial != j->header.end.serial);
2002         if (best_guess.serial != serial)
2003                 CHECK(journal_next(j, &best_guess));
2004
2005         /*
2006          * Enough space to proceed?
2007          */
2008         if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2009              (isc_uint32_t) (best_guess.offset - indexend)) {
2010                 dns_journal_destroy(&j);
2011                 return (ISC_R_NOSPACE);
2012         }
2013
2014         copy_length = j->header.end.offset - best_guess.offset;
2015
2016         /*
2017          * Invalidate entire index, will be rebuilt at end.
2018          */
2019         for (i = 0; i < j->header.index_size; i++) {
2020                 if (POS_VALID(j->index[i]))
2021                         POS_INVALIDATE(j->index[i]);
2022         }
2023
2024         /*
2025          * Convert the index into on-disk format and write
2026          * it to disk.
2027          */
2028         CHECK(index_to_disk(j));
2029         CHECK(journal_fsync(j));
2030
2031         /*
2032          * Update the journal header.
2033          */
2034         if (copy_length == 0) {
2035                 j->header.begin.serial = 0;
2036                 j->header.end.serial = 0;
2037                 j->header.begin.offset = 0;
2038                 j->header.end.offset = 0;
2039         } else {
2040                 j->header.begin = best_guess;
2041         }
2042         journal_header_encode(&j->header, &rawheader);
2043         CHECK(journal_seek(j, 0));
2044         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2045         CHECK(journal_fsync(j));
2046
2047         if (copy_length != 0) {
2048                 /*
2049                  * Copy best_guess to end into space just freed.
2050                  */
2051                 size = 64*1024;
2052                 if (copy_length < size)
2053                         size = copy_length;
2054                 buf = isc_mem_get(mctx, size);
2055                 if (buf == NULL) {
2056                         result = ISC_R_NOMEMORY;
2057                         goto failure;
2058                 }
2059         
2060                 for (i = 0; i < copy_length; i += size) {
2061                         len = (copy_length - i) > size ? size :
2062                                                          (copy_length - i);
2063                         CHECK(journal_seek(j, best_guess.offset + i));
2064                         CHECK(journal_read(j, buf, len));
2065                         CHECK(journal_seek(j, indexend + i));
2066                         CHECK(journal_write(j, buf, len));
2067                 }
2068
2069                 CHECK(journal_fsync(j));
2070
2071                 /*
2072                  * Compute new header.
2073                  */
2074                 j->header.begin.offset = indexend;
2075                 j->header.end.offset = indexend + copy_length;
2076                 /*
2077                  * Update the journal header.
2078                  */
2079                 journal_header_encode(&j->header, &rawheader);
2080                 CHECK(journal_seek(j, 0));
2081                 CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2082                 CHECK(journal_fsync(j));
2083
2084                 /*
2085                  * Build new index.
2086                  */
2087                 current_pos = j->header.begin;
2088                 while (current_pos.serial != j->header.end.serial) {
2089                         index_add(j, &current_pos);
2090                         CHECK(journal_next(j, &current_pos));
2091                 }
2092
2093                 /*
2094                  * Write index.
2095                  */
2096                 CHECK(index_to_disk(j));
2097                 CHECK(journal_fsync(j));
2098
2099                 indexend = j->header.end.offset;
2100         }
2101         dns_journal_destroy(&j);
2102         (void)isc_file_truncate(filename, (isc_offset_t)indexend);
2103         result = ISC_R_SUCCESS;
2104
2105  failure:
2106         if (buf != NULL)
2107                 isc_mem_put(mctx, buf, size);
2108         if (j != NULL)
2109                 dns_journal_destroy(&j);
2110         return (result);
2111 }
2112
2113 static isc_result_t
2114 index_to_disk(dns_journal_t *j) {
2115         isc_result_t result = ISC_R_SUCCESS;
2116
2117         if (j->header.index_size != 0) {
2118                 unsigned int i;
2119                 unsigned char *p;
2120                 unsigned int rawbytes;
2121
2122                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2123
2124                 p = j->rawindex;
2125                 for (i = 0; i < j->header.index_size; i++) {
2126                         encode_uint32(j->index[i].serial, p);
2127                         p += 4;
2128                         encode_uint32(j->index[i].offset, p);
2129                         p += 4;
2130                 }
2131                 INSIST(p == j->rawindex + rawbytes);
2132
2133                 CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2134                 CHECK(journal_write(j, j->rawindex, rawbytes));
2135         }
2136 failure:
2137         return (result);
2138 }