]> CyberLeo.Net >> Repos - FreeBSD/releng/10.1.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
Copy stable/10@r272459 to releng/10.1 as part of
[FreeBSD/releng/10.1.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dmu_send.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
27  */
28
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dbuf.h>
33 #include <sys/dnode.h>
34 #include <sys/zfs_context.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dmu_traverse.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dsl_prop.h>
40 #include <sys/dsl_pool.h>
41 #include <sys/dsl_synctask.h>
42 #include <sys/zfs_ioctl.h>
43 #include <sys/zap.h>
44 #include <sys/zio_checksum.h>
45 #include <sys/zfs_znode.h>
46 #include <zfs_fletcher.h>
47 #include <sys/avl.h>
48 #include <sys/ddt.h>
49 #include <sys/zfs_onexit.h>
50 #include <sys/dmu_send.h>
51 #include <sys/dsl_destroy.h>
52 #include <sys/blkptr.h>
53 #include <sys/dsl_bookmark.h>
54 #include <sys/zfeature.h>
55
56 #ifdef __FreeBSD__
57 #undef dump_write
58 #define dump_write dmu_dump_write
59 #endif
60
61 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
62 int zfs_send_corrupt_data = B_FALSE;
63
64 static char *dmu_recv_tag = "dmu_recv_tag";
65 static const char *recv_clone_name = "%recv";
66
67 static int
68 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
69 {
70         dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
71         struct uio auio;
72         struct iovec aiov;
73         ASSERT0(len % 8);
74
75         fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
76         aiov.iov_base = buf;
77         aiov.iov_len = len;
78         auio.uio_iov = &aiov;
79         auio.uio_iovcnt = 1;
80         auio.uio_resid = len;
81         auio.uio_segflg = UIO_SYSSPACE;
82         auio.uio_rw = UIO_WRITE;
83         auio.uio_offset = (off_t)-1;
84         auio.uio_td = dsp->dsa_td;
85 #ifdef _KERNEL
86         if (dsp->dsa_fp->f_type == DTYPE_VNODE)
87                 bwillwrite();
88         dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
89             dsp->dsa_td);
90 #else
91         fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
92         dsp->dsa_err = EOPNOTSUPP;
93 #endif
94         mutex_enter(&ds->ds_sendstream_lock);
95         *dsp->dsa_off += len;
96         mutex_exit(&ds->ds_sendstream_lock);
97
98         return (dsp->dsa_err);
99 }
100
101 static int
102 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
103     uint64_t length)
104 {
105         struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
106
107         /*
108          * When we receive a free record, dbuf_free_range() assumes
109          * that the receiving system doesn't have any dbufs in the range
110          * being freed.  This is always true because there is a one-record
111          * constraint: we only send one WRITE record for any given
112          * object+offset.  We know that the one-record constraint is
113          * true because we always send data in increasing order by
114          * object,offset.
115          *
116          * If the increasing-order constraint ever changes, we should find
117          * another way to assert that the one-record constraint is still
118          * satisfied.
119          */
120         ASSERT(object > dsp->dsa_last_data_object ||
121             (object == dsp->dsa_last_data_object &&
122             offset > dsp->dsa_last_data_offset));
123
124         /*
125          * If we are doing a non-incremental send, then there can't
126          * be any data in the dataset we're receiving into.  Therefore
127          * a free record would simply be a no-op.  Save space by not
128          * sending it to begin with.
129          */
130         if (!dsp->dsa_incremental)
131                 return (0);
132
133         if (length != -1ULL && offset + length < offset)
134                 length = -1ULL;
135
136         /*
137          * If there is a pending op, but it's not PENDING_FREE, push it out,
138          * since free block aggregation can only be done for blocks of the
139          * same type (i.e., DRR_FREE records can only be aggregated with
140          * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
141          * aggregated with other DRR_FREEOBJECTS records.
142          */
143         if (dsp->dsa_pending_op != PENDING_NONE &&
144             dsp->dsa_pending_op != PENDING_FREE) {
145                 if (dump_bytes(dsp, dsp->dsa_drr,
146                     sizeof (dmu_replay_record_t)) != 0)
147                         return (SET_ERROR(EINTR));
148                 dsp->dsa_pending_op = PENDING_NONE;
149         }
150
151         if (dsp->dsa_pending_op == PENDING_FREE) {
152                 /*
153                  * There should never be a PENDING_FREE if length is -1
154                  * (because dump_dnode is the only place where this
155                  * function is called with a -1, and only after flushing
156                  * any pending record).
157                  */
158                 ASSERT(length != -1ULL);
159                 /*
160                  * Check to see whether this free block can be aggregated
161                  * with pending one.
162                  */
163                 if (drrf->drr_object == object && drrf->drr_offset +
164                     drrf->drr_length == offset) {
165                         drrf->drr_length += length;
166                         return (0);
167                 } else {
168                         /* not a continuation.  Push out pending record */
169                         if (dump_bytes(dsp, dsp->dsa_drr,
170                             sizeof (dmu_replay_record_t)) != 0)
171                                 return (SET_ERROR(EINTR));
172                         dsp->dsa_pending_op = PENDING_NONE;
173                 }
174         }
175         /* create a FREE record and make it pending */
176         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
177         dsp->dsa_drr->drr_type = DRR_FREE;
178         drrf->drr_object = object;
179         drrf->drr_offset = offset;
180         drrf->drr_length = length;
181         drrf->drr_toguid = dsp->dsa_toguid;
182         if (length == -1ULL) {
183                 if (dump_bytes(dsp, dsp->dsa_drr,
184                     sizeof (dmu_replay_record_t)) != 0)
185                         return (SET_ERROR(EINTR));
186         } else {
187                 dsp->dsa_pending_op = PENDING_FREE;
188         }
189
190         return (0);
191 }
192
193 static int
194 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
195     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
196 {
197         struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
198
199         /*
200          * We send data in increasing object, offset order.
201          * See comment in dump_free() for details.
202          */
203         ASSERT(object > dsp->dsa_last_data_object ||
204             (object == dsp->dsa_last_data_object &&
205             offset > dsp->dsa_last_data_offset));
206         dsp->dsa_last_data_object = object;
207         dsp->dsa_last_data_offset = offset + blksz - 1;
208
209         /*
210          * If there is any kind of pending aggregation (currently either
211          * a grouping of free objects or free blocks), push it out to
212          * the stream, since aggregation can't be done across operations
213          * of different types.
214          */
215         if (dsp->dsa_pending_op != PENDING_NONE) {
216                 if (dump_bytes(dsp, dsp->dsa_drr,
217                     sizeof (dmu_replay_record_t)) != 0)
218                         return (SET_ERROR(EINTR));
219                 dsp->dsa_pending_op = PENDING_NONE;
220         }
221         /* write a DATA record */
222         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
223         dsp->dsa_drr->drr_type = DRR_WRITE;
224         drrw->drr_object = object;
225         drrw->drr_type = type;
226         drrw->drr_offset = offset;
227         drrw->drr_length = blksz;
228         drrw->drr_toguid = dsp->dsa_toguid;
229         if (BP_IS_EMBEDDED(bp)) {
230                 /*
231                  * There's no pre-computed checksum of embedded BP's, so
232                  * (like fletcher4-checkummed blocks) userland will have
233                  * to compute a dedup-capable checksum itself.
234                  */
235                 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
236         } else {
237                 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
238                 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
239                         drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
240                 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
241                 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
242                 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
243                 drrw->drr_key.ddk_cksum = bp->blk_cksum;
244         }
245
246         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
247                 return (SET_ERROR(EINTR));
248         if (dump_bytes(dsp, data, blksz) != 0)
249                 return (SET_ERROR(EINTR));
250         return (0);
251 }
252
253 static int
254 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
255     int blksz, const blkptr_t *bp)
256 {
257         char buf[BPE_PAYLOAD_SIZE];
258         struct drr_write_embedded *drrw =
259             &(dsp->dsa_drr->drr_u.drr_write_embedded);
260
261         if (dsp->dsa_pending_op != PENDING_NONE) {
262                 if (dump_bytes(dsp, dsp->dsa_drr,
263                     sizeof (dmu_replay_record_t)) != 0)
264                         return (EINTR);
265                 dsp->dsa_pending_op = PENDING_NONE;
266         }
267
268         ASSERT(BP_IS_EMBEDDED(bp));
269
270         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
271         dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
272         drrw->drr_object = object;
273         drrw->drr_offset = offset;
274         drrw->drr_length = blksz;
275         drrw->drr_toguid = dsp->dsa_toguid;
276         drrw->drr_compression = BP_GET_COMPRESS(bp);
277         drrw->drr_etype = BPE_GET_ETYPE(bp);
278         drrw->drr_lsize = BPE_GET_LSIZE(bp);
279         drrw->drr_psize = BPE_GET_PSIZE(bp);
280
281         decode_embedded_bp_compressed(bp, buf);
282
283         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
284                 return (EINTR);
285         if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
286                 return (EINTR);
287         return (0);
288 }
289
290 static int
291 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
292 {
293         struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
294
295         if (dsp->dsa_pending_op != PENDING_NONE) {
296                 if (dump_bytes(dsp, dsp->dsa_drr,
297                     sizeof (dmu_replay_record_t)) != 0)
298                         return (SET_ERROR(EINTR));
299                 dsp->dsa_pending_op = PENDING_NONE;
300         }
301
302         /* write a SPILL record */
303         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
304         dsp->dsa_drr->drr_type = DRR_SPILL;
305         drrs->drr_object = object;
306         drrs->drr_length = blksz;
307         drrs->drr_toguid = dsp->dsa_toguid;
308
309         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
310                 return (SET_ERROR(EINTR));
311         if (dump_bytes(dsp, data, blksz))
312                 return (SET_ERROR(EINTR));
313         return (0);
314 }
315
316 static int
317 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
318 {
319         struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
320
321         /* See comment in dump_free(). */
322         if (!dsp->dsa_incremental)
323                 return (0);
324
325         /*
326          * If there is a pending op, but it's not PENDING_FREEOBJECTS,
327          * push it out, since free block aggregation can only be done for
328          * blocks of the same type (i.e., DRR_FREE records can only be
329          * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
330          * can only be aggregated with other DRR_FREEOBJECTS records.
331          */
332         if (dsp->dsa_pending_op != PENDING_NONE &&
333             dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
334                 if (dump_bytes(dsp, dsp->dsa_drr,
335                     sizeof (dmu_replay_record_t)) != 0)
336                         return (SET_ERROR(EINTR));
337                 dsp->dsa_pending_op = PENDING_NONE;
338         }
339         if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
340                 /*
341                  * See whether this free object array can be aggregated
342                  * with pending one
343                  */
344                 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
345                         drrfo->drr_numobjs += numobjs;
346                         return (0);
347                 } else {
348                         /* can't be aggregated.  Push out pending record */
349                         if (dump_bytes(dsp, dsp->dsa_drr,
350                             sizeof (dmu_replay_record_t)) != 0)
351                                 return (SET_ERROR(EINTR));
352                         dsp->dsa_pending_op = PENDING_NONE;
353                 }
354         }
355
356         /* write a FREEOBJECTS record */
357         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
358         dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
359         drrfo->drr_firstobj = firstobj;
360         drrfo->drr_numobjs = numobjs;
361         drrfo->drr_toguid = dsp->dsa_toguid;
362
363         dsp->dsa_pending_op = PENDING_FREEOBJECTS;
364
365         return (0);
366 }
367
368 static int
369 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
370 {
371         struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
372
373         if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
374                 return (dump_freeobjects(dsp, object, 1));
375
376         if (dsp->dsa_pending_op != PENDING_NONE) {
377                 if (dump_bytes(dsp, dsp->dsa_drr,
378                     sizeof (dmu_replay_record_t)) != 0)
379                         return (SET_ERROR(EINTR));
380                 dsp->dsa_pending_op = PENDING_NONE;
381         }
382
383         /* write an OBJECT record */
384         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
385         dsp->dsa_drr->drr_type = DRR_OBJECT;
386         drro->drr_object = object;
387         drro->drr_type = dnp->dn_type;
388         drro->drr_bonustype = dnp->dn_bonustype;
389         drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
390         drro->drr_bonuslen = dnp->dn_bonuslen;
391         drro->drr_checksumtype = dnp->dn_checksum;
392         drro->drr_compress = dnp->dn_compress;
393         drro->drr_toguid = dsp->dsa_toguid;
394
395         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
396                 return (SET_ERROR(EINTR));
397
398         if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
399                 return (SET_ERROR(EINTR));
400
401         /* Free anything past the end of the file. */
402         if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
403             (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
404                 return (SET_ERROR(EINTR));
405         if (dsp->dsa_err != 0)
406                 return (SET_ERROR(EINTR));
407         return (0);
408 }
409
410 static boolean_t
411 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
412 {
413         if (!BP_IS_EMBEDDED(bp))
414                 return (B_FALSE);
415
416         /*
417          * Compression function must be legacy, or explicitly enabled.
418          */
419         if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
420             !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
421                 return (B_FALSE);
422
423         /*
424          * Embed type must be explicitly enabled.
425          */
426         switch (BPE_GET_ETYPE(bp)) {
427         case BP_EMBEDDED_TYPE_DATA:
428                 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
429                         return (B_TRUE);
430                 break;
431         default:
432                 return (B_FALSE);
433         }
434         return (B_FALSE);
435 }
436
437 #define BP_SPAN(dnp, level) \
438         (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
439         (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
440
441 /* ARGSUSED */
442 static int
443 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
444     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
445 {
446         dmu_sendarg_t *dsp = arg;
447         dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
448         int err = 0;
449
450         if (issig(JUSTLOOKING) && issig(FORREAL))
451                 return (SET_ERROR(EINTR));
452
453         if (zb->zb_object != DMU_META_DNODE_OBJECT &&
454             DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
455                 return (0);
456         } else if (zb->zb_level == ZB_ZIL_LEVEL) {
457                 /*
458                  * If we are sending a non-snapshot (which is allowed on
459                  * read-only pools), it may have a ZIL, which must be ignored.
460                  */
461                 return (0);
462         } else if (BP_IS_HOLE(bp) &&
463             zb->zb_object == DMU_META_DNODE_OBJECT) {
464                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
465                 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
466                 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
467         } else if (BP_IS_HOLE(bp)) {
468                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
469                 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
470         } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
471                 return (0);
472         } else if (type == DMU_OT_DNODE) {
473                 dnode_phys_t *blk;
474                 int i;
475                 int blksz = BP_GET_LSIZE(bp);
476                 uint32_t aflags = ARC_WAIT;
477                 arc_buf_t *abuf;
478
479                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
480                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
481                     &aflags, zb) != 0)
482                         return (SET_ERROR(EIO));
483
484                 blk = abuf->b_data;
485                 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
486                         uint64_t dnobj = (zb->zb_blkid <<
487                             (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
488                         err = dump_dnode(dsp, dnobj, blk+i);
489                         if (err != 0)
490                                 break;
491                 }
492                 (void) arc_buf_remove_ref(abuf, &abuf);
493         } else if (type == DMU_OT_SA) {
494                 uint32_t aflags = ARC_WAIT;
495                 arc_buf_t *abuf;
496                 int blksz = BP_GET_LSIZE(bp);
497
498                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
499                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
500                     &aflags, zb) != 0)
501                         return (SET_ERROR(EIO));
502
503                 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
504                 (void) arc_buf_remove_ref(abuf, &abuf);
505         } else if (backup_do_embed(dsp, bp)) {
506                 /* it's an embedded level-0 block of a regular object */
507                 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
508                 err = dump_write_embedded(dsp, zb->zb_object,
509                     zb->zb_blkid * blksz, blksz, bp);
510         } else { /* it's a level-0 block of a regular object */
511                 uint32_t aflags = ARC_WAIT;
512                 arc_buf_t *abuf;
513                 int blksz = BP_GET_LSIZE(bp);
514
515                 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
516                 ASSERT0(zb->zb_level);
517                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
518                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
519                     &aflags, zb) != 0) {
520                         if (zfs_send_corrupt_data) {
521                                 /* Send a block filled with 0x"zfs badd bloc" */
522                                 abuf = arc_buf_alloc(spa, blksz, &abuf,
523                                     ARC_BUFC_DATA);
524                                 uint64_t *ptr;
525                                 for (ptr = abuf->b_data;
526                                     (char *)ptr < (char *)abuf->b_data + blksz;
527                                     ptr++)
528                                         *ptr = 0x2f5baddb10c;
529                         } else {
530                                 return (SET_ERROR(EIO));
531                         }
532                 }
533
534                 err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
535                     blksz, bp, abuf->b_data);
536                 (void) arc_buf_remove_ref(abuf, &abuf);
537         }
538
539         ASSERT(err == 0 || err == EINTR);
540         return (err);
541 }
542
543 /*
544  * Releases dp using the specified tag.
545  */
546 static int
547 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
548     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
549 #ifdef illumos
550     int outfd, vnode_t *vp, offset_t *off)
551 #else
552     int outfd, struct file *fp, offset_t *off)
553 #endif
554 {
555         objset_t *os;
556         dmu_replay_record_t *drr;
557         dmu_sendarg_t *dsp;
558         int err;
559         uint64_t fromtxg = 0;
560         uint64_t featureflags = 0;
561
562         err = dmu_objset_from_ds(ds, &os);
563         if (err != 0) {
564                 dsl_pool_rele(dp, tag);
565                 return (err);
566         }
567
568         drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
569         drr->drr_type = DRR_BEGIN;
570         drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
571         DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
572             DMU_SUBSTREAM);
573
574 #ifdef _KERNEL
575         if (dmu_objset_type(os) == DMU_OST_ZFS) {
576                 uint64_t version;
577                 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
578                         kmem_free(drr, sizeof (dmu_replay_record_t));
579                         dsl_pool_rele(dp, tag);
580                         return (SET_ERROR(EINVAL));
581                 }
582                 if (version >= ZPL_VERSION_SA) {
583                         featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
584                 }
585         }
586 #endif
587
588         if (embedok &&
589             spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
590                 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
591                 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
592                         featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
593         } else {
594                 embedok = B_FALSE;
595         }
596
597         DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
598             featureflags);
599
600         drr->drr_u.drr_begin.drr_creation_time =
601             ds->ds_phys->ds_creation_time;
602         drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
603         if (is_clone)
604                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
605         drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
606         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
607                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
608
609         if (fromzb != NULL) {
610                 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
611                 fromtxg = fromzb->zbm_creation_txg;
612         }
613         dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
614         if (!dsl_dataset_is_snapshot(ds)) {
615                 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
616                     sizeof (drr->drr_u.drr_begin.drr_toname));
617         }
618
619         dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
620
621         dsp->dsa_drr = drr;
622         dsp->dsa_outfd = outfd;
623         dsp->dsa_proc = curproc;
624         dsp->dsa_td = curthread;
625         dsp->dsa_fp = fp;
626         dsp->dsa_os = os;
627         dsp->dsa_off = off;
628         dsp->dsa_toguid = ds->ds_phys->ds_guid;
629         ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
630         dsp->dsa_pending_op = PENDING_NONE;
631         dsp->dsa_incremental = (fromzb != NULL);
632         dsp->dsa_featureflags = featureflags;
633
634         mutex_enter(&ds->ds_sendstream_lock);
635         list_insert_head(&ds->ds_sendstreams, dsp);
636         mutex_exit(&ds->ds_sendstream_lock);
637
638         dsl_dataset_long_hold(ds, FTAG);
639         dsl_pool_rele(dp, tag);
640
641         if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
642                 err = dsp->dsa_err;
643                 goto out;
644         }
645
646         err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
647             backup_cb, dsp);
648
649         if (dsp->dsa_pending_op != PENDING_NONE)
650                 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
651                         err = SET_ERROR(EINTR);
652
653         if (err != 0) {
654                 if (err == EINTR && dsp->dsa_err != 0)
655                         err = dsp->dsa_err;
656                 goto out;
657         }
658
659         bzero(drr, sizeof (dmu_replay_record_t));
660         drr->drr_type = DRR_END;
661         drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
662         drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
663
664         if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
665                 err = dsp->dsa_err;
666                 goto out;
667         }
668
669 out:
670         mutex_enter(&ds->ds_sendstream_lock);
671         list_remove(&ds->ds_sendstreams, dsp);
672         mutex_exit(&ds->ds_sendstream_lock);
673
674         kmem_free(drr, sizeof (dmu_replay_record_t));
675         kmem_free(dsp, sizeof (dmu_sendarg_t));
676
677         dsl_dataset_long_rele(ds, FTAG);
678
679         return (err);
680 }
681
682 int
683 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
684 #ifdef illumos
685     boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
686 #else
687     boolean_t embedok, int outfd, struct file *fp, offset_t *off)
688 #endif
689 {
690         dsl_pool_t *dp;
691         dsl_dataset_t *ds;
692         dsl_dataset_t *fromds = NULL;
693         int err;
694
695         err = dsl_pool_hold(pool, FTAG, &dp);
696         if (err != 0)
697                 return (err);
698
699         err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
700         if (err != 0) {
701                 dsl_pool_rele(dp, FTAG);
702                 return (err);
703         }
704
705         if (fromsnap != 0) {
706                 zfs_bookmark_phys_t zb;
707                 boolean_t is_clone;
708
709                 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
710                 if (err != 0) {
711                         dsl_dataset_rele(ds, FTAG);
712                         dsl_pool_rele(dp, FTAG);
713                         return (err);
714                 }
715                 if (!dsl_dataset_is_before(ds, fromds, 0))
716                         err = SET_ERROR(EXDEV);
717                 zb.zbm_creation_time = fromds->ds_phys->ds_creation_time;
718                 zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg;
719                 zb.zbm_guid = fromds->ds_phys->ds_guid;
720                 is_clone = (fromds->ds_dir != ds->ds_dir);
721                 dsl_dataset_rele(fromds, FTAG);
722                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
723                     outfd, fp, off);
724         } else {
725                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
726                     outfd, fp, off);
727         }
728         dsl_dataset_rele(ds, FTAG);
729         return (err);
730 }
731
732 int
733 dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
734 #ifdef illumos
735     int outfd, vnode_t *vp, offset_t *off)
736 #else
737     int outfd, struct file *fp, offset_t *off)
738 #endif
739 {
740         dsl_pool_t *dp;
741         dsl_dataset_t *ds;
742         int err;
743         boolean_t owned = B_FALSE;
744
745         if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
746                 return (SET_ERROR(EINVAL));
747
748         err = dsl_pool_hold(tosnap, FTAG, &dp);
749         if (err != 0)
750                 return (err);
751
752         if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
753                 /*
754                  * We are sending a filesystem or volume.  Ensure
755                  * that it doesn't change by owning the dataset.
756                  */
757                 err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
758                 owned = B_TRUE;
759         } else {
760                 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
761         }
762         if (err != 0) {
763                 dsl_pool_rele(dp, FTAG);
764                 return (err);
765         }
766
767         if (fromsnap != NULL) {
768                 zfs_bookmark_phys_t zb;
769                 boolean_t is_clone = B_FALSE;
770                 int fsnamelen = strchr(tosnap, '@') - tosnap;
771
772                 /*
773                  * If the fromsnap is in a different filesystem, then
774                  * mark the send stream as a clone.
775                  */
776                 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
777                     (fromsnap[fsnamelen] != '@' &&
778                     fromsnap[fsnamelen] != '#')) {
779                         is_clone = B_TRUE;
780                 }
781
782                 if (strchr(fromsnap, '@')) {
783                         dsl_dataset_t *fromds;
784                         err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
785                         if (err == 0) {
786                                 if (!dsl_dataset_is_before(ds, fromds, 0))
787                                         err = SET_ERROR(EXDEV);
788                                 zb.zbm_creation_time =
789                                     fromds->ds_phys->ds_creation_time;
790                                 zb.zbm_creation_txg =
791                                     fromds->ds_phys->ds_creation_txg;
792                                 zb.zbm_guid = fromds->ds_phys->ds_guid;
793                                 is_clone = (ds->ds_dir != fromds->ds_dir);
794                                 dsl_dataset_rele(fromds, FTAG);
795                         }
796                 } else {
797                         err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
798                 }
799                 if (err != 0) {
800                         dsl_dataset_rele(ds, FTAG);
801                         dsl_pool_rele(dp, FTAG);
802                         return (err);
803                 }
804                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
805                     outfd, fp, off);
806         } else {
807                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
808                     outfd, fp, off);
809         }
810         if (owned)
811                 dsl_dataset_disown(ds, FTAG);
812         else
813                 dsl_dataset_rele(ds, FTAG);
814         return (err);
815 }
816
817 int
818 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
819 {
820         dsl_pool_t *dp = ds->ds_dir->dd_pool;
821         int err;
822         uint64_t size;
823
824         ASSERT(dsl_pool_config_held(dp));
825
826         /* tosnap must be a snapshot */
827         if (!dsl_dataset_is_snapshot(ds))
828                 return (SET_ERROR(EINVAL));
829
830         /*
831          * fromsnap must be an earlier snapshot from the same fs as tosnap,
832          * or the origin's fs.
833          */
834         if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
835                 return (SET_ERROR(EXDEV));
836
837         /* Get uncompressed size estimate of changed data. */
838         if (fromds == NULL) {
839                 size = ds->ds_phys->ds_uncompressed_bytes;
840         } else {
841                 uint64_t used, comp;
842                 err = dsl_dataset_space_written(fromds, ds,
843                     &used, &comp, &size);
844                 if (err != 0)
845                         return (err);
846         }
847
848         /*
849          * Assume that space (both on-disk and in-stream) is dominated by
850          * data.  We will adjust for indirect blocks and the copies property,
851          * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
852          */
853
854         /*
855          * Subtract out approximate space used by indirect blocks.
856          * Assume most space is used by data blocks (non-indirect, non-dnode).
857          * Assume all blocks are recordsize.  Assume ditto blocks and
858          * internal fragmentation counter out compression.
859          *
860          * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
861          * block, which we observe in practice.
862          */
863         uint64_t recordsize;
864         err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
865         if (err != 0)
866                 return (err);
867         size -= size / recordsize * sizeof (blkptr_t);
868
869         /* Add in the space for the record associated with each block. */
870         size += size / recordsize * sizeof (dmu_replay_record_t);
871
872         *sizep = size;
873
874         return (0);
875 }
876
877 typedef struct dmu_recv_begin_arg {
878         const char *drba_origin;
879         dmu_recv_cookie_t *drba_cookie;
880         cred_t *drba_cred;
881         uint64_t drba_snapobj;
882 } dmu_recv_begin_arg_t;
883
884 static int
885 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
886     uint64_t fromguid)
887 {
888         uint64_t val;
889         int error;
890         dsl_pool_t *dp = ds->ds_dir->dd_pool;
891
892         /* temporary clone name must not exist */
893         error = zap_lookup(dp->dp_meta_objset,
894             ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
895             8, 1, &val);
896         if (error != ENOENT)
897                 return (error == 0 ? EBUSY : error);
898
899         /* new snapshot name must not exist */
900         error = zap_lookup(dp->dp_meta_objset,
901             ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
902             8, 1, &val);
903         if (error != ENOENT)
904                 return (error == 0 ? EEXIST : error);
905
906         /*
907          * Check snapshot limit before receiving. We'll recheck again at the
908          * end, but might as well abort before receiving if we're already over
909          * the limit.
910          *
911          * Note that we do not check the file system limit with
912          * dsl_dir_fscount_check because the temporary %clones don't count
913          * against that limit.
914          */
915         error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
916             NULL, drba->drba_cred);
917         if (error != 0)
918                 return (error);
919
920         if (fromguid != 0) {
921                 dsl_dataset_t *snap;
922                 uint64_t obj = ds->ds_phys->ds_prev_snap_obj;
923
924                 /* Find snapshot in this dir that matches fromguid. */
925                 while (obj != 0) {
926                         error = dsl_dataset_hold_obj(dp, obj, FTAG,
927                             &snap);
928                         if (error != 0)
929                                 return (SET_ERROR(ENODEV));
930                         if (snap->ds_dir != ds->ds_dir) {
931                                 dsl_dataset_rele(snap, FTAG);
932                                 return (SET_ERROR(ENODEV));
933                         }
934                         if (snap->ds_phys->ds_guid == fromguid)
935                                 break;
936                         obj = snap->ds_phys->ds_prev_snap_obj;
937                         dsl_dataset_rele(snap, FTAG);
938                 }
939                 if (obj == 0)
940                         return (SET_ERROR(ENODEV));
941
942                 if (drba->drba_cookie->drc_force) {
943                         drba->drba_snapobj = obj;
944                 } else {
945                         /*
946                          * If we are not forcing, there must be no
947                          * changes since fromsnap.
948                          */
949                         if (dsl_dataset_modified_since_snap(ds, snap)) {
950                                 dsl_dataset_rele(snap, FTAG);
951                                 return (SET_ERROR(ETXTBSY));
952                         }
953                         drba->drba_snapobj = ds->ds_prev->ds_object;
954                 }
955
956                 dsl_dataset_rele(snap, FTAG);
957         } else {
958                 /* if full, most recent snapshot must be $ORIGIN */
959                 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
960                         return (SET_ERROR(ENODEV));
961                 drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj;
962         }
963
964         return (0);
965
966 }
967
968 static int
969 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
970 {
971         dmu_recv_begin_arg_t *drba = arg;
972         dsl_pool_t *dp = dmu_tx_pool(tx);
973         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
974         uint64_t fromguid = drrb->drr_fromguid;
975         int flags = drrb->drr_flags;
976         int error;
977         uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
978         dsl_dataset_t *ds;
979         const char *tofs = drba->drba_cookie->drc_tofs;
980
981         /* already checked */
982         ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
983
984         if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
985             DMU_COMPOUNDSTREAM ||
986             drrb->drr_type >= DMU_OST_NUMTYPES ||
987             ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
988                 return (SET_ERROR(EINVAL));
989
990         /* Verify pool version supports SA if SA_SPILL feature set */
991         if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
992             spa_version(dp->dp_spa) < SPA_VERSION_SA)
993                 return (SET_ERROR(ENOTSUP));
994
995         /*
996          * The receiving code doesn't know how to translate a WRITE_EMBEDDED
997          * record to a plan WRITE record, so the pool must have the
998          * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
999          * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
1000          */
1001         if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
1002             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
1003                 return (SET_ERROR(ENOTSUP));
1004         if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
1005             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
1006                 return (SET_ERROR(ENOTSUP));
1007
1008         error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1009         if (error == 0) {
1010                 /* target fs already exists; recv into temp clone */
1011
1012                 /* Can't recv a clone into an existing fs */
1013                 if (flags & DRR_FLAG_CLONE) {
1014                         dsl_dataset_rele(ds, FTAG);
1015                         return (SET_ERROR(EINVAL));
1016                 }
1017
1018                 error = recv_begin_check_existing_impl(drba, ds, fromguid);
1019                 dsl_dataset_rele(ds, FTAG);
1020         } else if (error == ENOENT) {
1021                 /* target fs does not exist; must be a full backup or clone */
1022                 char buf[MAXNAMELEN];
1023
1024                 /*
1025                  * If it's a non-clone incremental, we are missing the
1026                  * target fs, so fail the recv.
1027                  */
1028                 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
1029                         return (SET_ERROR(ENOENT));
1030
1031                 /* Open the parent of tofs */
1032                 ASSERT3U(strlen(tofs), <, MAXNAMELEN);
1033                 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
1034                 error = dsl_dataset_hold(dp, buf, FTAG, &ds);
1035                 if (error != 0)
1036                         return (error);
1037
1038                 /*
1039                  * Check filesystem and snapshot limits before receiving. We'll
1040                  * recheck snapshot limits again at the end (we create the
1041                  * filesystems and increment those counts during begin_sync).
1042                  */
1043                 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1044                     ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
1045                 if (error != 0) {
1046                         dsl_dataset_rele(ds, FTAG);
1047                         return (error);
1048                 }
1049
1050                 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1051                     ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
1052                 if (error != 0) {
1053                         dsl_dataset_rele(ds, FTAG);
1054                         return (error);
1055                 }
1056
1057                 if (drba->drba_origin != NULL) {
1058                         dsl_dataset_t *origin;
1059                         error = dsl_dataset_hold(dp, drba->drba_origin,
1060                             FTAG, &origin);
1061                         if (error != 0) {
1062                                 dsl_dataset_rele(ds, FTAG);
1063                                 return (error);
1064                         }
1065                         if (!dsl_dataset_is_snapshot(origin)) {
1066                                 dsl_dataset_rele(origin, FTAG);
1067                                 dsl_dataset_rele(ds, FTAG);
1068                                 return (SET_ERROR(EINVAL));
1069                         }
1070                         if (origin->ds_phys->ds_guid != fromguid) {
1071                                 dsl_dataset_rele(origin, FTAG);
1072                                 dsl_dataset_rele(ds, FTAG);
1073                                 return (SET_ERROR(ENODEV));
1074                         }
1075                         dsl_dataset_rele(origin, FTAG);
1076                 }
1077                 dsl_dataset_rele(ds, FTAG);
1078                 error = 0;
1079         }
1080         return (error);
1081 }
1082
1083 static void
1084 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
1085 {
1086         dmu_recv_begin_arg_t *drba = arg;
1087         dsl_pool_t *dp = dmu_tx_pool(tx);
1088         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
1089         const char *tofs = drba->drba_cookie->drc_tofs;
1090         dsl_dataset_t *ds, *newds;
1091         uint64_t dsobj;
1092         int error;
1093         uint64_t crflags;
1094
1095         crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
1096             DS_FLAG_CI_DATASET : 0;
1097
1098         error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1099         if (error == 0) {
1100                 /* create temporary clone */
1101                 dsl_dataset_t *snap = NULL;
1102                 if (drba->drba_snapobj != 0) {
1103                         VERIFY0(dsl_dataset_hold_obj(dp,
1104                             drba->drba_snapobj, FTAG, &snap));
1105                 }
1106                 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
1107                     snap, crflags, drba->drba_cred, tx);
1108                 dsl_dataset_rele(snap, FTAG);
1109                 dsl_dataset_rele(ds, FTAG);
1110         } else {
1111                 dsl_dir_t *dd;
1112                 const char *tail;
1113                 dsl_dataset_t *origin = NULL;
1114
1115                 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
1116
1117                 if (drba->drba_origin != NULL) {
1118                         VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
1119                             FTAG, &origin));
1120                 }
1121
1122                 /* Create new dataset. */
1123                 dsobj = dsl_dataset_create_sync(dd,
1124                     strrchr(tofs, '/') + 1,
1125                     origin, crflags, drba->drba_cred, tx);
1126                 if (origin != NULL)
1127                         dsl_dataset_rele(origin, FTAG);
1128                 dsl_dir_rele(dd, FTAG);
1129                 drba->drba_cookie->drc_newfs = B_TRUE;
1130         }
1131         VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
1132
1133         dmu_buf_will_dirty(newds->ds_dbuf, tx);
1134         newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
1135
1136         /*
1137          * If we actually created a non-clone, we need to create the
1138          * objset in our new dataset.
1139          */
1140         if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
1141                 (void) dmu_objset_create_impl(dp->dp_spa,
1142                     newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
1143         }
1144
1145         drba->drba_cookie->drc_ds = newds;
1146
1147         spa_history_log_internal_ds(newds, "receive", tx, "");
1148 }
1149
1150 /*
1151  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
1152  * succeeds; otherwise we will leak the holds on the datasets.
1153  */
1154 int
1155 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
1156     boolean_t force, char *origin, dmu_recv_cookie_t *drc)
1157 {
1158         dmu_recv_begin_arg_t drba = { 0 };
1159         dmu_replay_record_t *drr;
1160
1161         bzero(drc, sizeof (dmu_recv_cookie_t));
1162         drc->drc_drrb = drrb;
1163         drc->drc_tosnap = tosnap;
1164         drc->drc_tofs = tofs;
1165         drc->drc_force = force;
1166         drc->drc_cred = CRED();
1167
1168         if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1169                 drc->drc_byteswap = B_TRUE;
1170         else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
1171                 return (SET_ERROR(EINVAL));
1172
1173         drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1174         drr->drr_type = DRR_BEGIN;
1175         drr->drr_u.drr_begin = *drc->drc_drrb;
1176         if (drc->drc_byteswap) {
1177                 fletcher_4_incremental_byteswap(drr,
1178                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
1179         } else {
1180                 fletcher_4_incremental_native(drr,
1181                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
1182         }
1183         kmem_free(drr, sizeof (dmu_replay_record_t));
1184
1185         if (drc->drc_byteswap) {
1186                 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1187                 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1188                 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1189                 drrb->drr_type = BSWAP_32(drrb->drr_type);
1190                 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1191                 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1192         }
1193
1194         drba.drba_origin = origin;
1195         drba.drba_cookie = drc;
1196         drba.drba_cred = CRED();
1197
1198         return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
1199             &drba, 5, ZFS_SPACE_CHECK_NORMAL));
1200 }
1201
1202 struct restorearg {
1203         int err;
1204         boolean_t byteswap;
1205         kthread_t *td;
1206         struct file *fp;
1207         char *buf;
1208         uint64_t voff;
1209         int bufsize; /* amount of memory allocated for buf */
1210         zio_cksum_t cksum;
1211         avl_tree_t *guid_to_ds_map;
1212 };
1213
1214 typedef struct guid_map_entry {
1215         uint64_t        guid;
1216         dsl_dataset_t   *gme_ds;
1217         avl_node_t      avlnode;
1218 } guid_map_entry_t;
1219
1220 static int
1221 guid_compare(const void *arg1, const void *arg2)
1222 {
1223         const guid_map_entry_t *gmep1 = arg1;
1224         const guid_map_entry_t *gmep2 = arg2;
1225
1226         if (gmep1->guid < gmep2->guid)
1227                 return (-1);
1228         else if (gmep1->guid > gmep2->guid)
1229                 return (1);
1230         return (0);
1231 }
1232
1233 static void
1234 free_guid_map_onexit(void *arg)
1235 {
1236         avl_tree_t *ca = arg;
1237         void *cookie = NULL;
1238         guid_map_entry_t *gmep;
1239
1240         while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
1241                 dsl_dataset_long_rele(gmep->gme_ds, gmep);
1242                 dsl_dataset_rele(gmep->gme_ds, gmep);
1243                 kmem_free(gmep, sizeof (guid_map_entry_t));
1244         }
1245         avl_destroy(ca);
1246         kmem_free(ca, sizeof (avl_tree_t));
1247 }
1248
1249 static int
1250 restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
1251 {
1252         struct uio auio;
1253         struct iovec aiov;
1254         int error;
1255
1256         aiov.iov_base = buf;
1257         aiov.iov_len = len;
1258         auio.uio_iov = &aiov;
1259         auio.uio_iovcnt = 1;
1260         auio.uio_resid = len;
1261         auio.uio_segflg = UIO_SYSSPACE;
1262         auio.uio_rw = UIO_READ;
1263         auio.uio_offset = off;
1264         auio.uio_td = ra->td;
1265 #ifdef _KERNEL
1266         error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
1267 #else
1268         fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
1269         error = EOPNOTSUPP;
1270 #endif
1271         *resid = auio.uio_resid;
1272         return (error);
1273 }
1274
1275 static void *
1276 restore_read(struct restorearg *ra, int len)
1277 {
1278         void *rv;
1279         int done = 0;
1280
1281         /* some things will require 8-byte alignment, so everything must */
1282         ASSERT0(len % 8);
1283
1284         while (done < len) {
1285                 ssize_t resid;
1286
1287                 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
1288                     len - done, ra->voff, &resid);
1289
1290                 if (resid == len - done)
1291                         ra->err = SET_ERROR(EINVAL);
1292                 ra->voff += len - done - resid;
1293                 done = len - resid;
1294                 if (ra->err != 0)
1295                         return (NULL);
1296         }
1297
1298         ASSERT3U(done, ==, len);
1299         rv = ra->buf;
1300         if (ra->byteswap)
1301                 fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
1302         else
1303                 fletcher_4_incremental_native(rv, len, &ra->cksum);
1304         return (rv);
1305 }
1306
1307 static void
1308 backup_byteswap(dmu_replay_record_t *drr)
1309 {
1310 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
1311 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
1312         drr->drr_type = BSWAP_32(drr->drr_type);
1313         drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
1314         switch (drr->drr_type) {
1315         case DRR_BEGIN:
1316                 DO64(drr_begin.drr_magic);
1317                 DO64(drr_begin.drr_versioninfo);
1318                 DO64(drr_begin.drr_creation_time);
1319                 DO32(drr_begin.drr_type);
1320                 DO32(drr_begin.drr_flags);
1321                 DO64(drr_begin.drr_toguid);
1322                 DO64(drr_begin.drr_fromguid);
1323                 break;
1324         case DRR_OBJECT:
1325                 DO64(drr_object.drr_object);
1326                 DO32(drr_object.drr_type);
1327                 DO32(drr_object.drr_bonustype);
1328                 DO32(drr_object.drr_blksz);
1329                 DO32(drr_object.drr_bonuslen);
1330                 DO64(drr_object.drr_toguid);
1331                 break;
1332         case DRR_FREEOBJECTS:
1333                 DO64(drr_freeobjects.drr_firstobj);
1334                 DO64(drr_freeobjects.drr_numobjs);
1335                 DO64(drr_freeobjects.drr_toguid);
1336                 break;
1337         case DRR_WRITE:
1338                 DO64(drr_write.drr_object);
1339                 DO32(drr_write.drr_type);
1340                 DO64(drr_write.drr_offset);
1341                 DO64(drr_write.drr_length);
1342                 DO64(drr_write.drr_toguid);
1343                 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1344                 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1345                 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1346                 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1347                 DO64(drr_write.drr_key.ddk_prop);
1348                 break;
1349         case DRR_WRITE_BYREF:
1350                 DO64(drr_write_byref.drr_object);
1351                 DO64(drr_write_byref.drr_offset);
1352                 DO64(drr_write_byref.drr_length);
1353                 DO64(drr_write_byref.drr_toguid);
1354                 DO64(drr_write_byref.drr_refguid);
1355                 DO64(drr_write_byref.drr_refobject);
1356                 DO64(drr_write_byref.drr_refoffset);
1357                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1358                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1359                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1360                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1361                 DO64(drr_write_byref.drr_key.ddk_prop);
1362                 break;
1363         case DRR_WRITE_EMBEDDED:
1364                 DO64(drr_write_embedded.drr_object);
1365                 DO64(drr_write_embedded.drr_offset);
1366                 DO64(drr_write_embedded.drr_length);
1367                 DO64(drr_write_embedded.drr_toguid);
1368                 DO32(drr_write_embedded.drr_lsize);
1369                 DO32(drr_write_embedded.drr_psize);
1370                 break;
1371         case DRR_FREE:
1372                 DO64(drr_free.drr_object);
1373                 DO64(drr_free.drr_offset);
1374                 DO64(drr_free.drr_length);
1375                 DO64(drr_free.drr_toguid);
1376                 break;
1377         case DRR_SPILL:
1378                 DO64(drr_spill.drr_object);
1379                 DO64(drr_spill.drr_length);
1380                 DO64(drr_spill.drr_toguid);
1381                 break;
1382         case DRR_END:
1383                 DO64(drr_end.drr_checksum.zc_word[0]);
1384                 DO64(drr_end.drr_checksum.zc_word[1]);
1385                 DO64(drr_end.drr_checksum.zc_word[2]);
1386                 DO64(drr_end.drr_checksum.zc_word[3]);
1387                 DO64(drr_end.drr_toguid);
1388                 break;
1389         }
1390 #undef DO64
1391 #undef DO32
1392 }
1393
1394 static int
1395 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1396 {
1397         int err;
1398         dmu_tx_t *tx;
1399         void *data = NULL;
1400
1401         if (drro->drr_type == DMU_OT_NONE ||
1402             !DMU_OT_IS_VALID(drro->drr_type) ||
1403             !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1404             drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1405             drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1406             P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1407             drro->drr_blksz < SPA_MINBLOCKSIZE ||
1408             drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1409             drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1410                 return (SET_ERROR(EINVAL));
1411         }
1412
1413         err = dmu_object_info(os, drro->drr_object, NULL);
1414
1415         if (err != 0 && err != ENOENT)
1416                 return (SET_ERROR(EINVAL));
1417
1418         if (drro->drr_bonuslen) {
1419                 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1420                 if (ra->err != 0)
1421                         return (ra->err);
1422         }
1423
1424         if (err == ENOENT) {
1425                 /* currently free, want to be allocated */
1426                 tx = dmu_tx_create(os);
1427                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1428                 err = dmu_tx_assign(tx, TXG_WAIT);
1429                 if (err != 0) {
1430                         dmu_tx_abort(tx);
1431                         return (err);
1432                 }
1433                 err = dmu_object_claim(os, drro->drr_object,
1434                     drro->drr_type, drro->drr_blksz,
1435                     drro->drr_bonustype, drro->drr_bonuslen, tx);
1436                 dmu_tx_commit(tx);
1437         } else {
1438                 /* currently allocated, want to be allocated */
1439                 err = dmu_object_reclaim(os, drro->drr_object,
1440                     drro->drr_type, drro->drr_blksz,
1441                     drro->drr_bonustype, drro->drr_bonuslen);
1442         }
1443         if (err != 0) {
1444                 return (SET_ERROR(EINVAL));
1445         }
1446
1447         tx = dmu_tx_create(os);
1448         dmu_tx_hold_bonus(tx, drro->drr_object);
1449         err = dmu_tx_assign(tx, TXG_WAIT);
1450         if (err != 0) {
1451                 dmu_tx_abort(tx);
1452                 return (err);
1453         }
1454
1455         dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1456             tx);
1457         dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1458
1459         if (data != NULL) {
1460                 dmu_buf_t *db;
1461
1462                 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1463                 dmu_buf_will_dirty(db, tx);
1464
1465                 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1466                 bcopy(data, db->db_data, drro->drr_bonuslen);
1467                 if (ra->byteswap) {
1468                         dmu_object_byteswap_t byteswap =
1469                             DMU_OT_BYTESWAP(drro->drr_bonustype);
1470                         dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1471                             drro->drr_bonuslen);
1472                 }
1473                 dmu_buf_rele(db, FTAG);
1474         }
1475         dmu_tx_commit(tx);
1476         return (0);
1477 }
1478
1479 /* ARGSUSED */
1480 static int
1481 restore_freeobjects(struct restorearg *ra, objset_t *os,
1482     struct drr_freeobjects *drrfo)
1483 {
1484         uint64_t obj;
1485
1486         if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1487                 return (SET_ERROR(EINVAL));
1488
1489         for (obj = drrfo->drr_firstobj;
1490             obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1491             (void) dmu_object_next(os, &obj, FALSE, 0)) {
1492                 int err;
1493
1494                 if (dmu_object_info(os, obj, NULL) != 0)
1495                         continue;
1496
1497                 err = dmu_free_long_object(os, obj);
1498                 if (err != 0)
1499                         return (err);
1500         }
1501         return (0);
1502 }
1503
1504 static int
1505 restore_write(struct restorearg *ra, objset_t *os,
1506     struct drr_write *drrw)
1507 {
1508         dmu_tx_t *tx;
1509         void *data;
1510         int err;
1511
1512         if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1513             !DMU_OT_IS_VALID(drrw->drr_type))
1514                 return (SET_ERROR(EINVAL));
1515
1516         data = restore_read(ra, drrw->drr_length);
1517         if (data == NULL)
1518                 return (ra->err);
1519
1520         if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1521                 return (SET_ERROR(EINVAL));
1522
1523         tx = dmu_tx_create(os);
1524
1525         dmu_tx_hold_write(tx, drrw->drr_object,
1526             drrw->drr_offset, drrw->drr_length);
1527         err = dmu_tx_assign(tx, TXG_WAIT);
1528         if (err != 0) {
1529                 dmu_tx_abort(tx);
1530                 return (err);
1531         }
1532         if (ra->byteswap) {
1533                 dmu_object_byteswap_t byteswap =
1534                     DMU_OT_BYTESWAP(drrw->drr_type);
1535                 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
1536         }
1537         dmu_write(os, drrw->drr_object,
1538             drrw->drr_offset, drrw->drr_length, data, tx);
1539         dmu_tx_commit(tx);
1540         return (0);
1541 }
1542
1543 /*
1544  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1545  * streams to refer to a copy of the data that is already on the
1546  * system because it came in earlier in the stream.  This function
1547  * finds the earlier copy of the data, and uses that copy instead of
1548  * data from the stream to fulfill this write.
1549  */
1550 static int
1551 restore_write_byref(struct restorearg *ra, objset_t *os,
1552     struct drr_write_byref *drrwbr)
1553 {
1554         dmu_tx_t *tx;
1555         int err;
1556         guid_map_entry_t gmesrch;
1557         guid_map_entry_t *gmep;
1558         avl_index_t where;
1559         objset_t *ref_os = NULL;
1560         dmu_buf_t *dbp;
1561
1562         if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1563                 return (SET_ERROR(EINVAL));
1564
1565         /*
1566          * If the GUID of the referenced dataset is different from the
1567          * GUID of the target dataset, find the referenced dataset.
1568          */
1569         if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1570                 gmesrch.guid = drrwbr->drr_refguid;
1571                 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1572                     &where)) == NULL) {
1573                         return (SET_ERROR(EINVAL));
1574                 }
1575                 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1576                         return (SET_ERROR(EINVAL));
1577         } else {
1578                 ref_os = os;
1579         }
1580
1581         err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1582             drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
1583         if (err != 0)
1584                 return (err);
1585
1586         tx = dmu_tx_create(os);
1587
1588         dmu_tx_hold_write(tx, drrwbr->drr_object,
1589             drrwbr->drr_offset, drrwbr->drr_length);
1590         err = dmu_tx_assign(tx, TXG_WAIT);
1591         if (err != 0) {
1592                 dmu_tx_abort(tx);
1593                 return (err);
1594         }
1595         dmu_write(os, drrwbr->drr_object,
1596             drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1597         dmu_buf_rele(dbp, FTAG);
1598         dmu_tx_commit(tx);
1599         return (0);
1600 }
1601
1602 static int
1603 restore_write_embedded(struct restorearg *ra, objset_t *os,
1604     struct drr_write_embedded *drrwnp)
1605 {
1606         dmu_tx_t *tx;
1607         int err;
1608         void *data;
1609
1610         if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
1611                 return (EINVAL);
1612
1613         if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
1614                 return (EINVAL);
1615
1616         if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
1617                 return (EINVAL);
1618         if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
1619                 return (EINVAL);
1620
1621         data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
1622         if (data == NULL)
1623                 return (ra->err);
1624
1625         tx = dmu_tx_create(os);
1626
1627         dmu_tx_hold_write(tx, drrwnp->drr_object,
1628             drrwnp->drr_offset, drrwnp->drr_length);
1629         err = dmu_tx_assign(tx, TXG_WAIT);
1630         if (err != 0) {
1631                 dmu_tx_abort(tx);
1632                 return (err);
1633         }
1634
1635         dmu_write_embedded(os, drrwnp->drr_object,
1636             drrwnp->drr_offset, data, drrwnp->drr_etype,
1637             drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
1638             ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
1639
1640         dmu_tx_commit(tx);
1641         return (0);
1642 }
1643
1644 static int
1645 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1646 {
1647         dmu_tx_t *tx;
1648         void *data;
1649         dmu_buf_t *db, *db_spill;
1650         int err;
1651
1652         if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1653             drrs->drr_length > SPA_MAXBLOCKSIZE)
1654                 return (SET_ERROR(EINVAL));
1655
1656         data = restore_read(ra, drrs->drr_length);
1657         if (data == NULL)
1658                 return (ra->err);
1659
1660         if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1661                 return (SET_ERROR(EINVAL));
1662
1663         VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1664         if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1665                 dmu_buf_rele(db, FTAG);
1666                 return (err);
1667         }
1668
1669         tx = dmu_tx_create(os);
1670
1671         dmu_tx_hold_spill(tx, db->db_object);
1672
1673         err = dmu_tx_assign(tx, TXG_WAIT);
1674         if (err != 0) {
1675                 dmu_buf_rele(db, FTAG);
1676                 dmu_buf_rele(db_spill, FTAG);
1677                 dmu_tx_abort(tx);
1678                 return (err);
1679         }
1680         dmu_buf_will_dirty(db_spill, tx);
1681
1682         if (db_spill->db_size < drrs->drr_length)
1683                 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1684                     drrs->drr_length, tx));
1685         bcopy(data, db_spill->db_data, drrs->drr_length);
1686
1687         dmu_buf_rele(db, FTAG);
1688         dmu_buf_rele(db_spill, FTAG);
1689
1690         dmu_tx_commit(tx);
1691         return (0);
1692 }
1693
1694 /* ARGSUSED */
1695 static int
1696 restore_free(struct restorearg *ra, objset_t *os,
1697     struct drr_free *drrf)
1698 {
1699         int err;
1700
1701         if (drrf->drr_length != -1ULL &&
1702             drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1703                 return (SET_ERROR(EINVAL));
1704
1705         if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1706                 return (SET_ERROR(EINVAL));
1707
1708         err = dmu_free_long_range(os, drrf->drr_object,
1709             drrf->drr_offset, drrf->drr_length);
1710         return (err);
1711 }
1712
1713 /* used to destroy the drc_ds on error */
1714 static void
1715 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
1716 {
1717         char name[MAXNAMELEN];
1718         dsl_dataset_name(drc->drc_ds, name);
1719         dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
1720         (void) dsl_destroy_head(name);
1721 }
1722
1723 /*
1724  * NB: callers *must* call dmu_recv_end() if this succeeds.
1725  */
1726 int
1727 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
1728     int cleanup_fd, uint64_t *action_handlep)
1729 {
1730         struct restorearg ra = { 0 };
1731         dmu_replay_record_t *drr;
1732         objset_t *os;
1733         zio_cksum_t pcksum;
1734         int featureflags;
1735
1736         ra.byteswap = drc->drc_byteswap;
1737         ra.cksum = drc->drc_cksum;
1738         ra.td = curthread;
1739         ra.fp = fp;
1740         ra.voff = *voffp;
1741         ra.bufsize = 1<<20;
1742         ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1743
1744         /* these were verified in dmu_recv_begin */
1745         ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
1746             DMU_SUBSTREAM);
1747         ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
1748
1749         /*
1750          * Open the objset we are modifying.
1751          */
1752         VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
1753
1754         ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1755
1756         featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1757
1758         /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1759         if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1760                 minor_t minor;
1761
1762                 if (cleanup_fd == -1) {
1763                         ra.err = SET_ERROR(EBADF);
1764                         goto out;
1765                 }
1766                 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1767                 if (ra.err != 0) {
1768                         cleanup_fd = -1;
1769                         goto out;
1770                 }
1771
1772                 if (*action_handlep == 0) {
1773                         ra.guid_to_ds_map =
1774                             kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1775                         avl_create(ra.guid_to_ds_map, guid_compare,
1776                             sizeof (guid_map_entry_t),
1777                             offsetof(guid_map_entry_t, avlnode));
1778                         ra.err = zfs_onexit_add_cb(minor,
1779                             free_guid_map_onexit, ra.guid_to_ds_map,
1780                             action_handlep);
1781                         if (ra.err != 0)
1782                                 goto out;
1783                 } else {
1784                         ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1785                             (void **)&ra.guid_to_ds_map);
1786                         if (ra.err != 0)
1787                                 goto out;
1788                 }
1789
1790                 drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1791         }
1792
1793         /*
1794          * Read records and process them.
1795          */
1796         pcksum = ra.cksum;
1797         while (ra.err == 0 &&
1798             NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1799                 if (issig(JUSTLOOKING) && issig(FORREAL)) {
1800                         ra.err = SET_ERROR(EINTR);
1801                         goto out;
1802                 }
1803
1804                 if (ra.byteswap)
1805                         backup_byteswap(drr);
1806
1807                 switch (drr->drr_type) {
1808                 case DRR_OBJECT:
1809                 {
1810                         /*
1811                          * We need to make a copy of the record header,
1812                          * because restore_{object,write} may need to
1813                          * restore_read(), which will invalidate drr.
1814                          */
1815                         struct drr_object drro = drr->drr_u.drr_object;
1816                         ra.err = restore_object(&ra, os, &drro);
1817                         break;
1818                 }
1819                 case DRR_FREEOBJECTS:
1820                 {
1821                         struct drr_freeobjects drrfo =
1822                             drr->drr_u.drr_freeobjects;
1823                         ra.err = restore_freeobjects(&ra, os, &drrfo);
1824                         break;
1825                 }
1826                 case DRR_WRITE:
1827                 {
1828                         struct drr_write drrw = drr->drr_u.drr_write;
1829                         ra.err = restore_write(&ra, os, &drrw);
1830                         break;
1831                 }
1832                 case DRR_WRITE_BYREF:
1833                 {
1834                         struct drr_write_byref drrwbr =
1835                             drr->drr_u.drr_write_byref;
1836                         ra.err = restore_write_byref(&ra, os, &drrwbr);
1837                         break;
1838                 }
1839                 case DRR_WRITE_EMBEDDED:
1840                 {
1841                         struct drr_write_embedded drrwe =
1842                             drr->drr_u.drr_write_embedded;
1843                         ra.err = restore_write_embedded(&ra, os, &drrwe);
1844                         break;
1845                 }
1846                 case DRR_FREE:
1847                 {
1848                         struct drr_free drrf = drr->drr_u.drr_free;
1849                         ra.err = restore_free(&ra, os, &drrf);
1850                         break;
1851                 }
1852                 case DRR_END:
1853                 {
1854                         struct drr_end drre = drr->drr_u.drr_end;
1855                         /*
1856                          * We compare against the *previous* checksum
1857                          * value, because the stored checksum is of
1858                          * everything before the DRR_END record.
1859                          */
1860                         if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1861                                 ra.err = SET_ERROR(ECKSUM);
1862                         goto out;
1863                 }
1864                 case DRR_SPILL:
1865                 {
1866                         struct drr_spill drrs = drr->drr_u.drr_spill;
1867                         ra.err = restore_spill(&ra, os, &drrs);
1868                         break;
1869                 }
1870                 default:
1871                         ra.err = SET_ERROR(EINVAL);
1872                         goto out;
1873                 }
1874                 pcksum = ra.cksum;
1875         }
1876         ASSERT(ra.err != 0);
1877
1878 out:
1879         if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1880                 zfs_onexit_fd_rele(cleanup_fd);
1881
1882         if (ra.err != 0) {
1883                 /*
1884                  * destroy what we created, so we don't leave it in the
1885                  * inconsistent restoring state.
1886                  */
1887                 dmu_recv_cleanup_ds(drc);
1888         }
1889
1890         kmem_free(ra.buf, ra.bufsize);
1891         *voffp = ra.voff;
1892         return (ra.err);
1893 }
1894
1895 static int
1896 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
1897 {
1898         dmu_recv_cookie_t *drc = arg;
1899         dsl_pool_t *dp = dmu_tx_pool(tx);
1900         int error;
1901
1902         ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
1903
1904         if (!drc->drc_newfs) {
1905                 dsl_dataset_t *origin_head;
1906
1907                 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
1908                 if (error != 0)
1909                         return (error);
1910                 if (drc->drc_force) {
1911                         /*
1912                          * We will destroy any snapshots in tofs (i.e. before
1913                          * origin_head) that are after the origin (which is
1914                          * the snap before drc_ds, because drc_ds can not
1915                          * have any snaps of its own).
1916                          */
1917                         uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
1918                         while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
1919                                 dsl_dataset_t *snap;
1920                                 error = dsl_dataset_hold_obj(dp, obj, FTAG,
1921                                     &snap);
1922                                 if (error != 0)
1923                                         return (error);
1924                                 if (snap->ds_dir != origin_head->ds_dir)
1925                                         error = SET_ERROR(EINVAL);
1926                                 if (error == 0)  {
1927                                         error = dsl_destroy_snapshot_check_impl(
1928                                             snap, B_FALSE);
1929                                 }
1930                                 obj = snap->ds_phys->ds_prev_snap_obj;
1931                                 dsl_dataset_rele(snap, FTAG);
1932                                 if (error != 0)
1933                                         return (error);
1934                         }
1935                 }
1936                 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
1937                     origin_head, drc->drc_force, drc->drc_owner, tx);
1938                 if (error != 0) {
1939                         dsl_dataset_rele(origin_head, FTAG);
1940                         return (error);
1941                 }
1942                 error = dsl_dataset_snapshot_check_impl(origin_head,
1943                     drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
1944                 dsl_dataset_rele(origin_head, FTAG);
1945                 if (error != 0)
1946                         return (error);
1947
1948                 error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
1949         } else {
1950                 error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
1951                     drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
1952         }
1953         return (error);
1954 }
1955
1956 static void
1957 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
1958 {
1959         dmu_recv_cookie_t *drc = arg;
1960         dsl_pool_t *dp = dmu_tx_pool(tx);
1961
1962         spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
1963             tx, "snap=%s", drc->drc_tosnap);
1964
1965         if (!drc->drc_newfs) {
1966                 dsl_dataset_t *origin_head;
1967
1968                 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
1969                     &origin_head));
1970
1971                 if (drc->drc_force) {
1972                         /*
1973                          * Destroy any snapshots of drc_tofs (origin_head)
1974                          * after the origin (the snap before drc_ds).
1975                          */
1976                         uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
1977                         while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
1978                                 dsl_dataset_t *snap;
1979                                 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
1980                                     &snap));
1981                                 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
1982                                 obj = snap->ds_phys->ds_prev_snap_obj;
1983                                 dsl_destroy_snapshot_sync_impl(snap,
1984                                     B_FALSE, tx);
1985                                 dsl_dataset_rele(snap, FTAG);
1986                         }
1987                 }
1988                 VERIFY3P(drc->drc_ds->ds_prev, ==,
1989                     origin_head->ds_prev);
1990
1991                 dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
1992                     origin_head, tx);
1993                 dsl_dataset_snapshot_sync_impl(origin_head,
1994                     drc->drc_tosnap, tx);
1995
1996                 /* set snapshot's creation time and guid */
1997                 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
1998                 origin_head->ds_prev->ds_phys->ds_creation_time =
1999                     drc->drc_drrb->drr_creation_time;
2000                 origin_head->ds_prev->ds_phys->ds_guid =
2001                     drc->drc_drrb->drr_toguid;
2002                 origin_head->ds_prev->ds_phys->ds_flags &=
2003                     ~DS_FLAG_INCONSISTENT;
2004
2005                 dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2006                 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
2007
2008                 dsl_dataset_rele(origin_head, FTAG);
2009                 dsl_destroy_head_sync_impl(drc->drc_ds, tx);
2010
2011                 if (drc->drc_owner != NULL)
2012                         VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
2013         } else {
2014                 dsl_dataset_t *ds = drc->drc_ds;
2015
2016                 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
2017
2018                 /* set snapshot's creation time and guid */
2019                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2020                 ds->ds_prev->ds_phys->ds_creation_time =
2021                     drc->drc_drrb->drr_creation_time;
2022                 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
2023                 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
2024
2025                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2026                 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
2027         }
2028         drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
2029         /*
2030          * Release the hold from dmu_recv_begin.  This must be done before
2031          * we return to open context, so that when we free the dataset's dnode,
2032          * we can evict its bonus buffer.
2033          */
2034         dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
2035         drc->drc_ds = NULL;
2036 }
2037
2038 static int
2039 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
2040 {
2041         dsl_pool_t *dp;
2042         dsl_dataset_t *snapds;
2043         guid_map_entry_t *gmep;
2044         int err;
2045
2046         ASSERT(guid_map != NULL);
2047
2048         err = dsl_pool_hold(name, FTAG, &dp);
2049         if (err != 0)
2050                 return (err);
2051         gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
2052         err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
2053         if (err == 0) {
2054                 gmep->guid = snapds->ds_phys->ds_guid;
2055                 gmep->gme_ds = snapds;
2056                 avl_add(guid_map, gmep);
2057                 dsl_dataset_long_hold(snapds, gmep);
2058         } else
2059                 kmem_free(gmep, sizeof (*gmep));
2060
2061         dsl_pool_rele(dp, FTAG);
2062         return (err);
2063 }
2064
2065 static int dmu_recv_end_modified_blocks = 3;
2066
2067 static int
2068 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
2069 {
2070         int error;
2071         char name[MAXNAMELEN];
2072
2073 #ifdef _KERNEL
2074         /*
2075          * We will be destroying the ds; make sure its origin is unmounted if
2076          * necessary.
2077          */
2078         dsl_dataset_name(drc->drc_ds, name);
2079         zfs_destroy_unmount_origin(name);
2080 #endif
2081
2082         error = dsl_sync_task(drc->drc_tofs,
2083             dmu_recv_end_check, dmu_recv_end_sync, drc,
2084             dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
2085
2086         if (error != 0)
2087                 dmu_recv_cleanup_ds(drc);
2088         return (error);
2089 }
2090
2091 static int
2092 dmu_recv_new_end(dmu_recv_cookie_t *drc)
2093 {
2094         int error;
2095
2096         error = dsl_sync_task(drc->drc_tofs,
2097             dmu_recv_end_check, dmu_recv_end_sync, drc,
2098             dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
2099
2100         if (error != 0) {
2101                 dmu_recv_cleanup_ds(drc);
2102         } else if (drc->drc_guid_to_ds_map != NULL) {
2103                 (void) add_ds_to_guidmap(drc->drc_tofs,
2104                     drc->drc_guid_to_ds_map,
2105                     drc->drc_newsnapobj);
2106         }
2107         return (error);
2108 }
2109
2110 int
2111 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
2112 {
2113         drc->drc_owner = owner;
2114
2115         if (drc->drc_newfs)
2116                 return (dmu_recv_new_end(drc));
2117         else
2118                 return (dmu_recv_existing_end(drc));
2119 }
2120
2121 /*
2122  * Return TRUE if this objset is currently being received into.
2123  */
2124 boolean_t
2125 dmu_objset_is_receiving(objset_t *os)
2126 {
2127         return (os->os_dsl_dataset != NULL &&
2128             os->os_dsl_dataset->ds_owner == dmu_recv_tag);
2129 }