]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dmu_send.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26  */
27
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dbuf.h>
32 #include <sys/dnode.h>
33 #include <sys/zfs_context.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/dmu_traverse.h>
36 #include <sys/dsl_dataset.h>
37 #include <sys/dsl_dir.h>
38 #include <sys/dsl_prop.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/dsl_synctask.h>
41 #include <sys/zfs_ioctl.h>
42 #include <sys/zap.h>
43 #include <sys/zio_checksum.h>
44 #include <sys/zfs_znode.h>
45 #include <zfs_fletcher.h>
46 #include <sys/avl.h>
47 #include <sys/ddt.h>
48 #include <sys/zfs_onexit.h>
49
50 static char *dmu_recv_tag = "dmu_recv_tag";
51
52 /*
53  * The list of data whose inclusion in a send stream can be pending from
54  * one call to backup_cb to another.  Multiple calls to dump_free() and
55  * dump_freeobjects() can be aggregated into a single DRR_FREE or
56  * DRR_FREEOBJECTS replay record.
57  */
58 typedef enum {
59         PENDING_NONE,
60         PENDING_FREE,
61         PENDING_FREEOBJECTS
62 } pendop_t;
63
64 struct backuparg {
65         dmu_replay_record_t *drr;
66         kthread_t *td;
67         struct file *fp;
68         offset_t *off;
69         objset_t *os;
70         zio_cksum_t zc;
71         uint64_t toguid;
72         int err;
73         pendop_t pending_op;
74 };
75
76 static int
77 dump_bytes(struct backuparg *ba, void *buf, int len)
78 {
79         struct uio auio;
80         struct iovec aiov;
81         ASSERT3U(len % 8, ==, 0);
82
83         fletcher_4_incremental_native(buf, len, &ba->zc);
84         aiov.iov_base = buf;
85         aiov.iov_len = len;
86         auio.uio_iov = &aiov;
87         auio.uio_iovcnt = 1;
88         auio.uio_resid = len;
89         auio.uio_segflg = UIO_SYSSPACE;
90         auio.uio_rw = UIO_WRITE;
91         auio.uio_offset = (off_t)-1;
92         auio.uio_td = ba->td;
93 #ifdef _KERNEL
94         if (ba->fp->f_type == DTYPE_VNODE)
95                 bwillwrite();
96         ba->err = fo_write(ba->fp, &auio, ba->td->td_ucred, 0, ba->td);
97 #else
98         fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
99         ba->err = EOPNOTSUPP;
100 #endif
101         *ba->off += len;
102         return (ba->err);
103 }
104
105 static int
106 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
107     uint64_t length)
108 {
109         struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
110
111         /*
112          * If there is a pending op, but it's not PENDING_FREE, push it out,
113          * since free block aggregation can only be done for blocks of the
114          * same type (i.e., DRR_FREE records can only be aggregated with
115          * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
116          * aggregated with other DRR_FREEOBJECTS records.
117          */
118         if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
119                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
120                         return (EINTR);
121                 ba->pending_op = PENDING_NONE;
122         }
123
124         if (ba->pending_op == PENDING_FREE) {
125                 /*
126                  * There should never be a PENDING_FREE if length is -1
127                  * (because dump_dnode is the only place where this
128                  * function is called with a -1, and only after flushing
129                  * any pending record).
130                  */
131                 ASSERT(length != -1ULL);
132                 /*
133                  * Check to see whether this free block can be aggregated
134                  * with pending one.
135                  */
136                 if (drrf->drr_object == object && drrf->drr_offset +
137                     drrf->drr_length == offset) {
138                         drrf->drr_length += length;
139                         return (0);
140                 } else {
141                         /* not a continuation.  Push out pending record */
142                         if (dump_bytes(ba, ba->drr,
143                             sizeof (dmu_replay_record_t)) != 0)
144                                 return (EINTR);
145                         ba->pending_op = PENDING_NONE;
146                 }
147         }
148         /* create a FREE record and make it pending */
149         bzero(ba->drr, sizeof (dmu_replay_record_t));
150         ba->drr->drr_type = DRR_FREE;
151         drrf->drr_object = object;
152         drrf->drr_offset = offset;
153         drrf->drr_length = length;
154         drrf->drr_toguid = ba->toguid;
155         if (length == -1ULL) {
156                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
157                         return (EINTR);
158         } else {
159                 ba->pending_op = PENDING_FREE;
160         }
161
162         return (0);
163 }
164
165 static int
166 dump_data(struct backuparg *ba, dmu_object_type_t type,
167     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
168 {
169         struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
170
171
172         /*
173          * If there is any kind of pending aggregation (currently either
174          * a grouping of free objects or free blocks), push it out to
175          * the stream, since aggregation can't be done across operations
176          * of different types.
177          */
178         if (ba->pending_op != PENDING_NONE) {
179                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
180                         return (EINTR);
181                 ba->pending_op = PENDING_NONE;
182         }
183         /* write a DATA record */
184         bzero(ba->drr, sizeof (dmu_replay_record_t));
185         ba->drr->drr_type = DRR_WRITE;
186         drrw->drr_object = object;
187         drrw->drr_type = type;
188         drrw->drr_offset = offset;
189         drrw->drr_length = blksz;
190         drrw->drr_toguid = ba->toguid;
191         drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
192         if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
193                 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
194         DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
195         DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
196         DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
197         drrw->drr_key.ddk_cksum = bp->blk_cksum;
198
199         if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
200                 return (EINTR);
201         if (dump_bytes(ba, data, blksz) != 0)
202                 return (EINTR);
203         return (0);
204 }
205
206 static int
207 dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
208 {
209         struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
210
211         if (ba->pending_op != PENDING_NONE) {
212                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
213                         return (EINTR);
214                 ba->pending_op = PENDING_NONE;
215         }
216
217         /* write a SPILL record */
218         bzero(ba->drr, sizeof (dmu_replay_record_t));
219         ba->drr->drr_type = DRR_SPILL;
220         drrs->drr_object = object;
221         drrs->drr_length = blksz;
222         drrs->drr_toguid = ba->toguid;
223
224         if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
225                 return (EINTR);
226         if (dump_bytes(ba, data, blksz))
227                 return (EINTR);
228         return (0);
229 }
230
231 static int
232 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
233 {
234         struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
235
236         /*
237          * If there is a pending op, but it's not PENDING_FREEOBJECTS,
238          * push it out, since free block aggregation can only be done for
239          * blocks of the same type (i.e., DRR_FREE records can only be
240          * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
241          * can only be aggregated with other DRR_FREEOBJECTS records.
242          */
243         if (ba->pending_op != PENDING_NONE &&
244             ba->pending_op != PENDING_FREEOBJECTS) {
245                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
246                         return (EINTR);
247                 ba->pending_op = PENDING_NONE;
248         }
249         if (ba->pending_op == PENDING_FREEOBJECTS) {
250                 /*
251                  * See whether this free object array can be aggregated
252                  * with pending one
253                  */
254                 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
255                         drrfo->drr_numobjs += numobjs;
256                         return (0);
257                 } else {
258                         /* can't be aggregated.  Push out pending record */
259                         if (dump_bytes(ba, ba->drr,
260                             sizeof (dmu_replay_record_t)) != 0)
261                                 return (EINTR);
262                         ba->pending_op = PENDING_NONE;
263                 }
264         }
265
266         /* write a FREEOBJECTS record */
267         bzero(ba->drr, sizeof (dmu_replay_record_t));
268         ba->drr->drr_type = DRR_FREEOBJECTS;
269         drrfo->drr_firstobj = firstobj;
270         drrfo->drr_numobjs = numobjs;
271         drrfo->drr_toguid = ba->toguid;
272
273         ba->pending_op = PENDING_FREEOBJECTS;
274
275         return (0);
276 }
277
278 static int
279 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
280 {
281         struct drr_object *drro = &(ba->drr->drr_u.drr_object);
282
283         if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
284                 return (dump_freeobjects(ba, object, 1));
285
286         if (ba->pending_op != PENDING_NONE) {
287                 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
288                         return (EINTR);
289                 ba->pending_op = PENDING_NONE;
290         }
291
292         /* write an OBJECT record */
293         bzero(ba->drr, sizeof (dmu_replay_record_t));
294         ba->drr->drr_type = DRR_OBJECT;
295         drro->drr_object = object;
296         drro->drr_type = dnp->dn_type;
297         drro->drr_bonustype = dnp->dn_bonustype;
298         drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
299         drro->drr_bonuslen = dnp->dn_bonuslen;
300         drro->drr_checksumtype = dnp->dn_checksum;
301         drro->drr_compress = dnp->dn_compress;
302         drro->drr_toguid = ba->toguid;
303
304         if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
305                 return (EINTR);
306
307         if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
308                 return (EINTR);
309
310         /* free anything past the end of the file */
311         if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
312             (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
313                 return (EINTR);
314         if (ba->err)
315                 return (EINTR);
316         return (0);
317 }
318
319 #define BP_SPAN(dnp, level) \
320         (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
321         (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
322
323 /* ARGSUSED */
324 static int
325 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
326     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
327 {
328         struct backuparg *ba = arg;
329         dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
330         int err = 0;
331
332         if (issig(JUSTLOOKING) && issig(FORREAL))
333                 return (EINTR);
334
335         if (zb->zb_object != DMU_META_DNODE_OBJECT &&
336             DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
337                 return (0);
338         } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
339                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
340                 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
341                 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
342         } else if (bp == NULL) {
343                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
344                 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span);
345         } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
346                 return (0);
347         } else if (type == DMU_OT_DNODE) {
348                 dnode_phys_t *blk;
349                 int i;
350                 int blksz = BP_GET_LSIZE(bp);
351                 uint32_t aflags = ARC_WAIT;
352                 arc_buf_t *abuf;
353
354                 if (dsl_read(NULL, spa, bp, pbuf,
355                     arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
356                     ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
357                         return (EIO);
358
359                 blk = abuf->b_data;
360                 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
361                         uint64_t dnobj = (zb->zb_blkid <<
362                             (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
363                         err = dump_dnode(ba, dnobj, blk+i);
364                         if (err)
365                                 break;
366                 }
367                 (void) arc_buf_remove_ref(abuf, &abuf);
368         } else if (type == DMU_OT_SA) {
369                 uint32_t aflags = ARC_WAIT;
370                 arc_buf_t *abuf;
371                 int blksz = BP_GET_LSIZE(bp);
372
373                 if (arc_read_nolock(NULL, spa, bp,
374                     arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
375                     ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
376                         return (EIO);
377
378                 err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
379                 (void) arc_buf_remove_ref(abuf, &abuf);
380         } else { /* it's a level-0 block of a regular object */
381                 uint32_t aflags = ARC_WAIT;
382                 arc_buf_t *abuf;
383                 int blksz = BP_GET_LSIZE(bp);
384
385                 if (dsl_read(NULL, spa, bp, pbuf,
386                     arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
387                     ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
388                         return (EIO);
389
390                 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
391                     blksz, bp, abuf->b_data);
392                 (void) arc_buf_remove_ref(abuf, &abuf);
393         }
394
395         ASSERT(err == 0 || err == EINTR);
396         return (err);
397 }
398
399 int
400 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
401     struct file *fp, offset_t *off)
402 {
403         dsl_dataset_t *ds = tosnap->os_dsl_dataset;
404         dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
405         dmu_replay_record_t *drr;
406         struct backuparg ba;
407         int err;
408         uint64_t fromtxg = 0;
409
410         /* tosnap must be a snapshot */
411         if (ds->ds_phys->ds_next_snap_obj == 0)
412                 return (EINVAL);
413
414         /* fromsnap must be an earlier snapshot from the same fs as tosnap */
415         if (fromds && (ds->ds_dir != fromds->ds_dir ||
416             fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
417                 return (EXDEV);
418
419         if (fromorigin) {
420                 dsl_pool_t *dp = ds->ds_dir->dd_pool;
421
422                 if (fromsnap)
423                         return (EINVAL);
424
425                 if (dsl_dir_is_clone(ds->ds_dir)) {
426                         rw_enter(&dp->dp_config_rwlock, RW_READER);
427                         err = dsl_dataset_hold_obj(dp,
428                             ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
429                         rw_exit(&dp->dp_config_rwlock);
430                         if (err)
431                                 return (err);
432                 } else {
433                         fromorigin = B_FALSE;
434                 }
435         }
436
437
438         drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
439         drr->drr_type = DRR_BEGIN;
440         drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
441         DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
442             DMU_SUBSTREAM);
443
444 #ifdef _KERNEL
445         if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
446                 uint64_t version;
447                 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0)
448                         return (EINVAL);
449                 if (version == ZPL_VERSION_SA) {
450                         DMU_SET_FEATUREFLAGS(
451                             drr->drr_u.drr_begin.drr_versioninfo,
452                             DMU_BACKUP_FEATURE_SA_SPILL);
453                 }
454         }
455 #endif
456
457         drr->drr_u.drr_begin.drr_creation_time =
458             ds->ds_phys->ds_creation_time;
459         drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
460         if (fromorigin)
461                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
462         drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
463         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
464                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
465
466         if (fromds)
467                 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
468         dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
469
470         if (fromds)
471                 fromtxg = fromds->ds_phys->ds_creation_txg;
472         if (fromorigin)
473                 dsl_dataset_rele(fromds, FTAG);
474
475         ba.drr = drr;
476         ba.td = curthread;
477         ba.fp = fp;
478         ba.os = tosnap;
479         ba.off = off;
480         ba.toguid = ds->ds_phys->ds_guid;
481         ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
482         ba.pending_op = PENDING_NONE;
483
484         if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
485                 kmem_free(drr, sizeof (dmu_replay_record_t));
486                 return (ba.err);
487         }
488
489         err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
490             backup_cb, &ba);
491
492         if (ba.pending_op != PENDING_NONE)
493                 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
494                         err = EINTR;
495
496         if (err) {
497                 if (err == EINTR && ba.err)
498                         err = ba.err;
499                 kmem_free(drr, sizeof (dmu_replay_record_t));
500                 return (err);
501         }
502
503         bzero(drr, sizeof (dmu_replay_record_t));
504         drr->drr_type = DRR_END;
505         drr->drr_u.drr_end.drr_checksum = ba.zc;
506         drr->drr_u.drr_end.drr_toguid = ba.toguid;
507
508         if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
509                 kmem_free(drr, sizeof (dmu_replay_record_t));
510                 return (ba.err);
511         }
512
513         kmem_free(drr, sizeof (dmu_replay_record_t));
514
515         return (0);
516 }
517
518 struct recvbeginsyncarg {
519         const char *tofs;
520         const char *tosnap;
521         dsl_dataset_t *origin;
522         uint64_t fromguid;
523         dmu_objset_type_t type;
524         void *tag;
525         boolean_t force;
526         uint64_t dsflags;
527         char clonelastname[MAXNAMELEN];
528         dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
529         cred_t *cr;
530 };
531
532 /* ARGSUSED */
533 static int
534 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
535 {
536         dsl_dir_t *dd = arg1;
537         struct recvbeginsyncarg *rbsa = arg2;
538         objset_t *mos = dd->dd_pool->dp_meta_objset;
539         uint64_t val;
540         int err;
541
542         err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
543             strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
544
545         if (err != ENOENT)
546                 return (err ? err : EEXIST);
547
548         if (rbsa->origin) {
549                 /* make sure it's a snap in the same pool */
550                 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
551                         return (EXDEV);
552                 if (!dsl_dataset_is_snapshot(rbsa->origin))
553                         return (EINVAL);
554                 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
555                         return (ENODEV);
556         }
557
558         return (0);
559 }
560
561 static void
562 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
563 {
564         dsl_dir_t *dd = arg1;
565         struct recvbeginsyncarg *rbsa = arg2;
566         uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
567         uint64_t dsobj;
568
569         /* Create and open new dataset. */
570         dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
571             rbsa->origin, flags, rbsa->cr, tx);
572         VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
573             B_TRUE, dmu_recv_tag, &rbsa->ds));
574
575         if (rbsa->origin == NULL) {
576                 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
577                     rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
578         }
579
580         spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
581             dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
582 }
583
584 /* ARGSUSED */
585 static int
586 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
587 {
588         dsl_dataset_t *ds = arg1;
589         struct recvbeginsyncarg *rbsa = arg2;
590         int err;
591         uint64_t val;
592
593         /* must not have any changes since most recent snapshot */
594         if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
595                 return (ETXTBSY);
596
597         /* new snapshot name must not exist */
598         err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
599             ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
600         if (err == 0)
601                 return (EEXIST);
602         if (err != ENOENT)
603                 return (err);
604
605         if (rbsa->fromguid) {
606                 /* if incremental, most recent snapshot must match fromguid */
607                 if (ds->ds_prev == NULL)
608                         return (ENODEV);
609
610                 /*
611                  * most recent snapshot must match fromguid, or there are no
612                  * changes since the fromguid one
613                  */
614                 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
615                         uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
616                         uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
617                         while (obj != 0) {
618                                 dsl_dataset_t *snap;
619                                 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
620                                     obj, FTAG, &snap);
621                                 if (err)
622                                         return (ENODEV);
623                                 if (snap->ds_phys->ds_creation_txg < birth) {
624                                         dsl_dataset_rele(snap, FTAG);
625                                         return (ENODEV);
626                                 }
627                                 if (snap->ds_phys->ds_guid == rbsa->fromguid) {
628                                         dsl_dataset_rele(snap, FTAG);
629                                         break; /* it's ok */
630                                 }
631                                 obj = snap->ds_phys->ds_prev_snap_obj;
632                                 dsl_dataset_rele(snap, FTAG);
633                         }
634                         if (obj == 0)
635                                 return (ENODEV);
636                 }
637         } else {
638                 /* if full, most recent snapshot must be $ORIGIN */
639                 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
640                         return (ENODEV);
641         }
642
643         /* temporary clone name must not exist */
644         err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
645             ds->ds_dir->dd_phys->dd_child_dir_zapobj,
646             rbsa->clonelastname, 8, 1, &val);
647         if (err == 0)
648                 return (EEXIST);
649         if (err != ENOENT)
650                 return (err);
651
652         return (0);
653 }
654
655 /* ARGSUSED */
656 static void
657 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
658 {
659         dsl_dataset_t *ohds = arg1;
660         struct recvbeginsyncarg *rbsa = arg2;
661         dsl_pool_t *dp = ohds->ds_dir->dd_pool;
662         dsl_dataset_t *cds;
663         uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
664         uint64_t dsobj;
665
666         /* create and open the temporary clone */
667         dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
668             ohds->ds_prev, flags, rbsa->cr, tx);
669         VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
670
671         /*
672          * If we actually created a non-clone, we need to create the
673          * objset in our new dataset.
674          */
675         if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
676                 (void) dmu_objset_create_impl(dp->dp_spa,
677                     cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
678         }
679
680         rbsa->ds = cds;
681
682         spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
683             dp->dp_spa, tx, "dataset = %lld", dsobj);
684 }
685
686 static boolean_t
687 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
688 {
689         int featureflags;
690
691         featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
692
693         /* Verify pool version supports SA if SA_SPILL feature set */
694         return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
695             (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
696 }
697
698 /*
699  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
700  * succeeds; otherwise we will leak the holds on the datasets.
701  */
702 int
703 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
704     boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
705 {
706         int err = 0;
707         boolean_t byteswap;
708         struct recvbeginsyncarg rbsa = { 0 };
709         uint64_t versioninfo;
710         int flags;
711         dsl_dataset_t *ds;
712
713         if (drrb->drr_magic == DMU_BACKUP_MAGIC)
714                 byteswap = FALSE;
715         else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
716                 byteswap = TRUE;
717         else
718                 return (EINVAL);
719
720         rbsa.tofs = tofs;
721         rbsa.tosnap = tosnap;
722         rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
723         rbsa.fromguid = drrb->drr_fromguid;
724         rbsa.type = drrb->drr_type;
725         rbsa.tag = FTAG;
726         rbsa.dsflags = 0;
727         rbsa.cr = CRED();
728         versioninfo = drrb->drr_versioninfo;
729         flags = drrb->drr_flags;
730
731         if (byteswap) {
732                 rbsa.type = BSWAP_32(rbsa.type);
733                 rbsa.fromguid = BSWAP_64(rbsa.fromguid);
734                 versioninfo = BSWAP_64(versioninfo);
735                 flags = BSWAP_32(flags);
736         }
737
738         if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
739             rbsa.type >= DMU_OST_NUMTYPES ||
740             ((flags & DRR_FLAG_CLONE) && origin == NULL))
741                 return (EINVAL);
742
743         if (flags & DRR_FLAG_CI_DATA)
744                 rbsa.dsflags = DS_FLAG_CI_DATASET;
745
746         bzero(drc, sizeof (dmu_recv_cookie_t));
747         drc->drc_drrb = drrb;
748         drc->drc_tosnap = tosnap;
749         drc->drc_top_ds = top_ds;
750         drc->drc_force = force;
751
752         /*
753          * Process the begin in syncing context.
754          */
755
756         /* open the dataset we are logically receiving into */
757         err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
758         if (err == 0) {
759                 if (dmu_recv_verify_features(ds, drrb)) {
760                         dsl_dataset_rele(ds, dmu_recv_tag);
761                         return (ENOTSUP);
762                 }
763                 /* target fs already exists; recv into temp clone */
764
765                 /* Can't recv a clone into an existing fs */
766                 if (flags & DRR_FLAG_CLONE) {
767                         dsl_dataset_rele(ds, dmu_recv_tag);
768                         return (EINVAL);
769                 }
770
771                 /* must not have an incremental recv already in progress */
772                 if (!mutex_tryenter(&ds->ds_recvlock)) {
773                         dsl_dataset_rele(ds, dmu_recv_tag);
774                         return (EBUSY);
775                 }
776
777                 /* tmp clone name is: tofs/%tosnap" */
778                 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
779                     "%%%s", tosnap);
780                 rbsa.force = force;
781                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
782                     recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
783                 if (err) {
784                         mutex_exit(&ds->ds_recvlock);
785                         dsl_dataset_rele(ds, dmu_recv_tag);
786                         return (err);
787                 }
788                 drc->drc_logical_ds = ds;
789                 drc->drc_real_ds = rbsa.ds;
790         } else if (err == ENOENT) {
791                 /* target fs does not exist; must be a full backup or clone */
792                 char *cp;
793
794                 /*
795                  * If it's a non-clone incremental, we are missing the
796                  * target fs, so fail the recv.
797                  */
798                 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
799                         return (ENOENT);
800
801                 /* Open the parent of tofs */
802                 cp = strrchr(tofs, '/');
803                 *cp = '\0';
804                 err = dsl_dataset_hold(tofs, FTAG, &ds);
805                 *cp = '/';
806                 if (err)
807                         return (err);
808
809                 if (dmu_recv_verify_features(ds, drrb)) {
810                         dsl_dataset_rele(ds, FTAG);
811                         return (ENOTSUP);
812                 }
813
814                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
815                     recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
816                 dsl_dataset_rele(ds, FTAG);
817                 if (err)
818                         return (err);
819                 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
820                 drc->drc_newfs = B_TRUE;
821         }
822
823         return (err);
824 }
825
826 struct restorearg {
827         int err;
828         int byteswap;
829         kthread_t *td;
830         struct file *fp;
831         char *buf;
832         uint64_t voff;
833         int bufsize; /* amount of memory allocated for buf */
834         zio_cksum_t cksum;
835         avl_tree_t *guid_to_ds_map;
836 };
837
838 typedef struct guid_map_entry {
839         uint64_t        guid;
840         dsl_dataset_t   *gme_ds;
841         avl_node_t      avlnode;
842 } guid_map_entry_t;
843
844 static int
845 guid_compare(const void *arg1, const void *arg2)
846 {
847         const guid_map_entry_t *gmep1 = arg1;
848         const guid_map_entry_t *gmep2 = arg2;
849
850         if (gmep1->guid < gmep2->guid)
851                 return (-1);
852         else if (gmep1->guid > gmep2->guid)
853                 return (1);
854         return (0);
855 }
856
857 static void
858 free_guid_map_onexit(void *arg)
859 {
860         avl_tree_t *ca = arg;
861         void *cookie = NULL;
862         guid_map_entry_t *gmep;
863
864         while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
865                 dsl_dataset_rele(gmep->gme_ds, ca);
866                 kmem_free(gmep, sizeof (guid_map_entry_t));
867         }
868         avl_destroy(ca);
869         kmem_free(ca, sizeof (avl_tree_t));
870 }
871
872 static int
873 restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
874 {
875         struct uio auio;
876         struct iovec aiov;
877         int error;
878
879         aiov.iov_base = buf;
880         aiov.iov_len = len;
881         auio.uio_iov = &aiov;
882         auio.uio_iovcnt = 1;
883         auio.uio_resid = len;
884         auio.uio_segflg = UIO_SYSSPACE;
885         auio.uio_rw = UIO_READ;
886         auio.uio_offset = off;
887         auio.uio_td = ra->td;
888 #ifdef _KERNEL
889         error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
890 #else
891         fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
892         error = EOPNOTSUPP;
893 #endif
894         *resid = auio.uio_resid;
895         return (error);
896 }
897
898 static void *
899 restore_read(struct restorearg *ra, int len)
900 {
901         void *rv;
902         int done = 0;
903
904         /* some things will require 8-byte alignment, so everything must */
905         ASSERT3U(len % 8, ==, 0);
906
907         while (done < len) {
908                 ssize_t resid;
909
910                 ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
911                     len - done, ra->voff, &resid);
912
913                 if (resid == len - done)
914                         ra->err = EINVAL;
915                 ra->voff += len - done - resid;
916                 done = len - resid;
917                 if (ra->err)
918                         return (NULL);
919         }
920
921         ASSERT3U(done, ==, len);
922         rv = ra->buf;
923         if (ra->byteswap)
924                 fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
925         else
926                 fletcher_4_incremental_native(rv, len, &ra->cksum);
927         return (rv);
928 }
929
930 static void
931 backup_byteswap(dmu_replay_record_t *drr)
932 {
933 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
934 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
935         drr->drr_type = BSWAP_32(drr->drr_type);
936         drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
937         switch (drr->drr_type) {
938         case DRR_BEGIN:
939                 DO64(drr_begin.drr_magic);
940                 DO64(drr_begin.drr_versioninfo);
941                 DO64(drr_begin.drr_creation_time);
942                 DO32(drr_begin.drr_type);
943                 DO32(drr_begin.drr_flags);
944                 DO64(drr_begin.drr_toguid);
945                 DO64(drr_begin.drr_fromguid);
946                 break;
947         case DRR_OBJECT:
948                 DO64(drr_object.drr_object);
949                 /* DO64(drr_object.drr_allocation_txg); */
950                 DO32(drr_object.drr_type);
951                 DO32(drr_object.drr_bonustype);
952                 DO32(drr_object.drr_blksz);
953                 DO32(drr_object.drr_bonuslen);
954                 DO64(drr_object.drr_toguid);
955                 break;
956         case DRR_FREEOBJECTS:
957                 DO64(drr_freeobjects.drr_firstobj);
958                 DO64(drr_freeobjects.drr_numobjs);
959                 DO64(drr_freeobjects.drr_toguid);
960                 break;
961         case DRR_WRITE:
962                 DO64(drr_write.drr_object);
963                 DO32(drr_write.drr_type);
964                 DO64(drr_write.drr_offset);
965                 DO64(drr_write.drr_length);
966                 DO64(drr_write.drr_toguid);
967                 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
968                 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
969                 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
970                 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
971                 DO64(drr_write.drr_key.ddk_prop);
972                 break;
973         case DRR_WRITE_BYREF:
974                 DO64(drr_write_byref.drr_object);
975                 DO64(drr_write_byref.drr_offset);
976                 DO64(drr_write_byref.drr_length);
977                 DO64(drr_write_byref.drr_toguid);
978                 DO64(drr_write_byref.drr_refguid);
979                 DO64(drr_write_byref.drr_refobject);
980                 DO64(drr_write_byref.drr_refoffset);
981                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
982                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
983                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
984                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
985                 DO64(drr_write_byref.drr_key.ddk_prop);
986                 break;
987         case DRR_FREE:
988                 DO64(drr_free.drr_object);
989                 DO64(drr_free.drr_offset);
990                 DO64(drr_free.drr_length);
991                 DO64(drr_free.drr_toguid);
992                 break;
993         case DRR_SPILL:
994                 DO64(drr_spill.drr_object);
995                 DO64(drr_spill.drr_length);
996                 DO64(drr_spill.drr_toguid);
997                 break;
998         case DRR_END:
999                 DO64(drr_end.drr_checksum.zc_word[0]);
1000                 DO64(drr_end.drr_checksum.zc_word[1]);
1001                 DO64(drr_end.drr_checksum.zc_word[2]);
1002                 DO64(drr_end.drr_checksum.zc_word[3]);
1003                 DO64(drr_end.drr_toguid);
1004                 break;
1005         }
1006 #undef DO64
1007 #undef DO32
1008 }
1009
1010 static int
1011 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1012 {
1013         int err;
1014         dmu_tx_t *tx;
1015         void *data = NULL;
1016
1017         if (drro->drr_type == DMU_OT_NONE ||
1018             drro->drr_type >= DMU_OT_NUMTYPES ||
1019             drro->drr_bonustype >= DMU_OT_NUMTYPES ||
1020             drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1021             drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1022             P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1023             drro->drr_blksz < SPA_MINBLOCKSIZE ||
1024             drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1025             drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1026                 return (EINVAL);
1027         }
1028
1029         err = dmu_object_info(os, drro->drr_object, NULL);
1030
1031         if (err != 0 && err != ENOENT)
1032                 return (EINVAL);
1033
1034         if (drro->drr_bonuslen) {
1035                 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1036                 if (ra->err)
1037                         return (ra->err);
1038         }
1039
1040         if (err == ENOENT) {
1041                 /* currently free, want to be allocated */
1042                 tx = dmu_tx_create(os);
1043                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1044                 err = dmu_tx_assign(tx, TXG_WAIT);
1045                 if (err) {
1046                         dmu_tx_abort(tx);
1047                         return (err);
1048                 }
1049                 err = dmu_object_claim(os, drro->drr_object,
1050                     drro->drr_type, drro->drr_blksz,
1051                     drro->drr_bonustype, drro->drr_bonuslen, tx);
1052                 dmu_tx_commit(tx);
1053         } else {
1054                 /* currently allocated, want to be allocated */
1055                 err = dmu_object_reclaim(os, drro->drr_object,
1056                     drro->drr_type, drro->drr_blksz,
1057                     drro->drr_bonustype, drro->drr_bonuslen);
1058         }
1059         if (err) {
1060                 return (EINVAL);
1061         }
1062
1063         tx = dmu_tx_create(os);
1064         dmu_tx_hold_bonus(tx, drro->drr_object);
1065         err = dmu_tx_assign(tx, TXG_WAIT);
1066         if (err) {
1067                 dmu_tx_abort(tx);
1068                 return (err);
1069         }
1070
1071         dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1072             tx);
1073         dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1074
1075         if (data != NULL) {
1076                 dmu_buf_t *db;
1077
1078                 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1079                 dmu_buf_will_dirty(db, tx);
1080
1081                 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1082                 bcopy(data, db->db_data, drro->drr_bonuslen);
1083                 if (ra->byteswap) {
1084                         dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
1085                             drro->drr_bonuslen);
1086                 }
1087                 dmu_buf_rele(db, FTAG);
1088         }
1089         dmu_tx_commit(tx);
1090         return (0);
1091 }
1092
1093 /* ARGSUSED */
1094 static int
1095 restore_freeobjects(struct restorearg *ra, objset_t *os,
1096     struct drr_freeobjects *drrfo)
1097 {
1098         uint64_t obj;
1099
1100         if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1101                 return (EINVAL);
1102
1103         for (obj = drrfo->drr_firstobj;
1104             obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1105             (void) dmu_object_next(os, &obj, FALSE, 0)) {
1106                 int err;
1107
1108                 if (dmu_object_info(os, obj, NULL) != 0)
1109                         continue;
1110
1111                 err = dmu_free_object(os, obj);
1112                 if (err)
1113                         return (err);
1114         }
1115         return (0);
1116 }
1117
1118 static int
1119 restore_write(struct restorearg *ra, objset_t *os,
1120     struct drr_write *drrw)
1121 {
1122         dmu_tx_t *tx;
1123         void *data;
1124         int err;
1125
1126         if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1127             drrw->drr_type >= DMU_OT_NUMTYPES)
1128                 return (EINVAL);
1129
1130         data = restore_read(ra, drrw->drr_length);
1131         if (data == NULL)
1132                 return (ra->err);
1133
1134         if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1135                 return (EINVAL);
1136
1137         tx = dmu_tx_create(os);
1138
1139         dmu_tx_hold_write(tx, drrw->drr_object,
1140             drrw->drr_offset, drrw->drr_length);
1141         err = dmu_tx_assign(tx, TXG_WAIT);
1142         if (err) {
1143                 dmu_tx_abort(tx);
1144                 return (err);
1145         }
1146         if (ra->byteswap)
1147                 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
1148         dmu_write(os, drrw->drr_object,
1149             drrw->drr_offset, drrw->drr_length, data, tx);
1150         dmu_tx_commit(tx);
1151         return (0);
1152 }
1153
1154 /*
1155  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1156  * streams to refer to a copy of the data that is already on the
1157  * system because it came in earlier in the stream.  This function
1158  * finds the earlier copy of the data, and uses that copy instead of
1159  * data from the stream to fulfill this write.
1160  */
1161 static int
1162 restore_write_byref(struct restorearg *ra, objset_t *os,
1163     struct drr_write_byref *drrwbr)
1164 {
1165         dmu_tx_t *tx;
1166         int err;
1167         guid_map_entry_t gmesrch;
1168         guid_map_entry_t *gmep;
1169         avl_index_t     where;
1170         objset_t *ref_os = NULL;
1171         dmu_buf_t *dbp;
1172
1173         if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1174                 return (EINVAL);
1175
1176         /*
1177          * If the GUID of the referenced dataset is different from the
1178          * GUID of the target dataset, find the referenced dataset.
1179          */
1180         if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1181                 gmesrch.guid = drrwbr->drr_refguid;
1182                 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1183                     &where)) == NULL) {
1184                         return (EINVAL);
1185                 }
1186                 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1187                         return (EINVAL);
1188         } else {
1189                 ref_os = os;
1190         }
1191
1192         if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1193             drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1194                 return (err);
1195
1196         tx = dmu_tx_create(os);
1197
1198         dmu_tx_hold_write(tx, drrwbr->drr_object,
1199             drrwbr->drr_offset, drrwbr->drr_length);
1200         err = dmu_tx_assign(tx, TXG_WAIT);
1201         if (err) {
1202                 dmu_tx_abort(tx);
1203                 return (err);
1204         }
1205         dmu_write(os, drrwbr->drr_object,
1206             drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1207         dmu_buf_rele(dbp, FTAG);
1208         dmu_tx_commit(tx);
1209         return (0);
1210 }
1211
1212 static int
1213 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1214 {
1215         dmu_tx_t *tx;
1216         void *data;
1217         dmu_buf_t *db, *db_spill;
1218         int err;
1219
1220         if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1221             drrs->drr_length > SPA_MAXBLOCKSIZE)
1222                 return (EINVAL);
1223
1224         data = restore_read(ra, drrs->drr_length);
1225         if (data == NULL)
1226                 return (ra->err);
1227
1228         if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1229                 return (EINVAL);
1230
1231         VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1232         if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1233                 dmu_buf_rele(db, FTAG);
1234                 return (err);
1235         }
1236
1237         tx = dmu_tx_create(os);
1238
1239         dmu_tx_hold_spill(tx, db->db_object);
1240
1241         err = dmu_tx_assign(tx, TXG_WAIT);
1242         if (err) {
1243                 dmu_buf_rele(db, FTAG);
1244                 dmu_buf_rele(db_spill, FTAG);
1245                 dmu_tx_abort(tx);
1246                 return (err);
1247         }
1248         dmu_buf_will_dirty(db_spill, tx);
1249
1250         if (db_spill->db_size < drrs->drr_length)
1251                 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1252                     drrs->drr_length, tx));
1253         bcopy(data, db_spill->db_data, drrs->drr_length);
1254
1255         dmu_buf_rele(db, FTAG);
1256         dmu_buf_rele(db_spill, FTAG);
1257
1258         dmu_tx_commit(tx);
1259         return (0);
1260 }
1261
1262 /* ARGSUSED */
1263 static int
1264 restore_free(struct restorearg *ra, objset_t *os,
1265     struct drr_free *drrf)
1266 {
1267         int err;
1268
1269         if (drrf->drr_length != -1ULL &&
1270             drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1271                 return (EINVAL);
1272
1273         if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1274                 return (EINVAL);
1275
1276         err = dmu_free_long_range(os, drrf->drr_object,
1277             drrf->drr_offset, drrf->drr_length);
1278         return (err);
1279 }
1280
1281 /*
1282  * NB: callers *must* call dmu_recv_end() if this succeeds.
1283  */
1284 int
1285 dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
1286     int cleanup_fd, uint64_t *action_handlep)
1287 {
1288         struct restorearg ra = { 0 };
1289         dmu_replay_record_t *drr;
1290         objset_t *os;
1291         zio_cksum_t pcksum;
1292         int featureflags;
1293
1294         if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1295                 ra.byteswap = TRUE;
1296
1297         {
1298                 /* compute checksum of drr_begin record */
1299                 dmu_replay_record_t *drr;
1300                 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1301
1302                 drr->drr_type = DRR_BEGIN;
1303                 drr->drr_u.drr_begin = *drc->drc_drrb;
1304                 if (ra.byteswap) {
1305                         fletcher_4_incremental_byteswap(drr,
1306                             sizeof (dmu_replay_record_t), &ra.cksum);
1307                 } else {
1308                         fletcher_4_incremental_native(drr,
1309                             sizeof (dmu_replay_record_t), &ra.cksum);
1310                 }
1311                 kmem_free(drr, sizeof (dmu_replay_record_t));
1312         }
1313
1314         if (ra.byteswap) {
1315                 struct drr_begin *drrb = drc->drc_drrb;
1316                 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1317                 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1318                 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1319                 drrb->drr_type = BSWAP_32(drrb->drr_type);
1320                 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1321                 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1322         }
1323
1324         ra.td = curthread;
1325         ra.fp = fp;
1326         ra.voff = *voffp;
1327         ra.bufsize = 1<<20;
1328         ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1329
1330         /* these were verified in dmu_recv_begin */
1331         ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
1332             DMU_SUBSTREAM);
1333         ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
1334
1335         /*
1336          * Open the objset we are modifying.
1337          */
1338         VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
1339
1340         ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1341
1342         featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1343
1344         /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1345         if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1346                 minor_t minor;
1347
1348                 if (cleanup_fd == -1) {
1349                         ra.err = EBADF;
1350                         goto out;
1351                 }
1352                 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1353                 if (ra.err) {
1354                         cleanup_fd = -1;
1355                         goto out;
1356                 }
1357
1358                 if (*action_handlep == 0) {
1359                         ra.guid_to_ds_map =
1360                             kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1361                         avl_create(ra.guid_to_ds_map, guid_compare,
1362                             sizeof (guid_map_entry_t),
1363                             offsetof(guid_map_entry_t, avlnode));
1364                         ra.err = zfs_onexit_add_cb(minor,
1365                             free_guid_map_onexit, ra.guid_to_ds_map,
1366                             action_handlep);
1367                         if (ra.err)
1368                                 goto out;
1369                 } else {
1370                         ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1371                             (void **)&ra.guid_to_ds_map);
1372                         if (ra.err)
1373                                 goto out;
1374                 }
1375
1376                 drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1377         }
1378
1379         /*
1380          * Read records and process them.
1381          */
1382         pcksum = ra.cksum;
1383         while (ra.err == 0 &&
1384             NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1385                 if (issig(JUSTLOOKING) && issig(FORREAL)) {
1386                         ra.err = EINTR;
1387                         goto out;
1388                 }
1389
1390                 if (ra.byteswap)
1391                         backup_byteswap(drr);
1392
1393                 switch (drr->drr_type) {
1394                 case DRR_OBJECT:
1395                 {
1396                         /*
1397                          * We need to make a copy of the record header,
1398                          * because restore_{object,write} may need to
1399                          * restore_read(), which will invalidate drr.
1400                          */
1401                         struct drr_object drro = drr->drr_u.drr_object;
1402                         ra.err = restore_object(&ra, os, &drro);
1403                         break;
1404                 }
1405                 case DRR_FREEOBJECTS:
1406                 {
1407                         struct drr_freeobjects drrfo =
1408                             drr->drr_u.drr_freeobjects;
1409                         ra.err = restore_freeobjects(&ra, os, &drrfo);
1410                         break;
1411                 }
1412                 case DRR_WRITE:
1413                 {
1414                         struct drr_write drrw = drr->drr_u.drr_write;
1415                         ra.err = restore_write(&ra, os, &drrw);
1416                         break;
1417                 }
1418                 case DRR_WRITE_BYREF:
1419                 {
1420                         struct drr_write_byref drrwbr =
1421                             drr->drr_u.drr_write_byref;
1422                         ra.err = restore_write_byref(&ra, os, &drrwbr);
1423                         break;
1424                 }
1425                 case DRR_FREE:
1426                 {
1427                         struct drr_free drrf = drr->drr_u.drr_free;
1428                         ra.err = restore_free(&ra, os, &drrf);
1429                         break;
1430                 }
1431                 case DRR_END:
1432                 {
1433                         struct drr_end drre = drr->drr_u.drr_end;
1434                         /*
1435                          * We compare against the *previous* checksum
1436                          * value, because the stored checksum is of
1437                          * everything before the DRR_END record.
1438                          */
1439                         if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1440                                 ra.err = ECKSUM;
1441                         goto out;
1442                 }
1443                 case DRR_SPILL:
1444                 {
1445                         struct drr_spill drrs = drr->drr_u.drr_spill;
1446                         ra.err = restore_spill(&ra, os, &drrs);
1447                         break;
1448                 }
1449                 default:
1450                         ra.err = EINVAL;
1451                         goto out;
1452                 }
1453                 pcksum = ra.cksum;
1454         }
1455         ASSERT(ra.err != 0);
1456
1457 out:
1458         if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1459                 zfs_onexit_fd_rele(cleanup_fd);
1460
1461         if (ra.err != 0) {
1462                 /*
1463                  * destroy what we created, so we don't leave it in the
1464                  * inconsistent restoring state.
1465                  */
1466                 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
1467
1468                 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1469                     B_FALSE);
1470                 if (drc->drc_real_ds != drc->drc_logical_ds) {
1471                         mutex_exit(&drc->drc_logical_ds->ds_recvlock);
1472                         dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
1473                 }
1474         }
1475
1476         kmem_free(ra.buf, ra.bufsize);
1477         *voffp = ra.voff;
1478         return (ra.err);
1479 }
1480
1481 struct recvendsyncarg {
1482         char *tosnap;
1483         uint64_t creation_time;
1484         uint64_t toguid;
1485 };
1486
1487 static int
1488 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
1489 {
1490         dsl_dataset_t *ds = arg1;
1491         struct recvendsyncarg *resa = arg2;
1492
1493         return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
1494 }
1495
1496 static void
1497 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1498 {
1499         dsl_dataset_t *ds = arg1;
1500         struct recvendsyncarg *resa = arg2;
1501
1502         dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
1503
1504         /* set snapshot's creation time and guid */
1505         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1506         ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
1507         ds->ds_prev->ds_phys->ds_guid = resa->toguid;
1508         ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1509
1510         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1511         ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1512 }
1513
1514 static int
1515 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
1516 {
1517         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1518         uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
1519         dsl_dataset_t *snapds;
1520         guid_map_entry_t *gmep;
1521         int err;
1522
1523         ASSERT(guid_map != NULL);
1524
1525         rw_enter(&dp->dp_config_rwlock, RW_READER);
1526         err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
1527         if (err == 0) {
1528                 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
1529                 gmep->guid = snapds->ds_phys->ds_guid;
1530                 gmep->gme_ds = snapds;
1531                 avl_add(guid_map, gmep);
1532         }
1533
1534         rw_exit(&dp->dp_config_rwlock);
1535         return (err);
1536 }
1537
1538 static int
1539 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1540 {
1541         struct recvendsyncarg resa;
1542         dsl_dataset_t *ds = drc->drc_logical_ds;
1543         int err;
1544
1545         /*
1546          * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1547          * expects it to have a ds_user_ptr (and zil), but clone_swap()
1548          * can close it.
1549          */
1550         txg_wait_synced(ds->ds_dir->dd_pool, 0);
1551
1552         if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
1553                 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
1554                     drc->drc_force);
1555                 if (err)
1556                         goto out;
1557         } else {
1558                 mutex_exit(&ds->ds_recvlock);
1559                 dsl_dataset_rele(ds, dmu_recv_tag);
1560                 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1561                     B_FALSE);
1562                 return (EBUSY);
1563         }
1564
1565         resa.creation_time = drc->drc_drrb->drr_creation_time;
1566         resa.toguid = drc->drc_drrb->drr_toguid;
1567         resa.tosnap = drc->drc_tosnap;
1568
1569         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1570             recv_end_check, recv_end_sync, ds, &resa, 3);
1571         if (err) {
1572                 /* swap back */
1573                 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
1574         }
1575
1576 out:
1577         mutex_exit(&ds->ds_recvlock);
1578         if (err == 0 && drc->drc_guid_to_ds_map != NULL)
1579                 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1580         dsl_dataset_disown(ds, dmu_recv_tag);
1581         (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
1582         return (err);
1583 }
1584
1585 static int
1586 dmu_recv_new_end(dmu_recv_cookie_t *drc)
1587 {
1588         struct recvendsyncarg resa;
1589         dsl_dataset_t *ds = drc->drc_logical_ds;
1590         int err;
1591
1592         /*
1593          * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1594          * expects it to have a ds_user_ptr (and zil), but clone_swap()
1595          * can close it.
1596          */
1597         txg_wait_synced(ds->ds_dir->dd_pool, 0);
1598
1599         resa.creation_time = drc->drc_drrb->drr_creation_time;
1600         resa.toguid = drc->drc_drrb->drr_toguid;
1601         resa.tosnap = drc->drc_tosnap;
1602
1603         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1604             recv_end_check, recv_end_sync, ds, &resa, 3);
1605         if (err) {
1606                 /* clean up the fs we just recv'd into */
1607                 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
1608         } else {
1609                 if (drc->drc_guid_to_ds_map != NULL)
1610                         (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1611                 /* release the hold from dmu_recv_begin */
1612                 dsl_dataset_disown(ds, dmu_recv_tag);
1613         }
1614         return (err);
1615 }
1616
1617 int
1618 dmu_recv_end(dmu_recv_cookie_t *drc)
1619 {
1620         if (drc->drc_logical_ds != drc->drc_real_ds)
1621                 return (dmu_recv_existing_end(drc));
1622         else
1623                 return (dmu_recv_new_end(drc));
1624 }