]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/xen/blkback/blkback.c
MFV: file 5.33
[FreeBSD/FreeBSD.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009-2012 Spectra Logic Corporation
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions, and the following disclaimer,
12  *    without modification.
13  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
14  *    substantially similar to the "NO WARRANTY" disclaimer below
15  *    ("Disclaimer") and any redistribution must be conditioned upon
16  *    including a substantially similar Disclaimer requirement for further
17  *    binary redistribution.
18  *
19  * NO WARRANTY
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGES.
31  *
32  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
33  *          Ken Merry           (Spectra Logic Corporation)
34  */
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 /**
39  * \file blkback.c
40  *
41  * \brief Device driver supporting the vending of block storage from
42  *        a FreeBSD domain to other domains.
43  */
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49
50 #include <sys/bio.h>
51 #include <sys/bus.h>
52 #include <sys/conf.h>
53 #include <sys/devicestat.h>
54 #include <sys/disk.h>
55 #include <sys/fcntl.h>
56 #include <sys/filedesc.h>
57 #include <sys/kdb.h>
58 #include <sys/module.h>
59 #include <sys/namei.h>
60 #include <sys/proc.h>
61 #include <sys/rman.h>
62 #include <sys/taskqueue.h>
63 #include <sys/types.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/sysctl.h>
67 #include <sys/bitstring.h>
68 #include <sys/sdt.h>
69
70 #include <geom/geom.h>
71
72 #include <machine/_inttypes.h>
73
74 #include <vm/vm.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_kern.h>
77
78 #include <xen/xen-os.h>
79 #include <xen/blkif.h>
80 #include <xen/gnttab.h>
81 #include <xen/xen_intr.h>
82
83 #include <xen/interface/event_channel.h>
84 #include <xen/interface/grant_table.h>
85
86 #include <xen/xenbus/xenbusvar.h>
87
88 /*--------------------------- Compile-time Tunables --------------------------*/
89 /**
90  * The maximum number of shared memory ring pages we will allow in a
91  * negotiated block-front/back communication channel.  Allow enough
92  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
93  */
94 #define XBB_MAX_RING_PAGES              32
95
96 /**
97  * The maximum number of outstanding request blocks (request headers plus
98  * additional segment blocks) we will allow in a negotiated block-front/back
99  * communication channel.
100  */
101 #define XBB_MAX_REQUESTS                                        \
102         __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
103
104 /**
105  * \brief Define to force all I/O to be performed on memory owned by the
106  *        backend device, with a copy-in/out to the remote domain's memory.
107  *
108  * \note  This option is currently required when this driver's domain is
109  *        operating in HVM mode on a system using an IOMMU.
110  *
111  * This driver uses Xen's grant table API to gain access to the memory of
112  * the remote domains it serves.  When our domain is operating in PV mode,
113  * the grant table mechanism directly updates our domain's page table entries
114  * to point to the physical pages of the remote domain.  This scheme guarantees
115  * that blkback and the backing devices it uses can safely perform DMA
116  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
117  * insure that our domain cannot DMA to pages owned by another domain.  As
118  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
119  * table API.  For this reason, in HVM mode, we must bounce all requests into
120  * memory that is mapped into our domain at domain startup and thus has
121  * valid IOMMU mappings.
122  */
123 #define XBB_USE_BOUNCE_BUFFERS
124
125 /**
126  * \brief Define to enable rudimentary request logging to the console.
127  */
128 #undef XBB_DEBUG
129
130 /*---------------------------------- Macros ----------------------------------*/
131 /**
132  * Custom malloc type for all driver allocations.
133  */
134 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
135
136 #ifdef XBB_DEBUG
137 #define DPRINTF(fmt, args...)                                   \
138     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
139 #else
140 #define DPRINTF(fmt, args...) do {} while(0)
141 #endif
142
143 /**
144  * The maximum mapped region size per request we will allow in a negotiated
145  * block-front/back communication channel.
146  */
147 #define XBB_MAX_REQUEST_SIZE                                    \
148         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
149
150 /**
151  * The maximum number of segments (within a request header and accompanying
152  * segment blocks) per request we will allow in a negotiated block-front/back
153  * communication channel.
154  */
155 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
156         (MIN(UIO_MAXIOV,                                        \
157              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
158                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
159
160 /**
161  * The maximum number of ring pages that we can allow per request list.
162  * We limit this to the maximum number of segments per request, because
163  * that is already a reasonable number of segments to aggregate.  This
164  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
165  * because that would leave situations where we can't dispatch even one
166  * large request.
167  */
168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
169
170 /*--------------------------- Forward Declarations ---------------------------*/
171 struct xbb_softc;
172 struct xbb_xen_req;
173
174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
175                               ...) __attribute__((format(printf, 3, 4)));
176 static int  xbb_shutdown(struct xbb_softc *xbb);
177
178 /*------------------------------ Data Structures -----------------------------*/
179
180 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
181
182 typedef enum {
183         XBB_REQLIST_NONE        = 0x00,
184         XBB_REQLIST_MAPPED      = 0x01
185 } xbb_reqlist_flags;
186
187 struct xbb_xen_reqlist {
188         /**
189          * Back reference to the parent block back instance for this
190          * request.  Used during bio_done handling.
191          */
192         struct xbb_softc        *xbb;
193
194         /**
195          * BLKIF_OP code for this request.
196          */
197         int                      operation;
198
199         /**
200          * Set to BLKIF_RSP_* to indicate request status.
201          *
202          * This field allows an error status to be recorded even if the
203          * delivery of this status must be deferred.  Deferred reporting
204          * is necessary, for example, when an error is detected during
205          * completion processing of one bio when other bios for this
206          * request are still outstanding.
207          */
208         int                      status;
209
210         /**
211          * Number of 512 byte sectors not transferred.
212          */
213         int                      residual_512b_sectors;
214
215         /**
216          * Starting sector number of the first request in the list.
217          */
218         off_t                    starting_sector_number;
219
220         /**
221          * If we're going to coalesce, the next contiguous sector would be
222          * this one.
223          */
224         off_t                    next_contig_sector;
225
226         /**
227          * Number of child requests in the list.
228          */
229         int                      num_children;
230
231         /**
232          * Number of I/O requests still pending on the backend.
233          */
234         int                      pendcnt;
235
236         /**
237          * Total number of segments for requests in the list.
238          */
239         int                      nr_segments;
240
241         /**
242          * Flags for this particular request list.
243          */
244         xbb_reqlist_flags        flags;
245
246         /**
247          * Kernel virtual address space reserved for this request
248          * list structure and used to map the remote domain's pages for
249          * this I/O, into our domain's address space.
250          */
251         uint8_t                 *kva;
252
253         /**
254          * Base, pseudo-physical address, corresponding to the start
255          * of this request's kva region.
256          */
257         uint64_t                 gnt_base;
258
259
260 #ifdef XBB_USE_BOUNCE_BUFFERS
261         /**
262          * Pre-allocated domain local memory used to proxy remote
263          * domain memory during I/O operations.
264          */
265         uint8_t                 *bounce;
266 #endif
267
268         /**
269          * Array of grant handles (one per page) used to map this request.
270          */
271         grant_handle_t          *gnt_handles;
272
273         /**
274          * Device statistics request ordering type (ordered or simple).
275          */
276         devstat_tag_type         ds_tag_type;
277
278         /**
279          * Device statistics request type (read, write, no_data).
280          */
281         devstat_trans_flags      ds_trans_type;
282
283         /**
284          * The start time for this request.
285          */
286         struct bintime           ds_t0;
287
288         /**
289          * Linked list of contiguous requests with the same operation type.
290          */
291         struct xbb_xen_req_list  contig_req_list;
292
293         /**
294          * Linked list links used to aggregate idle requests in the
295          * request list free pool (xbb->reqlist_free_stailq) and pending
296          * requests waiting for execution (xbb->reqlist_pending_stailq).
297          */
298         STAILQ_ENTRY(xbb_xen_reqlist) links;
299 };
300
301 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
302
303 /**
304  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
305  */
306 struct xbb_xen_req {
307         /**
308          * Linked list links used to aggregate requests into a reqlist
309          * and to store them in the request free pool.
310          */
311         STAILQ_ENTRY(xbb_xen_req) links;
312
313         /**
314          * The remote domain's identifier for this I/O request.
315          */
316         uint64_t                  id;
317
318         /**
319          * The number of pages currently mapped for this request.
320          */
321         int                       nr_pages;
322
323         /**
324          * The number of 512 byte sectors comprising this requests.
325          */
326         int                       nr_512b_sectors;
327
328         /**
329          * BLKIF_OP code for this request.
330          */
331         int                       operation;
332
333         /**
334          * Storage used for non-native ring requests.
335          */
336         blkif_request_t          ring_req_storage;
337
338         /**
339          * Pointer to the Xen request in the ring.
340          */
341         blkif_request_t         *ring_req;
342
343         /**
344          * Consumer index for this request.
345          */
346         RING_IDX                 req_ring_idx;
347
348         /**
349          * The start time for this request.
350          */
351         struct bintime           ds_t0;
352
353         /**
354          * Pointer back to our parent request list.
355          */
356         struct xbb_xen_reqlist  *reqlist;
357 };
358 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
359
360 /**
361  * \brief Configuration data for the shared memory request ring
362  *        used to communicate with the front-end client of this
363  *        this driver.
364  */
365 struct xbb_ring_config {
366         /** KVA address where ring memory is mapped. */
367         vm_offset_t     va;
368
369         /** The pseudo-physical address where ring memory is mapped.*/
370         uint64_t        gnt_addr;
371
372         /**
373          * Grant table handles, one per-ring page, returned by the
374          * hyperpervisor upon mapping of the ring and required to
375          * unmap it when a connection is torn down.
376          */
377         grant_handle_t  handle[XBB_MAX_RING_PAGES];
378
379         /**
380          * The device bus address returned by the hypervisor when
381          * mapping the ring and required to unmap it when a connection
382          * is torn down.
383          */
384         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
385
386         /** The number of ring pages mapped for the current connection. */
387         u_int           ring_pages;
388
389         /**
390          * The grant references, one per-ring page, supplied by the
391          * front-end, allowing us to reference the ring pages in the
392          * front-end's domain and to map these pages into our own domain.
393          */
394         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
395
396         /** The interrupt driven even channel used to signal ring events. */
397         evtchn_port_t   evtchn;
398 };
399
400 /**
401  * Per-instance connection state flags.
402  */
403 typedef enum
404 {
405         /**
406          * The front-end requested a read-only mount of the
407          * back-end device/file.
408          */
409         XBBF_READ_ONLY         = 0x01,
410
411         /** Communication with the front-end has been established. */
412         XBBF_RING_CONNECTED    = 0x02,
413
414         /**
415          * Front-end requests exist in the ring and are waiting for
416          * xbb_xen_req objects to free up.
417          */
418         XBBF_RESOURCE_SHORTAGE = 0x04,
419
420         /** Connection teardown in progress. */
421         XBBF_SHUTDOWN          = 0x08,
422
423         /** A thread is already performing shutdown processing. */
424         XBBF_IN_SHUTDOWN       = 0x10
425 } xbb_flag_t;
426
427 /** Backend device type.  */
428 typedef enum {
429         /** Backend type unknown. */
430         XBB_TYPE_NONE           = 0x00,
431
432         /**
433          * Backend type disk (access via cdev switch
434          * strategy routine).
435          */
436         XBB_TYPE_DISK           = 0x01,
437
438         /** Backend type file (access vnode operations.). */
439         XBB_TYPE_FILE           = 0x02
440 } xbb_type;
441
442 /**
443  * \brief Structure used to memoize information about a per-request
444  *        scatter-gather list.
445  *
446  * The chief benefit of using this data structure is it avoids having
447  * to reparse the possibly discontiguous S/G list in the original
448  * request.  Due to the way that the mapping of the memory backing an
449  * I/O transaction is handled by Xen, a second pass is unavoidable.
450  * At least this way the second walk is a simple array traversal.
451  *
452  * \note A single Scatter/Gather element in the block interface covers
453  *       at most 1 machine page.  In this context a sector (blkif
454  *       nomenclature, not what I'd choose) is a 512b aligned unit
455  *       of mapping within the machine page referenced by an S/G
456  *       element.
457  */
458 struct xbb_sg {
459         /** The number of 512b data chunks mapped in this S/G element. */
460         int16_t nsect;
461
462         /**
463          * The index (0 based) of the first 512b data chunk mapped
464          * in this S/G element.
465          */
466         uint8_t first_sect;
467
468         /**
469          * The index (0 based) of the last 512b data chunk mapped
470          * in this S/G element.
471          */
472         uint8_t last_sect;
473 };
474
475 /**
476  * Character device backend specific configuration data.
477  */
478 struct xbb_dev_data {
479         /** Cdev used for device backend access.  */
480         struct cdev   *cdev;
481
482         /** Cdev switch used for device backend access.  */
483         struct cdevsw *csw;
484
485         /** Used to hold a reference on opened cdev backend devices. */
486         int            dev_ref;
487 };
488
489 /**
490  * File backend specific configuration data.
491  */
492 struct xbb_file_data {
493         /** Credentials to use for vnode backed (file based) I/O. */
494         struct ucred   *cred;
495
496         /**
497          * \brief Array of io vectors used to process file based I/O.
498          *
499          * Only a single file based request is outstanding per-xbb instance,
500          * so we only need one of these.
501          */
502         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
503 #ifdef XBB_USE_BOUNCE_BUFFERS
504
505         /**
506          * \brief Array of io vectors used to handle bouncing of file reads.
507          *
508          * Vnode operations are free to modify uio data during their
509          * exectuion.  In the case of a read with bounce buffering active,
510          * we need some of the data from the original uio in order to
511          * bounce-out the read data.  This array serves as the temporary
512          * storage for this saved data.
513          */
514         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
515
516         /**
517          * \brief Array of memoized bounce buffer kva offsets used
518          *        in the file based backend.
519          *
520          * Due to the way that the mapping of the memory backing an
521          * I/O transaction is handled by Xen, a second pass through
522          * the request sg elements is unavoidable. We memoize the computed
523          * bounce address here to reduce the cost of the second walk.
524          */
525         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
526 #endif /* XBB_USE_BOUNCE_BUFFERS */
527 };
528
529 /**
530  * Collection of backend type specific data.
531  */
532 union xbb_backend_data {
533         struct xbb_dev_data  dev;
534         struct xbb_file_data file;
535 };
536
537 /**
538  * Function signature of backend specific I/O handlers.
539  */
540 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
541                               struct xbb_xen_reqlist *reqlist, int operation,
542                               int flags);
543
544 /**
545  * Per-instance configuration data.
546  */
547 struct xbb_softc {
548
549         /**
550          * Task-queue used to process I/O requests.
551          */
552         struct taskqueue         *io_taskqueue;
553
554         /**
555          * Single "run the request queue" task enqueued
556          * on io_taskqueue.
557          */
558         struct task               io_task;
559
560         /** Device type for this instance. */
561         xbb_type                  device_type;
562
563         /** NewBus device corresponding to this instance. */
564         device_t                  dev;
565
566         /** Backend specific dispatch routine for this instance. */
567         xbb_dispatch_t            dispatch_io;
568
569         /** The number of requests outstanding on the backend device/file. */
570         int                       active_request_count;
571
572         /** Free pool of request tracking structures. */
573         struct xbb_xen_req_list   request_free_stailq;
574
575         /** Array, sized at connection time, of request tracking structures. */
576         struct xbb_xen_req       *requests;
577
578         /** Free pool of request list structures. */
579         struct xbb_xen_reqlist_list reqlist_free_stailq;
580
581         /** List of pending request lists awaiting execution. */
582         struct xbb_xen_reqlist_list reqlist_pending_stailq;
583
584         /** Array, sized at connection time, of request list structures. */
585         struct xbb_xen_reqlist   *request_lists;
586
587         /**
588          * Global pool of kva used for mapping remote domain ring
589          * and I/O transaction data.
590          */
591         vm_offset_t               kva;
592
593         /** Pseudo-physical address corresponding to kva. */
594         uint64_t                  gnt_base_addr;
595
596         /** The size of the global kva pool. */
597         int                       kva_size;
598
599         /** The size of the KVA area used for request lists. */
600         int                       reqlist_kva_size;
601
602         /** The number of pages of KVA used for request lists */
603         int                       reqlist_kva_pages;
604
605         /** Bitmap of free KVA pages */
606         bitstr_t                 *kva_free;
607
608         /**
609          * \brief Cached value of the front-end's domain id.
610          * 
611          * This value is used at once for each mapped page in
612          * a transaction.  We cache it to avoid incuring the
613          * cost of an ivar access every time this is needed.
614          */
615         domid_t                   otherend_id;
616
617         /**
618          * \brief The blkif protocol abi in effect.
619          *
620          * There are situations where the back and front ends can
621          * have a different, native abi (e.g. intel x86_64 and
622          * 32bit x86 domains on the same machine).  The back-end
623          * always accommodates the front-end's native abi.  That
624          * value is pulled from the XenStore and recorded here.
625          */
626         int                       abi;
627
628         /**
629          * \brief The maximum number of requests and request lists allowed
630          *        to be in flight at a time.
631          *
632          * This value is negotiated via the XenStore.
633          */
634         u_int                     max_requests;
635
636         /**
637          * \brief The maximum number of segments (1 page per segment)
638          *        that can be mapped by a request.
639          *
640          * This value is negotiated via the XenStore.
641          */
642         u_int                     max_request_segments;
643
644         /**
645          * \brief Maximum number of segments per request list.
646          *
647          * This value is derived from and will generally be larger than
648          * max_request_segments.
649          */
650         u_int                     max_reqlist_segments;
651
652         /**
653          * The maximum size of any request to this back-end
654          * device.
655          *
656          * This value is negotiated via the XenStore.
657          */
658         u_int                     max_request_size;
659
660         /**
661          * The maximum size of any request list.  This is derived directly
662          * from max_reqlist_segments.
663          */
664         u_int                     max_reqlist_size;
665
666         /** Various configuration and state bit flags. */
667         xbb_flag_t                flags;
668
669         /** Ring mapping and interrupt configuration data. */
670         struct xbb_ring_config    ring_config;
671
672         /** Runtime, cross-abi safe, structures for ring access. */
673         blkif_back_rings_t        rings;
674
675         /** IRQ mapping for the communication ring event channel. */
676         xen_intr_handle_t         xen_intr_handle;
677
678         /**
679          * \brief Backend access mode flags (e.g. write, or read-only).
680          *
681          * This value is passed to us by the front-end via the XenStore.
682          */
683         char                     *dev_mode;
684
685         /**
686          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
687          *
688          * This value is passed to us by the front-end via the XenStore.
689          * Currently unused.
690          */
691         char                     *dev_type;
692
693         /**
694          * \brief Backend device/file identifier.
695          *
696          * This value is passed to us by the front-end via the XenStore.
697          * We expect this to be a POSIX path indicating the file or
698          * device to open.
699          */
700         char                     *dev_name;
701
702         /**
703          * Vnode corresponding to the backend device node or file
704          * we are acessing.
705          */
706         struct vnode             *vn;
707
708         union xbb_backend_data    backend;
709
710         /** The native sector size of the backend. */
711         u_int                     sector_size;
712
713         /** log2 of sector_size.  */
714         u_int                     sector_size_shift;
715
716         /** Size in bytes of the backend device or file.  */
717         off_t                     media_size;
718
719         /**
720          * \brief media_size expressed in terms of the backend native
721          *        sector size.
722          *
723          * (e.g. xbb->media_size >> xbb->sector_size_shift).
724          */
725         uint64_t                  media_num_sectors;
726
727         /**
728          * \brief Array of memoized scatter gather data computed during the
729          *        conversion of blkif ring requests to internal xbb_xen_req
730          *        structures.
731          *
732          * Ring processing is serialized so we only need one of these.
733          */
734         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
735
736         /**
737          * Temporary grant table map used in xbb_dispatch_io().  When
738          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
739          * stack could cause a stack overflow.
740          */
741         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
742
743         /** Mutex protecting per-instance data. */
744         struct mtx                lock;
745
746         /**
747          * Resource representing allocated physical address space
748          * associated with our per-instance kva region.
749          */
750         struct resource          *pseudo_phys_res;
751
752         /** Resource id for allocated physical address space. */
753         int                       pseudo_phys_res_id;
754
755         /**
756          * I/O statistics from BlockBack dispatch down.  These are
757          * coalesced requests, and we start them right before execution.
758          */
759         struct devstat           *xbb_stats;
760
761         /**
762          * I/O statistics coming into BlockBack.  These are the requests as
763          * we get them from BlockFront.  They are started as soon as we
764          * receive a request, and completed when the I/O is complete.
765          */
766         struct devstat           *xbb_stats_in;
767
768         /** Disable sending flush to the backend */
769         int                       disable_flush;
770
771         /** Send a real flush for every N flush requests */
772         int                       flush_interval;
773
774         /** Count of flush requests in the interval */
775         int                       flush_count;
776
777         /** Don't coalesce requests if this is set */
778         int                       no_coalesce_reqs;
779
780         /** Number of requests we have received */
781         uint64_t                  reqs_received;
782
783         /** Number of requests we have completed*/
784         uint64_t                  reqs_completed;
785
786         /** Number of requests we queued but not pushed*/
787         uint64_t                  reqs_queued_for_completion;
788
789         /** Number of requests we completed with an error status*/
790         uint64_t                  reqs_completed_with_error;
791
792         /** How many forced dispatches (i.e. without coalescing) have happened */
793         uint64_t                  forced_dispatch;
794
795         /** How many normal dispatches have happened */
796         uint64_t                  normal_dispatch;
797
798         /** How many total dispatches have happened */
799         uint64_t                  total_dispatch;
800
801         /** How many times we have run out of KVA */
802         uint64_t                  kva_shortages;
803
804         /** How many times we have run out of request structures */
805         uint64_t                  request_shortages;
806
807         /** Watch to wait for hotplug script execution */
808         struct xs_watch           hotplug_watch;
809 };
810
811 /*---------------------------- Request Processing ----------------------------*/
812 /**
813  * Allocate an internal transaction tracking structure from the free pool.
814  *
815  * \param xbb  Per-instance xbb configuration structure.
816  *
817  * \return  On success, a pointer to the allocated xbb_xen_req structure.
818  *          Otherwise NULL.
819  */
820 static inline struct xbb_xen_req *
821 xbb_get_req(struct xbb_softc *xbb)
822 {
823         struct xbb_xen_req *req;
824
825         req = NULL;
826
827         mtx_assert(&xbb->lock, MA_OWNED);
828
829         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
830                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
831                 xbb->active_request_count++;
832         }
833
834         return (req);
835 }
836
837 /**
838  * Return an allocated transaction tracking structure to the free pool.
839  *
840  * \param xbb  Per-instance xbb configuration structure.
841  * \param req  The request structure to free.
842  */
843 static inline void
844 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
845 {
846         mtx_assert(&xbb->lock, MA_OWNED);
847
848         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
849         xbb->active_request_count--;
850
851         KASSERT(xbb->active_request_count >= 0,
852                 ("xbb_release_req: negative active count"));
853 }
854
855 /**
856  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
857  *
858  * \param xbb       Per-instance xbb configuration structure.
859  * \param req_list  The list of requests to free.
860  * \param nreqs     The number of items in the list.
861  */
862 static inline void
863 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
864                  int nreqs)
865 {
866         mtx_assert(&xbb->lock, MA_OWNED);
867
868         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
869         xbb->active_request_count -= nreqs;
870
871         KASSERT(xbb->active_request_count >= 0,
872                 ("xbb_release_reqs: negative active count"));
873 }
874
875 /**
876  * Given a page index and 512b sector offset within that page,
877  * calculate an offset into a request's kva region.
878  *
879  * \param reqlist The request structure whose kva region will be accessed.
880  * \param pagenr  The page index used to compute the kva offset.
881  * \param sector  The 512b sector index used to compute the page relative
882  *                kva offset.
883  *
884  * \return  The computed global KVA offset.
885  */
886 static inline uint8_t *
887 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
888 {
889         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
890 }
891
892 #ifdef XBB_USE_BOUNCE_BUFFERS
893 /**
894  * Given a page index and 512b sector offset within that page,
895  * calculate an offset into a request's local bounce memory region.
896  *
897  * \param reqlist The request structure whose bounce region will be accessed.
898  * \param pagenr  The page index used to compute the bounce offset.
899  * \param sector  The 512b sector index used to compute the page relative
900  *                bounce offset.
901  *
902  * \return  The computed global bounce buffer address.
903  */
904 static inline uint8_t *
905 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
906 {
907         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
908 }
909 #endif
910
911 /**
912  * Given a page number and 512b sector offset within that page,
913  * calculate an offset into the request's memory region that the
914  * underlying backend device/file should use for I/O.
915  *
916  * \param reqlist The request structure whose I/O region will be accessed.
917  * \param pagenr  The page index used to compute the I/O offset.
918  * \param sector  The 512b sector index used to compute the page relative
919  *                I/O offset.
920  *
921  * \return  The computed global I/O address.
922  *
923  * Depending on configuration, this will either be a local bounce buffer
924  * or a pointer to the memory mapped in from the front-end domain for
925  * this request.
926  */
927 static inline uint8_t *
928 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
929 {
930 #ifdef XBB_USE_BOUNCE_BUFFERS
931         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
932 #else
933         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
934 #endif
935 }
936
937 /**
938  * Given a page index and 512b sector offset within that page, calculate
939  * an offset into the local pseudo-physical address space used to map a
940  * front-end's request data into a request.
941  *
942  * \param reqlist The request list structure whose pseudo-physical region
943  *                will be accessed.
944  * \param pagenr  The page index used to compute the pseudo-physical offset.
945  * \param sector  The 512b sector index used to compute the page relative
946  *                pseudo-physical offset.
947  *
948  * \return  The computed global pseudo-phsyical address.
949  *
950  * Depending on configuration, this will either be a local bounce buffer
951  * or a pointer to the memory mapped in from the front-end domain for
952  * this request.
953  */
954 static inline uintptr_t
955 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
956 {
957         struct xbb_softc *xbb;
958
959         xbb = reqlist->xbb;
960
961         return ((uintptr_t)(xbb->gnt_base_addr +
962                 (uintptr_t)(reqlist->kva - xbb->kva) +
963                 (PAGE_SIZE * pagenr) + (sector << 9)));
964 }
965
966 /**
967  * Get Kernel Virtual Address space for mapping requests.
968  *
969  * \param xbb         Per-instance xbb configuration structure.
970  * \param nr_pages    Number of pages needed.
971  * \param check_only  If set, check for free KVA but don't allocate it.
972  * \param have_lock   If set, xbb lock is already held.
973  *
974  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
975  *
976  * Note:  This should be unnecessary once we have either chaining or
977  * scatter/gather support for struct bio.  At that point we'll be able to
978  * put multiple addresses and lengths in one bio/bio chain and won't need
979  * to map everything into one virtual segment.
980  */
981 static uint8_t *
982 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
983 {
984         int first_clear;
985         int num_clear;
986         uint8_t *free_kva;
987         int      i;
988
989         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
990
991         first_clear = 0;
992         free_kva = NULL;
993
994         mtx_lock(&xbb->lock);
995
996         /*
997          * Look for the first available page.  If there are none, we're done.
998          */
999         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
1000
1001         if (first_clear == -1)
1002                 goto bailout;
1003
1004         /*
1005          * Starting at the first available page, look for consecutive free
1006          * pages that will satisfy the user's request.
1007          */
1008         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1009                 /*
1010                  * If this is true, the page is used, so we have to reset
1011                  * the number of clear pages and the first clear page
1012                  * (since it pointed to a region with an insufficient number
1013                  * of clear pages).
1014                  */
1015                 if (bit_test(xbb->kva_free, i)) {
1016                         num_clear = 0;
1017                         first_clear = -1;
1018                         continue;
1019                 }
1020
1021                 if (first_clear == -1)
1022                         first_clear = i;
1023
1024                 /*
1025                  * If this is true, we've found a large enough free region
1026                  * to satisfy the request.
1027                  */
1028                 if (++num_clear == nr_pages) {
1029
1030                         bit_nset(xbb->kva_free, first_clear,
1031                                  first_clear + nr_pages - 1);
1032
1033                         free_kva = xbb->kva +
1034                                 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE);
1035
1036                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1037                                 free_kva + (nr_pages * PAGE_SIZE) <=
1038                                 (uint8_t *)xbb->ring_config.va,
1039                                 ("Free KVA %p len %d out of range, "
1040                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1041                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1042                                  (uintmax_t)xbb->ring_config.va));
1043                         break;
1044                 }
1045         }
1046
1047 bailout:
1048
1049         if (free_kva == NULL) {
1050                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1051                 xbb->kva_shortages++;
1052         }
1053
1054         mtx_unlock(&xbb->lock);
1055
1056         return (free_kva);
1057 }
1058
1059 /**
1060  * Free allocated KVA.
1061  *
1062  * \param xbb       Per-instance xbb configuration structure.
1063  * \param kva_ptr   Pointer to allocated KVA region.  
1064  * \param nr_pages  Number of pages in the KVA region.
1065  */
1066 static void
1067 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1068 {
1069         intptr_t start_page;
1070
1071         mtx_assert(&xbb->lock, MA_OWNED);
1072
1073         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1074         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1075
1076 }
1077
1078 /**
1079  * Unmap the front-end pages associated with this I/O request.
1080  *
1081  * \param req  The request structure to unmap.
1082  */
1083 static void
1084 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1085 {
1086         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1087         u_int                         i;
1088         u_int                         invcount;
1089         int                           error;
1090
1091         invcount = 0;
1092         for (i = 0; i < reqlist->nr_segments; i++) {
1093
1094                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1095                         continue;
1096
1097                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1098                 unmap[invcount].dev_bus_addr = 0;
1099                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1100                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1101                 invcount++;
1102         }
1103
1104         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1105                                           unmap, invcount);
1106         KASSERT(error == 0, ("Grant table operation failed"));
1107 }
1108
1109 /**
1110  * Allocate an internal transaction tracking structure from the free pool.
1111  *
1112  * \param xbb  Per-instance xbb configuration structure.
1113  *
1114  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1115  *          Otherwise NULL.
1116  */
1117 static inline struct xbb_xen_reqlist *
1118 xbb_get_reqlist(struct xbb_softc *xbb)
1119 {
1120         struct xbb_xen_reqlist *reqlist;
1121
1122         reqlist = NULL;
1123
1124         mtx_assert(&xbb->lock, MA_OWNED);
1125
1126         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1127
1128                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1129                 reqlist->flags = XBB_REQLIST_NONE;
1130                 reqlist->kva = NULL;
1131                 reqlist->status = BLKIF_RSP_OKAY;
1132                 reqlist->residual_512b_sectors = 0;
1133                 reqlist->num_children = 0;
1134                 reqlist->nr_segments = 0;
1135                 STAILQ_INIT(&reqlist->contig_req_list);
1136         }
1137
1138         return (reqlist);
1139 }
1140
1141 /**
1142  * Return an allocated transaction tracking structure to the free pool.
1143  *
1144  * \param xbb        Per-instance xbb configuration structure.
1145  * \param req        The request list structure to free.
1146  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1147  *                   during a resource shortage condition.
1148  */
1149 static inline void
1150 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1151                     int wakeup)
1152 {
1153
1154         mtx_assert(&xbb->lock, MA_OWNED);
1155
1156         if (wakeup) {
1157                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1158                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1159         }
1160
1161         if (reqlist->kva != NULL)
1162                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1163
1164         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1165
1166         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1167
1168         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1169                 /*
1170                  * Shutdown is in progress.  See if we can
1171                  * progress further now that one more request
1172                  * has completed and been returned to the
1173                  * free pool.
1174                  */
1175                 xbb_shutdown(xbb);
1176         }
1177
1178         if (wakeup != 0)
1179                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1180 }
1181
1182 /**
1183  * Request resources and do basic request setup.
1184  *
1185  * \param xbb          Per-instance xbb configuration structure.
1186  * \param reqlist      Pointer to reqlist pointer.
1187  * \param ring_req     Pointer to a block ring request.
1188  * \param ring_index   The ring index of this request.
1189  *
1190  * \return  0 for success, non-zero for failure.
1191  */
1192 static int
1193 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1194                   blkif_request_t *ring_req, RING_IDX ring_idx)
1195 {
1196         struct xbb_xen_reqlist *nreqlist;
1197         struct xbb_xen_req     *nreq;
1198
1199         nreqlist = NULL;
1200         nreq     = NULL;
1201
1202         mtx_lock(&xbb->lock);
1203
1204         /*
1205          * We don't allow new resources to be allocated if we're in the
1206          * process of shutting down.
1207          */
1208         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1209                 mtx_unlock(&xbb->lock);
1210                 return (1);
1211         }
1212
1213         /*
1214          * Allocate a reqlist if the caller doesn't have one already.
1215          */
1216         if (*reqlist == NULL) {
1217                 nreqlist = xbb_get_reqlist(xbb);
1218                 if (nreqlist == NULL)
1219                         goto bailout_error;
1220         }
1221
1222         /* We always allocate a request. */
1223         nreq = xbb_get_req(xbb);
1224         if (nreq == NULL)
1225                 goto bailout_error;
1226
1227         mtx_unlock(&xbb->lock);
1228
1229         if (*reqlist == NULL) {
1230                 *reqlist = nreqlist;
1231                 nreqlist->operation = ring_req->operation;
1232                 nreqlist->starting_sector_number = ring_req->sector_number;
1233                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1234                                    links);
1235         }
1236
1237         nreq->reqlist = *reqlist;
1238         nreq->req_ring_idx = ring_idx;
1239         nreq->id = ring_req->id;
1240         nreq->operation = ring_req->operation;
1241
1242         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1243                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1244                 nreq->ring_req = &nreq->ring_req_storage;
1245         } else {
1246                 nreq->ring_req = ring_req;
1247         }
1248
1249         binuptime(&nreq->ds_t0);
1250         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1251         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1252         (*reqlist)->num_children++;
1253         (*reqlist)->nr_segments += ring_req->nr_segments;
1254
1255         return (0);
1256
1257 bailout_error:
1258
1259         /*
1260          * We're out of resources, so set the shortage flag.  The next time
1261          * a request is released, we'll try waking up the work thread to
1262          * see if we can allocate more resources.
1263          */
1264         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1265         xbb->request_shortages++;
1266
1267         if (nreq != NULL)
1268                 xbb_release_req(xbb, nreq);
1269
1270         if (nreqlist != NULL)
1271                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1272
1273         mtx_unlock(&xbb->lock);
1274
1275         return (1);
1276 }
1277
1278 /**
1279  * Create and queue a response to a blkif request.
1280  * 
1281  * \param xbb     Per-instance xbb configuration structure.
1282  * \param req     The request structure to which to respond.
1283  * \param status  The status code to report.  See BLKIF_RSP_*
1284  *                in sys/xen/interface/io/blkif.h.
1285  */
1286 static void
1287 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1288 {
1289         blkif_response_t *resp;
1290
1291         /*
1292          * The mutex is required here, and should be held across this call
1293          * until after the subsequent call to xbb_push_responses().  This
1294          * is to guarantee that another context won't queue responses and
1295          * push them while we're active.
1296          *
1297          * That could lead to the other end being notified of responses
1298          * before the resources have been freed on this end.  The other end
1299          * would then be able to queue additional I/O, and we may run out
1300          * of resources because we haven't freed them all yet.
1301          */
1302         mtx_assert(&xbb->lock, MA_OWNED);
1303
1304         /*
1305          * Place on the response ring for the relevant domain.
1306          * For now, only the spacing between entries is different
1307          * in the different ABIs, not the response entry layout.
1308          */
1309         switch (xbb->abi) {
1310         case BLKIF_PROTOCOL_NATIVE:
1311                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1312                                          xbb->rings.native.rsp_prod_pvt);
1313                 break;
1314         case BLKIF_PROTOCOL_X86_32:
1315                 resp = (blkif_response_t *)
1316                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1317                                       xbb->rings.x86_32.rsp_prod_pvt);
1318                 break;
1319         case BLKIF_PROTOCOL_X86_64:
1320                 resp = (blkif_response_t *)
1321                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1322                                       xbb->rings.x86_64.rsp_prod_pvt);
1323                 break;
1324         default:
1325                 panic("Unexpected blkif protocol ABI.");
1326         }
1327
1328         resp->id        = req->id;
1329         resp->operation = req->operation;
1330         resp->status    = status;
1331
1332         if (status != BLKIF_RSP_OKAY)
1333                 xbb->reqs_completed_with_error++;
1334
1335         xbb->rings.common.rsp_prod_pvt++;
1336
1337         xbb->reqs_queued_for_completion++;
1338
1339 }
1340
1341 /**
1342  * Send queued responses to blkif requests.
1343  * 
1344  * \param xbb            Per-instance xbb configuration structure.
1345  * \param run_taskqueue  Flag that is set to 1 if the taskqueue
1346  *                       should be run, 0 if it does not need to be run.
1347  * \param notify         Flag that is set to 1 if the other end should be
1348  *                       notified via irq, 0 if the other end should not be
1349  *                       notified.
1350  */
1351 static void
1352 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
1353 {
1354         int more_to_do;
1355
1356         /*
1357          * The mutex is required here.
1358          */
1359         mtx_assert(&xbb->lock, MA_OWNED);
1360
1361         more_to_do = 0;
1362
1363         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
1364
1365         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1366
1367                 /*
1368                  * Tail check for pending requests. Allows frontend to avoid
1369                  * notifications if requests are already in flight (lower
1370                  * overheads and promotes batching).
1371                  */
1372                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1373         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1374
1375                 more_to_do = 1;
1376         }
1377
1378         xbb->reqs_completed += xbb->reqs_queued_for_completion;
1379         xbb->reqs_queued_for_completion = 0;
1380
1381         *run_taskqueue = more_to_do;
1382 }
1383
1384 /**
1385  * Complete a request list.
1386  *
1387  * \param xbb        Per-instance xbb configuration structure.
1388  * \param reqlist    Allocated internal request list structure.
1389  */
1390 static void
1391 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1392 {
1393         struct xbb_xen_req *nreq;
1394         off_t               sectors_sent;
1395         int                 notify, run_taskqueue;
1396
1397         sectors_sent = 0;
1398
1399         if (reqlist->flags & XBB_REQLIST_MAPPED)
1400                 xbb_unmap_reqlist(reqlist);
1401
1402         mtx_lock(&xbb->lock);
1403
1404         /*
1405          * All I/O is done, send the response. A lock is not necessary
1406          * to protect the request list, because all requests have
1407          * completed.  Therefore this is the only context accessing this
1408          * reqlist right now.  However, in order to make sure that no one
1409          * else queues responses onto the queue or pushes them to the other
1410          * side while we're active, we need to hold the lock across the
1411          * calls to xbb_queue_response() and xbb_push_responses().
1412          */
1413         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1414                 off_t cur_sectors_sent;
1415
1416                 /* Put this response on the ring, but don't push yet */
1417                 xbb_queue_response(xbb, nreq, reqlist->status);
1418
1419                 /* We don't report bytes sent if there is an error. */
1420                 if (reqlist->status == BLKIF_RSP_OKAY)
1421                         cur_sectors_sent = nreq->nr_512b_sectors;
1422                 else
1423                         cur_sectors_sent = 0;
1424
1425                 sectors_sent += cur_sectors_sent;
1426
1427                 devstat_end_transaction(xbb->xbb_stats_in,
1428                                         /*bytes*/cur_sectors_sent << 9,
1429                                         reqlist->ds_tag_type,
1430                                         reqlist->ds_trans_type,
1431                                         /*now*/NULL,
1432                                         /*then*/&nreq->ds_t0);
1433         }
1434
1435         /*
1436          * Take out any sectors not sent.  If we wind up negative (which
1437          * might happen if an error is reported as well as a residual), just
1438          * report 0 sectors sent.
1439          */
1440         sectors_sent -= reqlist->residual_512b_sectors;
1441         if (sectors_sent < 0)
1442                 sectors_sent = 0;
1443
1444         devstat_end_transaction(xbb->xbb_stats,
1445                                 /*bytes*/ sectors_sent << 9,
1446                                 reqlist->ds_tag_type,
1447                                 reqlist->ds_trans_type,
1448                                 /*now*/NULL,
1449                                 /*then*/&reqlist->ds_t0);
1450
1451         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1452
1453         xbb_push_responses(xbb, &run_taskqueue, &notify);
1454
1455         mtx_unlock(&xbb->lock);
1456
1457         if (run_taskqueue)
1458                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1459
1460         if (notify)
1461                 xen_intr_signal(xbb->xen_intr_handle);
1462 }
1463
1464 /**
1465  * Completion handler for buffer I/O requests issued by the device
1466  * backend driver.
1467  *
1468  * \param bio  The buffer I/O request on which to perform completion
1469  *             processing.
1470  */
1471 static void
1472 xbb_bio_done(struct bio *bio)
1473 {
1474         struct xbb_softc       *xbb;
1475         struct xbb_xen_reqlist *reqlist;
1476
1477         reqlist = bio->bio_caller1;
1478         xbb     = reqlist->xbb;
1479
1480         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1481
1482         /*
1483          * This is a bit imprecise.  With aggregated I/O a single
1484          * request list can contain multiple front-end requests and
1485          * a multiple bios may point to a single request.  By carefully
1486          * walking the request list, we could map residuals and errors
1487          * back to the original front-end request, but the interface
1488          * isn't sufficiently rich for us to properly report the error.
1489          * So, we just treat the entire request list as having failed if an
1490          * error occurs on any part.  And, if an error occurs, we treat
1491          * the amount of data transferred as 0.
1492          *
1493          * For residuals, we report it on the overall aggregated device,
1494          * but not on the individual requests, since we don't currently
1495          * do the work to determine which front-end request to which the
1496          * residual applies.
1497          */
1498         if (bio->bio_error) {
1499                 DPRINTF("BIO returned error %d for operation on device %s\n",
1500                         bio->bio_error, xbb->dev_name);
1501                 reqlist->status = BLKIF_RSP_ERROR;
1502
1503                 if (bio->bio_error == ENXIO
1504                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1505
1506                         /*
1507                          * Backend device has disappeared.  Signal the
1508                          * front-end that we (the device proxy) want to
1509                          * go away.
1510                          */
1511                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1512                 }
1513         }
1514
1515 #ifdef XBB_USE_BOUNCE_BUFFERS
1516         if (bio->bio_cmd == BIO_READ) {
1517                 vm_offset_t kva_offset;
1518
1519                 kva_offset = (vm_offset_t)bio->bio_data
1520                            - (vm_offset_t)reqlist->bounce;
1521                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1522                        bio->bio_data, bio->bio_bcount);
1523         }
1524 #endif /* XBB_USE_BOUNCE_BUFFERS */
1525
1526         /*
1527          * Decrement the pending count for the request list.  When we're
1528          * done with the requests, send status back for all of them.
1529          */
1530         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1531                 xbb_complete_reqlist(xbb, reqlist);
1532
1533         g_destroy_bio(bio);
1534 }
1535
1536 /**
1537  * Parse a blkif request into an internal request structure and send
1538  * it to the backend for processing.
1539  *
1540  * \param xbb       Per-instance xbb configuration structure.
1541  * \param reqlist   Allocated internal request list structure.
1542  *
1543  * \return          On success, 0.  For resource shortages, non-zero.
1544  *  
1545  * This routine performs the backend common aspects of request parsing
1546  * including compiling an internal request structure, parsing the S/G
1547  * list and any secondary ring requests in which they may reside, and
1548  * the mapping of front-end I/O pages into our domain.
1549  */
1550 static int
1551 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1552 {
1553         struct xbb_sg                *xbb_sg;
1554         struct gnttab_map_grant_ref  *map;
1555         struct blkif_request_segment *sg;
1556         struct blkif_request_segment *last_block_sg;
1557         struct xbb_xen_req           *nreq;
1558         u_int                         nseg;
1559         u_int                         seg_idx;
1560         u_int                         block_segs;
1561         int                           nr_sects;
1562         int                           total_sects;
1563         int                           operation;
1564         uint8_t                       bio_flags;
1565         int                           error;
1566
1567         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1568         bio_flags            = 0;
1569         total_sects          = 0;
1570         nr_sects             = 0;
1571
1572         /*
1573          * First determine whether we have enough free KVA to satisfy this
1574          * request list.  If not, tell xbb_run_queue() so it can go to
1575          * sleep until we have more KVA.
1576          */
1577         reqlist->kva = NULL;
1578         if (reqlist->nr_segments != 0) {
1579                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1580                 if (reqlist->kva == NULL) {
1581                         /*
1582                          * If we're out of KVA, return ENOMEM.
1583                          */
1584                         return (ENOMEM);
1585                 }
1586         }
1587
1588         binuptime(&reqlist->ds_t0);
1589         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1590
1591         switch (reqlist->operation) {
1592         case BLKIF_OP_WRITE_BARRIER:
1593                 bio_flags       |= BIO_ORDERED;
1594                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1595                 /* FALLTHROUGH */
1596         case BLKIF_OP_WRITE:
1597                 operation = BIO_WRITE;
1598                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1599                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1600                         DPRINTF("Attempt to write to read only device %s\n",
1601                                 xbb->dev_name);
1602                         reqlist->status = BLKIF_RSP_ERROR;
1603                         goto send_response;
1604                 }
1605                 break;
1606         case BLKIF_OP_READ:
1607                 operation = BIO_READ;
1608                 reqlist->ds_trans_type = DEVSTAT_READ;
1609                 break;
1610         case BLKIF_OP_FLUSH_DISKCACHE:
1611                 /*
1612                  * If this is true, the user has requested that we disable
1613                  * flush support.  So we just complete the requests
1614                  * successfully.
1615                  */
1616                 if (xbb->disable_flush != 0) {
1617                         goto send_response;
1618                 }
1619
1620                 /*
1621                  * The user has requested that we only send a real flush
1622                  * for every N flush requests.  So keep count, and either
1623                  * complete the request immediately or queue it for the
1624                  * backend.
1625                  */
1626                 if (xbb->flush_interval != 0) {
1627                         if (++(xbb->flush_count) < xbb->flush_interval) {
1628                                 goto send_response;
1629                         } else
1630                                 xbb->flush_count = 0;
1631                 }
1632
1633                 operation = BIO_FLUSH;
1634                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1635                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1636                 goto do_dispatch;
1637                 /*NOTREACHED*/
1638         default:
1639                 DPRINTF("error: unknown block io operation [%d]\n",
1640                         reqlist->operation);
1641                 reqlist->status = BLKIF_RSP_ERROR;
1642                 goto send_response;
1643         }
1644
1645         reqlist->xbb  = xbb;
1646         xbb_sg        = xbb->xbb_sgs;
1647         map           = xbb->maps;
1648         seg_idx       = 0;
1649
1650         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1651                 blkif_request_t         *ring_req;
1652                 RING_IDX                 req_ring_idx;
1653                 u_int                    req_seg_idx;
1654
1655                 ring_req              = nreq->ring_req;
1656                 req_ring_idx          = nreq->req_ring_idx;
1657                 nr_sects              = 0;
1658                 nseg                  = ring_req->nr_segments;
1659                 nreq->nr_pages        = nseg;
1660                 nreq->nr_512b_sectors = 0;
1661                 req_seg_idx           = 0;
1662                 sg                    = NULL;
1663
1664                 /* Check that number of segments is sane. */
1665                 if (__predict_false(nseg == 0)
1666                  || __predict_false(nseg > xbb->max_request_segments)) {
1667                         DPRINTF("Bad number of segments in request (%d)\n",
1668                                 nseg);
1669                         reqlist->status = BLKIF_RSP_ERROR;
1670                         goto send_response;
1671                 }
1672
1673                 block_segs    = nseg;
1674                 sg            = ring_req->seg;
1675                 last_block_sg = sg + block_segs;
1676
1677                 while (sg < last_block_sg) {
1678                         KASSERT(seg_idx <
1679                                 XBB_MAX_SEGMENTS_PER_REQLIST,
1680                                 ("seg_idx %d is too large, max "
1681                                 "segs %d\n", seg_idx,
1682                                 XBB_MAX_SEGMENTS_PER_REQLIST));
1683
1684                         xbb_sg->first_sect = sg->first_sect;
1685                         xbb_sg->last_sect  = sg->last_sect;
1686                         xbb_sg->nsect =
1687                             (int8_t)(sg->last_sect -
1688                             sg->first_sect + 1);
1689
1690                         if ((sg->last_sect >= (PAGE_SIZE >> 9))
1691                          || (xbb_sg->nsect <= 0)) {
1692                                 reqlist->status = BLKIF_RSP_ERROR;
1693                                 goto send_response;
1694                         }
1695
1696                         nr_sects += xbb_sg->nsect;
1697                         map->host_addr = xbb_get_gntaddr(reqlist,
1698                                                 seg_idx, /*sector*/0);
1699                         KASSERT(map->host_addr + PAGE_SIZE <=
1700                                 xbb->ring_config.gnt_addr,
1701                                 ("Host address %#jx len %d overlaps "
1702                                  "ring address %#jx\n",
1703                                 (uintmax_t)map->host_addr, PAGE_SIZE,
1704                                 (uintmax_t)xbb->ring_config.gnt_addr));
1705
1706                         map->flags     = GNTMAP_host_map;
1707                         map->ref       = sg->gref;
1708                         map->dom       = xbb->otherend_id;
1709                         if (operation == BIO_WRITE)
1710                                 map->flags |= GNTMAP_readonly;
1711                         sg++;
1712                         map++;
1713                         xbb_sg++;
1714                         seg_idx++;
1715                         req_seg_idx++;
1716                 }
1717
1718                 /* Convert to the disk's sector size */
1719                 nreq->nr_512b_sectors = nr_sects;
1720                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1721                 total_sects += nr_sects;
1722
1723                 if ((nreq->nr_512b_sectors &
1724                     ((xbb->sector_size >> 9) - 1)) != 0) {
1725                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1726                                       "a multiple of the backing store sector "
1727                                       "size (%d)\n", __func__,
1728                                       nreq->nr_512b_sectors << 9,
1729                                       xbb->sector_size);
1730                         reqlist->status = BLKIF_RSP_ERROR;
1731                         goto send_response;
1732                 }
1733         }
1734
1735         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1736                                           xbb->maps, reqlist->nr_segments);
1737         if (error != 0)
1738                 panic("Grant table operation failed (%d)", error);
1739
1740         reqlist->flags |= XBB_REQLIST_MAPPED;
1741
1742         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1743              seg_idx++, map++){
1744
1745                 if (__predict_false(map->status != 0)) {
1746                         DPRINTF("invalid buffer -- could not remap "
1747                                 "it (%d)\n", map->status);
1748                         DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
1749                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1750                                 map->host_addr, map->flags, map->ref,
1751                                 map->dom);
1752                         reqlist->status = BLKIF_RSP_ERROR;
1753                         goto send_response;
1754                 }
1755
1756                 reqlist->gnt_handles[seg_idx] = map->handle;
1757         }
1758         if (reqlist->starting_sector_number + total_sects >
1759             xbb->media_num_sectors) {
1760
1761                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1762                         "extends past end of device %s\n",
1763                         operation == BIO_READ ? "read" : "write",
1764                         reqlist->starting_sector_number,
1765                         reqlist->starting_sector_number + total_sects,
1766                         xbb->dev_name); 
1767                 reqlist->status = BLKIF_RSP_ERROR;
1768                 goto send_response;
1769         }
1770
1771 do_dispatch:
1772
1773         error = xbb->dispatch_io(xbb,
1774                                  reqlist,
1775                                  operation,
1776                                  bio_flags);
1777
1778         if (error != 0) {
1779                 reqlist->status = BLKIF_RSP_ERROR;
1780                 goto send_response;
1781         }
1782
1783         return (0);
1784
1785 send_response:
1786
1787         xbb_complete_reqlist(xbb, reqlist);
1788
1789         return (0);
1790 }
1791
1792 static __inline int
1793 xbb_count_sects(blkif_request_t *ring_req)
1794 {
1795         int i;
1796         int cur_size = 0;
1797
1798         for (i = 0; i < ring_req->nr_segments; i++) {
1799                 int nsect;
1800
1801                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1802                         ring_req->seg[i].first_sect + 1);
1803                 if (nsect <= 0)
1804                         break;
1805
1806                 cur_size += nsect;
1807         }
1808
1809         return (cur_size);
1810 }
1811
1812 /**
1813  * Process incoming requests from the shared communication ring in response
1814  * to a signal on the ring's event channel.
1815  *
1816  * \param context  Callback argument registerd during task initialization -
1817  *                 the xbb_softc for this instance.
1818  * \param pending  The number of taskqueue_enqueue events that have
1819  *                 occurred since this handler was last run.
1820  */
1821 static void
1822 xbb_run_queue(void *context, int pending)
1823 {
1824         struct xbb_softc       *xbb;
1825         blkif_back_rings_t     *rings;
1826         RING_IDX                rp;
1827         uint64_t                cur_sector;
1828         int                     cur_operation;
1829         struct xbb_xen_reqlist *reqlist;
1830
1831
1832         xbb   = (struct xbb_softc *)context;
1833         rings = &xbb->rings;
1834
1835         /*
1836          * Work gather and dispatch loop.  Note that we have a bias here
1837          * towards gathering I/O sent by blockfront.  We first gather up
1838          * everything in the ring, as long as we have resources.  Then we
1839          * dispatch one request, and then attempt to gather up any
1840          * additional requests that have come in while we were dispatching
1841          * the request.
1842          *
1843          * This allows us to get a clearer picture (via devstat) of how
1844          * many requests blockfront is queueing to us at any given time.
1845          */
1846         for (;;) {
1847                 int retval;
1848
1849                 /*
1850                  * Initialize reqlist to the last element in the pending
1851                  * queue, if there is one.  This allows us to add more
1852                  * requests to that request list, if we have room.
1853                  */
1854                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1855                                       xbb_xen_reqlist, links);
1856                 if (reqlist != NULL) {
1857                         cur_sector = reqlist->next_contig_sector;
1858                         cur_operation = reqlist->operation;
1859                 } else {
1860                         cur_operation = 0;
1861                         cur_sector    = 0;
1862                 }
1863
1864                 /*
1865                  * Cache req_prod to avoid accessing a cache line shared
1866                  * with the frontend.
1867                  */
1868                 rp = rings->common.sring->req_prod;
1869
1870                 /* Ensure we see queued requests up to 'rp'. */
1871                 rmb();
1872
1873                 /**
1874                  * Run so long as there is work to consume and the generation
1875                  * of a response will not overflow the ring.
1876                  *
1877                  * @note There's a 1 to 1 relationship between requests and
1878                  *       responses, so an overflow should never occur.  This
1879                  *       test is to protect our domain from digesting bogus
1880                  *       data.  Shouldn't we log this?
1881                  */
1882                 while (rings->common.req_cons != rp
1883                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1884                                                   rings->common.req_cons) == 0){
1885                         blkif_request_t         ring_req_storage;
1886                         blkif_request_t        *ring_req;
1887                         int                     cur_size;
1888
1889                         switch (xbb->abi) {
1890                         case BLKIF_PROTOCOL_NATIVE:
1891                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1892                                     rings->common.req_cons);
1893                                 break;
1894                         case BLKIF_PROTOCOL_X86_32:
1895                         {
1896                                 struct blkif_x86_32_request *ring_req32;
1897
1898                                 ring_req32 = RING_GET_REQUEST(
1899                                     &xbb->rings.x86_32, rings->common.req_cons);
1900                                 blkif_get_x86_32_req(&ring_req_storage,
1901                                                      ring_req32);
1902                                 ring_req = &ring_req_storage;
1903                                 break;
1904                         }
1905                         case BLKIF_PROTOCOL_X86_64:
1906                         {
1907                                 struct blkif_x86_64_request *ring_req64;
1908
1909                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1910                                     rings->common.req_cons);
1911                                 blkif_get_x86_64_req(&ring_req_storage,
1912                                                      ring_req64);
1913                                 ring_req = &ring_req_storage;
1914                                 break;
1915                         }
1916                         default:
1917                                 panic("Unexpected blkif protocol ABI.");
1918                                 /* NOTREACHED */
1919                         } 
1920
1921                         /*
1922                          * Check for situations that would require closing
1923                          * off this I/O for further coalescing:
1924                          *  - Coalescing is turned off.
1925                          *  - Current I/O is out of sequence with the previous
1926                          *    I/O.
1927                          *  - Coalesced I/O would be too large.
1928                          */
1929                         if ((reqlist != NULL)
1930                          && ((xbb->no_coalesce_reqs != 0)
1931                           || ((xbb->no_coalesce_reqs == 0)
1932                            && ((ring_req->sector_number != cur_sector)
1933                             || (ring_req->operation != cur_operation)
1934                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1935                                  xbb->max_reqlist_segments))))) {
1936                                 reqlist = NULL;
1937                         }
1938
1939                         /*
1940                          * Grab and check for all resources in one shot.
1941                          * If we can't get all of the resources we need,
1942                          * the shortage is noted and the thread will get
1943                          * woken up when more resources are available.
1944                          */
1945                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1946                                                    xbb->rings.common.req_cons);
1947
1948                         if (retval != 0) {
1949                                 /*
1950                                  * Resource shortage has been recorded.
1951                                  * We'll be scheduled to run once a request
1952                                  * object frees up due to a completion.
1953                                  */
1954                                 break;
1955                         }
1956
1957                         /*
1958                          * Signify that we can overwrite this request with
1959                          * a response by incrementing our consumer index.
1960                          * The response won't be generated until after
1961                          * we've already consumed all necessary data out
1962                          * of the version of the request in the ring buffer
1963                          * (for native mode).  We must update the consumer
1964                          * index  before issuing back-end I/O so there is
1965                          * no possibility that it will complete and a
1966                          * response be generated before we make room in 
1967                          * the queue for that response.
1968                          */
1969                         xbb->rings.common.req_cons++;
1970                         xbb->reqs_received++;
1971
1972                         cur_size = xbb_count_sects(ring_req);
1973                         cur_sector = ring_req->sector_number + cur_size;
1974                         reqlist->next_contig_sector = cur_sector;
1975                         cur_operation = ring_req->operation;
1976                 }
1977
1978                 /* Check for I/O to dispatch */
1979                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1980                 if (reqlist == NULL) {
1981                         /*
1982                          * We're out of work to do, put the task queue to
1983                          * sleep.
1984                          */
1985                         break;
1986                 }
1987
1988                 /*
1989                  * Grab the first request off the queue and attempt
1990                  * to dispatch it.
1991                  */
1992                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1993
1994                 retval = xbb_dispatch_io(xbb, reqlist);
1995                 if (retval != 0) {
1996                         /*
1997                          * xbb_dispatch_io() returns non-zero only when
1998                          * there is a resource shortage.  If that's the
1999                          * case, re-queue this request on the head of the
2000                          * queue, and go to sleep until we have more
2001                          * resources.
2002                          */
2003                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
2004                                            reqlist, links);
2005                         break;
2006                 } else {
2007                         /*
2008                          * If we still have anything on the queue after
2009                          * removing the head entry, that is because we
2010                          * met one of the criteria to create a new
2011                          * request list (outlined above), and we'll call
2012                          * that a forced dispatch for statistical purposes.
2013                          *
2014                          * Otherwise, if there is only one element on the
2015                          * queue, we coalesced everything available on
2016                          * the ring and we'll call that a normal dispatch.
2017                          */
2018                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2019
2020                         if (reqlist != NULL)
2021                                 xbb->forced_dispatch++;
2022                         else
2023                                 xbb->normal_dispatch++;
2024
2025                         xbb->total_dispatch++;
2026                 }
2027         }
2028 }
2029
2030 /**
2031  * Interrupt handler bound to the shared ring's event channel.
2032  *
2033  * \param arg  Callback argument registerd during event channel
2034  *             binding - the xbb_softc for this instance.
2035  */
2036 static int
2037 xbb_filter(void *arg)
2038 {
2039         struct xbb_softc *xbb;
2040
2041         /* Defer to taskqueue thread. */
2042         xbb = (struct xbb_softc *)arg;
2043         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
2044
2045         return (FILTER_HANDLED);
2046 }
2047
2048 SDT_PROVIDER_DEFINE(xbb);
2049 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
2050 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
2051                   "uint64_t");
2052 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
2053                   "uint64_t", "uint64_t");
2054
2055 /*----------------------------- Backend Handlers -----------------------------*/
2056 /**
2057  * Backend handler for character device access.
2058  *
2059  * \param xbb        Per-instance xbb configuration structure.
2060  * \param reqlist    Allocated internal request list structure.
2061  * \param operation  BIO_* I/O operation code.
2062  * \param bio_flags  Additional bio_flag data to pass to any generated
2063  *                   bios (e.g. BIO_ORDERED)..
2064  *
2065  * \return  0 for success, errno codes for failure.
2066  */
2067 static int
2068 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2069                  int operation, int bio_flags)
2070 {
2071         struct xbb_dev_data *dev_data;
2072         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2073         off_t                bio_offset;
2074         struct bio          *bio;
2075         struct xbb_sg       *xbb_sg;
2076         u_int                nbio;
2077         u_int                bio_idx;
2078         u_int                nseg;
2079         u_int                seg_idx;
2080         int                  error;
2081
2082         dev_data   = &xbb->backend.dev;
2083         bio_offset = (off_t)reqlist->starting_sector_number
2084                    << xbb->sector_size_shift;
2085         error      = 0;
2086         nbio       = 0;
2087         bio_idx    = 0;
2088
2089         if (operation == BIO_FLUSH) {
2090                 bio = g_new_bio();
2091                 if (__predict_false(bio == NULL)) {
2092                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2093                         error = ENOMEM;
2094                         return (error);
2095                 }
2096
2097                 bio->bio_cmd     = BIO_FLUSH;
2098                 bio->bio_flags  |= BIO_ORDERED;
2099                 bio->bio_dev     = dev_data->cdev;
2100                 bio->bio_offset  = 0;
2101                 bio->bio_data    = 0;
2102                 bio->bio_done    = xbb_bio_done;
2103                 bio->bio_caller1 = reqlist;
2104                 bio->bio_pblkno  = 0;
2105
2106                 reqlist->pendcnt = 1;
2107
2108                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2109                            device_get_unit(xbb->dev));
2110
2111                 (*dev_data->csw->d_strategy)(bio);
2112
2113                 return (0);
2114         }
2115
2116         xbb_sg = xbb->xbb_sgs;
2117         bio    = NULL;
2118         nseg = reqlist->nr_segments;
2119
2120         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2121
2122                 /*
2123                  * KVA will not be contiguous, so any additional
2124                  * I/O will need to be represented in a new bio.
2125                  */
2126                 if ((bio != NULL)
2127                  && (xbb_sg->first_sect != 0)) {
2128                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2129                                 printf("%s: Discontiguous I/O request "
2130                                        "from domain %d ends on "
2131                                        "non-sector boundary\n",
2132                                        __func__, xbb->otherend_id);
2133                                 error = EINVAL;
2134                                 goto fail_free_bios;
2135                         }
2136                         bio = NULL;
2137                 }
2138
2139                 if (bio == NULL) {
2140                         /*
2141                          * Make sure that the start of this bio is
2142                          * aligned to a device sector.
2143                          */
2144                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2145                                 printf("%s: Misaligned I/O request "
2146                                        "from domain %d\n", __func__,
2147                                        xbb->otherend_id);
2148                                 error = EINVAL;
2149                                 goto fail_free_bios;
2150                         }
2151
2152                         bio = bios[nbio++] = g_new_bio();
2153                         if (__predict_false(bio == NULL)) {
2154                                 error = ENOMEM;
2155                                 goto fail_free_bios;
2156                         }
2157                         bio->bio_cmd     = operation;
2158                         bio->bio_flags  |= bio_flags;
2159                         bio->bio_dev     = dev_data->cdev;
2160                         bio->bio_offset  = bio_offset;
2161                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2162                                                 xbb_sg->first_sect);
2163                         bio->bio_done    = xbb_bio_done;
2164                         bio->bio_caller1 = reqlist;
2165                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2166                 }
2167
2168                 bio->bio_length += xbb_sg->nsect << 9;
2169                 bio->bio_bcount  = bio->bio_length;
2170                 bio_offset      += xbb_sg->nsect << 9;
2171
2172                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2173
2174                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2175                                 printf("%s: Discontiguous I/O request "
2176                                        "from domain %d ends on "
2177                                        "non-sector boundary\n",
2178                                        __func__, xbb->otherend_id);
2179                                 error = EINVAL;
2180                                 goto fail_free_bios;
2181                         }
2182                         /*
2183                          * KVA will not be contiguous, so any additional
2184                          * I/O will need to be represented in a new bio.
2185                          */
2186                         bio = NULL;
2187                 }
2188         }
2189
2190         reqlist->pendcnt = nbio;
2191
2192         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2193         {
2194 #ifdef XBB_USE_BOUNCE_BUFFERS
2195                 vm_offset_t kva_offset;
2196
2197                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2198                            - (vm_offset_t)reqlist->bounce;
2199                 if (operation == BIO_WRITE) {
2200                         memcpy(bios[bio_idx]->bio_data,
2201                                (uint8_t *)reqlist->kva + kva_offset,
2202                                bios[bio_idx]->bio_bcount);
2203                 }
2204 #endif
2205                 if (operation == BIO_READ) {
2206                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2207                                    device_get_unit(xbb->dev),
2208                                    bios[bio_idx]->bio_offset,
2209                                    bios[bio_idx]->bio_length);
2210                 } else if (operation == BIO_WRITE) {
2211                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2212                                    device_get_unit(xbb->dev),
2213                                    bios[bio_idx]->bio_offset,
2214                                    bios[bio_idx]->bio_length);
2215                 }
2216                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2217         }
2218
2219         return (error);
2220
2221 fail_free_bios:
2222         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2223                 g_destroy_bio(bios[bio_idx]);
2224         
2225         return (error);
2226 }
2227
2228 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
2229 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
2230                   "uint64_t");
2231 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
2232                   "uint64_t", "uint64_t");
2233
2234 /**
2235  * Backend handler for file access.
2236  *
2237  * \param xbb        Per-instance xbb configuration structure.
2238  * \param reqlist    Allocated internal request list.
2239  * \param operation  BIO_* I/O operation code.
2240  * \param flags      Additional bio_flag data to pass to any generated bios
2241  *                   (e.g. BIO_ORDERED)..
2242  *
2243  * \return  0 for success, errno codes for failure.
2244  */
2245 static int
2246 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2247                   int operation, int flags)
2248 {
2249         struct xbb_file_data *file_data;
2250         u_int                 seg_idx;
2251         u_int                 nseg;
2252         struct uio            xuio;
2253         struct xbb_sg        *xbb_sg;
2254         struct iovec         *xiovec;
2255 #ifdef XBB_USE_BOUNCE_BUFFERS
2256         void                **p_vaddr;
2257         int                   saved_uio_iovcnt;
2258 #endif /* XBB_USE_BOUNCE_BUFFERS */
2259         int                   error;
2260
2261         file_data = &xbb->backend.file;
2262         error = 0;
2263         bzero(&xuio, sizeof(xuio));
2264
2265         switch (operation) {
2266         case BIO_READ:
2267                 xuio.uio_rw = UIO_READ;
2268                 break;
2269         case BIO_WRITE:
2270                 xuio.uio_rw = UIO_WRITE;
2271                 break;
2272         case BIO_FLUSH: {
2273                 struct mount *mountpoint;
2274
2275                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2276                            device_get_unit(xbb->dev));
2277
2278                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2279
2280                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2281                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2282                 VOP_UNLOCK(xbb->vn, 0);
2283
2284                 vn_finished_write(mountpoint);
2285
2286                 goto bailout_send_response;
2287                 /* NOTREACHED */
2288         }
2289         default:
2290                 panic("invalid operation %d", operation);
2291                 /* NOTREACHED */
2292         }
2293         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2294                         << xbb->sector_size_shift;
2295         xuio.uio_segflg = UIO_SYSSPACE;
2296         xuio.uio_iov = file_data->xiovecs;
2297         xuio.uio_iovcnt = 0;
2298         xbb_sg = xbb->xbb_sgs;
2299         nseg = reqlist->nr_segments;
2300
2301         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2302
2303                 /*
2304                  * If the first sector is not 0, the KVA will
2305                  * not be contiguous and we'll need to go on
2306                  * to another segment.
2307                  */
2308                 if (xbb_sg->first_sect != 0)
2309                         xiovec = NULL;
2310
2311                 if (xiovec == NULL) {
2312                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2313                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2314                             seg_idx, xbb_sg->first_sect);
2315 #ifdef XBB_USE_BOUNCE_BUFFERS
2316                         /*
2317                          * Store the address of the incoming
2318                          * buffer at this particular offset
2319                          * as well, so we can do the copy
2320                          * later without having to do more
2321                          * work to recalculate this address.
2322                          */
2323                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2324                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2325                             xbb_sg->first_sect);
2326 #endif /* XBB_USE_BOUNCE_BUFFERS */
2327                         xiovec->iov_len = 0;
2328                         xuio.uio_iovcnt++;
2329                 }
2330
2331                 xiovec->iov_len += xbb_sg->nsect << 9;
2332
2333                 xuio.uio_resid += xbb_sg->nsect << 9;
2334
2335                 /*
2336                  * If the last sector is not the full page
2337                  * size count, the next segment will not be
2338                  * contiguous in KVA and we need a new iovec.
2339                  */
2340                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2341                         xiovec = NULL;
2342         }
2343
2344         xuio.uio_td = curthread;
2345
2346 #ifdef XBB_USE_BOUNCE_BUFFERS
2347         saved_uio_iovcnt = xuio.uio_iovcnt;
2348
2349         if (operation == BIO_WRITE) {
2350                 /* Copy the write data to the local buffer. */
2351                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2352                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2353                      seg_idx++, xiovec++, p_vaddr++) {
2354
2355                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2356                 }
2357         } else {
2358                 /*
2359                  * We only need to save off the iovecs in the case of a
2360                  * read, because the copy for the read happens after the
2361                  * VOP_READ().  (The uio will get modified in that call
2362                  * sequence.)
2363                  */
2364                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2365                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2366         }
2367 #endif /* XBB_USE_BOUNCE_BUFFERS */
2368
2369         switch (operation) {
2370         case BIO_READ:
2371
2372                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2373                            device_get_unit(xbb->dev), xuio.uio_offset,
2374                            xuio.uio_resid);
2375
2376                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2377
2378                 /*
2379                  * UFS pays attention to IO_DIRECT for reads.  If the
2380                  * DIRECTIO option is configured into the kernel, it calls
2381                  * ffs_rawread().  But that only works for single-segment
2382                  * uios with user space addresses.  In our case, with a
2383                  * kernel uio, it still reads into the buffer cache, but it
2384                  * will just try to release the buffer from the cache later
2385                  * on in ffs_read().
2386                  *
2387                  * ZFS does not pay attention to IO_DIRECT for reads.
2388                  *
2389                  * UFS does not pay attention to IO_SYNC for reads.
2390                  *
2391                  * ZFS pays attention to IO_SYNC (which translates into the
2392                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2393                  * attempts to sync the file before reading.
2394                  *
2395                  * So, to attempt to provide some barrier semantics in the
2396                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2397                  */
2398                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2399                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2400
2401                 VOP_UNLOCK(xbb->vn, 0);
2402                 break;
2403         case BIO_WRITE: {
2404                 struct mount *mountpoint;
2405
2406                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2407                            device_get_unit(xbb->dev), xuio.uio_offset,
2408                            xuio.uio_resid);
2409
2410                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2411
2412                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2413
2414                 /*
2415                  * UFS pays attention to IO_DIRECT for writes.  The write
2416                  * is done asynchronously.  (Normally the write would just
2417                  * get put into cache.
2418                  *
2419                  * UFS pays attention to IO_SYNC for writes.  It will
2420                  * attempt to write the buffer out synchronously if that
2421                  * flag is set.
2422                  *
2423                  * ZFS does not pay attention to IO_DIRECT for writes.
2424                  *
2425                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2426                  * for writes.  It will flush the transaction from the
2427                  * cache before returning.
2428                  *
2429                  * So if we've got the BIO_ORDERED flag set, we want
2430                  * IO_SYNC in either the UFS or ZFS case.
2431                  */
2432                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2433                                   IO_SYNC : 0, file_data->cred);
2434                 VOP_UNLOCK(xbb->vn, 0);
2435
2436                 vn_finished_write(mountpoint);
2437
2438                 break;
2439         }
2440         default:
2441                 panic("invalid operation %d", operation);
2442                 /* NOTREACHED */
2443         }
2444
2445 #ifdef XBB_USE_BOUNCE_BUFFERS
2446         /* We only need to copy here for read operations */
2447         if (operation == BIO_READ) {
2448
2449                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2450                      xiovec = file_data->saved_xiovecs;
2451                      seg_idx < saved_uio_iovcnt; seg_idx++,
2452                      xiovec++, p_vaddr++) {
2453
2454                         /*
2455                          * Note that we have to use the copy of the 
2456                          * io vector we made above.  uiomove() modifies
2457                          * the uio and its referenced vector as uiomove
2458                          * performs the copy, so we can't rely on any
2459                          * state from the original uio.
2460                          */
2461                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2462                 }
2463         }
2464 #endif /* XBB_USE_BOUNCE_BUFFERS */
2465
2466 bailout_send_response:
2467
2468         if (error != 0)
2469                 reqlist->status = BLKIF_RSP_ERROR;
2470
2471         xbb_complete_reqlist(xbb, reqlist);
2472
2473         return (0);
2474 }
2475
2476 /*--------------------------- Backend Configuration --------------------------*/
2477 /**
2478  * Close and cleanup any backend device/file specific state for this
2479  * block back instance. 
2480  *
2481  * \param xbb  Per-instance xbb configuration structure.
2482  */
2483 static void
2484 xbb_close_backend(struct xbb_softc *xbb)
2485 {
2486         DROP_GIANT();
2487         DPRINTF("closing dev=%s\n", xbb->dev_name);
2488         if (xbb->vn) {
2489                 int flags = FREAD;
2490
2491                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2492                         flags |= FWRITE;
2493
2494                 switch (xbb->device_type) {
2495                 case XBB_TYPE_DISK:
2496                         if (xbb->backend.dev.csw) {
2497                                 dev_relthread(xbb->backend.dev.cdev,
2498                                               xbb->backend.dev.dev_ref);
2499                                 xbb->backend.dev.csw  = NULL;
2500                                 xbb->backend.dev.cdev = NULL;
2501                         }
2502                         break;
2503                 case XBB_TYPE_FILE:
2504                         break;
2505                 case XBB_TYPE_NONE:
2506                 default:
2507                         panic("Unexpected backend type.");
2508                         break;
2509                 }
2510
2511                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2512                 xbb->vn = NULL;
2513
2514                 switch (xbb->device_type) {
2515                 case XBB_TYPE_DISK:
2516                         break;
2517                 case XBB_TYPE_FILE:
2518                         if (xbb->backend.file.cred != NULL) {
2519                                 crfree(xbb->backend.file.cred);
2520                                 xbb->backend.file.cred = NULL;
2521                         }
2522                         break;
2523                 case XBB_TYPE_NONE:
2524                 default:
2525                         panic("Unexpected backend type.");
2526                         break;
2527                 }
2528         }
2529         PICKUP_GIANT();
2530 }
2531
2532 /**
2533  * Open a character device to be used for backend I/O.
2534  *
2535  * \param xbb  Per-instance xbb configuration structure.
2536  *
2537  * \return  0 for success, errno codes for failure.
2538  */
2539 static int
2540 xbb_open_dev(struct xbb_softc *xbb)
2541 {
2542         struct vattr   vattr;
2543         struct cdev   *dev;
2544         struct cdevsw *devsw;
2545         int            error;
2546
2547         xbb->device_type = XBB_TYPE_DISK;
2548         xbb->dispatch_io = xbb_dispatch_dev;
2549         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2550         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2551                                              &xbb->backend.dev.dev_ref);
2552         if (xbb->backend.dev.csw == NULL)
2553                 panic("Unable to retrieve device switch");
2554
2555         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2556         if (error) {
2557                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2558                                  "vnode attributes for device %s",
2559                                  xbb->dev_name);
2560                 return (error);
2561         }
2562
2563
2564         dev = xbb->vn->v_rdev;
2565         devsw = dev->si_devsw;
2566         if (!devsw->d_ioctl) {
2567                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2568                                  "device %s!", xbb->dev_name);
2569                 return (ENODEV);
2570         }
2571
2572         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2573                                (caddr_t)&xbb->sector_size, FREAD,
2574                                curthread);
2575         if (error) {
2576                 xenbus_dev_fatal(xbb->dev, error,
2577                                  "error calling ioctl DIOCGSECTORSIZE "
2578                                  "for device %s", xbb->dev_name);
2579                 return (error);
2580         }
2581
2582         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2583                                (caddr_t)&xbb->media_size, FREAD,
2584                                curthread);
2585         if (error) {
2586                 xenbus_dev_fatal(xbb->dev, error,
2587                                  "error calling ioctl DIOCGMEDIASIZE "
2588                                  "for device %s", xbb->dev_name);
2589                 return (error);
2590         }
2591
2592         return (0);
2593 }
2594
2595 /**
2596  * Open a file to be used for backend I/O.
2597  *
2598  * \param xbb  Per-instance xbb configuration structure.
2599  *
2600  * \return  0 for success, errno codes for failure.
2601  */
2602 static int
2603 xbb_open_file(struct xbb_softc *xbb)
2604 {
2605         struct xbb_file_data *file_data;
2606         struct vattr          vattr;
2607         int                   error;
2608
2609         file_data = &xbb->backend.file;
2610         xbb->device_type = XBB_TYPE_FILE;
2611         xbb->dispatch_io = xbb_dispatch_file;
2612         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2613         if (error != 0) {
2614                 xenbus_dev_fatal(xbb->dev, error,
2615                                  "error calling VOP_GETATTR()"
2616                                  "for file %s", xbb->dev_name);
2617                 return (error);
2618         }
2619
2620         /*
2621          * Verify that we have the ability to upgrade to exclusive
2622          * access on this file so we can trap errors at open instead
2623          * of reporting them during first access.
2624          */
2625         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2626                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2627                 if (xbb->vn->v_iflag & VI_DOOMED) {
2628                         error = EBADF;
2629                         xenbus_dev_fatal(xbb->dev, error,
2630                                          "error locking file %s",
2631                                          xbb->dev_name);
2632
2633                         return (error);
2634                 }
2635         }
2636
2637         file_data->cred = crhold(curthread->td_ucred);
2638         xbb->media_size = vattr.va_size;
2639
2640         /*
2641          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2642          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2643          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2644          * may not work with other OSes as well.  So just export a sector
2645          * size of 512 bytes, which should work with any OS or
2646          * application.  Since our backing is a file, any block size will
2647          * work fine for the backing store.
2648          */
2649 #if 0
2650         xbb->sector_size = vattr.va_blocksize;
2651 #endif
2652         xbb->sector_size = 512;
2653
2654         /*
2655          * Sanity check.  The media size has to be at least one
2656          * sector long.
2657          */
2658         if (xbb->media_size < xbb->sector_size) {
2659                 error = EINVAL;
2660                 xenbus_dev_fatal(xbb->dev, error,
2661                                  "file %s size %ju < block size %u",
2662                                  xbb->dev_name,
2663                                  (uintmax_t)xbb->media_size,
2664                                  xbb->sector_size);
2665         }
2666         return (error);
2667 }
2668
2669 /**
2670  * Open the backend provider for this connection.
2671  *
2672  * \param xbb  Per-instance xbb configuration structure.
2673  *
2674  * \return  0 for success, errno codes for failure.
2675  */
2676 static int
2677 xbb_open_backend(struct xbb_softc *xbb)
2678 {
2679         struct nameidata nd;
2680         int              flags;
2681         int              error;
2682
2683         flags = FREAD;
2684         error = 0;
2685
2686         DPRINTF("opening dev=%s\n", xbb->dev_name);
2687
2688         if (rootvnode == NULL) {
2689                 xenbus_dev_fatal(xbb->dev, ENOENT,
2690                                  "Root file system not mounted");
2691                 return (ENOENT);
2692         }
2693
2694         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2695                 flags |= FWRITE;
2696
2697         pwd_ensure_dirs();
2698
2699  again:
2700         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2701         error = vn_open(&nd, &flags, 0, NULL);
2702         if (error) {
2703                 /*
2704                  * This is the only reasonable guess we can make as far as
2705                  * path if the user doesn't give us a fully qualified path.
2706                  * If they want to specify a file, they need to specify the
2707                  * full path.
2708                  */
2709                 if (xbb->dev_name[0] != '/') {
2710                         char *dev_path = "/dev/";
2711                         char *dev_name;
2712
2713                         /* Try adding device path at beginning of name */
2714                         dev_name = malloc(strlen(xbb->dev_name)
2715                                         + strlen(dev_path) + 1,
2716                                           M_XENBLOCKBACK, M_NOWAIT);
2717                         if (dev_name) {
2718                                 sprintf(dev_name, "%s%s", dev_path,
2719                                         xbb->dev_name);
2720                                 free(xbb->dev_name, M_XENBLOCKBACK);
2721                                 xbb->dev_name = dev_name;
2722                                 goto again;
2723                         }
2724                 }
2725                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2726                                  xbb->dev_name);
2727                 return (error);
2728         }
2729
2730         NDFREE(&nd, NDF_ONLY_PNBUF);
2731                 
2732         xbb->vn = nd.ni_vp;
2733
2734         /* We only support disks and files. */
2735         if (vn_isdisk(xbb->vn, &error)) {
2736                 error = xbb_open_dev(xbb);
2737         } else if (xbb->vn->v_type == VREG) {
2738                 error = xbb_open_file(xbb);
2739         } else {
2740                 error = EINVAL;
2741                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2742                                  "or file", xbb->dev_name);
2743         }
2744         VOP_UNLOCK(xbb->vn, 0);
2745
2746         if (error != 0) {
2747                 xbb_close_backend(xbb);
2748                 return (error);
2749         }
2750
2751         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2752         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2753
2754         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2755                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2756                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2757
2758         return (0);
2759 }
2760
2761 /*------------------------ Inter-Domain Communication ------------------------*/
2762 /**
2763  * Free dynamically allocated KVA or pseudo-physical address allocations.
2764  *
2765  * \param xbb  Per-instance xbb configuration structure.
2766  */
2767 static void
2768 xbb_free_communication_mem(struct xbb_softc *xbb)
2769 {
2770         if (xbb->kva != 0) {
2771                 if (xbb->pseudo_phys_res != NULL) {
2772                         xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
2773                             xbb->pseudo_phys_res);
2774                         xbb->pseudo_phys_res = NULL;
2775                 }
2776         }
2777         xbb->kva = 0;
2778         xbb->gnt_base_addr = 0;
2779         if (xbb->kva_free != NULL) {
2780                 free(xbb->kva_free, M_XENBLOCKBACK);
2781                 xbb->kva_free = NULL;
2782         }
2783 }
2784
2785 /**
2786  * Cleanup all inter-domain communication mechanisms.
2787  *
2788  * \param xbb  Per-instance xbb configuration structure.
2789  */
2790 static int
2791 xbb_disconnect(struct xbb_softc *xbb)
2792 {
2793         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2794         struct gnttab_unmap_grant_ref *op;
2795         u_int                          ring_idx;
2796         int                            error;
2797
2798         DPRINTF("\n");
2799
2800         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2801                 return (0);
2802
2803         xen_intr_unbind(&xbb->xen_intr_handle);
2804
2805         mtx_unlock(&xbb->lock);
2806         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2807         mtx_lock(&xbb->lock);
2808
2809         /*
2810          * No new interrupts can generate work, but we must wait
2811          * for all currently active requests to drain.
2812          */
2813         if (xbb->active_request_count != 0)
2814                 return (EAGAIN);
2815         
2816         for (ring_idx = 0, op = ops;
2817              ring_idx < xbb->ring_config.ring_pages;
2818              ring_idx++, op++) {
2819
2820                 op->host_addr    = xbb->ring_config.gnt_addr
2821                                  + (ring_idx * PAGE_SIZE);
2822                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2823                 op->handle       = xbb->ring_config.handle[ring_idx];
2824         }
2825
2826         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2827                                           xbb->ring_config.ring_pages);
2828         if (error != 0)
2829                 panic("Grant table op failed (%d)", error);
2830
2831         xbb_free_communication_mem(xbb);
2832
2833         if (xbb->requests != NULL) {
2834                 free(xbb->requests, M_XENBLOCKBACK);
2835                 xbb->requests = NULL;
2836         }
2837
2838         if (xbb->request_lists != NULL) {
2839                 struct xbb_xen_reqlist *reqlist;
2840                 int i;
2841
2842                 /* There is one request list for ever allocated request. */
2843                 for (i = 0, reqlist = xbb->request_lists;
2844                      i < xbb->max_requests; i++, reqlist++){
2845 #ifdef XBB_USE_BOUNCE_BUFFERS
2846                         if (reqlist->bounce != NULL) {
2847                                 free(reqlist->bounce, M_XENBLOCKBACK);
2848                                 reqlist->bounce = NULL;
2849                         }
2850 #endif
2851                         if (reqlist->gnt_handles != NULL) {
2852                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2853                                 reqlist->gnt_handles = NULL;
2854                         }
2855                 }
2856                 free(xbb->request_lists, M_XENBLOCKBACK);
2857                 xbb->request_lists = NULL;
2858         }
2859
2860         xbb->flags &= ~XBBF_RING_CONNECTED;
2861         return (0);
2862 }
2863
2864 /**
2865  * Map shared memory ring into domain local address space, initialize
2866  * ring control structures, and bind an interrupt to the event channel
2867  * used to notify us of ring changes.
2868  *
2869  * \param xbb  Per-instance xbb configuration structure.
2870  */
2871 static int
2872 xbb_connect_ring(struct xbb_softc *xbb)
2873 {
2874         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2875         struct gnttab_map_grant_ref *gnt;
2876         u_int                        ring_idx;
2877         int                          error;
2878
2879         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2880                 return (0);
2881
2882         /*
2883          * Kva for our ring is at the tail of the region of kva allocated
2884          * by xbb_alloc_communication_mem().
2885          */
2886         xbb->ring_config.va = xbb->kva
2887                             + (xbb->kva_size
2888                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2889         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2890                                   + (xbb->kva_size
2891                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2892
2893         for (ring_idx = 0, gnt = gnts;
2894              ring_idx < xbb->ring_config.ring_pages;
2895              ring_idx++, gnt++) {
2896
2897                 gnt->host_addr = xbb->ring_config.gnt_addr
2898                                + (ring_idx * PAGE_SIZE);
2899                 gnt->flags     = GNTMAP_host_map;
2900                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2901                 gnt->dom       = xbb->otherend_id;
2902         }
2903
2904         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2905                                           xbb->ring_config.ring_pages);
2906         if (error)
2907                 panic("blkback: Ring page grant table op failed (%d)", error);
2908
2909         for (ring_idx = 0, gnt = gnts;
2910              ring_idx < xbb->ring_config.ring_pages;
2911              ring_idx++, gnt++) {
2912                 if (gnt->status != 0) {
2913                         xbb->ring_config.va = 0;
2914                         xenbus_dev_fatal(xbb->dev, EACCES,
2915                                          "Ring shared page mapping failed. "
2916                                          "Status %d.", gnt->status);
2917                         return (EACCES);
2918                 }
2919                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2920                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2921         }
2922
2923         /* Initialize the ring based on ABI. */
2924         switch (xbb->abi) {
2925         case BLKIF_PROTOCOL_NATIVE:
2926         {
2927                 blkif_sring_t *sring;
2928                 sring = (blkif_sring_t *)xbb->ring_config.va;
2929                 BACK_RING_INIT(&xbb->rings.native, sring,
2930                                xbb->ring_config.ring_pages * PAGE_SIZE);
2931                 break;
2932         }
2933         case BLKIF_PROTOCOL_X86_32:
2934         {
2935                 blkif_x86_32_sring_t *sring_x86_32;
2936                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2937                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2938                                xbb->ring_config.ring_pages * PAGE_SIZE);
2939                 break;
2940         }
2941         case BLKIF_PROTOCOL_X86_64:
2942         {
2943                 blkif_x86_64_sring_t *sring_x86_64;
2944                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2945                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2946                                xbb->ring_config.ring_pages * PAGE_SIZE);
2947                 break;
2948         }
2949         default:
2950                 panic("Unexpected blkif protocol ABI.");
2951         }
2952
2953         xbb->flags |= XBBF_RING_CONNECTED;
2954
2955         error = xen_intr_bind_remote_port(xbb->dev,
2956                                           xbb->otherend_id,
2957                                           xbb->ring_config.evtchn,
2958                                           xbb_filter,
2959                                           /*ithread_handler*/NULL,
2960                                           /*arg*/xbb,
2961                                           INTR_TYPE_BIO | INTR_MPSAFE,
2962                                           &xbb->xen_intr_handle);
2963         if (error) {
2964                 (void)xbb_disconnect(xbb);
2965                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2966                 return (error);
2967         }
2968
2969         DPRINTF("rings connected!\n");
2970
2971         return 0;
2972 }
2973
2974 /**
2975  * Size KVA and pseudo-physical address allocations based on negotiated
2976  * values for the size and number of I/O requests, and the size of our
2977  * communication ring.
2978  *
2979  * \param xbb  Per-instance xbb configuration structure.
2980  *
2981  * These address spaces are used to dynamically map pages in the
2982  * front-end's domain into our own.
2983  */
2984 static int
2985 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2986 {
2987         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2988         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2989         xbb->kva_size = xbb->reqlist_kva_size +
2990                         (xbb->ring_config.ring_pages * PAGE_SIZE);
2991
2992         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
2993         if (xbb->kva_free == NULL)
2994                 return (ENOMEM);
2995
2996         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
2997                 device_get_nameunit(xbb->dev), xbb->kva_size,
2998                 xbb->reqlist_kva_size);
2999         /*
3000          * Reserve a range of pseudo physical memory that we can map
3001          * into kva.  These pages will only be backed by machine
3002          * pages ("real memory") during the lifetime of front-end requests
3003          * via grant table operations.
3004          */
3005         xbb->pseudo_phys_res_id = 0;
3006         xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
3007             xbb->kva_size);
3008         if (xbb->pseudo_phys_res == NULL) {
3009                 xbb->kva = 0;
3010                 return (ENOMEM);
3011         }
3012         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3013         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3014
3015         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3016                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3017                 (uintmax_t)xbb->gnt_base_addr); 
3018         return (0);
3019 }
3020
3021 /**
3022  * Collect front-end information from the XenStore.
3023  *
3024  * \param xbb  Per-instance xbb configuration structure.
3025  */
3026 static int
3027 xbb_collect_frontend_info(struct xbb_softc *xbb)
3028 {
3029         char        protocol_abi[64];
3030         const char *otherend_path;
3031         int         error;
3032         u_int       ring_idx;
3033         u_int       ring_page_order;
3034         size_t      ring_size;
3035
3036         otherend_path = xenbus_get_otherend_path(xbb->dev);
3037
3038         /*
3039          * Protocol defaults valid even if all negotiation fails.
3040          */
3041         xbb->ring_config.ring_pages = 1;
3042         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_REQUEST;
3043         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3044
3045         /*
3046          * Mandatory data (used in all versions of the protocol) first.
3047          */
3048         error = xs_scanf(XST_NIL, otherend_path,
3049                          "event-channel", NULL, "%" PRIu32,
3050                          &xbb->ring_config.evtchn);
3051         if (error != 0) {
3052                 xenbus_dev_fatal(xbb->dev, error,
3053                                  "Unable to retrieve event-channel information "
3054                                  "from frontend %s.  Unable to connect.",
3055                                  xenbus_get_otherend_path(xbb->dev));
3056                 return (error);
3057         }
3058
3059         /*
3060          * These fields are initialized to legacy protocol defaults
3061          * so we only need to fail if reading the updated value succeeds
3062          * and the new value is outside of its allowed range.
3063          *
3064          * \note xs_gather() returns on the first encountered error, so
3065          *       we must use independent calls in order to guarantee
3066          *       we don't miss information in a sparsly populated front-end
3067          *       tree.
3068          *
3069          * \note xs_scanf() does not update variables for unmatched
3070          *       fields.
3071          */
3072         ring_page_order = 0;
3073         xbb->max_requests = 32;
3074
3075         (void)xs_scanf(XST_NIL, otherend_path,
3076                        "ring-page-order", NULL, "%u",
3077                        &ring_page_order);
3078         xbb->ring_config.ring_pages = 1 << ring_page_order;
3079         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3080         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3081
3082         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3083                 xenbus_dev_fatal(xbb->dev, EINVAL,
3084                                  "Front-end specified ring-pages of %u "
3085                                  "exceeds backend limit of %u.  "
3086                                  "Unable to connect.",
3087                                  xbb->ring_config.ring_pages,
3088                                  XBB_MAX_RING_PAGES);
3089                 return (EINVAL);
3090         }
3091
3092         if (xbb->ring_config.ring_pages == 1) {
3093                 error = xs_gather(XST_NIL, otherend_path,
3094                                   "ring-ref", "%" PRIu32,
3095                                   &xbb->ring_config.ring_ref[0],
3096                                   NULL);
3097                 if (error != 0) {
3098                         xenbus_dev_fatal(xbb->dev, error,
3099                                          "Unable to retrieve ring information "
3100                                          "from frontend %s.  Unable to "
3101                                          "connect.",
3102                                          xenbus_get_otherend_path(xbb->dev));
3103                         return (error);
3104                 }
3105         } else {
3106                 /* Multi-page ring format. */
3107                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3108                      ring_idx++) {
3109                         char ring_ref_name[]= "ring_refXX";
3110
3111                         snprintf(ring_ref_name, sizeof(ring_ref_name),
3112                                  "ring-ref%u", ring_idx);
3113                         error = xs_scanf(XST_NIL, otherend_path,
3114                                          ring_ref_name, NULL, "%" PRIu32,
3115                                          &xbb->ring_config.ring_ref[ring_idx]);
3116                         if (error != 0) {
3117                                 xenbus_dev_fatal(xbb->dev, error,
3118                                                  "Failed to retriev grant "
3119                                                  "reference for page %u of "
3120                                                  "shared ring.  Unable "
3121                                                  "to connect.", ring_idx);
3122                                 return (error);
3123                         }
3124                 }
3125         }
3126
3127         error = xs_gather(XST_NIL, otherend_path,
3128                           "protocol", "%63s", protocol_abi,
3129                           NULL); 
3130         if (error != 0
3131          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3132                 /*
3133                  * Assume native if the frontend has not
3134                  * published ABI data or it has published and
3135                  * matches our own ABI.
3136                  */
3137                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3138         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3139
3140                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3141         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3142
3143                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3144         } else {
3145
3146                 xenbus_dev_fatal(xbb->dev, EINVAL,
3147                                  "Unknown protocol ABI (%s) published by "
3148                                  "frontend.  Unable to connect.", protocol_abi);
3149                 return (EINVAL);
3150         }
3151         return (0);
3152 }
3153
3154 /**
3155  * Allocate per-request data structures given request size and number
3156  * information negotiated with the front-end.
3157  *
3158  * \param xbb  Per-instance xbb configuration structure.
3159  */
3160 static int
3161 xbb_alloc_requests(struct xbb_softc *xbb)
3162 {
3163         struct xbb_xen_req *req;
3164         struct xbb_xen_req *last_req;
3165
3166         /*
3167          * Allocate request book keeping datastructures.
3168          */
3169         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3170                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3171         if (xbb->requests == NULL) {
3172                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3173                                   "Unable to allocate request structures");
3174                 return (ENOMEM);
3175         }
3176
3177         req      = xbb->requests;
3178         last_req = &xbb->requests[xbb->max_requests - 1];
3179         STAILQ_INIT(&xbb->request_free_stailq);
3180         while (req <= last_req) {
3181                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3182                 req++;
3183         }
3184         return (0);
3185 }
3186
3187 static int
3188 xbb_alloc_request_lists(struct xbb_softc *xbb)
3189 {
3190         struct xbb_xen_reqlist *reqlist;
3191         int                     i;
3192
3193         /*
3194          * If no requests can be merged, we need 1 request list per
3195          * in flight request.
3196          */
3197         xbb->request_lists = malloc(xbb->max_requests *
3198                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3199         if (xbb->request_lists == NULL) {
3200                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3201                                   "Unable to allocate request list structures");
3202                 return (ENOMEM);
3203         }
3204
3205         STAILQ_INIT(&xbb->reqlist_free_stailq);
3206         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3207         for (i = 0; i < xbb->max_requests; i++) {
3208                 int seg;
3209
3210                 reqlist      = &xbb->request_lists[i];
3211
3212                 reqlist->xbb = xbb;
3213
3214 #ifdef XBB_USE_BOUNCE_BUFFERS
3215                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3216                                          M_XENBLOCKBACK, M_NOWAIT);
3217                 if (reqlist->bounce == NULL) {
3218                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3219                                          "Unable to allocate request "
3220                                          "bounce buffers");
3221                         return (ENOMEM);
3222                 }
3223 #endif /* XBB_USE_BOUNCE_BUFFERS */
3224
3225                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3226                                               sizeof(*reqlist->gnt_handles),
3227                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3228                 if (reqlist->gnt_handles == NULL) {
3229                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3230                                           "Unable to allocate request "
3231                                           "grant references");
3232                         return (ENOMEM);
3233                 }
3234
3235                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3236                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3237
3238                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3239         }
3240         return (0);
3241 }
3242
3243 /**
3244  * Supply information about the physical device to the frontend
3245  * via XenBus.
3246  *
3247  * \param xbb  Per-instance xbb configuration structure.
3248  */
3249 static int
3250 xbb_publish_backend_info(struct xbb_softc *xbb)
3251 {
3252         struct xs_transaction xst;
3253         const char           *our_path;
3254         const char           *leaf;
3255         int                   error;
3256
3257         our_path = xenbus_get_node(xbb->dev);
3258         while (1) {
3259                 error = xs_transaction_start(&xst);
3260                 if (error != 0) {
3261                         xenbus_dev_fatal(xbb->dev, error,
3262                                          "Error publishing backend info "
3263                                          "(start transaction)");
3264                         return (error);
3265                 }
3266
3267                 leaf = "sectors";
3268                 error = xs_printf(xst, our_path, leaf,
3269                                   "%"PRIu64, xbb->media_num_sectors);
3270                 if (error != 0)
3271                         break;
3272
3273                 /* XXX Support all VBD attributes here. */
3274                 leaf = "info";
3275                 error = xs_printf(xst, our_path, leaf, "%u",
3276                                   xbb->flags & XBBF_READ_ONLY
3277                                 ? VDISK_READONLY : 0);
3278                 if (error != 0)
3279                         break;
3280
3281                 leaf = "sector-size";
3282                 error = xs_printf(xst, our_path, leaf, "%u",
3283                                   xbb->sector_size);
3284                 if (error != 0)
3285                         break;
3286
3287                 error = xs_transaction_end(xst, 0);
3288                 if (error == 0) {
3289                         return (0);
3290                 } else if (error != EAGAIN) {
3291                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3292                         return (error);
3293                 }
3294         }
3295
3296         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3297                         our_path, leaf);
3298         xs_transaction_end(xst, 1);
3299         return (error);
3300 }
3301
3302 /**
3303  * Connect to our blkfront peer now that it has completed publishing
3304  * its configuration into the XenStore.
3305  *
3306  * \param xbb  Per-instance xbb configuration structure.
3307  */
3308 static void
3309 xbb_connect(struct xbb_softc *xbb)
3310 {
3311         int error;
3312
3313         if (xenbus_get_state(xbb->dev) != XenbusStateInitialised)
3314                 return;
3315
3316         if (xbb_collect_frontend_info(xbb) != 0)
3317                 return;
3318
3319         xbb->flags &= ~XBBF_SHUTDOWN;
3320
3321         /*
3322          * We limit the maximum number of reqlist segments to the maximum
3323          * number of segments in the ring, or our absolute maximum,
3324          * whichever is smaller.
3325          */
3326         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3327                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3328
3329         /*
3330          * The maximum size is simply a function of the number of segments
3331          * we can handle.
3332          */
3333         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3334
3335         /* Allocate resources whose size depends on front-end configuration. */
3336         error = xbb_alloc_communication_mem(xbb);
3337         if (error != 0) {
3338                 xenbus_dev_fatal(xbb->dev, error,
3339                                  "Unable to allocate communication memory");
3340                 return;
3341         }
3342
3343         error = xbb_alloc_requests(xbb);
3344         if (error != 0) {
3345                 /* Specific errors are reported by xbb_alloc_requests(). */
3346                 return;
3347         }
3348
3349         error = xbb_alloc_request_lists(xbb);
3350         if (error != 0) {
3351                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3352                 return;
3353         }
3354
3355         /*
3356          * Connect communication channel.
3357          */
3358         error = xbb_connect_ring(xbb);
3359         if (error != 0) {
3360                 /* Specific errors are reported by xbb_connect_ring(). */
3361                 return;
3362         }
3363         
3364         if (xbb_publish_backend_info(xbb) != 0) {
3365                 /*
3366                  * If we can't publish our data, we cannot participate
3367                  * in this connection, and waiting for a front-end state
3368                  * change will not help the situation.
3369                  */
3370                 (void)xbb_disconnect(xbb);
3371                 return;
3372         }
3373
3374         /* Ready for I/O. */
3375         xenbus_set_state(xbb->dev, XenbusStateConnected);
3376 }
3377
3378 /*-------------------------- Device Teardown Support -------------------------*/
3379 /**
3380  * Perform device shutdown functions.
3381  *
3382  * \param xbb  Per-instance xbb configuration structure.
3383  *
3384  * Mark this instance as shutting down, wait for any active I/O on the
3385  * backend device/file to drain, disconnect from the front-end, and notify
3386  * any waiters (e.g. a thread invoking our detach method) that detach can
3387  * now proceed.
3388  */
3389 static int
3390 xbb_shutdown(struct xbb_softc *xbb)
3391 {
3392         XenbusState frontState;
3393         int         error;
3394
3395         DPRINTF("\n");
3396
3397         /*
3398          * Due to the need to drop our mutex during some
3399          * xenbus operations, it is possible for two threads
3400          * to attempt to close out shutdown processing at
3401          * the same time.  Tell the caller that hits this
3402          * race to try back later. 
3403          */
3404         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3405                 return (EAGAIN);
3406
3407         xbb->flags |= XBBF_IN_SHUTDOWN;
3408         mtx_unlock(&xbb->lock);
3409
3410         if (xbb->hotplug_watch.node != NULL) {
3411                 xs_unregister_watch(&xbb->hotplug_watch);
3412                 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3413                 xbb->hotplug_watch.node = NULL;
3414         }
3415
3416         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3417                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3418
3419         frontState = xenbus_get_otherend_state(xbb->dev);
3420         mtx_lock(&xbb->lock);
3421         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3422
3423         /* Wait for the frontend to disconnect (if it's connected). */
3424         if (frontState == XenbusStateConnected)
3425                 return (EAGAIN);
3426
3427         DPRINTF("\n");
3428
3429         /* Indicate shutdown is in progress. */
3430         xbb->flags |= XBBF_SHUTDOWN;
3431
3432         /* Disconnect from the front-end. */
3433         error = xbb_disconnect(xbb);
3434         if (error != 0) {
3435                 /*
3436                  * Requests still outstanding.  We'll be called again
3437                  * once they complete.
3438                  */
3439                 KASSERT(error == EAGAIN,
3440                         ("%s: Unexpected xbb_disconnect() failure %d",
3441                          __func__, error));
3442
3443                 return (error);
3444         }
3445
3446         DPRINTF("\n");
3447
3448         /* Indicate to xbb_detach() that is it safe to proceed. */
3449         wakeup(xbb);
3450
3451         return (0);
3452 }
3453
3454 /**
3455  * Report an attach time error to the console and Xen, and cleanup
3456  * this instance by forcing immediate detach processing.
3457  *
3458  * \param xbb  Per-instance xbb configuration structure.
3459  * \param err  Errno describing the error.
3460  * \param fmt  Printf style format and arguments
3461  */
3462 static void
3463 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3464 {
3465         va_list ap;
3466         va_list ap_hotplug;
3467
3468         va_start(ap, fmt);
3469         va_copy(ap_hotplug, ap);
3470         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3471                   "hotplug-error", fmt, ap_hotplug);
3472         va_end(ap_hotplug);
3473         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3474                   "hotplug-status", "error");
3475
3476         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3477         va_end(ap);
3478
3479         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3480                   "online", "0");
3481         mtx_lock(&xbb->lock);
3482         xbb_shutdown(xbb);
3483         mtx_unlock(&xbb->lock);
3484 }
3485
3486 /*---------------------------- NewBus Entrypoints ----------------------------*/
3487 /**
3488  * Inspect a XenBus device and claim it if is of the appropriate type.
3489  * 
3490  * \param dev  NewBus device object representing a candidate XenBus device.
3491  *
3492  * \return  0 for success, errno codes for failure.
3493  */
3494 static int
3495 xbb_probe(device_t dev)
3496 {
3497  
3498         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3499                 device_set_desc(dev, "Backend Virtual Block Device");
3500                 device_quiet(dev);
3501                 return (0);
3502         }
3503
3504         return (ENXIO);
3505 }
3506
3507 /**
3508  * Setup sysctl variables to control various Block Back parameters.
3509  *
3510  * \param xbb  Xen Block Back softc.
3511  *
3512  */
3513 static void
3514 xbb_setup_sysctl(struct xbb_softc *xbb)
3515 {
3516         struct sysctl_ctx_list *sysctl_ctx = NULL;
3517         struct sysctl_oid      *sysctl_tree = NULL;
3518         
3519         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3520         if (sysctl_ctx == NULL)
3521                 return;
3522
3523         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3524         if (sysctl_tree == NULL)
3525                 return;
3526
3527         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3528                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3529                        "fake the flush command");
3530
3531         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3532                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3533                        "send a real flush for N flush requests");
3534
3535         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3536                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3537                        "Don't coalesce contiguous requests");
3538
3539         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3540                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3541                          "how many I/O requests we have received");
3542
3543         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3544                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3545                          "how many I/O requests have been completed");
3546
3547         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3548                          "reqs_queued_for_completion", CTLFLAG_RW,
3549                          &xbb->reqs_queued_for_completion,
3550                          "how many I/O requests queued but not yet pushed");
3551
3552         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3553                          "reqs_completed_with_error", CTLFLAG_RW,
3554                          &xbb->reqs_completed_with_error,
3555                          "how many I/O requests completed with error status");
3556
3557         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3558                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3559                          "how many I/O dispatches were forced");
3560
3561         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3562                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3563                          "how many I/O dispatches were normal");
3564
3565         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3566                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3567                          "total number of I/O dispatches");
3568
3569         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3570                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3571                          "how many times we have run out of KVA");
3572
3573         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3574                          "request_shortages", CTLFLAG_RW,
3575                          &xbb->request_shortages,
3576                          "how many times we have run out of requests");
3577
3578         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3579                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3580                         "maximum outstanding requests (negotiated)");
3581
3582         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3583                         "max_request_segments", CTLFLAG_RD,
3584                         &xbb->max_request_segments, 0,
3585                         "maximum number of pages per requests (negotiated)");
3586
3587         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3588                         "max_request_size", CTLFLAG_RD,
3589                         &xbb->max_request_size, 0,
3590                         "maximum size in bytes of a request (negotiated)");
3591
3592         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3593                         "ring_pages", CTLFLAG_RD,
3594                         &xbb->ring_config.ring_pages, 0,
3595                         "communication channel pages (negotiated)");
3596 }
3597
3598 static void
3599 xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len)
3600 {
3601         device_t                 dev;
3602         struct xbb_softc        *xbb;
3603         int                      error;
3604
3605         dev = (device_t) watch->callback_data;
3606         xbb = device_get_softc(dev);
3607
3608         error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path",
3609             NULL, &xbb->dev_name, NULL);
3610         if (error != 0)
3611                 return;
3612
3613         xs_unregister_watch(watch);
3614         free(watch->node, M_XENBLOCKBACK);
3615         watch->node = NULL;
3616
3617         /* Collect physical device information. */
3618         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3619                           "device-type", NULL, &xbb->dev_type,
3620                           NULL);
3621         if (error != 0)
3622                 xbb->dev_type = NULL;
3623
3624         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3625                           "mode", NULL, &xbb->dev_mode,
3626                           NULL);
3627         if (error != 0) {
3628                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3629                                   xenbus_get_node(dev));
3630                 return;
3631         }
3632
3633         /* Parse fopen style mode flags. */
3634         if (strchr(xbb->dev_mode, 'w') == NULL)
3635                 xbb->flags |= XBBF_READ_ONLY;
3636
3637         /*
3638          * Verify the physical device is present and can support
3639          * the desired I/O mode.
3640          */
3641         error = xbb_open_backend(xbb);
3642         if (error != 0) {
3643                 xbb_attach_failed(xbb, error, "Unable to open %s",
3644                                   xbb->dev_name);
3645                 return;
3646         }
3647
3648         /* Use devstat(9) for recording statistics. */
3649         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3650                                            xbb->sector_size,
3651                                            DEVSTAT_ALL_SUPPORTED,
3652                                            DEVSTAT_TYPE_DIRECT
3653                                          | DEVSTAT_TYPE_IF_OTHER,
3654                                            DEVSTAT_PRIORITY_OTHER);
3655
3656         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3657                                               xbb->sector_size,
3658                                               DEVSTAT_ALL_SUPPORTED,
3659                                               DEVSTAT_TYPE_DIRECT
3660                                             | DEVSTAT_TYPE_IF_OTHER,
3661                                               DEVSTAT_PRIORITY_OTHER);
3662         /*
3663          * Setup sysctl variables.
3664          */
3665         xbb_setup_sysctl(xbb);
3666
3667         /*
3668          * Create a taskqueue for doing work that must occur from a
3669          * thread context.
3670          */
3671         xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3672                                                   M_NOWAIT,
3673                                                   taskqueue_thread_enqueue,
3674                                                   /*contxt*/&xbb->io_taskqueue);
3675         if (xbb->io_taskqueue == NULL) {
3676                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3677                 return;
3678         }
3679
3680         taskqueue_start_threads(&xbb->io_taskqueue,
3681                                 /*num threads*/1,
3682                                 /*priority*/PWAIT,
3683                                 /*thread name*/
3684                                 "%s taskq", device_get_nameunit(dev));
3685
3686         /* Update hot-plug status to satisfy xend. */
3687         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3688                           "hotplug-status", "connected");
3689         if (error) {
3690                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3691                                   xenbus_get_node(xbb->dev));
3692                 return;
3693         }
3694
3695         /* Tell the front end that we are ready to connect. */
3696         xenbus_set_state(dev, XenbusStateInitialised);
3697 }
3698
3699 /**
3700  * Attach to a XenBus device that has been claimed by our probe routine.
3701  *
3702  * \param dev  NewBus device object representing this Xen Block Back instance.
3703  *
3704  * \return  0 for success, errno codes for failure.
3705  */
3706 static int
3707 xbb_attach(device_t dev)
3708 {
3709         struct xbb_softc        *xbb;
3710         int                      error;
3711         u_int                    max_ring_page_order;
3712         struct sbuf             *watch_path;
3713
3714         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3715
3716         /*
3717          * Basic initialization.
3718          * After this block it is safe to call xbb_detach()
3719          * to clean up any allocated data for this instance.
3720          */
3721         xbb = device_get_softc(dev);
3722         xbb->dev = dev;
3723         xbb->otherend_id = xenbus_get_otherend_id(dev);
3724         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3725         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3726
3727         /*
3728          * Publish protocol capabilities for consumption by the
3729          * front-end.
3730          */
3731         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3732                           "feature-barrier", "1");
3733         if (error) {
3734                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3735                                   xenbus_get_node(xbb->dev));
3736                 return (error);
3737         }
3738
3739         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3740                           "feature-flush-cache", "1");
3741         if (error) {
3742                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3743                                   xenbus_get_node(xbb->dev));
3744                 return (error);
3745         }
3746
3747         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3748         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3749                           "max-ring-page-order", "%u", max_ring_page_order);
3750         if (error) {
3751                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3752                                   xenbus_get_node(xbb->dev));
3753                 return (error);
3754         }
3755
3756         /*
3757          * We need to wait for hotplug script execution before
3758          * moving forward.
3759          */
3760         watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path");
3761         xbb->hotplug_watch.callback_data = (uintptr_t)dev;
3762         xbb->hotplug_watch.callback = xbb_attach_disk;
3763         KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup"));
3764         xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK);
3765         sbuf_delete(watch_path);
3766         error = xs_register_watch(&xbb->hotplug_watch);
3767         if (error != 0) {
3768                 xbb_attach_failed(xbb, error, "failed to create watch on %s",
3769                     xbb->hotplug_watch.node);
3770                 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3771                 return (error);
3772         }
3773
3774         /* Tell the toolstack blkback has attached. */
3775         xenbus_set_state(dev, XenbusStateInitWait);
3776
3777         return (0);
3778 }
3779
3780 /**
3781  * Detach from a block back device instance.
3782  *
3783  * \param dev  NewBus device object representing this Xen Block Back instance.
3784  *
3785  * \return  0 for success, errno codes for failure.
3786  * 
3787  * \note A block back device may be detached at any time in its life-cycle,
3788  *       including part way through the attach process.  For this reason,
3789  *       initialization order and the initialization state checks in this
3790  *       routine must be carefully coupled so that attach time failures
3791  *       are gracefully handled.
3792  */
3793 static int
3794 xbb_detach(device_t dev)
3795 {
3796         struct xbb_softc *xbb;
3797
3798         DPRINTF("\n");
3799
3800         xbb = device_get_softc(dev);
3801         mtx_lock(&xbb->lock);
3802         while (xbb_shutdown(xbb) == EAGAIN) {
3803                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3804                        "xbb_shutdown", 0);
3805         }
3806         mtx_unlock(&xbb->lock);
3807
3808         DPRINTF("\n");
3809
3810         if (xbb->io_taskqueue != NULL)
3811                 taskqueue_free(xbb->io_taskqueue);
3812
3813         if (xbb->xbb_stats != NULL)
3814                 devstat_remove_entry(xbb->xbb_stats);
3815
3816         if (xbb->xbb_stats_in != NULL)
3817                 devstat_remove_entry(xbb->xbb_stats_in);
3818
3819         xbb_close_backend(xbb);
3820
3821         if (xbb->dev_mode != NULL) {
3822                 free(xbb->dev_mode, M_XENSTORE);
3823                 xbb->dev_mode = NULL;
3824         }
3825
3826         if (xbb->dev_type != NULL) {
3827                 free(xbb->dev_type, M_XENSTORE);
3828                 xbb->dev_type = NULL;
3829         }
3830
3831         if (xbb->dev_name != NULL) {
3832                 free(xbb->dev_name, M_XENSTORE);
3833                 xbb->dev_name = NULL;
3834         }
3835
3836         mtx_destroy(&xbb->lock);
3837         return (0);
3838 }
3839
3840 /**
3841  * Prepare this block back device for suspension of this VM.
3842  * 
3843  * \param dev  NewBus device object representing this Xen Block Back instance.
3844  *
3845  * \return  0 for success, errno codes for failure.
3846  */
3847 static int
3848 xbb_suspend(device_t dev)
3849 {
3850 #ifdef NOT_YET
3851         struct xbb_softc *sc = device_get_softc(dev);
3852
3853         /* Prevent new requests being issued until we fix things up. */
3854         mtx_lock(&sc->xb_io_lock);
3855         sc->connected = BLKIF_STATE_SUSPENDED;
3856         mtx_unlock(&sc->xb_io_lock);
3857 #endif
3858
3859         return (0);
3860 }
3861
3862 /**
3863  * Perform any processing required to recover from a suspended state.
3864  * 
3865  * \param dev  NewBus device object representing this Xen Block Back instance.
3866  *
3867  * \return  0 for success, errno codes for failure.
3868  */
3869 static int
3870 xbb_resume(device_t dev)
3871 {
3872         return (0);
3873 }
3874
3875 /**
3876  * Handle state changes expressed via the XenStore by our front-end peer.
3877  *
3878  * \param dev             NewBus device object representing this Xen
3879  *                        Block Back instance.
3880  * \param frontend_state  The new state of the front-end.
3881  *
3882  * \return  0 for success, errno codes for failure.
3883  */
3884 static void
3885 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3886 {
3887         struct xbb_softc *xbb = device_get_softc(dev);
3888
3889         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3890                 xenbus_strstate(frontend_state),
3891                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3892
3893         switch (frontend_state) {
3894         case XenbusStateInitialising:
3895                 break;
3896         case XenbusStateInitialised:
3897         case XenbusStateConnected:
3898                 xbb_connect(xbb);
3899                 break;
3900         case XenbusStateClosing:
3901         case XenbusStateClosed:
3902                 mtx_lock(&xbb->lock);
3903                 xbb_shutdown(xbb);
3904                 mtx_unlock(&xbb->lock);
3905                 if (frontend_state == XenbusStateClosed)
3906                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3907                 break;
3908         default:
3909                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3910                                  frontend_state);
3911                 break;
3912         }
3913 }
3914
3915 /*---------------------------- NewBus Registration ---------------------------*/
3916 static device_method_t xbb_methods[] = {
3917         /* Device interface */
3918         DEVMETHOD(device_probe,         xbb_probe),
3919         DEVMETHOD(device_attach,        xbb_attach),
3920         DEVMETHOD(device_detach,        xbb_detach),
3921         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3922         DEVMETHOD(device_suspend,       xbb_suspend),
3923         DEVMETHOD(device_resume,        xbb_resume),
3924
3925         /* Xenbus interface */
3926         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3927
3928         { 0, 0 }
3929 };
3930
3931 static driver_t xbb_driver = {
3932         "xbbd",
3933         xbb_methods,
3934         sizeof(struct xbb_softc),
3935 };
3936 devclass_t xbb_devclass;
3937
3938 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);