]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/xen/blkback/blkback.c
Reimport dts files from vendor repo now that it has been properly
[FreeBSD/FreeBSD.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * Copyright (c) 2009-2011 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  *          Ken Merry           (Spectra Logic Corporation)
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 /**
37  * \file blkback.c
38  *
39  * \brief Device driver supporting the vending of block storage from
40  *        a FreeBSD domain to other domains.
41  */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47
48 #include <sys/bio.h>
49 #include <sys/bus.h>
50 #include <sys/conf.h>
51 #include <sys/devicestat.h>
52 #include <sys/disk.h>
53 #include <sys/fcntl.h>
54 #include <sys/filedesc.h>
55 #include <sys/kdb.h>
56 #include <sys/module.h>
57 #include <sys/namei.h>
58 #include <sys/proc.h>
59 #include <sys/rman.h>
60 #include <sys/taskqueue.h>
61 #include <sys/types.h>
62 #include <sys/vnode.h>
63 #include <sys/mount.h>
64 #include <sys/sysctl.h>
65 #include <sys/bitstring.h>
66 #include <sys/sdt.h>
67
68 #include <geom/geom.h>
69
70 #include <machine/_inttypes.h>
71
72 #include <vm/vm.h>
73 #include <vm/vm_extern.h>
74 #include <vm/vm_kern.h>
75
76 #include <xen/xen-os.h>
77 #include <xen/blkif.h>
78 #include <xen/gnttab.h>
79 #include <xen/xen_intr.h>
80
81 #include <xen/interface/event_channel.h>
82 #include <xen/interface/grant_table.h>
83
84 #include <xen/xenbus/xenbusvar.h>
85
86 /*--------------------------- Compile-time Tunables --------------------------*/
87 /**
88  * The maximum number of outstanding request blocks (request headers plus
89  * additional segment blocks) we will allow in a negotiated block-front/back
90  * communication channel.
91  */
92 #define XBB_MAX_REQUESTS        256
93
94 /**
95  * \brief Define to force all I/O to be performed on memory owned by the
96  *        backend device, with a copy-in/out to the remote domain's memory.
97  *
98  * \note  This option is currently required when this driver's domain is
99  *        operating in HVM mode on a system using an IOMMU.
100  *
101  * This driver uses Xen's grant table API to gain access to the memory of
102  * the remote domains it serves.  When our domain is operating in PV mode,
103  * the grant table mechanism directly updates our domain's page table entries
104  * to point to the physical pages of the remote domain.  This scheme guarantees
105  * that blkback and the backing devices it uses can safely perform DMA
106  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
107  * insure that our domain cannot DMA to pages owned by another domain.  As
108  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
109  * table API.  For this reason, in HVM mode, we must bounce all requests into
110  * memory that is mapped into our domain at domain startup and thus has
111  * valid IOMMU mappings.
112  */
113 #define XBB_USE_BOUNCE_BUFFERS
114
115 /**
116  * \brief Define to enable rudimentary request logging to the console.
117  */
118 #undef XBB_DEBUG
119
120 /*---------------------------------- Macros ----------------------------------*/
121 /**
122  * Custom malloc type for all driver allocations.
123  */
124 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
125
126 #ifdef XBB_DEBUG
127 #define DPRINTF(fmt, args...)                                   \
128     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
129 #else
130 #define DPRINTF(fmt, args...) do {} while(0)
131 #endif
132
133 /**
134  * The maximum mapped region size per request we will allow in a negotiated
135  * block-front/back communication channel.
136  */
137 #define XBB_MAX_REQUEST_SIZE                                    \
138         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
139
140 /**
141  * The maximum number of segments (within a request header and accompanying
142  * segment blocks) per request we will allow in a negotiated block-front/back
143  * communication channel.
144  */
145 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
146         (MIN(UIO_MAXIOV,                                        \
147              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
148                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
149
150 /**
151  * The maximum number of shared memory ring pages we will allow in a
152  * negotiated block-front/back communication channel.  Allow enough
153  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
154  */
155 #define XBB_MAX_RING_PAGES                                                  \
156         BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
157                        * XBB_MAX_REQUESTS)
158 /**
159  * The maximum number of ring pages that we can allow per request list.
160  * We limit this to the maximum number of segments per request, because
161  * that is already a reasonable number of segments to aggregate.  This
162  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
163  * because that would leave situations where we can't dispatch even one
164  * large request.
165  */
166 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
167
168 /*--------------------------- Forward Declarations ---------------------------*/
169 struct xbb_softc;
170 struct xbb_xen_req;
171
172 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
173                               ...) __attribute__((format(printf, 3, 4)));
174 static int  xbb_shutdown(struct xbb_softc *xbb);
175 static int  xbb_detach(device_t dev);
176
177 /*------------------------------ Data Structures -----------------------------*/
178
179 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
180
181 typedef enum {
182         XBB_REQLIST_NONE        = 0x00,
183         XBB_REQLIST_MAPPED      = 0x01
184 } xbb_reqlist_flags;
185
186 struct xbb_xen_reqlist {
187         /**
188          * Back reference to the parent block back instance for this
189          * request.  Used during bio_done handling.
190          */
191         struct xbb_softc        *xbb;
192
193         /**
194          * BLKIF_OP code for this request.
195          */
196         int                      operation;
197
198         /**
199          * Set to BLKIF_RSP_* to indicate request status.
200          *
201          * This field allows an error status to be recorded even if the
202          * delivery of this status must be deferred.  Deferred reporting
203          * is necessary, for example, when an error is detected during
204          * completion processing of one bio when other bios for this
205          * request are still outstanding.
206          */
207         int                      status;
208
209         /**
210          * Number of 512 byte sectors not transferred.
211          */
212         int                      residual_512b_sectors;
213
214         /**
215          * Starting sector number of the first request in the list.
216          */
217         off_t                    starting_sector_number;
218
219         /**
220          * If we're going to coalesce, the next contiguous sector would be
221          * this one.
222          */
223         off_t                    next_contig_sector;
224
225         /**
226          * Number of child requests in the list.
227          */
228         int                      num_children;
229
230         /**
231          * Number of I/O requests still pending on the backend.
232          */
233         int                      pendcnt;
234
235         /**
236          * Total number of segments for requests in the list.
237          */
238         int                      nr_segments;
239
240         /**
241          * Flags for this particular request list.
242          */
243         xbb_reqlist_flags        flags;
244
245         /**
246          * Kernel virtual address space reserved for this request
247          * list structure and used to map the remote domain's pages for
248          * this I/O, into our domain's address space.
249          */
250         uint8_t                 *kva;
251
252         /**
253          * Base, psuedo-physical address, corresponding to the start
254          * of this request's kva region.
255          */
256         uint64_t                 gnt_base;
257
258
259 #ifdef XBB_USE_BOUNCE_BUFFERS
260         /**
261          * Pre-allocated domain local memory used to proxy remote
262          * domain memory during I/O operations.
263          */
264         uint8_t                 *bounce;
265 #endif
266
267         /**
268          * Array of grant handles (one per page) used to map this request.
269          */
270         grant_handle_t          *gnt_handles;
271
272         /**
273          * Device statistics request ordering type (ordered or simple).
274          */
275         devstat_tag_type         ds_tag_type;
276
277         /**
278          * Device statistics request type (read, write, no_data).
279          */
280         devstat_trans_flags      ds_trans_type;
281
282         /**
283          * The start time for this request.
284          */
285         struct bintime           ds_t0;
286
287         /**
288          * Linked list of contiguous requests with the same operation type.
289          */
290         struct xbb_xen_req_list  contig_req_list;
291
292         /**
293          * Linked list links used to aggregate idle requests in the
294          * request list free pool (xbb->reqlist_free_stailq) and pending
295          * requests waiting for execution (xbb->reqlist_pending_stailq).
296          */
297         STAILQ_ENTRY(xbb_xen_reqlist) links;
298 };
299
300 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
301
302 /**
303  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
304  */
305 struct xbb_xen_req {
306         /**
307          * Linked list links used to aggregate requests into a reqlist
308          * and to store them in the request free pool.
309          */
310         STAILQ_ENTRY(xbb_xen_req) links;
311
312         /**
313          * The remote domain's identifier for this I/O request.
314          */
315         uint64_t                  id;
316
317         /**
318          * The number of pages currently mapped for this request.
319          */
320         int                       nr_pages;
321
322         /**
323          * The number of 512 byte sectors comprising this requests.
324          */
325         int                       nr_512b_sectors;
326
327         /**
328          * BLKIF_OP code for this request.
329          */
330         int                       operation;
331
332         /**
333          * Storage used for non-native ring requests.
334          */
335         blkif_request_t          ring_req_storage;
336
337         /**
338          * Pointer to the Xen request in the ring.
339          */
340         blkif_request_t         *ring_req;
341
342         /**
343          * Consumer index for this request.
344          */
345         RING_IDX                 req_ring_idx;
346
347         /**
348          * The start time for this request.
349          */
350         struct bintime           ds_t0;
351
352         /**
353          * Pointer back to our parent request list.
354          */
355         struct xbb_xen_reqlist  *reqlist;
356 };
357 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
358
359 /**
360  * \brief Configuration data for the shared memory request ring
361  *        used to communicate with the front-end client of this
362  *        this driver.
363  */
364 struct xbb_ring_config {
365         /** KVA address where ring memory is mapped. */
366         vm_offset_t     va;
367
368         /** The pseudo-physical address where ring memory is mapped.*/
369         uint64_t        gnt_addr;
370
371         /**
372          * Grant table handles, one per-ring page, returned by the
373          * hyperpervisor upon mapping of the ring and required to
374          * unmap it when a connection is torn down.
375          */
376         grant_handle_t  handle[XBB_MAX_RING_PAGES];
377
378         /**
379          * The device bus address returned by the hypervisor when
380          * mapping the ring and required to unmap it when a connection
381          * is torn down.
382          */
383         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
384
385         /** The number of ring pages mapped for the current connection. */
386         u_int           ring_pages;
387
388         /**
389          * The grant references, one per-ring page, supplied by the
390          * front-end, allowing us to reference the ring pages in the
391          * front-end's domain and to map these pages into our own domain.
392          */
393         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
394
395         /** The interrupt driven even channel used to signal ring events. */
396         evtchn_port_t   evtchn;
397 };
398
399 /**
400  * Per-instance connection state flags.
401  */
402 typedef enum
403 {
404         /**
405          * The front-end requested a read-only mount of the
406          * back-end device/file.
407          */
408         XBBF_READ_ONLY         = 0x01,
409
410         /** Communication with the front-end has been established. */
411         XBBF_RING_CONNECTED    = 0x02,
412
413         /**
414          * Front-end requests exist in the ring and are waiting for
415          * xbb_xen_req objects to free up.
416          */
417         XBBF_RESOURCE_SHORTAGE = 0x04,
418
419         /** Connection teardown in progress. */
420         XBBF_SHUTDOWN          = 0x08,
421
422         /** A thread is already performing shutdown processing. */
423         XBBF_IN_SHUTDOWN       = 0x10
424 } xbb_flag_t;
425
426 /** Backend device type.  */
427 typedef enum {
428         /** Backend type unknown. */
429         XBB_TYPE_NONE           = 0x00,
430
431         /**
432          * Backend type disk (access via cdev switch
433          * strategy routine).
434          */
435         XBB_TYPE_DISK           = 0x01,
436
437         /** Backend type file (access vnode operations.). */
438         XBB_TYPE_FILE           = 0x02
439 } xbb_type;
440
441 /**
442  * \brief Structure used to memoize information about a per-request
443  *        scatter-gather list.
444  *
445  * The chief benefit of using this data structure is it avoids having
446  * to reparse the possibly discontiguous S/G list in the original
447  * request.  Due to the way that the mapping of the memory backing an
448  * I/O transaction is handled by Xen, a second pass is unavoidable.
449  * At least this way the second walk is a simple array traversal.
450  *
451  * \note A single Scatter/Gather element in the block interface covers
452  *       at most 1 machine page.  In this context a sector (blkif
453  *       nomenclature, not what I'd choose) is a 512b aligned unit
454  *       of mapping within the machine page referenced by an S/G
455  *       element.
456  */
457 struct xbb_sg {
458         /** The number of 512b data chunks mapped in this S/G element. */
459         int16_t nsect;
460
461         /**
462          * The index (0 based) of the first 512b data chunk mapped
463          * in this S/G element.
464          */
465         uint8_t first_sect;
466
467         /**
468          * The index (0 based) of the last 512b data chunk mapped
469          * in this S/G element.
470          */
471         uint8_t last_sect;
472 };
473
474 /**
475  * Character device backend specific configuration data.
476  */
477 struct xbb_dev_data {
478         /** Cdev used for device backend access.  */
479         struct cdev   *cdev;
480
481         /** Cdev switch used for device backend access.  */
482         struct cdevsw *csw;
483
484         /** Used to hold a reference on opened cdev backend devices. */
485         int            dev_ref;
486 };
487
488 /**
489  * File backend specific configuration data.
490  */
491 struct xbb_file_data {
492         /** Credentials to use for vnode backed (file based) I/O. */
493         struct ucred   *cred;
494
495         /**
496          * \brief Array of io vectors used to process file based I/O.
497          *
498          * Only a single file based request is outstanding per-xbb instance,
499          * so we only need one of these.
500          */
501         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
502 #ifdef XBB_USE_BOUNCE_BUFFERS
503
504         /**
505          * \brief Array of io vectors used to handle bouncing of file reads.
506          *
507          * Vnode operations are free to modify uio data during their
508          * exectuion.  In the case of a read with bounce buffering active,
509          * we need some of the data from the original uio in order to
510          * bounce-out the read data.  This array serves as the temporary
511          * storage for this saved data.
512          */
513         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
514
515         /**
516          * \brief Array of memoized bounce buffer kva offsets used
517          *        in the file based backend.
518          *
519          * Due to the way that the mapping of the memory backing an
520          * I/O transaction is handled by Xen, a second pass through
521          * the request sg elements is unavoidable. We memoize the computed
522          * bounce address here to reduce the cost of the second walk.
523          */
524         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
525 #endif /* XBB_USE_BOUNCE_BUFFERS */
526 };
527
528 /**
529  * Collection of backend type specific data.
530  */
531 union xbb_backend_data {
532         struct xbb_dev_data  dev;
533         struct xbb_file_data file;
534 };
535
536 /**
537  * Function signature of backend specific I/O handlers.
538  */
539 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
540                               struct xbb_xen_reqlist *reqlist, int operation,
541                               int flags);
542
543 /**
544  * Per-instance configuration data.
545  */
546 struct xbb_softc {
547
548         /**
549          * Task-queue used to process I/O requests.
550          */
551         struct taskqueue         *io_taskqueue;
552
553         /**
554          * Single "run the request queue" task enqueued
555          * on io_taskqueue.
556          */
557         struct task               io_task;
558
559         /** Device type for this instance. */
560         xbb_type                  device_type;
561
562         /** NewBus device corresponding to this instance. */
563         device_t                  dev;
564
565         /** Backend specific dispatch routine for this instance. */
566         xbb_dispatch_t            dispatch_io;
567
568         /** The number of requests outstanding on the backend device/file. */
569         int                       active_request_count;
570
571         /** Free pool of request tracking structures. */
572         struct xbb_xen_req_list   request_free_stailq;
573
574         /** Array, sized at connection time, of request tracking structures. */
575         struct xbb_xen_req       *requests;
576
577         /** Free pool of request list structures. */
578         struct xbb_xen_reqlist_list reqlist_free_stailq;
579
580         /** List of pending request lists awaiting execution. */
581         struct xbb_xen_reqlist_list reqlist_pending_stailq;
582
583         /** Array, sized at connection time, of request list structures. */
584         struct xbb_xen_reqlist   *request_lists;
585
586         /**
587          * Global pool of kva used for mapping remote domain ring
588          * and I/O transaction data.
589          */
590         vm_offset_t               kva;
591
592         /** Psuedo-physical address corresponding to kva. */
593         uint64_t                  gnt_base_addr;
594
595         /** The size of the global kva pool. */
596         int                       kva_size;
597
598         /** The size of the KVA area used for request lists. */
599         int                       reqlist_kva_size;
600
601         /** The number of pages of KVA used for request lists */
602         int                       reqlist_kva_pages;
603
604         /** Bitmap of free KVA pages */
605         bitstr_t                 *kva_free;
606
607         /**
608          * \brief Cached value of the front-end's domain id.
609          * 
610          * This value is used at once for each mapped page in
611          * a transaction.  We cache it to avoid incuring the
612          * cost of an ivar access every time this is needed.
613          */
614         domid_t                   otherend_id;
615
616         /**
617          * \brief The blkif protocol abi in effect.
618          *
619          * There are situations where the back and front ends can
620          * have a different, native abi (e.g. intel x86_64 and
621          * 32bit x86 domains on the same machine).  The back-end
622          * always accomodates the front-end's native abi.  That
623          * value is pulled from the XenStore and recorded here.
624          */
625         int                       abi;
626
627         /**
628          * \brief The maximum number of requests and request lists allowed
629          *        to be in flight at a time.
630          *
631          * This value is negotiated via the XenStore.
632          */
633         u_int                     max_requests;
634
635         /**
636          * \brief The maximum number of segments (1 page per segment)
637          *        that can be mapped by a request.
638          *
639          * This value is negotiated via the XenStore.
640          */
641         u_int                     max_request_segments;
642
643         /**
644          * \brief Maximum number of segments per request list.
645          *
646          * This value is derived from and will generally be larger than
647          * max_request_segments.
648          */
649         u_int                     max_reqlist_segments;
650
651         /**
652          * The maximum size of any request to this back-end
653          * device.
654          *
655          * This value is negotiated via the XenStore.
656          */
657         u_int                     max_request_size;
658
659         /**
660          * The maximum size of any request list.  This is derived directly
661          * from max_reqlist_segments.
662          */
663         u_int                     max_reqlist_size;
664
665         /** Various configuration and state bit flags. */
666         xbb_flag_t                flags;
667
668         /** Ring mapping and interrupt configuration data. */
669         struct xbb_ring_config    ring_config;
670
671         /** Runtime, cross-abi safe, structures for ring access. */
672         blkif_back_rings_t        rings;
673
674         /** IRQ mapping for the communication ring event channel. */
675         xen_intr_handle_t         xen_intr_handle;
676
677         /**
678          * \brief Backend access mode flags (e.g. write, or read-only).
679          *
680          * This value is passed to us by the front-end via the XenStore.
681          */
682         char                     *dev_mode;
683
684         /**
685          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
686          *
687          * This value is passed to us by the front-end via the XenStore.
688          * Currently unused.
689          */
690         char                     *dev_type;
691
692         /**
693          * \brief Backend device/file identifier.
694          *
695          * This value is passed to us by the front-end via the XenStore.
696          * We expect this to be a POSIX path indicating the file or
697          * device to open.
698          */
699         char                     *dev_name;
700
701         /**
702          * Vnode corresponding to the backend device node or file
703          * we are acessing.
704          */
705         struct vnode             *vn;
706
707         union xbb_backend_data    backend;
708
709         /** The native sector size of the backend. */
710         u_int                     sector_size;
711
712         /** log2 of sector_size.  */
713         u_int                     sector_size_shift;
714
715         /** Size in bytes of the backend device or file.  */
716         off_t                     media_size;
717
718         /**
719          * \brief media_size expressed in terms of the backend native
720          *        sector size.
721          *
722          * (e.g. xbb->media_size >> xbb->sector_size_shift).
723          */
724         uint64_t                  media_num_sectors;
725
726         /**
727          * \brief Array of memoized scatter gather data computed during the
728          *        conversion of blkif ring requests to internal xbb_xen_req
729          *        structures.
730          *
731          * Ring processing is serialized so we only need one of these.
732          */
733         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
734
735         /**
736          * Temporary grant table map used in xbb_dispatch_io().  When
737          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
738          * stack could cause a stack overflow.
739          */
740         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
741
742         /** Mutex protecting per-instance data. */
743         struct mtx                lock;
744
745 #ifdef XENHVM
746         /**
747          * Resource representing allocated physical address space
748          * associated with our per-instance kva region.
749          */
750         struct resource          *pseudo_phys_res;
751
752         /** Resource id for allocated physical address space. */
753         int                       pseudo_phys_res_id;
754 #endif
755
756         /**
757          * I/O statistics from BlockBack dispatch down.  These are
758          * coalesced requests, and we start them right before execution.
759          */
760         struct devstat           *xbb_stats;
761
762         /**
763          * I/O statistics coming into BlockBack.  These are the requests as
764          * we get them from BlockFront.  They are started as soon as we
765          * receive a request, and completed when the I/O is complete.
766          */
767         struct devstat           *xbb_stats_in;
768
769         /** Disable sending flush to the backend */
770         int                       disable_flush;
771
772         /** Send a real flush for every N flush requests */
773         int                       flush_interval;
774
775         /** Count of flush requests in the interval */
776         int                       flush_count;
777
778         /** Don't coalesce requests if this is set */
779         int                       no_coalesce_reqs;
780
781         /** Number of requests we have received */
782         uint64_t                  reqs_received;
783
784         /** Number of requests we have completed*/
785         uint64_t                  reqs_completed;
786
787         /** How many forced dispatches (i.e. without coalescing) have happend */
788         uint64_t                  forced_dispatch;
789
790         /** How many normal dispatches have happend */
791         uint64_t                  normal_dispatch;
792
793         /** How many total dispatches have happend */
794         uint64_t                  total_dispatch;
795
796         /** How many times we have run out of KVA */
797         uint64_t                  kva_shortages;
798
799         /** How many times we have run out of request structures */
800         uint64_t                  request_shortages;
801 };
802
803 /*---------------------------- Request Processing ----------------------------*/
804 /**
805  * Allocate an internal transaction tracking structure from the free pool.
806  *
807  * \param xbb  Per-instance xbb configuration structure.
808  *
809  * \return  On success, a pointer to the allocated xbb_xen_req structure.
810  *          Otherwise NULL.
811  */
812 static inline struct xbb_xen_req *
813 xbb_get_req(struct xbb_softc *xbb)
814 {
815         struct xbb_xen_req *req;
816
817         req = NULL;
818
819         mtx_assert(&xbb->lock, MA_OWNED);
820
821         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
822                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
823                 xbb->active_request_count++;
824         }
825
826         return (req);
827 }
828
829 /**
830  * Return an allocated transaction tracking structure to the free pool.
831  *
832  * \param xbb  Per-instance xbb configuration structure.
833  * \param req  The request structure to free.
834  */
835 static inline void
836 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
837 {
838         mtx_assert(&xbb->lock, MA_OWNED);
839
840         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
841         xbb->active_request_count--;
842
843         KASSERT(xbb->active_request_count >= 0,
844                 ("xbb_release_req: negative active count"));
845 }
846
847 /**
848  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
849  *
850  * \param xbb       Per-instance xbb configuration structure.
851  * \param req_list  The list of requests to free.
852  * \param nreqs     The number of items in the list.
853  */
854 static inline void
855 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
856                  int nreqs)
857 {
858         mtx_assert(&xbb->lock, MA_OWNED);
859
860         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
861         xbb->active_request_count -= nreqs;
862
863         KASSERT(xbb->active_request_count >= 0,
864                 ("xbb_release_reqs: negative active count"));
865 }
866
867 /**
868  * Given a page index and 512b sector offset within that page,
869  * calculate an offset into a request's kva region.
870  *
871  * \param reqlist The request structure whose kva region will be accessed.
872  * \param pagenr  The page index used to compute the kva offset.
873  * \param sector  The 512b sector index used to compute the page relative
874  *                kva offset.
875  *
876  * \return  The computed global KVA offset.
877  */
878 static inline uint8_t *
879 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
880 {
881         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
882 }
883
884 #ifdef XBB_USE_BOUNCE_BUFFERS
885 /**
886  * Given a page index and 512b sector offset within that page,
887  * calculate an offset into a request's local bounce memory region.
888  *
889  * \param reqlist The request structure whose bounce region will be accessed.
890  * \param pagenr  The page index used to compute the bounce offset.
891  * \param sector  The 512b sector index used to compute the page relative
892  *                bounce offset.
893  *
894  * \return  The computed global bounce buffer address.
895  */
896 static inline uint8_t *
897 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
898 {
899         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
900 }
901 #endif
902
903 /**
904  * Given a page number and 512b sector offset within that page,
905  * calculate an offset into the request's memory region that the
906  * underlying backend device/file should use for I/O.
907  *
908  * \param reqlist The request structure whose I/O region will be accessed.
909  * \param pagenr  The page index used to compute the I/O offset.
910  * \param sector  The 512b sector index used to compute the page relative
911  *                I/O offset.
912  *
913  * \return  The computed global I/O address.
914  *
915  * Depending on configuration, this will either be a local bounce buffer
916  * or a pointer to the memory mapped in from the front-end domain for
917  * this request.
918  */
919 static inline uint8_t *
920 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
921 {
922 #ifdef XBB_USE_BOUNCE_BUFFERS
923         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
924 #else
925         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
926 #endif
927 }
928
929 /**
930  * Given a page index and 512b sector offset within that page, calculate
931  * an offset into the local psuedo-physical address space used to map a
932  * front-end's request data into a request.
933  *
934  * \param reqlist The request list structure whose pseudo-physical region
935  *                will be accessed.
936  * \param pagenr  The page index used to compute the pseudo-physical offset.
937  * \param sector  The 512b sector index used to compute the page relative
938  *                pseudo-physical offset.
939  *
940  * \return  The computed global pseudo-phsyical address.
941  *
942  * Depending on configuration, this will either be a local bounce buffer
943  * or a pointer to the memory mapped in from the front-end domain for
944  * this request.
945  */
946 static inline uintptr_t
947 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
948 {
949         struct xbb_softc *xbb;
950
951         xbb = reqlist->xbb;
952
953         return ((uintptr_t)(xbb->gnt_base_addr +
954                 (uintptr_t)(reqlist->kva - xbb->kva) +
955                 (PAGE_SIZE * pagenr) + (sector << 9)));
956 }
957
958 /**
959  * Get Kernel Virtual Address space for mapping requests.
960  *
961  * \param xbb         Per-instance xbb configuration structure.
962  * \param nr_pages    Number of pages needed.
963  * \param check_only  If set, check for free KVA but don't allocate it.
964  * \param have_lock   If set, xbb lock is already held.
965  *
966  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
967  *
968  * Note:  This should be unnecessary once we have either chaining or
969  * scatter/gather support for struct bio.  At that point we'll be able to
970  * put multiple addresses and lengths in one bio/bio chain and won't need
971  * to map everything into one virtual segment.
972  */
973 static uint8_t *
974 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
975 {
976         intptr_t first_clear;
977         intptr_t num_clear;
978         uint8_t *free_kva;
979         int      i;
980
981         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
982
983         first_clear = 0;
984         free_kva = NULL;
985
986         mtx_lock(&xbb->lock);
987
988         /*
989          * Look for the first available page.  If there are none, we're done.
990          */
991         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
992
993         if (first_clear == -1)
994                 goto bailout;
995
996         /*
997          * Starting at the first available page, look for consecutive free
998          * pages that will satisfy the user's request.
999          */
1000         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1001                 /*
1002                  * If this is true, the page is used, so we have to reset
1003                  * the number of clear pages and the first clear page
1004                  * (since it pointed to a region with an insufficient number
1005                  * of clear pages).
1006                  */
1007                 if (bit_test(xbb->kva_free, i)) {
1008                         num_clear = 0;
1009                         first_clear = -1;
1010                         continue;
1011                 }
1012
1013                 if (first_clear == -1)
1014                         first_clear = i;
1015
1016                 /*
1017                  * If this is true, we've found a large enough free region
1018                  * to satisfy the request.
1019                  */
1020                 if (++num_clear == nr_pages) {
1021
1022                         bit_nset(xbb->kva_free, first_clear,
1023                                  first_clear + nr_pages - 1);
1024
1025                         free_kva = xbb->kva +
1026                                 (uint8_t *)(first_clear * PAGE_SIZE);
1027
1028                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1029                                 free_kva + (nr_pages * PAGE_SIZE) <=
1030                                 (uint8_t *)xbb->ring_config.va,
1031                                 ("Free KVA %p len %d out of range, "
1032                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1033                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1034                                  (uintmax_t)xbb->ring_config.va));
1035                         break;
1036                 }
1037         }
1038
1039 bailout:
1040
1041         if (free_kva == NULL) {
1042                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1043                 xbb->kva_shortages++;
1044         }
1045
1046         mtx_unlock(&xbb->lock);
1047
1048         return (free_kva);
1049 }
1050
1051 /**
1052  * Free allocated KVA.
1053  *
1054  * \param xbb       Per-instance xbb configuration structure.
1055  * \param kva_ptr   Pointer to allocated KVA region.  
1056  * \param nr_pages  Number of pages in the KVA region.
1057  */
1058 static void
1059 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1060 {
1061         intptr_t start_page;
1062
1063         mtx_assert(&xbb->lock, MA_OWNED);
1064
1065         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1066         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1067
1068 }
1069
1070 /**
1071  * Unmap the front-end pages associated with this I/O request.
1072  *
1073  * \param req  The request structure to unmap.
1074  */
1075 static void
1076 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1077 {
1078         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1079         u_int                         i;
1080         u_int                         invcount;
1081         int                           error;
1082
1083         invcount = 0;
1084         for (i = 0; i < reqlist->nr_segments; i++) {
1085
1086                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1087                         continue;
1088
1089                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1090                 unmap[invcount].dev_bus_addr = 0;
1091                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1092                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1093                 invcount++;
1094         }
1095
1096         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1097                                           unmap, invcount);
1098         KASSERT(error == 0, ("Grant table operation failed"));
1099 }
1100
1101 /**
1102  * Allocate an internal transaction tracking structure from the free pool.
1103  *
1104  * \param xbb  Per-instance xbb configuration structure.
1105  *
1106  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1107  *          Otherwise NULL.
1108  */
1109 static inline struct xbb_xen_reqlist *
1110 xbb_get_reqlist(struct xbb_softc *xbb)
1111 {
1112         struct xbb_xen_reqlist *reqlist;
1113
1114         reqlist = NULL;
1115
1116         mtx_assert(&xbb->lock, MA_OWNED);
1117
1118         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1119
1120                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1121                 reqlist->flags = XBB_REQLIST_NONE;
1122                 reqlist->kva = NULL;
1123                 reqlist->status = BLKIF_RSP_OKAY;
1124                 reqlist->residual_512b_sectors = 0;
1125                 reqlist->num_children = 0;
1126                 reqlist->nr_segments = 0;
1127                 STAILQ_INIT(&reqlist->contig_req_list);
1128         }
1129
1130         return (reqlist);
1131 }
1132
1133 /**
1134  * Return an allocated transaction tracking structure to the free pool.
1135  *
1136  * \param xbb        Per-instance xbb configuration structure.
1137  * \param req        The request list structure to free.
1138  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1139  *                   during a resource shortage condition.
1140  */
1141 static inline void
1142 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1143                     int wakeup)
1144 {
1145
1146         mtx_lock(&xbb->lock);
1147
1148         if (wakeup) {
1149                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1150                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1151         }
1152
1153         if (reqlist->kva != NULL)
1154                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1155
1156         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1157
1158         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1159
1160         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1161                 /*
1162                  * Shutdown is in progress.  See if we can
1163                  * progress further now that one more request
1164                  * has completed and been returned to the
1165                  * free pool.
1166                  */
1167                 xbb_shutdown(xbb);
1168         }
1169
1170         mtx_unlock(&xbb->lock);
1171
1172         if (wakeup != 0)
1173                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1174 }
1175
1176 /**
1177  * Request resources and do basic request setup.
1178  *
1179  * \param xbb          Per-instance xbb configuration structure.
1180  * \param reqlist      Pointer to reqlist pointer.
1181  * \param ring_req     Pointer to a block ring request.
1182  * \param ring_index   The ring index of this request.
1183  *
1184  * \return  0 for success, non-zero for failure.
1185  */
1186 static int
1187 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1188                   blkif_request_t *ring_req, RING_IDX ring_idx)
1189 {
1190         struct xbb_xen_reqlist *nreqlist;
1191         struct xbb_xen_req     *nreq;
1192
1193         nreqlist = NULL;
1194         nreq     = NULL;
1195
1196         mtx_lock(&xbb->lock);
1197
1198         /*
1199          * We don't allow new resources to be allocated if we're in the
1200          * process of shutting down.
1201          */
1202         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1203                 mtx_unlock(&xbb->lock);
1204                 return (1);
1205         }
1206
1207         /*
1208          * Allocate a reqlist if the caller doesn't have one already.
1209          */
1210         if (*reqlist == NULL) {
1211                 nreqlist = xbb_get_reqlist(xbb);
1212                 if (nreqlist == NULL)
1213                         goto bailout_error;
1214         }
1215
1216         /* We always allocate a request. */
1217         nreq = xbb_get_req(xbb);
1218         if (nreq == NULL)
1219                 goto bailout_error;
1220
1221         mtx_unlock(&xbb->lock);
1222
1223         if (*reqlist == NULL) {
1224                 *reqlist = nreqlist;
1225                 nreqlist->operation = ring_req->operation;
1226                 nreqlist->starting_sector_number = ring_req->sector_number;
1227                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1228                                    links);
1229         }
1230
1231         nreq->reqlist = *reqlist;
1232         nreq->req_ring_idx = ring_idx;
1233         nreq->id = ring_req->id;
1234         nreq->operation = ring_req->operation;
1235
1236         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1237                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1238                 nreq->ring_req = &nreq->ring_req_storage;
1239         } else {
1240                 nreq->ring_req = ring_req;
1241         }
1242
1243         binuptime(&nreq->ds_t0);
1244         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1245         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1246         (*reqlist)->num_children++;
1247         (*reqlist)->nr_segments += ring_req->nr_segments;
1248
1249         return (0);
1250
1251 bailout_error:
1252
1253         /*
1254          * We're out of resources, so set the shortage flag.  The next time
1255          * a request is released, we'll try waking up the work thread to
1256          * see if we can allocate more resources.
1257          */
1258         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1259         xbb->request_shortages++;
1260
1261         if (nreq != NULL)
1262                 xbb_release_req(xbb, nreq);
1263
1264         mtx_unlock(&xbb->lock);
1265
1266         if (nreqlist != NULL)
1267                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1268
1269         return (1);
1270 }
1271
1272 /**
1273  * Create and transmit a response to a blkif request.
1274  * 
1275  * \param xbb     Per-instance xbb configuration structure.
1276  * \param req     The request structure to which to respond.
1277  * \param status  The status code to report.  See BLKIF_RSP_*
1278  *                in sys/xen/interface/io/blkif.h.
1279  */
1280 static void
1281 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1282 {
1283         blkif_response_t *resp;
1284         int               more_to_do;
1285         int               notify;
1286
1287         more_to_do = 0;
1288
1289         /*
1290          * Place on the response ring for the relevant domain.
1291          * For now, only the spacing between entries is different
1292          * in the different ABIs, not the response entry layout.
1293          */
1294         mtx_lock(&xbb->lock);
1295         switch (xbb->abi) {
1296         case BLKIF_PROTOCOL_NATIVE:
1297                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1298                                          xbb->rings.native.rsp_prod_pvt);
1299                 break;
1300         case BLKIF_PROTOCOL_X86_32:
1301                 resp = (blkif_response_t *)
1302                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1303                                       xbb->rings.x86_32.rsp_prod_pvt);
1304                 break;
1305         case BLKIF_PROTOCOL_X86_64:
1306                 resp = (blkif_response_t *)
1307                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1308                                       xbb->rings.x86_64.rsp_prod_pvt);
1309                 break;
1310         default:
1311                 panic("Unexpected blkif protocol ABI.");
1312         }
1313
1314         resp->id        = req->id;
1315         resp->operation = req->operation;
1316         resp->status    = status;
1317
1318         xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
1319         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
1320
1321         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1322
1323                 /*
1324                  * Tail check for pending requests. Allows frontend to avoid
1325                  * notifications if requests are already in flight (lower
1326                  * overheads and promotes batching).
1327                  */
1328                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1329         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1330
1331                 more_to_do = 1;
1332         }
1333
1334         xbb->reqs_completed++;
1335
1336         mtx_unlock(&xbb->lock);
1337
1338         if (more_to_do)
1339                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1340
1341         if (notify)
1342                 xen_intr_signal(xbb->xen_intr_handle);
1343 }
1344
1345 /**
1346  * Complete a request list.
1347  *
1348  * \param xbb        Per-instance xbb configuration structure.
1349  * \param reqlist    Allocated internal request list structure.
1350  */
1351 static void
1352 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1353 {
1354         struct xbb_xen_req *nreq;
1355         off_t               sectors_sent;
1356
1357         sectors_sent = 0;
1358
1359         if (reqlist->flags & XBB_REQLIST_MAPPED)
1360                 xbb_unmap_reqlist(reqlist);
1361
1362         /*
1363          * All I/O is done, send the response.  A lock should not be
1364          * necessary here because the request list is complete, and
1365          * therefore this is the only context accessing this request
1366          * right now.  The functions we call do their own locking if
1367          * necessary.
1368          */
1369         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1370                 off_t cur_sectors_sent;
1371
1372                 xbb_send_response(xbb, nreq, reqlist->status);
1373
1374                 /* We don't report bytes sent if there is an error. */
1375                 if (reqlist->status == BLKIF_RSP_OKAY)
1376                         cur_sectors_sent = nreq->nr_512b_sectors;
1377                 else
1378                         cur_sectors_sent = 0;
1379
1380                 sectors_sent += cur_sectors_sent;
1381
1382                 devstat_end_transaction(xbb->xbb_stats_in,
1383                                         /*bytes*/cur_sectors_sent << 9,
1384                                         reqlist->ds_tag_type,
1385                                         reqlist->ds_trans_type,
1386                                         /*now*/NULL,
1387                                         /*then*/&nreq->ds_t0);
1388         }
1389
1390         /*
1391          * Take out any sectors not sent.  If we wind up negative (which
1392          * might happen if an error is reported as well as a residual), just
1393          * report 0 sectors sent.
1394          */
1395         sectors_sent -= reqlist->residual_512b_sectors;
1396         if (sectors_sent < 0)
1397                 sectors_sent = 0;
1398
1399         devstat_end_transaction(xbb->xbb_stats,
1400                                 /*bytes*/ sectors_sent << 9,
1401                                 reqlist->ds_tag_type,
1402                                 reqlist->ds_trans_type,
1403                                 /*now*/NULL,
1404                                 /*then*/&reqlist->ds_t0);
1405
1406         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1407 }
1408
1409 /**
1410  * Completion handler for buffer I/O requests issued by the device
1411  * backend driver.
1412  *
1413  * \param bio  The buffer I/O request on which to perform completion
1414  *             processing.
1415  */
1416 static void
1417 xbb_bio_done(struct bio *bio)
1418 {
1419         struct xbb_softc       *xbb;
1420         struct xbb_xen_reqlist *reqlist;
1421
1422         reqlist = bio->bio_caller1;
1423         xbb     = reqlist->xbb;
1424
1425         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1426
1427         /*
1428          * This is a bit imprecise.  With aggregated I/O a single
1429          * request list can contain multiple front-end requests and
1430          * a multiple bios may point to a single request.  By carefully
1431          * walking the request list, we could map residuals and errors
1432          * back to the original front-end request, but the interface
1433          * isn't sufficiently rich for us to properly report the error.
1434          * So, we just treat the entire request list as having failed if an
1435          * error occurs on any part.  And, if an error occurs, we treat
1436          * the amount of data transferred as 0.
1437          *
1438          * For residuals, we report it on the overall aggregated device,
1439          * but not on the individual requests, since we don't currently
1440          * do the work to determine which front-end request to which the
1441          * residual applies.
1442          */
1443         if (bio->bio_error) {
1444                 DPRINTF("BIO returned error %d for operation on device %s\n",
1445                         bio->bio_error, xbb->dev_name);
1446                 reqlist->status = BLKIF_RSP_ERROR;
1447
1448                 if (bio->bio_error == ENXIO
1449                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1450
1451                         /*
1452                          * Backend device has disappeared.  Signal the
1453                          * front-end that we (the device proxy) want to
1454                          * go away.
1455                          */
1456                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1457                 }
1458         }
1459
1460 #ifdef XBB_USE_BOUNCE_BUFFERS
1461         if (bio->bio_cmd == BIO_READ) {
1462                 vm_offset_t kva_offset;
1463
1464                 kva_offset = (vm_offset_t)bio->bio_data
1465                            - (vm_offset_t)reqlist->bounce;
1466                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1467                        bio->bio_data, bio->bio_bcount);
1468         }
1469 #endif /* XBB_USE_BOUNCE_BUFFERS */
1470
1471         /*
1472          * Decrement the pending count for the request list.  When we're
1473          * done with the requests, send status back for all of them.
1474          */
1475         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1476                 xbb_complete_reqlist(xbb, reqlist);
1477
1478         g_destroy_bio(bio);
1479 }
1480
1481 /**
1482  * Parse a blkif request into an internal request structure and send
1483  * it to the backend for processing.
1484  *
1485  * \param xbb       Per-instance xbb configuration structure.
1486  * \param reqlist   Allocated internal request list structure.
1487  *
1488  * \return          On success, 0.  For resource shortages, non-zero.
1489  *  
1490  * This routine performs the backend common aspects of request parsing
1491  * including compiling an internal request structure, parsing the S/G
1492  * list and any secondary ring requests in which they may reside, and
1493  * the mapping of front-end I/O pages into our domain.
1494  */
1495 static int
1496 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1497 {
1498         struct xbb_sg                *xbb_sg;
1499         struct gnttab_map_grant_ref  *map;
1500         struct blkif_request_segment *sg;
1501         struct blkif_request_segment *last_block_sg;
1502         struct xbb_xen_req           *nreq;
1503         u_int                         nseg;
1504         u_int                         seg_idx;
1505         u_int                         block_segs;
1506         int                           nr_sects;
1507         int                           total_sects;
1508         int                           operation;
1509         uint8_t                       bio_flags;
1510         int                           error;
1511
1512         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1513         bio_flags            = 0;
1514         total_sects          = 0;
1515         nr_sects             = 0;
1516
1517         /*
1518          * First determine whether we have enough free KVA to satisfy this
1519          * request list.  If not, tell xbb_run_queue() so it can go to
1520          * sleep until we have more KVA.
1521          */
1522         reqlist->kva = NULL;
1523         if (reqlist->nr_segments != 0) {
1524                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1525                 if (reqlist->kva == NULL) {
1526                         /*
1527                          * If we're out of KVA, return ENOMEM.
1528                          */
1529                         return (ENOMEM);
1530                 }
1531         }
1532
1533         binuptime(&reqlist->ds_t0);
1534         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1535
1536         switch (reqlist->operation) {
1537         case BLKIF_OP_WRITE_BARRIER:
1538                 bio_flags       |= BIO_ORDERED;
1539                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1540                 /* FALLTHROUGH */
1541         case BLKIF_OP_WRITE:
1542                 operation = BIO_WRITE;
1543                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1544                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1545                         DPRINTF("Attempt to write to read only device %s\n",
1546                                 xbb->dev_name);
1547                         reqlist->status = BLKIF_RSP_ERROR;
1548                         goto send_response;
1549                 }
1550                 break;
1551         case BLKIF_OP_READ:
1552                 operation = BIO_READ;
1553                 reqlist->ds_trans_type = DEVSTAT_READ;
1554                 break;
1555         case BLKIF_OP_FLUSH_DISKCACHE:
1556                 /*
1557                  * If this is true, the user has requested that we disable
1558                  * flush support.  So we just complete the requests
1559                  * successfully.
1560                  */
1561                 if (xbb->disable_flush != 0) {
1562                         goto send_response;
1563                 }
1564
1565                 /*
1566                  * The user has requested that we only send a real flush
1567                  * for every N flush requests.  So keep count, and either
1568                  * complete the request immediately or queue it for the
1569                  * backend.
1570                  */
1571                 if (xbb->flush_interval != 0) {
1572                         if (++(xbb->flush_count) < xbb->flush_interval) {
1573                                 goto send_response;
1574                         } else
1575                                 xbb->flush_count = 0;
1576                 }
1577
1578                 operation = BIO_FLUSH;
1579                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1580                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1581                 goto do_dispatch;
1582                 /*NOTREACHED*/
1583         default:
1584                 DPRINTF("error: unknown block io operation [%d]\n",
1585                         reqlist->operation);
1586                 reqlist->status = BLKIF_RSP_ERROR;
1587                 goto send_response;
1588         }
1589
1590         reqlist->xbb  = xbb;
1591         xbb_sg        = xbb->xbb_sgs;
1592         map           = xbb->maps;
1593         seg_idx       = 0;
1594
1595         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1596                 blkif_request_t         *ring_req;
1597                 RING_IDX                 req_ring_idx;
1598                 u_int                    req_seg_idx;
1599
1600                 ring_req              = nreq->ring_req;
1601                 req_ring_idx          = nreq->req_ring_idx;
1602                 nr_sects              = 0;
1603                 nseg                  = ring_req->nr_segments;
1604                 nreq->nr_pages        = nseg;
1605                 nreq->nr_512b_sectors = 0;
1606                 req_seg_idx           = 0;
1607                 sg                    = NULL;
1608
1609                 /* Check that number of segments is sane. */
1610                 if (__predict_false(nseg == 0)
1611                  || __predict_false(nseg > xbb->max_request_segments)) {
1612                         DPRINTF("Bad number of segments in request (%d)\n",
1613                                 nseg);
1614                         reqlist->status = BLKIF_RSP_ERROR;
1615                         goto send_response;
1616                 }
1617
1618                 block_segs    = MIN(nreq->nr_pages,
1619                                     BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1620                 sg            = ring_req->seg;
1621                 last_block_sg = sg + block_segs;
1622                 while (1) {
1623
1624                         while (sg < last_block_sg) {
1625                                 KASSERT(seg_idx <
1626                                         XBB_MAX_SEGMENTS_PER_REQLIST,
1627                                         ("seg_idx %d is too large, max "
1628                                         "segs %d\n", seg_idx,
1629                                         XBB_MAX_SEGMENTS_PER_REQLIST));
1630                         
1631                                 xbb_sg->first_sect = sg->first_sect;
1632                                 xbb_sg->last_sect  = sg->last_sect;
1633                                 xbb_sg->nsect =
1634                                     (int8_t)(sg->last_sect -
1635                                     sg->first_sect + 1);
1636
1637                                 if ((sg->last_sect >= (PAGE_SIZE >> 9))
1638                                  || (xbb_sg->nsect <= 0)) {
1639                                         reqlist->status = BLKIF_RSP_ERROR;
1640                                         goto send_response;
1641                                 }
1642
1643                                 nr_sects += xbb_sg->nsect;
1644                                 map->host_addr = xbb_get_gntaddr(reqlist,
1645                                                         seg_idx, /*sector*/0);
1646                                 KASSERT(map->host_addr + PAGE_SIZE <=
1647                                         xbb->ring_config.gnt_addr,
1648                                         ("Host address %#jx len %d overlaps "
1649                                          "ring address %#jx\n",
1650                                         (uintmax_t)map->host_addr, PAGE_SIZE,
1651                                         (uintmax_t)xbb->ring_config.gnt_addr));
1652                                         
1653                                 map->flags     = GNTMAP_host_map;
1654                                 map->ref       = sg->gref;
1655                                 map->dom       = xbb->otherend_id;
1656                                 if (operation == BIO_WRITE)
1657                                         map->flags |= GNTMAP_readonly;
1658                                 sg++;
1659                                 map++;
1660                                 xbb_sg++;
1661                                 seg_idx++;
1662                                 req_seg_idx++;
1663                         }
1664
1665                         block_segs = MIN(nseg - req_seg_idx,
1666                                          BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1667                         if (block_segs == 0)
1668                                 break;
1669
1670                         /*
1671                          * Fetch the next request block full of SG elements.
1672                          * For now, only the spacing between entries is
1673                          * different in the different ABIs, not the sg entry
1674                          * layout.
1675                          */
1676                         req_ring_idx++;
1677                         switch (xbb->abi) {
1678                         case BLKIF_PROTOCOL_NATIVE:
1679                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
1680                                                            req_ring_idx);
1681                                 break;
1682                         case BLKIF_PROTOCOL_X86_32:
1683                         {
1684                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
1685                                                            req_ring_idx);
1686                                 break;
1687                         }
1688                         case BLKIF_PROTOCOL_X86_64:
1689                         {
1690                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
1691                                                            req_ring_idx);
1692                                 break;
1693                         }
1694                         default:
1695                                 panic("Unexpected blkif protocol ABI.");
1696                                 /* NOTREACHED */
1697                         } 
1698                         last_block_sg = sg + block_segs;
1699                 }
1700
1701                 /* Convert to the disk's sector size */
1702                 nreq->nr_512b_sectors = nr_sects;
1703                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1704                 total_sects += nr_sects;
1705
1706                 if ((nreq->nr_512b_sectors &
1707                     ((xbb->sector_size >> 9) - 1)) != 0) {
1708                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1709                                       "a multiple of the backing store sector "
1710                                       "size (%d)\n", __func__,
1711                                       nreq->nr_512b_sectors << 9,
1712                                       xbb->sector_size);
1713                         reqlist->status = BLKIF_RSP_ERROR;
1714                         goto send_response;
1715                 }
1716         }
1717
1718         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1719                                           xbb->maps, reqlist->nr_segments);
1720         if (error != 0)
1721                 panic("Grant table operation failed (%d)", error);
1722
1723         reqlist->flags |= XBB_REQLIST_MAPPED;
1724
1725         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1726              seg_idx++, map++){
1727
1728                 if (__predict_false(map->status != 0)) {
1729                         DPRINTF("invalid buffer -- could not remap "
1730                                 "it (%d)\n", map->status);
1731                         DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
1732                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1733                                 map->host_addr, map->flags, map->ref,
1734                                 map->dom);
1735                         reqlist->status = BLKIF_RSP_ERROR;
1736                         goto send_response;
1737                 }
1738
1739                 reqlist->gnt_handles[seg_idx] = map->handle;
1740         }
1741         if (reqlist->starting_sector_number + total_sects >
1742             xbb->media_num_sectors) {
1743
1744                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1745                         "extends past end of device %s\n",
1746                         operation == BIO_READ ? "read" : "write",
1747                         reqlist->starting_sector_number,
1748                         reqlist->starting_sector_number + total_sects,
1749                         xbb->dev_name); 
1750                 reqlist->status = BLKIF_RSP_ERROR;
1751                 goto send_response;
1752         }
1753
1754 do_dispatch:
1755
1756         error = xbb->dispatch_io(xbb,
1757                                  reqlist,
1758                                  operation,
1759                                  bio_flags);
1760
1761         if (error != 0) {
1762                 reqlist->status = BLKIF_RSP_ERROR;
1763                 goto send_response;
1764         }
1765
1766         return (0);
1767
1768 send_response:
1769
1770         xbb_complete_reqlist(xbb, reqlist);
1771
1772         return (0);
1773 }
1774
1775 static __inline int
1776 xbb_count_sects(blkif_request_t *ring_req)
1777 {
1778         int i;
1779         int cur_size = 0;
1780
1781         for (i = 0; i < ring_req->nr_segments; i++) {
1782                 int nsect;
1783
1784                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1785                         ring_req->seg[i].first_sect + 1);
1786                 if (nsect <= 0)
1787                         break;
1788
1789                 cur_size += nsect;
1790         }
1791
1792         return (cur_size);
1793 }
1794
1795 /**
1796  * Process incoming requests from the shared communication ring in response
1797  * to a signal on the ring's event channel.
1798  *
1799  * \param context  Callback argument registerd during task initialization -
1800  *                 the xbb_softc for this instance.
1801  * \param pending  The number of taskqueue_enqueue events that have
1802  *                 occurred since this handler was last run.
1803  */
1804 static void
1805 xbb_run_queue(void *context, int pending)
1806 {
1807         struct xbb_softc       *xbb;
1808         blkif_back_rings_t     *rings;
1809         RING_IDX                rp;
1810         uint64_t                cur_sector;
1811         int                     cur_operation;
1812         struct xbb_xen_reqlist *reqlist;
1813
1814
1815         xbb   = (struct xbb_softc *)context;
1816         rings = &xbb->rings;
1817
1818         /*
1819          * Work gather and dispatch loop.  Note that we have a bias here
1820          * towards gathering I/O sent by blockfront.  We first gather up
1821          * everything in the ring, as long as we have resources.  Then we
1822          * dispatch one request, and then attempt to gather up any
1823          * additional requests that have come in while we were dispatching
1824          * the request.
1825          *
1826          * This allows us to get a clearer picture (via devstat) of how
1827          * many requests blockfront is queueing to us at any given time.
1828          */
1829         for (;;) {
1830                 int retval;
1831
1832                 /*
1833                  * Initialize reqlist to the last element in the pending
1834                  * queue, if there is one.  This allows us to add more
1835                  * requests to that request list, if we have room.
1836                  */
1837                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1838                                       xbb_xen_reqlist, links);
1839                 if (reqlist != NULL) {
1840                         cur_sector = reqlist->next_contig_sector;
1841                         cur_operation = reqlist->operation;
1842                 } else {
1843                         cur_operation = 0;
1844                         cur_sector    = 0;
1845                 }
1846
1847                 /*
1848                  * Cache req_prod to avoid accessing a cache line shared
1849                  * with the frontend.
1850                  */
1851                 rp = rings->common.sring->req_prod;
1852
1853                 /* Ensure we see queued requests up to 'rp'. */
1854                 rmb();
1855
1856                 /**
1857                  * Run so long as there is work to consume and the generation
1858                  * of a response will not overflow the ring.
1859                  *
1860                  * @note There's a 1 to 1 relationship between requests and
1861                  *       responses, so an overflow should never occur.  This
1862                  *       test is to protect our domain from digesting bogus
1863                  *       data.  Shouldn't we log this?
1864                  */
1865                 while (rings->common.req_cons != rp
1866                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1867                                                   rings->common.req_cons) == 0){
1868                         blkif_request_t         ring_req_storage;
1869                         blkif_request_t        *ring_req;
1870                         int                     cur_size;
1871
1872                         switch (xbb->abi) {
1873                         case BLKIF_PROTOCOL_NATIVE:
1874                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1875                                     rings->common.req_cons);
1876                                 break;
1877                         case BLKIF_PROTOCOL_X86_32:
1878                         {
1879                                 struct blkif_x86_32_request *ring_req32;
1880
1881                                 ring_req32 = RING_GET_REQUEST(
1882                                     &xbb->rings.x86_32, rings->common.req_cons);
1883                                 blkif_get_x86_32_req(&ring_req_storage,
1884                                                      ring_req32);
1885                                 ring_req = &ring_req_storage;
1886                                 break;
1887                         }
1888                         case BLKIF_PROTOCOL_X86_64:
1889                         {
1890                                 struct blkif_x86_64_request *ring_req64;
1891
1892                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1893                                     rings->common.req_cons);
1894                                 blkif_get_x86_64_req(&ring_req_storage,
1895                                                      ring_req64);
1896                                 ring_req = &ring_req_storage;
1897                                 break;
1898                         }
1899                         default:
1900                                 panic("Unexpected blkif protocol ABI.");
1901                                 /* NOTREACHED */
1902                         } 
1903
1904                         /*
1905                          * Check for situations that would require closing
1906                          * off this I/O for further coalescing:
1907                          *  - Coalescing is turned off.
1908                          *  - Current I/O is out of sequence with the previous
1909                          *    I/O.
1910                          *  - Coalesced I/O would be too large.
1911                          */
1912                         if ((reqlist != NULL)
1913                          && ((xbb->no_coalesce_reqs != 0)
1914                           || ((xbb->no_coalesce_reqs == 0)
1915                            && ((ring_req->sector_number != cur_sector)
1916                             || (ring_req->operation != cur_operation)
1917                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1918                                  xbb->max_reqlist_segments))))) {
1919                                 reqlist = NULL;
1920                         }
1921
1922                         /*
1923                          * Grab and check for all resources in one shot.
1924                          * If we can't get all of the resources we need,
1925                          * the shortage is noted and the thread will get
1926                          * woken up when more resources are available.
1927                          */
1928                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1929                                                    xbb->rings.common.req_cons);
1930
1931                         if (retval != 0) {
1932                                 /*
1933                                  * Resource shortage has been recorded.
1934                                  * We'll be scheduled to run once a request
1935                                  * object frees up due to a completion.
1936                                  */
1937                                 break;
1938                         }
1939
1940                         /*
1941                          * Signify that we can overwrite this request with
1942                          * a response by incrementing our consumer index.
1943                          * The response won't be generated until after
1944                          * we've already consumed all necessary data out
1945                          * of the version of the request in the ring buffer
1946                          * (for native mode).  We must update the consumer
1947                          * index  before issueing back-end I/O so there is
1948                          * no possibility that it will complete and a
1949                          * response be generated before we make room in 
1950                          * the queue for that response.
1951                          */
1952                         xbb->rings.common.req_cons +=
1953                             BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
1954                         xbb->reqs_received++;
1955
1956                         cur_size = xbb_count_sects(ring_req);
1957                         cur_sector = ring_req->sector_number + cur_size;
1958                         reqlist->next_contig_sector = cur_sector;
1959                         cur_operation = ring_req->operation;
1960                 }
1961
1962                 /* Check for I/O to dispatch */
1963                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1964                 if (reqlist == NULL) {
1965                         /*
1966                          * We're out of work to do, put the task queue to
1967                          * sleep.
1968                          */
1969                         break;
1970                 }
1971
1972                 /*
1973                  * Grab the first request off the queue and attempt
1974                  * to dispatch it.
1975                  */
1976                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1977
1978                 retval = xbb_dispatch_io(xbb, reqlist);
1979                 if (retval != 0) {
1980                         /*
1981                          * xbb_dispatch_io() returns non-zero only when
1982                          * there is a resource shortage.  If that's the
1983                          * case, re-queue this request on the head of the
1984                          * queue, and go to sleep until we have more
1985                          * resources.
1986                          */
1987                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1988                                            reqlist, links);
1989                         break;
1990                 } else {
1991                         /*
1992                          * If we still have anything on the queue after
1993                          * removing the head entry, that is because we
1994                          * met one of the criteria to create a new
1995                          * request list (outlined above), and we'll call
1996                          * that a forced dispatch for statistical purposes.
1997                          *
1998                          * Otherwise, if there is only one element on the
1999                          * queue, we coalesced everything available on
2000                          * the ring and we'll call that a normal dispatch.
2001                          */
2002                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2003
2004                         if (reqlist != NULL)
2005                                 xbb->forced_dispatch++;
2006                         else
2007                                 xbb->normal_dispatch++;
2008
2009                         xbb->total_dispatch++;
2010                 }
2011         }
2012 }
2013
2014 /**
2015  * Interrupt handler bound to the shared ring's event channel.
2016  *
2017  * \param arg  Callback argument registerd during event channel
2018  *             binding - the xbb_softc for this instance.
2019  */
2020 static int
2021 xbb_filter(void *arg)
2022 {
2023         struct xbb_softc *xbb;
2024
2025         /* Defer to taskqueue thread. */
2026         xbb = (struct xbb_softc *)arg;
2027         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
2028
2029         return (FILTER_HANDLED);
2030 }
2031
2032 SDT_PROVIDER_DEFINE(xbb);
2033 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
2034 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
2035                   "uint64_t");
2036 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
2037                   "uint64_t", "uint64_t");
2038
2039 /*----------------------------- Backend Handlers -----------------------------*/
2040 /**
2041  * Backend handler for character device access.
2042  *
2043  * \param xbb        Per-instance xbb configuration structure.
2044  * \param reqlist    Allocated internal request list structure.
2045  * \param operation  BIO_* I/O operation code.
2046  * \param bio_flags  Additional bio_flag data to pass to any generated
2047  *                   bios (e.g. BIO_ORDERED)..
2048  *
2049  * \return  0 for success, errno codes for failure.
2050  */
2051 static int
2052 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2053                  int operation, int bio_flags)
2054 {
2055         struct xbb_dev_data *dev_data;
2056         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2057         off_t                bio_offset;
2058         struct bio          *bio;
2059         struct xbb_sg       *xbb_sg;
2060         u_int                nbio;
2061         u_int                bio_idx;
2062         u_int                nseg;
2063         u_int                seg_idx;
2064         int                  error;
2065
2066         dev_data   = &xbb->backend.dev;
2067         bio_offset = (off_t)reqlist->starting_sector_number
2068                    << xbb->sector_size_shift;
2069         error      = 0;
2070         nbio       = 0;
2071         bio_idx    = 0;
2072
2073         if (operation == BIO_FLUSH) {
2074                 bio = g_new_bio();
2075                 if (__predict_false(bio == NULL)) {
2076                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2077                         error = ENOMEM;
2078                         return (error);
2079                 }
2080
2081                 bio->bio_cmd     = BIO_FLUSH;
2082                 bio->bio_flags  |= BIO_ORDERED;
2083                 bio->bio_dev     = dev_data->cdev;
2084                 bio->bio_offset  = 0;
2085                 bio->bio_data    = 0;
2086                 bio->bio_done    = xbb_bio_done;
2087                 bio->bio_caller1 = reqlist;
2088                 bio->bio_pblkno  = 0;
2089
2090                 reqlist->pendcnt = 1;
2091
2092                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2093                            device_get_unit(xbb->dev));
2094
2095                 (*dev_data->csw->d_strategy)(bio);
2096
2097                 return (0);
2098         }
2099
2100         xbb_sg = xbb->xbb_sgs;
2101         bio    = NULL;
2102         nseg = reqlist->nr_segments;
2103
2104         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2105
2106                 /*
2107                  * KVA will not be contiguous, so any additional
2108                  * I/O will need to be represented in a new bio.
2109                  */
2110                 if ((bio != NULL)
2111                  && (xbb_sg->first_sect != 0)) {
2112                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2113                                 printf("%s: Discontiguous I/O request "
2114                                        "from domain %d ends on "
2115                                        "non-sector boundary\n",
2116                                        __func__, xbb->otherend_id);
2117                                 error = EINVAL;
2118                                 goto fail_free_bios;
2119                         }
2120                         bio = NULL;
2121                 }
2122
2123                 if (bio == NULL) {
2124                         /*
2125                          * Make sure that the start of this bio is
2126                          * aligned to a device sector.
2127                          */
2128                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2129                                 printf("%s: Misaligned I/O request "
2130                                        "from domain %d\n", __func__,
2131                                        xbb->otherend_id);
2132                                 error = EINVAL;
2133                                 goto fail_free_bios;
2134                         }
2135
2136                         bio = bios[nbio++] = g_new_bio();
2137                         if (__predict_false(bio == NULL)) {
2138                                 error = ENOMEM;
2139                                 goto fail_free_bios;
2140                         }
2141                         bio->bio_cmd     = operation;
2142                         bio->bio_flags  |= bio_flags;
2143                         bio->bio_dev     = dev_data->cdev;
2144                         bio->bio_offset  = bio_offset;
2145                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2146                                                 xbb_sg->first_sect);
2147                         bio->bio_done    = xbb_bio_done;
2148                         bio->bio_caller1 = reqlist;
2149                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2150                 }
2151
2152                 bio->bio_length += xbb_sg->nsect << 9;
2153                 bio->bio_bcount  = bio->bio_length;
2154                 bio_offset      += xbb_sg->nsect << 9;
2155
2156                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2157
2158                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2159                                 printf("%s: Discontiguous I/O request "
2160                                        "from domain %d ends on "
2161                                        "non-sector boundary\n",
2162                                        __func__, xbb->otherend_id);
2163                                 error = EINVAL;
2164                                 goto fail_free_bios;
2165                         }
2166                         /*
2167                          * KVA will not be contiguous, so any additional
2168                          * I/O will need to be represented in a new bio.
2169                          */
2170                         bio = NULL;
2171                 }
2172         }
2173
2174         reqlist->pendcnt = nbio;
2175
2176         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2177         {
2178 #ifdef XBB_USE_BOUNCE_BUFFERS
2179                 vm_offset_t kva_offset;
2180
2181                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2182                            - (vm_offset_t)reqlist->bounce;
2183                 if (operation == BIO_WRITE) {
2184                         memcpy(bios[bio_idx]->bio_data,
2185                                (uint8_t *)reqlist->kva + kva_offset,
2186                                bios[bio_idx]->bio_bcount);
2187                 }
2188 #endif
2189                 if (operation == BIO_READ) {
2190                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2191                                    device_get_unit(xbb->dev),
2192                                    bios[bio_idx]->bio_offset,
2193                                    bios[bio_idx]->bio_length);
2194                 } else if (operation == BIO_WRITE) {
2195                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2196                                    device_get_unit(xbb->dev),
2197                                    bios[bio_idx]->bio_offset,
2198                                    bios[bio_idx]->bio_length);
2199                 }
2200                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2201         }
2202
2203         return (error);
2204
2205 fail_free_bios:
2206         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2207                 g_destroy_bio(bios[bio_idx]);
2208         
2209         return (error);
2210 }
2211
2212 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
2213 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
2214                   "uint64_t");
2215 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
2216                   "uint64_t", "uint64_t");
2217
2218 /**
2219  * Backend handler for file access.
2220  *
2221  * \param xbb        Per-instance xbb configuration structure.
2222  * \param reqlist    Allocated internal request list.
2223  * \param operation  BIO_* I/O operation code.
2224  * \param flags      Additional bio_flag data to pass to any generated bios
2225  *                   (e.g. BIO_ORDERED)..
2226  *
2227  * \return  0 for success, errno codes for failure.
2228  */
2229 static int
2230 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2231                   int operation, int flags)
2232 {
2233         struct xbb_file_data *file_data;
2234         u_int                 seg_idx;
2235         u_int                 nseg;
2236         off_t                 sectors_sent;
2237         struct uio            xuio;
2238         struct xbb_sg        *xbb_sg;
2239         struct iovec         *xiovec;
2240 #ifdef XBB_USE_BOUNCE_BUFFERS
2241         void                **p_vaddr;
2242         int                   saved_uio_iovcnt;
2243 #endif /* XBB_USE_BOUNCE_BUFFERS */
2244         int                   error;
2245
2246         file_data = &xbb->backend.file;
2247         sectors_sent = 0;
2248         error = 0;
2249         bzero(&xuio, sizeof(xuio));
2250
2251         switch (operation) {
2252         case BIO_READ:
2253                 xuio.uio_rw = UIO_READ;
2254                 break;
2255         case BIO_WRITE:
2256                 xuio.uio_rw = UIO_WRITE;
2257                 break;
2258         case BIO_FLUSH: {
2259                 struct mount *mountpoint;
2260
2261                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2262                            device_get_unit(xbb->dev));
2263
2264                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2265
2266                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2267                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2268                 VOP_UNLOCK(xbb->vn, 0);
2269
2270                 vn_finished_write(mountpoint);
2271
2272                 goto bailout_send_response;
2273                 /* NOTREACHED */
2274         }
2275         default:
2276                 panic("invalid operation %d", operation);
2277                 /* NOTREACHED */
2278         }
2279         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2280                         << xbb->sector_size_shift;
2281         xuio.uio_segflg = UIO_SYSSPACE;
2282         xuio.uio_iov = file_data->xiovecs;
2283         xuio.uio_iovcnt = 0;
2284         xbb_sg = xbb->xbb_sgs;
2285         nseg = reqlist->nr_segments;
2286
2287         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2288
2289                 /*
2290                  * If the first sector is not 0, the KVA will
2291                  * not be contiguous and we'll need to go on
2292                  * to another segment.
2293                  */
2294                 if (xbb_sg->first_sect != 0)
2295                         xiovec = NULL;
2296
2297                 if (xiovec == NULL) {
2298                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2299                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2300                             seg_idx, xbb_sg->first_sect);
2301 #ifdef XBB_USE_BOUNCE_BUFFERS
2302                         /*
2303                          * Store the address of the incoming
2304                          * buffer at this particular offset
2305                          * as well, so we can do the copy
2306                          * later without having to do more
2307                          * work to recalculate this address.
2308                          */
2309                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2310                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2311                             xbb_sg->first_sect);
2312 #endif /* XBB_USE_BOUNCE_BUFFERS */
2313                         xiovec->iov_len = 0;
2314                         xuio.uio_iovcnt++;
2315                 }
2316
2317                 xiovec->iov_len += xbb_sg->nsect << 9;
2318
2319                 xuio.uio_resid += xbb_sg->nsect << 9;
2320
2321                 /*
2322                  * If the last sector is not the full page
2323                  * size count, the next segment will not be
2324                  * contiguous in KVA and we need a new iovec.
2325                  */
2326                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2327                         xiovec = NULL;
2328         }
2329
2330         xuio.uio_td = curthread;
2331
2332 #ifdef XBB_USE_BOUNCE_BUFFERS
2333         saved_uio_iovcnt = xuio.uio_iovcnt;
2334
2335         if (operation == BIO_WRITE) {
2336                 /* Copy the write data to the local buffer. */
2337                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2338                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2339                      seg_idx++, xiovec++, p_vaddr++) {
2340
2341                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2342                 }
2343         } else {
2344                 /*
2345                  * We only need to save off the iovecs in the case of a
2346                  * read, because the copy for the read happens after the
2347                  * VOP_READ().  (The uio will get modified in that call
2348                  * sequence.)
2349                  */
2350                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2351                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2352         }
2353 #endif /* XBB_USE_BOUNCE_BUFFERS */
2354
2355         switch (operation) {
2356         case BIO_READ:
2357
2358                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2359                            device_get_unit(xbb->dev), xuio.uio_offset,
2360                            xuio.uio_resid);
2361
2362                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2363
2364                 /*
2365                  * UFS pays attention to IO_DIRECT for reads.  If the
2366                  * DIRECTIO option is configured into the kernel, it calls
2367                  * ffs_rawread().  But that only works for single-segment
2368                  * uios with user space addresses.  In our case, with a
2369                  * kernel uio, it still reads into the buffer cache, but it
2370                  * will just try to release the buffer from the cache later
2371                  * on in ffs_read().
2372                  *
2373                  * ZFS does not pay attention to IO_DIRECT for reads.
2374                  *
2375                  * UFS does not pay attention to IO_SYNC for reads.
2376                  *
2377                  * ZFS pays attention to IO_SYNC (which translates into the
2378                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2379                  * attempts to sync the file before reading.
2380                  *
2381                  * So, to attempt to provide some barrier semantics in the
2382                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2383                  */
2384                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2385                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2386
2387                 VOP_UNLOCK(xbb->vn, 0);
2388                 break;
2389         case BIO_WRITE: {
2390                 struct mount *mountpoint;
2391
2392                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2393                            device_get_unit(xbb->dev), xuio.uio_offset,
2394                            xuio.uio_resid);
2395
2396                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2397
2398                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2399
2400                 /*
2401                  * UFS pays attention to IO_DIRECT for writes.  The write
2402                  * is done asynchronously.  (Normally the write would just
2403                  * get put into cache.
2404                  *
2405                  * UFS pays attention to IO_SYNC for writes.  It will
2406                  * attempt to write the buffer out synchronously if that
2407                  * flag is set.
2408                  *
2409                  * ZFS does not pay attention to IO_DIRECT for writes.
2410                  *
2411                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2412                  * for writes.  It will flush the transaction from the
2413                  * cache before returning.
2414                  *
2415                  * So if we've got the BIO_ORDERED flag set, we want
2416                  * IO_SYNC in either the UFS or ZFS case.
2417                  */
2418                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2419                                   IO_SYNC : 0, file_data->cred);
2420                 VOP_UNLOCK(xbb->vn, 0);
2421
2422                 vn_finished_write(mountpoint);
2423
2424                 break;
2425         }
2426         default:
2427                 panic("invalid operation %d", operation);
2428                 /* NOTREACHED */
2429         }
2430
2431 #ifdef XBB_USE_BOUNCE_BUFFERS
2432         /* We only need to copy here for read operations */
2433         if (operation == BIO_READ) {
2434
2435                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2436                      xiovec = file_data->saved_xiovecs;
2437                      seg_idx < saved_uio_iovcnt; seg_idx++,
2438                      xiovec++, p_vaddr++) {
2439
2440                         /*
2441                          * Note that we have to use the copy of the 
2442                          * io vector we made above.  uiomove() modifies
2443                          * the uio and its referenced vector as uiomove
2444                          * performs the copy, so we can't rely on any
2445                          * state from the original uio.
2446                          */
2447                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2448                 }
2449         }
2450 #endif /* XBB_USE_BOUNCE_BUFFERS */
2451
2452 bailout_send_response:
2453
2454         if (error != 0)
2455                 reqlist->status = BLKIF_RSP_ERROR;
2456
2457         xbb_complete_reqlist(xbb, reqlist);
2458
2459         return (0);
2460 }
2461
2462 /*--------------------------- Backend Configuration --------------------------*/
2463 /**
2464  * Close and cleanup any backend device/file specific state for this
2465  * block back instance. 
2466  *
2467  * \param xbb  Per-instance xbb configuration structure.
2468  */
2469 static void
2470 xbb_close_backend(struct xbb_softc *xbb)
2471 {
2472         DROP_GIANT();
2473         DPRINTF("closing dev=%s\n", xbb->dev_name);
2474         if (xbb->vn) {
2475                 int flags = FREAD;
2476
2477                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2478                         flags |= FWRITE;
2479
2480                 switch (xbb->device_type) {
2481                 case XBB_TYPE_DISK:
2482                         if (xbb->backend.dev.csw) {
2483                                 dev_relthread(xbb->backend.dev.cdev,
2484                                               xbb->backend.dev.dev_ref);
2485                                 xbb->backend.dev.csw  = NULL;
2486                                 xbb->backend.dev.cdev = NULL;
2487                         }
2488                         break;
2489                 case XBB_TYPE_FILE:
2490                         break;
2491                 case XBB_TYPE_NONE:
2492                 default:
2493                         panic("Unexpected backend type.");
2494                         break;
2495                 }
2496
2497                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2498                 xbb->vn = NULL;
2499
2500                 switch (xbb->device_type) {
2501                 case XBB_TYPE_DISK:
2502                         break;
2503                 case XBB_TYPE_FILE:
2504                         if (xbb->backend.file.cred != NULL) {
2505                                 crfree(xbb->backend.file.cred);
2506                                 xbb->backend.file.cred = NULL;
2507                         }
2508                         break;
2509                 case XBB_TYPE_NONE:
2510                 default:
2511                         panic("Unexpected backend type.");
2512                         break;
2513                 }
2514         }
2515         PICKUP_GIANT();
2516 }
2517
2518 /**
2519  * Open a character device to be used for backend I/O.
2520  *
2521  * \param xbb  Per-instance xbb configuration structure.
2522  *
2523  * \return  0 for success, errno codes for failure.
2524  */
2525 static int
2526 xbb_open_dev(struct xbb_softc *xbb)
2527 {
2528         struct vattr   vattr;
2529         struct cdev   *dev;
2530         struct cdevsw *devsw;
2531         int            error;
2532
2533         xbb->device_type = XBB_TYPE_DISK;
2534         xbb->dispatch_io = xbb_dispatch_dev;
2535         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2536         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2537                                              &xbb->backend.dev.dev_ref);
2538         if (xbb->backend.dev.csw == NULL)
2539                 panic("Unable to retrieve device switch");
2540
2541         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2542         if (error) {
2543                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2544                                  "vnode attributes for device %s",
2545                                  xbb->dev_name);
2546                 return (error);
2547         }
2548
2549
2550         dev = xbb->vn->v_rdev;
2551         devsw = dev->si_devsw;
2552         if (!devsw->d_ioctl) {
2553                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2554                                  "device %s!", xbb->dev_name);
2555                 return (ENODEV);
2556         }
2557
2558         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2559                                (caddr_t)&xbb->sector_size, FREAD,
2560                                curthread);
2561         if (error) {
2562                 xenbus_dev_fatal(xbb->dev, error,
2563                                  "error calling ioctl DIOCGSECTORSIZE "
2564                                  "for device %s", xbb->dev_name);
2565                 return (error);
2566         }
2567
2568         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2569                                (caddr_t)&xbb->media_size, FREAD,
2570                                curthread);
2571         if (error) {
2572                 xenbus_dev_fatal(xbb->dev, error,
2573                                  "error calling ioctl DIOCGMEDIASIZE "
2574                                  "for device %s", xbb->dev_name);
2575                 return (error);
2576         }
2577
2578         return (0);
2579 }
2580
2581 /**
2582  * Open a file to be used for backend I/O.
2583  *
2584  * \param xbb  Per-instance xbb configuration structure.
2585  *
2586  * \return  0 for success, errno codes for failure.
2587  */
2588 static int
2589 xbb_open_file(struct xbb_softc *xbb)
2590 {
2591         struct xbb_file_data *file_data;
2592         struct vattr          vattr;
2593         int                   error;
2594
2595         file_data = &xbb->backend.file;
2596         xbb->device_type = XBB_TYPE_FILE;
2597         xbb->dispatch_io = xbb_dispatch_file;
2598         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2599         if (error != 0) {
2600                 xenbus_dev_fatal(xbb->dev, error,
2601                                  "error calling VOP_GETATTR()"
2602                                  "for file %s", xbb->dev_name);
2603                 return (error);
2604         }
2605
2606         /*
2607          * Verify that we have the ability to upgrade to exclusive
2608          * access on this file so we can trap errors at open instead
2609          * of reporting them during first access.
2610          */
2611         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2612                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2613                 if (xbb->vn->v_iflag & VI_DOOMED) {
2614                         error = EBADF;
2615                         xenbus_dev_fatal(xbb->dev, error,
2616                                          "error locking file %s",
2617                                          xbb->dev_name);
2618
2619                         return (error);
2620                 }
2621         }
2622
2623         file_data->cred = crhold(curthread->td_ucred);
2624         xbb->media_size = vattr.va_size;
2625
2626         /*
2627          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2628          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2629          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2630          * may not work with other OSes as well.  So just export a sector
2631          * size of 512 bytes, which should work with any OS or
2632          * application.  Since our backing is a file, any block size will
2633          * work fine for the backing store.
2634          */
2635 #if 0
2636         xbb->sector_size = vattr.va_blocksize;
2637 #endif
2638         xbb->sector_size = 512;
2639
2640         /*
2641          * Sanity check.  The media size has to be at least one
2642          * sector long.
2643          */
2644         if (xbb->media_size < xbb->sector_size) {
2645                 error = EINVAL;
2646                 xenbus_dev_fatal(xbb->dev, error,
2647                                  "file %s size %ju < block size %u",
2648                                  xbb->dev_name,
2649                                  (uintmax_t)xbb->media_size,
2650                                  xbb->sector_size);
2651         }
2652         return (error);
2653 }
2654
2655 /**
2656  * Open the backend provider for this connection.
2657  *
2658  * \param xbb  Per-instance xbb configuration structure.
2659  *
2660  * \return  0 for success, errno codes for failure.
2661  */
2662 static int
2663 xbb_open_backend(struct xbb_softc *xbb)
2664 {
2665         struct nameidata nd;
2666         int              flags;
2667         int              error;
2668
2669         flags = FREAD;
2670         error = 0;
2671
2672         DPRINTF("opening dev=%s\n", xbb->dev_name);
2673
2674         if (rootvnode == NULL) {
2675                 xenbus_dev_fatal(xbb->dev, ENOENT,
2676                                  "Root file system not mounted");
2677                 return (ENOENT);
2678         }
2679
2680         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2681                 flags |= FWRITE;
2682
2683         if (!curthread->td_proc->p_fd->fd_cdir) {
2684                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
2685                 VREF(rootvnode);
2686         }
2687         if (!curthread->td_proc->p_fd->fd_rdir) {
2688                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
2689                 VREF(rootvnode);
2690         }
2691         if (!curthread->td_proc->p_fd->fd_jdir) {
2692                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
2693                 VREF(rootvnode);
2694         }
2695
2696  again:
2697         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2698         error = vn_open(&nd, &flags, 0, NULL);
2699         if (error) {
2700                 /*
2701                  * This is the only reasonable guess we can make as far as
2702                  * path if the user doesn't give us a fully qualified path.
2703                  * If they want to specify a file, they need to specify the
2704                  * full path.
2705                  */
2706                 if (xbb->dev_name[0] != '/') {
2707                         char *dev_path = "/dev/";
2708                         char *dev_name;
2709
2710                         /* Try adding device path at beginning of name */
2711                         dev_name = malloc(strlen(xbb->dev_name)
2712                                         + strlen(dev_path) + 1,
2713                                           M_XENBLOCKBACK, M_NOWAIT);
2714                         if (dev_name) {
2715                                 sprintf(dev_name, "%s%s", dev_path,
2716                                         xbb->dev_name);
2717                                 free(xbb->dev_name, M_XENBLOCKBACK);
2718                                 xbb->dev_name = dev_name;
2719                                 goto again;
2720                         }
2721                 }
2722                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2723                                  xbb->dev_name);
2724                 return (error);
2725         }
2726
2727         NDFREE(&nd, NDF_ONLY_PNBUF);
2728                 
2729         xbb->vn = nd.ni_vp;
2730
2731         /* We only support disks and files. */
2732         if (vn_isdisk(xbb->vn, &error)) {
2733                 error = xbb_open_dev(xbb);
2734         } else if (xbb->vn->v_type == VREG) {
2735                 error = xbb_open_file(xbb);
2736         } else {
2737                 error = EINVAL;
2738                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2739                                  "or file", xbb->dev_name);
2740         }
2741         VOP_UNLOCK(xbb->vn, 0);
2742
2743         if (error != 0) {
2744                 xbb_close_backend(xbb);
2745                 return (error);
2746         }
2747
2748         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2749         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2750
2751         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2752                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2753                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2754
2755         return (0);
2756 }
2757
2758 /*------------------------ Inter-Domain Communication ------------------------*/
2759 /**
2760  * Free dynamically allocated KVA or pseudo-physical address allocations.
2761  *
2762  * \param xbb  Per-instance xbb configuration structure.
2763  */
2764 static void
2765 xbb_free_communication_mem(struct xbb_softc *xbb)
2766 {
2767         if (xbb->kva != 0) {
2768 #ifndef XENHVM
2769                 kva_free(xbb->kva, xbb->kva_size);
2770 #else
2771                 if (xbb->pseudo_phys_res != NULL) {
2772                         bus_release_resource(xbb->dev, SYS_RES_MEMORY,
2773                                              xbb->pseudo_phys_res_id,
2774                                              xbb->pseudo_phys_res);
2775                         xbb->pseudo_phys_res = NULL;
2776                 }
2777 #endif
2778         }
2779         xbb->kva = 0;
2780         xbb->gnt_base_addr = 0;
2781         if (xbb->kva_free != NULL) {
2782                 free(xbb->kva_free, M_XENBLOCKBACK);
2783                 xbb->kva_free = NULL;
2784         }
2785 }
2786
2787 /**
2788  * Cleanup all inter-domain communication mechanisms.
2789  *
2790  * \param xbb  Per-instance xbb configuration structure.
2791  */
2792 static int
2793 xbb_disconnect(struct xbb_softc *xbb)
2794 {
2795         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2796         struct gnttab_unmap_grant_ref *op;
2797         u_int                          ring_idx;
2798         int                            error;
2799
2800         DPRINTF("\n");
2801
2802         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2803                 return (0);
2804
2805         xen_intr_unbind(&xbb->xen_intr_handle);
2806
2807         mtx_unlock(&xbb->lock);
2808         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2809         mtx_lock(&xbb->lock);
2810
2811         /*
2812          * No new interrupts can generate work, but we must wait
2813          * for all currently active requests to drain.
2814          */
2815         if (xbb->active_request_count != 0)
2816                 return (EAGAIN);
2817         
2818         for (ring_idx = 0, op = ops;
2819              ring_idx < xbb->ring_config.ring_pages;
2820              ring_idx++, op++) {
2821
2822                 op->host_addr    = xbb->ring_config.gnt_addr
2823                                  + (ring_idx * PAGE_SIZE);
2824                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2825                 op->handle       = xbb->ring_config.handle[ring_idx];
2826         }
2827
2828         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2829                                           xbb->ring_config.ring_pages);
2830         if (error != 0)
2831                 panic("Grant table op failed (%d)", error);
2832
2833         xbb_free_communication_mem(xbb);
2834
2835         if (xbb->requests != NULL) {
2836                 free(xbb->requests, M_XENBLOCKBACK);
2837                 xbb->requests = NULL;
2838         }
2839
2840         if (xbb->request_lists != NULL) {
2841                 struct xbb_xen_reqlist *reqlist;
2842                 int i;
2843
2844                 /* There is one request list for ever allocated request. */
2845                 for (i = 0, reqlist = xbb->request_lists;
2846                      i < xbb->max_requests; i++, reqlist++){
2847 #ifdef XBB_USE_BOUNCE_BUFFERS
2848                         if (reqlist->bounce != NULL) {
2849                                 free(reqlist->bounce, M_XENBLOCKBACK);
2850                                 reqlist->bounce = NULL;
2851                         }
2852 #endif
2853                         if (reqlist->gnt_handles != NULL) {
2854                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2855                                 reqlist->gnt_handles = NULL;
2856                         }
2857                 }
2858                 free(xbb->request_lists, M_XENBLOCKBACK);
2859                 xbb->request_lists = NULL;
2860         }
2861
2862         xbb->flags &= ~XBBF_RING_CONNECTED;
2863         return (0);
2864 }
2865
2866 /**
2867  * Map shared memory ring into domain local address space, initialize
2868  * ring control structures, and bind an interrupt to the event channel
2869  * used to notify us of ring changes.
2870  *
2871  * \param xbb  Per-instance xbb configuration structure.
2872  */
2873 static int
2874 xbb_connect_ring(struct xbb_softc *xbb)
2875 {
2876         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2877         struct gnttab_map_grant_ref *gnt;
2878         u_int                        ring_idx;
2879         int                          error;
2880
2881         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2882                 return (0);
2883
2884         /*
2885          * Kva for our ring is at the tail of the region of kva allocated
2886          * by xbb_alloc_communication_mem().
2887          */
2888         xbb->ring_config.va = xbb->kva
2889                             + (xbb->kva_size
2890                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2891         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2892                                   + (xbb->kva_size
2893                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2894
2895         for (ring_idx = 0, gnt = gnts;
2896              ring_idx < xbb->ring_config.ring_pages;
2897              ring_idx++, gnt++) {
2898
2899                 gnt->host_addr = xbb->ring_config.gnt_addr
2900                                + (ring_idx * PAGE_SIZE);
2901                 gnt->flags     = GNTMAP_host_map;
2902                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2903                 gnt->dom       = xbb->otherend_id;
2904         }
2905
2906         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2907                                           xbb->ring_config.ring_pages);
2908         if (error)
2909                 panic("blkback: Ring page grant table op failed (%d)", error);
2910
2911         for (ring_idx = 0, gnt = gnts;
2912              ring_idx < xbb->ring_config.ring_pages;
2913              ring_idx++, gnt++) {
2914                 if (gnt->status != 0) {
2915                         xbb->ring_config.va = 0;
2916                         xenbus_dev_fatal(xbb->dev, EACCES,
2917                                          "Ring shared page mapping failed. "
2918                                          "Status %d.", gnt->status);
2919                         return (EACCES);
2920                 }
2921                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2922                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2923         }
2924
2925         /* Initialize the ring based on ABI. */
2926         switch (xbb->abi) {
2927         case BLKIF_PROTOCOL_NATIVE:
2928         {
2929                 blkif_sring_t *sring;
2930                 sring = (blkif_sring_t *)xbb->ring_config.va;
2931                 BACK_RING_INIT(&xbb->rings.native, sring,
2932                                xbb->ring_config.ring_pages * PAGE_SIZE);
2933                 break;
2934         }
2935         case BLKIF_PROTOCOL_X86_32:
2936         {
2937                 blkif_x86_32_sring_t *sring_x86_32;
2938                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2939                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2940                                xbb->ring_config.ring_pages * PAGE_SIZE);
2941                 break;
2942         }
2943         case BLKIF_PROTOCOL_X86_64:
2944         {
2945                 blkif_x86_64_sring_t *sring_x86_64;
2946                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2947                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2948                                xbb->ring_config.ring_pages * PAGE_SIZE);
2949                 break;
2950         }
2951         default:
2952                 panic("Unexpected blkif protocol ABI.");
2953         }
2954
2955         xbb->flags |= XBBF_RING_CONNECTED;
2956
2957         error = xen_intr_bind_remote_port(xbb->dev,
2958                                           xbb->otherend_id,
2959                                           xbb->ring_config.evtchn,
2960                                           xbb_filter,
2961                                           /*ithread_handler*/NULL,
2962                                           /*arg*/xbb,
2963                                           INTR_TYPE_BIO | INTR_MPSAFE,
2964                                           &xbb->xen_intr_handle);
2965         if (error) {
2966                 (void)xbb_disconnect(xbb);
2967                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2968                 return (error);
2969         }
2970
2971         DPRINTF("rings connected!\n");
2972
2973         return 0;
2974 }
2975
2976 /* Needed to make bit_alloc() macro work */
2977 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,      \
2978                                    M_NOWAIT|M_ZERO);
2979
2980 /**
2981  * Size KVA and pseudo-physical address allocations based on negotiated
2982  * values for the size and number of I/O requests, and the size of our
2983  * communication ring.
2984  *
2985  * \param xbb  Per-instance xbb configuration structure.
2986  *
2987  * These address spaces are used to dynamically map pages in the
2988  * front-end's domain into our own.
2989  */
2990 static int
2991 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2992 {
2993         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2994         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2995         xbb->kva_size = xbb->reqlist_kva_size +
2996                         (xbb->ring_config.ring_pages * PAGE_SIZE);
2997
2998         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
2999         if (xbb->kva_free == NULL)
3000                 return (ENOMEM);
3001
3002         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
3003                 device_get_nameunit(xbb->dev), xbb->kva_size,
3004                 xbb->reqlist_kva_size);
3005 #ifndef XENHVM
3006         xbb->kva = kva_alloc(xbb->kva_size);
3007         if (xbb->kva == 0)
3008                 return (ENOMEM);
3009         xbb->gnt_base_addr = xbb->kva;
3010 #else /* XENHVM */
3011         /*
3012          * Reserve a range of pseudo physical memory that we can map
3013          * into kva.  These pages will only be backed by machine
3014          * pages ("real memory") during the lifetime of front-end requests
3015          * via grant table operations.
3016          */
3017         xbb->pseudo_phys_res_id = 0;
3018         xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
3019                                                   &xbb->pseudo_phys_res_id,
3020                                                   0, ~0, xbb->kva_size,
3021                                                   RF_ACTIVE);
3022         if (xbb->pseudo_phys_res == NULL) {
3023                 xbb->kva = 0;
3024                 return (ENOMEM);
3025         }
3026         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3027         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3028 #endif /* XENHVM */
3029
3030         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3031                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3032                 (uintmax_t)xbb->gnt_base_addr); 
3033         return (0);
3034 }
3035
3036 /**
3037  * Collect front-end information from the XenStore.
3038  *
3039  * \param xbb  Per-instance xbb configuration structure.
3040  */
3041 static int
3042 xbb_collect_frontend_info(struct xbb_softc *xbb)
3043 {
3044         char        protocol_abi[64];
3045         const char *otherend_path;
3046         int         error;
3047         u_int       ring_idx;
3048         u_int       ring_page_order;
3049         size_t      ring_size;
3050
3051         otherend_path = xenbus_get_otherend_path(xbb->dev);
3052
3053         /*
3054          * Protocol defaults valid even if all negotiation fails.
3055          */
3056         xbb->ring_config.ring_pages = 1;
3057         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
3058         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3059
3060         /*
3061          * Mandatory data (used in all versions of the protocol) first.
3062          */
3063         error = xs_scanf(XST_NIL, otherend_path,
3064                          "event-channel", NULL, "%" PRIu32,
3065                          &xbb->ring_config.evtchn);
3066         if (error != 0) {
3067                 xenbus_dev_fatal(xbb->dev, error,
3068                                  "Unable to retrieve event-channel information "
3069                                  "from frontend %s.  Unable to connect.",
3070                                  xenbus_get_otherend_path(xbb->dev));
3071                 return (error);
3072         }
3073
3074         /*
3075          * These fields are initialized to legacy protocol defaults
3076          * so we only need to fail if reading the updated value succeeds
3077          * and the new value is outside of its allowed range.
3078          *
3079          * \note xs_gather() returns on the first encountered error, so
3080          *       we must use independant calls in order to guarantee
3081          *       we don't miss information in a sparsly populated front-end
3082          *       tree.
3083          *
3084          * \note xs_scanf() does not update variables for unmatched
3085          *       fields.
3086          */
3087         ring_page_order = 0;
3088         (void)xs_scanf(XST_NIL, otherend_path,
3089                        "ring-page-order", NULL, "%u",
3090                        &ring_page_order);
3091         xbb->ring_config.ring_pages = 1 << ring_page_order;
3092         (void)xs_scanf(XST_NIL, otherend_path,
3093                        "num-ring-pages", NULL, "%u",
3094                        &xbb->ring_config.ring_pages);
3095         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3096         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3097
3098         (void)xs_scanf(XST_NIL, otherend_path,
3099                        "max-requests", NULL, "%u",
3100                        &xbb->max_requests);
3101
3102         (void)xs_scanf(XST_NIL, otherend_path,
3103                        "max-request-segments", NULL, "%u",
3104                        &xbb->max_request_segments);
3105
3106         (void)xs_scanf(XST_NIL, otherend_path,
3107                        "max-request-size", NULL, "%u",
3108                        &xbb->max_request_size);
3109
3110         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3111                 xenbus_dev_fatal(xbb->dev, EINVAL,
3112                                  "Front-end specified ring-pages of %u "
3113                                  "exceeds backend limit of %zu.  "
3114                                  "Unable to connect.",
3115                                  xbb->ring_config.ring_pages,
3116                                  XBB_MAX_RING_PAGES);
3117                 return (EINVAL);
3118         } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
3119                 xenbus_dev_fatal(xbb->dev, EINVAL,
3120                                  "Front-end specified max_requests of %u "
3121                                  "exceeds backend limit of %u.  "
3122                                  "Unable to connect.",
3123                                  xbb->max_requests,
3124                                  XBB_MAX_REQUESTS);
3125                 return (EINVAL);
3126         } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
3127                 xenbus_dev_fatal(xbb->dev, EINVAL,
3128                                  "Front-end specified max_requests_segments "
3129                                  "of %u exceeds backend limit of %u.  "
3130                                  "Unable to connect.",
3131                                  xbb->max_request_segments,
3132                                  XBB_MAX_SEGMENTS_PER_REQUEST);
3133                 return (EINVAL);
3134         } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
3135                 xenbus_dev_fatal(xbb->dev, EINVAL,
3136                                  "Front-end specified max_request_size "
3137                                  "of %u exceeds backend limit of %u.  "
3138                                  "Unable to connect.",
3139                                  xbb->max_request_size,
3140                                  XBB_MAX_REQUEST_SIZE);
3141                 return (EINVAL);
3142         }
3143
3144         if (xbb->ring_config.ring_pages == 1) {
3145                 error = xs_gather(XST_NIL, otherend_path,
3146                                   "ring-ref", "%" PRIu32,
3147                                   &xbb->ring_config.ring_ref[0],
3148                                   NULL);
3149                 if (error != 0) {
3150                         xenbus_dev_fatal(xbb->dev, error,
3151                                          "Unable to retrieve ring information "
3152                                          "from frontend %s.  Unable to "
3153                                          "connect.",
3154                                          xenbus_get_otherend_path(xbb->dev));
3155                         return (error);
3156                 }
3157         } else {
3158                 /* Multi-page ring format. */
3159                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3160                      ring_idx++) {
3161                         char ring_ref_name[]= "ring_refXX";
3162
3163                         snprintf(ring_ref_name, sizeof(ring_ref_name),
3164                                  "ring-ref%u", ring_idx);
3165                         error = xs_scanf(XST_NIL, otherend_path,
3166                                          ring_ref_name, NULL, "%" PRIu32,
3167                                          &xbb->ring_config.ring_ref[ring_idx]);
3168                         if (error != 0) {
3169                                 xenbus_dev_fatal(xbb->dev, error,
3170                                                  "Failed to retriev grant "
3171                                                  "reference for page %u of "
3172                                                  "shared ring.  Unable "
3173                                                  "to connect.", ring_idx);
3174                                 return (error);
3175                         }
3176                 }
3177         }
3178
3179         error = xs_gather(XST_NIL, otherend_path,
3180                           "protocol", "%63s", protocol_abi,
3181                           NULL); 
3182         if (error != 0
3183          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3184                 /*
3185                  * Assume native if the frontend has not
3186                  * published ABI data or it has published and
3187                  * matches our own ABI.
3188                  */
3189                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3190         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3191
3192                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3193         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3194
3195                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3196         } else {
3197
3198                 xenbus_dev_fatal(xbb->dev, EINVAL,
3199                                  "Unknown protocol ABI (%s) published by "
3200                                  "frontend.  Unable to connect.", protocol_abi);
3201                 return (EINVAL);
3202         }
3203         return (0);
3204 }
3205
3206 /**
3207  * Allocate per-request data structures given request size and number
3208  * information negotiated with the front-end.
3209  *
3210  * \param xbb  Per-instance xbb configuration structure.
3211  */
3212 static int
3213 xbb_alloc_requests(struct xbb_softc *xbb)
3214 {
3215         struct xbb_xen_req *req;
3216         struct xbb_xen_req *last_req;
3217
3218         /*
3219          * Allocate request book keeping datastructures.
3220          */
3221         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3222                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3223         if (xbb->requests == NULL) {
3224                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3225                                   "Unable to allocate request structures");
3226                 return (ENOMEM);
3227         }
3228
3229         req      = xbb->requests;
3230         last_req = &xbb->requests[xbb->max_requests - 1];
3231         STAILQ_INIT(&xbb->request_free_stailq);
3232         while (req <= last_req) {
3233                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3234                 req++;
3235         }
3236         return (0);
3237 }
3238
3239 static int
3240 xbb_alloc_request_lists(struct xbb_softc *xbb)
3241 {
3242         struct xbb_xen_reqlist *reqlist;
3243         int                     i;
3244
3245         /*
3246          * If no requests can be merged, we need 1 request list per
3247          * in flight request.
3248          */
3249         xbb->request_lists = malloc(xbb->max_requests *
3250                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3251         if (xbb->request_lists == NULL) {
3252                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3253                                   "Unable to allocate request list structures");
3254                 return (ENOMEM);
3255         }
3256
3257         STAILQ_INIT(&xbb->reqlist_free_stailq);
3258         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3259         for (i = 0; i < xbb->max_requests; i++) {
3260                 int seg;
3261
3262                 reqlist      = &xbb->request_lists[i];
3263
3264                 reqlist->xbb = xbb;
3265
3266 #ifdef XBB_USE_BOUNCE_BUFFERS
3267                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3268                                          M_XENBLOCKBACK, M_NOWAIT);
3269                 if (reqlist->bounce == NULL) {
3270                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3271                                          "Unable to allocate request "
3272                                          "bounce buffers");
3273                         return (ENOMEM);
3274                 }
3275 #endif /* XBB_USE_BOUNCE_BUFFERS */
3276
3277                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3278                                               sizeof(*reqlist->gnt_handles),
3279                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3280                 if (reqlist->gnt_handles == NULL) {
3281                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3282                                           "Unable to allocate request "
3283                                           "grant references");
3284                         return (ENOMEM);
3285                 }
3286
3287                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3288                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3289
3290                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3291         }
3292         return (0);
3293 }
3294
3295 /**
3296  * Supply information about the physical device to the frontend
3297  * via XenBus.
3298  *
3299  * \param xbb  Per-instance xbb configuration structure.
3300  */
3301 static int
3302 xbb_publish_backend_info(struct xbb_softc *xbb)
3303 {
3304         struct xs_transaction xst;
3305         const char           *our_path;
3306         const char           *leaf;
3307         int                   error;
3308
3309         our_path = xenbus_get_node(xbb->dev);
3310         while (1) {
3311                 error = xs_transaction_start(&xst);
3312                 if (error != 0) {
3313                         xenbus_dev_fatal(xbb->dev, error,
3314                                          "Error publishing backend info "
3315                                          "(start transaction)");
3316                         return (error);
3317                 }
3318
3319                 leaf = "sectors";
3320                 error = xs_printf(xst, our_path, leaf,
3321                                   "%"PRIu64, xbb->media_num_sectors);
3322                 if (error != 0)
3323                         break;
3324
3325                 /* XXX Support all VBD attributes here. */
3326                 leaf = "info";
3327                 error = xs_printf(xst, our_path, leaf, "%u",
3328                                   xbb->flags & XBBF_READ_ONLY
3329                                 ? VDISK_READONLY : 0);
3330                 if (error != 0)
3331                         break;
3332
3333                 leaf = "sector-size";
3334                 error = xs_printf(xst, our_path, leaf, "%u",
3335                                   xbb->sector_size);
3336                 if (error != 0)
3337                         break;
3338
3339                 error = xs_transaction_end(xst, 0);
3340                 if (error == 0) {
3341                         return (0);
3342                 } else if (error != EAGAIN) {
3343                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3344                         return (error);
3345                 }
3346         }
3347
3348         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3349                         our_path, leaf);
3350         xs_transaction_end(xst, 1);
3351         return (error);
3352 }
3353
3354 /**
3355  * Connect to our blkfront peer now that it has completed publishing
3356  * its configuration into the XenStore.
3357  *
3358  * \param xbb  Per-instance xbb configuration structure.
3359  */
3360 static void
3361 xbb_connect(struct xbb_softc *xbb)
3362 {
3363         int error;
3364
3365         if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
3366                 return;
3367
3368         if (xbb_collect_frontend_info(xbb) != 0)
3369                 return;
3370
3371         xbb->flags &= ~XBBF_SHUTDOWN;
3372
3373         /*
3374          * We limit the maximum number of reqlist segments to the maximum
3375          * number of segments in the ring, or our absolute maximum,
3376          * whichever is smaller.
3377          */
3378         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3379                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3380
3381         /*
3382          * The maximum size is simply a function of the number of segments
3383          * we can handle.
3384          */
3385         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3386
3387         /* Allocate resources whose size depends on front-end configuration. */
3388         error = xbb_alloc_communication_mem(xbb);
3389         if (error != 0) {
3390                 xenbus_dev_fatal(xbb->dev, error,
3391                                  "Unable to allocate communication memory");
3392                 return;
3393         }
3394
3395         error = xbb_alloc_requests(xbb);
3396         if (error != 0) {
3397                 /* Specific errors are reported by xbb_alloc_requests(). */
3398                 return;
3399         }
3400
3401         error = xbb_alloc_request_lists(xbb);
3402         if (error != 0) {
3403                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3404                 return;
3405         }
3406
3407         /*
3408          * Connect communication channel.
3409          */
3410         error = xbb_connect_ring(xbb);
3411         if (error != 0) {
3412                 /* Specific errors are reported by xbb_connect_ring(). */
3413                 return;
3414         }
3415         
3416         if (xbb_publish_backend_info(xbb) != 0) {
3417                 /*
3418                  * If we can't publish our data, we cannot participate
3419                  * in this connection, and waiting for a front-end state
3420                  * change will not help the situation.
3421                  */
3422                 (void)xbb_disconnect(xbb);
3423                 return;
3424         }
3425
3426         /* Ready for I/O. */
3427         xenbus_set_state(xbb->dev, XenbusStateConnected);
3428 }
3429
3430 /*-------------------------- Device Teardown Support -------------------------*/
3431 /**
3432  * Perform device shutdown functions.
3433  *
3434  * \param xbb  Per-instance xbb configuration structure.
3435  *
3436  * Mark this instance as shutting down, wait for any active I/O on the
3437  * backend device/file to drain, disconnect from the front-end, and notify
3438  * any waiters (e.g. a thread invoking our detach method) that detach can
3439  * now proceed.
3440  */
3441 static int
3442 xbb_shutdown(struct xbb_softc *xbb)
3443 {
3444         XenbusState frontState;
3445         int         error;
3446
3447         DPRINTF("\n");
3448
3449         /*
3450          * Due to the need to drop our mutex during some
3451          * xenbus operations, it is possible for two threads
3452          * to attempt to close out shutdown processing at
3453          * the same time.  Tell the caller that hits this
3454          * race to try back later. 
3455          */
3456         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3457                 return (EAGAIN);
3458
3459         xbb->flags |= XBBF_IN_SHUTDOWN;
3460         mtx_unlock(&xbb->lock);
3461
3462         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3463                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3464
3465         frontState = xenbus_get_otherend_state(xbb->dev);
3466         mtx_lock(&xbb->lock);
3467         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3468
3469         /* The front can submit I/O until entering the closed state. */
3470         if (frontState < XenbusStateClosed)
3471                 return (EAGAIN);
3472
3473         DPRINTF("\n");
3474
3475         /* Indicate shutdown is in progress. */
3476         xbb->flags |= XBBF_SHUTDOWN;
3477
3478         /* Disconnect from the front-end. */
3479         error = xbb_disconnect(xbb);
3480         if (error != 0) {
3481                 /*
3482                  * Requests still outstanding.  We'll be called again
3483                  * once they complete.
3484                  */
3485                 KASSERT(error == EAGAIN,
3486                         ("%s: Unexpected xbb_disconnect() failure %d",
3487                          __func__, error));
3488
3489                 return (error);
3490         }
3491
3492         DPRINTF("\n");
3493
3494         /* Indicate to xbb_detach() that is it safe to proceed. */
3495         wakeup(xbb);
3496
3497         return (0);
3498 }
3499
3500 /**
3501  * Report an attach time error to the console and Xen, and cleanup
3502  * this instance by forcing immediate detach processing.
3503  *
3504  * \param xbb  Per-instance xbb configuration structure.
3505  * \param err  Errno describing the error.
3506  * \param fmt  Printf style format and arguments
3507  */
3508 static void
3509 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3510 {
3511         va_list ap;
3512         va_list ap_hotplug;
3513
3514         va_start(ap, fmt);
3515         va_copy(ap_hotplug, ap);
3516         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3517                   "hotplug-error", fmt, ap_hotplug);
3518         va_end(ap_hotplug);
3519         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3520                   "hotplug-status", "error");
3521
3522         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3523         va_end(ap);
3524
3525         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3526                   "online", "0");
3527         xbb_detach(xbb->dev);
3528 }
3529
3530 /*---------------------------- NewBus Entrypoints ----------------------------*/
3531 /**
3532  * Inspect a XenBus device and claim it if is of the appropriate type.
3533  * 
3534  * \param dev  NewBus device object representing a candidate XenBus device.
3535  *
3536  * \return  0 for success, errno codes for failure.
3537  */
3538 static int
3539 xbb_probe(device_t dev)
3540 {
3541  
3542         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3543                 device_set_desc(dev, "Backend Virtual Block Device");
3544                 device_quiet(dev);
3545                 return (0);
3546         }
3547
3548         return (ENXIO);
3549 }
3550
3551 /**
3552  * Setup sysctl variables to control various Block Back parameters.
3553  *
3554  * \param xbb  Xen Block Back softc.
3555  *
3556  */
3557 static void
3558 xbb_setup_sysctl(struct xbb_softc *xbb)
3559 {
3560         struct sysctl_ctx_list *sysctl_ctx = NULL;
3561         struct sysctl_oid      *sysctl_tree = NULL;
3562         
3563         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3564         if (sysctl_ctx == NULL)
3565                 return;
3566
3567         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3568         if (sysctl_tree == NULL)
3569                 return;
3570
3571         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3572                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3573                        "fake the flush command");
3574
3575         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3576                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3577                        "send a real flush for N flush requests");
3578
3579         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3580                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3581                        "Don't coalesce contiguous requests");
3582
3583         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3584                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3585                          "how many I/O requests we have received");
3586
3587         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3588                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3589                          "how many I/O requests have been completed");
3590
3591         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3592                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3593                          "how many I/O dispatches were forced");
3594
3595         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3596                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3597                          "how many I/O dispatches were normal");
3598
3599         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3600                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3601                          "total number of I/O dispatches");
3602
3603         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3604                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3605                          "how many times we have run out of KVA");
3606
3607         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3608                          "request_shortages", CTLFLAG_RW,
3609                          &xbb->request_shortages,
3610                          "how many times we have run out of requests");
3611
3612         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3613                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3614                         "maximum outstanding requests (negotiated)");
3615
3616         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3617                         "max_request_segments", CTLFLAG_RD,
3618                         &xbb->max_request_segments, 0,
3619                         "maximum number of pages per requests (negotiated)");
3620
3621         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3622                         "max_request_size", CTLFLAG_RD,
3623                         &xbb->max_request_size, 0,
3624                         "maximum size in bytes of a request (negotiated)");
3625
3626         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3627                         "ring_pages", CTLFLAG_RD,
3628                         &xbb->ring_config.ring_pages, 0,
3629                         "communication channel pages (negotiated)");
3630 }
3631
3632 /**
3633  * Attach to a XenBus device that has been claimed by our probe routine.
3634  *
3635  * \param dev  NewBus device object representing this Xen Block Back instance.
3636  *
3637  * \return  0 for success, errno codes for failure.
3638  */
3639 static int
3640 xbb_attach(device_t dev)
3641 {
3642         struct xbb_softc        *xbb;
3643         int                      error;
3644         u_int                    max_ring_page_order;
3645
3646         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3647
3648         /*
3649          * Basic initialization.
3650          * After this block it is safe to call xbb_detach()
3651          * to clean up any allocated data for this instance.
3652          */
3653         xbb = device_get_softc(dev);
3654         xbb->dev = dev;
3655         xbb->otherend_id = xenbus_get_otherend_id(dev);
3656         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3657         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3658
3659         /*
3660          * Publish protocol capabilities for consumption by the
3661          * front-end.
3662          */
3663         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3664                           "feature-barrier", "1");
3665         if (error) {
3666                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3667                                   xenbus_get_node(xbb->dev));
3668                 return (error);
3669         }
3670
3671         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3672                           "feature-flush-cache", "1");
3673         if (error) {
3674                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3675                                   xenbus_get_node(xbb->dev));
3676                 return (error);
3677         }
3678
3679         /*
3680          * Amazon EC2 client compatility.  They refer to max-ring-pages
3681          * instead of to max-ring-page-order.
3682          */
3683         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3684                           "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
3685         if (error) {
3686                 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
3687                                   xenbus_get_node(xbb->dev));
3688                 return (error);
3689         }
3690
3691         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3692         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3693                           "max-ring-page-order", "%u", max_ring_page_order);
3694         if (error) {
3695                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3696                                   xenbus_get_node(xbb->dev));
3697                 return (error);
3698         }
3699
3700         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3701                           "max-requests", "%u", XBB_MAX_REQUESTS);
3702         if (error) {
3703                 xbb_attach_failed(xbb, error, "writing %s/max-requests",
3704                                   xenbus_get_node(xbb->dev));
3705                 return (error);
3706         }
3707
3708         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3709                           "max-request-segments", "%u",
3710                           XBB_MAX_SEGMENTS_PER_REQUEST);
3711         if (error) {
3712                 xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
3713                                   xenbus_get_node(xbb->dev));
3714                 return (error);
3715         }
3716
3717         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3718                           "max-request-size", "%u",
3719                           XBB_MAX_REQUEST_SIZE);
3720         if (error) {
3721                 xbb_attach_failed(xbb, error, "writing %s/max-request-size",
3722                                   xenbus_get_node(xbb->dev));
3723                 return (error);
3724         }
3725
3726         /* Collect physical device information. */
3727         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3728                           "device-type", NULL, &xbb->dev_type,
3729                           NULL);
3730         if (error != 0)
3731                 xbb->dev_type = NULL;
3732
3733         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3734                           "mode", NULL, &xbb->dev_mode,
3735                           "params", NULL, &xbb->dev_name,
3736                           NULL);
3737         if (error != 0) {
3738                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3739                                   xenbus_get_node(dev));
3740                 return (ENXIO);
3741         }
3742
3743         /* Parse fopen style mode flags. */
3744         if (strchr(xbb->dev_mode, 'w') == NULL)
3745                 xbb->flags |= XBBF_READ_ONLY;
3746
3747         /*
3748          * Verify the physical device is present and can support
3749          * the desired I/O mode.
3750          */
3751         DROP_GIANT();
3752         error = xbb_open_backend(xbb);
3753         PICKUP_GIANT();
3754         if (error != 0) {
3755                 xbb_attach_failed(xbb, error, "Unable to open %s",
3756                                   xbb->dev_name);
3757                 return (ENXIO);
3758         }
3759
3760         /* Use devstat(9) for recording statistics. */
3761         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3762                                            xbb->sector_size,
3763                                            DEVSTAT_ALL_SUPPORTED,
3764                                            DEVSTAT_TYPE_DIRECT
3765                                          | DEVSTAT_TYPE_IF_OTHER,
3766                                            DEVSTAT_PRIORITY_OTHER);
3767
3768         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3769                                               xbb->sector_size,
3770                                               DEVSTAT_ALL_SUPPORTED,
3771                                               DEVSTAT_TYPE_DIRECT
3772                                             | DEVSTAT_TYPE_IF_OTHER,
3773                                               DEVSTAT_PRIORITY_OTHER);
3774         /*
3775          * Setup sysctl variables.
3776          */
3777         xbb_setup_sysctl(xbb);
3778
3779         /*
3780          * Create a taskqueue for doing work that must occur from a
3781          * thread context.
3782          */
3783         xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3784                                                   M_NOWAIT,
3785                                                   taskqueue_thread_enqueue,
3786                                                   /*contxt*/&xbb->io_taskqueue);
3787         if (xbb->io_taskqueue == NULL) {
3788                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3789                 return (ENOMEM);
3790         }
3791
3792         taskqueue_start_threads(&xbb->io_taskqueue,
3793                                 /*num threads*/1,
3794                                 /*priority*/PWAIT,
3795                                 /*thread name*/
3796                                 "%s taskq", device_get_nameunit(dev));
3797
3798         /* Update hot-plug status to satisfy xend. */
3799         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3800                           "hotplug-status", "connected");
3801         if (error) {
3802                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3803                                   xenbus_get_node(xbb->dev));
3804                 return (error);
3805         }
3806
3807         /* Tell the front end that we are ready to connect. */
3808         xenbus_set_state(dev, XenbusStateInitWait);
3809
3810         return (0);
3811 }
3812
3813 /**
3814  * Detach from a block back device instance.
3815  *
3816  * \param dev  NewBus device object representing this Xen Block Back instance.
3817  *
3818  * \return  0 for success, errno codes for failure.
3819  * 
3820  * \note A block back device may be detached at any time in its life-cycle,
3821  *       including part way through the attach process.  For this reason,
3822  *       initialization order and the intialization state checks in this
3823  *       routine must be carefully coupled so that attach time failures
3824  *       are gracefully handled.
3825  */
3826 static int
3827 xbb_detach(device_t dev)
3828 {
3829         struct xbb_softc *xbb;
3830
3831         DPRINTF("\n");
3832
3833         xbb = device_get_softc(dev);
3834         mtx_lock(&xbb->lock);
3835         while (xbb_shutdown(xbb) == EAGAIN) {
3836                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3837                        "xbb_shutdown", 0);
3838         }
3839         mtx_unlock(&xbb->lock);
3840
3841         DPRINTF("\n");
3842
3843         if (xbb->io_taskqueue != NULL)
3844                 taskqueue_free(xbb->io_taskqueue);
3845
3846         if (xbb->xbb_stats != NULL)
3847                 devstat_remove_entry(xbb->xbb_stats);
3848
3849         if (xbb->xbb_stats_in != NULL)
3850                 devstat_remove_entry(xbb->xbb_stats_in);
3851
3852         xbb_close_backend(xbb);
3853
3854         if (xbb->dev_mode != NULL) {
3855                 free(xbb->dev_mode, M_XENSTORE);
3856                 xbb->dev_mode = NULL;
3857         }
3858
3859         if (xbb->dev_type != NULL) {
3860                 free(xbb->dev_type, M_XENSTORE);
3861                 xbb->dev_type = NULL;
3862         }
3863
3864         if (xbb->dev_name != NULL) {
3865                 free(xbb->dev_name, M_XENSTORE);
3866                 xbb->dev_name = NULL;
3867         }
3868
3869         mtx_destroy(&xbb->lock);
3870         return (0);
3871 }
3872
3873 /**
3874  * Prepare this block back device for suspension of this VM.
3875  * 
3876  * \param dev  NewBus device object representing this Xen Block Back instance.
3877  *
3878  * \return  0 for success, errno codes for failure.
3879  */
3880 static int
3881 xbb_suspend(device_t dev)
3882 {
3883 #ifdef NOT_YET
3884         struct xbb_softc *sc = device_get_softc(dev);
3885
3886         /* Prevent new requests being issued until we fix things up. */
3887         mtx_lock(&sc->xb_io_lock);
3888         sc->connected = BLKIF_STATE_SUSPENDED;
3889         mtx_unlock(&sc->xb_io_lock);
3890 #endif
3891
3892         return (0);
3893 }
3894
3895 /**
3896  * Perform any processing required to recover from a suspended state.
3897  * 
3898  * \param dev  NewBus device object representing this Xen Block Back instance.
3899  *
3900  * \return  0 for success, errno codes for failure.
3901  */
3902 static int
3903 xbb_resume(device_t dev)
3904 {
3905         return (0);
3906 }
3907
3908 /**
3909  * Handle state changes expressed via the XenStore by our front-end peer.
3910  *
3911  * \param dev             NewBus device object representing this Xen
3912  *                        Block Back instance.
3913  * \param frontend_state  The new state of the front-end.
3914  *
3915  * \return  0 for success, errno codes for failure.
3916  */
3917 static void
3918 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3919 {
3920         struct xbb_softc *xbb = device_get_softc(dev);
3921
3922         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3923                 xenbus_strstate(frontend_state),
3924                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3925
3926         switch (frontend_state) {
3927         case XenbusStateInitialising:
3928                 break;
3929         case XenbusStateInitialised:
3930         case XenbusStateConnected:
3931                 xbb_connect(xbb);
3932                 break;
3933         case XenbusStateClosing:
3934         case XenbusStateClosed:
3935                 mtx_lock(&xbb->lock);
3936                 xbb_shutdown(xbb);
3937                 mtx_unlock(&xbb->lock);
3938                 if (frontend_state == XenbusStateClosed)
3939                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3940                 break;
3941         default:
3942                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3943                                  frontend_state);
3944                 break;
3945         }
3946 }
3947
3948 /*---------------------------- NewBus Registration ---------------------------*/
3949 static device_method_t xbb_methods[] = {
3950         /* Device interface */
3951         DEVMETHOD(device_probe,         xbb_probe),
3952         DEVMETHOD(device_attach,        xbb_attach),
3953         DEVMETHOD(device_detach,        xbb_detach),
3954         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3955         DEVMETHOD(device_suspend,       xbb_suspend),
3956         DEVMETHOD(device_resume,        xbb_resume),
3957
3958         /* Xenbus interface */
3959         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3960
3961         { 0, 0 }
3962 };
3963
3964 static driver_t xbb_driver = {
3965         "xbbd",
3966         xbb_methods,
3967         sizeof(struct xbb_softc),
3968 };
3969 devclass_t xbb_devclass;
3970
3971 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);