]> CyberLeo.Net >> Repos - FreeBSD/releng/9.2.git/blob - sys/dev/xen/blkback/blkback.c
- Copy stable/9 to releng/9.2 as part of the 9.2-RELEASE cycle.
[FreeBSD/releng/9.2.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * Copyright (c) 2009-2011 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  *          Ken Merry           (Spectra Logic Corporation)
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 /**
37  * \file blkback.c
38  *
39  * \brief Device driver supporting the vending of block storage from
40  *        a FreeBSD domain to other domains.
41  */
42
43 #include "opt_kdtrace.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49
50 #include <sys/bio.h>
51 #include <sys/bus.h>
52 #include <sys/conf.h>
53 #include <sys/devicestat.h>
54 #include <sys/disk.h>
55 #include <sys/fcntl.h>
56 #include <sys/filedesc.h>
57 #include <sys/kdb.h>
58 #include <sys/module.h>
59 #include <sys/namei.h>
60 #include <sys/proc.h>
61 #include <sys/rman.h>
62 #include <sys/taskqueue.h>
63 #include <sys/types.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/sysctl.h>
67 #include <sys/bitstring.h>
68 #include <sys/sdt.h>
69
70 #include <geom/geom.h>
71
72 #include <machine/_inttypes.h>
73 #include <machine/xen/xen-os.h>
74
75 #include <vm/vm.h>
76 #include <vm/vm_extern.h>
77 #include <vm/vm_kern.h>
78
79 #include <xen/blkif.h>
80 #include <xen/evtchn.h>
81 #include <xen/gnttab.h>
82 #include <xen/xen_intr.h>
83
84 #include <xen/interface/event_channel.h>
85 #include <xen/interface/grant_table.h>
86
87 #include <xen/xenbus/xenbusvar.h>
88
89 /*--------------------------- Compile-time Tunables --------------------------*/
90 /**
91  * The maximum number of outstanding request blocks (request headers plus
92  * additional segment blocks) we will allow in a negotiated block-front/back
93  * communication channel.
94  */
95 #define XBB_MAX_REQUESTS        256
96
97 /**
98  * \brief Define to force all I/O to be performed on memory owned by the
99  *        backend device, with a copy-in/out to the remote domain's memory.
100  *
101  * \note  This option is currently required when this driver's domain is
102  *        operating in HVM mode on a system using an IOMMU.
103  *
104  * This driver uses Xen's grant table API to gain access to the memory of
105  * the remote domains it serves.  When our domain is operating in PV mode,
106  * the grant table mechanism directly updates our domain's page table entries
107  * to point to the physical pages of the remote domain.  This scheme guarantees
108  * that blkback and the backing devices it uses can safely perform DMA
109  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
110  * insure that our domain cannot DMA to pages owned by another domain.  As
111  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
112  * table API.  For this reason, in HVM mode, we must bounce all requests into
113  * memory that is mapped into our domain at domain startup and thus has
114  * valid IOMMU mappings.
115  */
116 #define XBB_USE_BOUNCE_BUFFERS
117
118 /**
119  * \brief Define to enable rudimentary request logging to the console.
120  */
121 #undef XBB_DEBUG
122
123 /*---------------------------------- Macros ----------------------------------*/
124 /**
125  * Custom malloc type for all driver allocations.
126  */
127 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
128
129 #ifdef XBB_DEBUG
130 #define DPRINTF(fmt, args...)                                   \
131     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
132 #else
133 #define DPRINTF(fmt, args...) do {} while(0)
134 #endif
135
136 /**
137  * The maximum mapped region size per request we will allow in a negotiated
138  * block-front/back communication channel.
139  */
140 #define XBB_MAX_REQUEST_SIZE                                    \
141         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
142
143 /**
144  * The maximum number of segments (within a request header and accompanying
145  * segment blocks) per request we will allow in a negotiated block-front/back
146  * communication channel.
147  */
148 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
149         (MIN(UIO_MAXIOV,                                        \
150              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
151                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
152
153 /**
154  * The maximum number of shared memory ring pages we will allow in a
155  * negotiated block-front/back communication channel.  Allow enough
156  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
157  */
158 #define XBB_MAX_RING_PAGES                                                  \
159         BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
160                        * XBB_MAX_REQUESTS)
161 /**
162  * The maximum number of ring pages that we can allow per request list.
163  * We limit this to the maximum number of segments per request, because
164  * that is already a reasonable number of segments to aggregate.  This
165  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
166  * because that would leave situations where we can't dispatch even one
167  * large request.
168  */
169 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
170
171 /*--------------------------- Forward Declarations ---------------------------*/
172 struct xbb_softc;
173 struct xbb_xen_req;
174
175 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
176                               ...) __attribute__((format(printf, 3, 4)));
177 static int  xbb_shutdown(struct xbb_softc *xbb);
178 static int  xbb_detach(device_t dev);
179
180 /*------------------------------ Data Structures -----------------------------*/
181
182 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
183
184 typedef enum {
185         XBB_REQLIST_NONE        = 0x00,
186         XBB_REQLIST_MAPPED      = 0x01
187 } xbb_reqlist_flags;
188
189 struct xbb_xen_reqlist {
190         /**
191          * Back reference to the parent block back instance for this
192          * request.  Used during bio_done handling.
193          */
194         struct xbb_softc        *xbb;
195
196         /**
197          * BLKIF_OP code for this request.
198          */
199         int                      operation;
200
201         /**
202          * Set to BLKIF_RSP_* to indicate request status.
203          *
204          * This field allows an error status to be recorded even if the
205          * delivery of this status must be deferred.  Deferred reporting
206          * is necessary, for example, when an error is detected during
207          * completion processing of one bio when other bios for this
208          * request are still outstanding.
209          */
210         int                      status;
211
212         /**
213          * Number of 512 byte sectors not transferred.
214          */
215         int                      residual_512b_sectors;
216
217         /**
218          * Starting sector number of the first request in the list.
219          */
220         off_t                    starting_sector_number;
221
222         /**
223          * If we're going to coalesce, the next contiguous sector would be
224          * this one.
225          */
226         off_t                    next_contig_sector;
227
228         /**
229          * Number of child requests in the list.
230          */
231         int                      num_children;
232
233         /**
234          * Number of I/O requests dispatched to the backend.
235          */
236         int                      pendcnt;
237
238         /**
239          * Total number of segments for requests in the list.
240          */
241         int                      nr_segments;
242
243         /**
244          * Flags for this particular request list.
245          */
246         xbb_reqlist_flags        flags;
247
248         /**
249          * Kernel virtual address space reserved for this request
250          * list structure and used to map the remote domain's pages for
251          * this I/O, into our domain's address space.
252          */
253         uint8_t                 *kva;
254
255         /**
256          * Base, psuedo-physical address, corresponding to the start
257          * of this request's kva region.
258          */
259         uint64_t                 gnt_base;
260
261
262 #ifdef XBB_USE_BOUNCE_BUFFERS
263         /**
264          * Pre-allocated domain local memory used to proxy remote
265          * domain memory during I/O operations.
266          */
267         uint8_t                 *bounce;
268 #endif
269
270         /**
271          * Array of grant handles (one per page) used to map this request.
272          */
273         grant_handle_t          *gnt_handles;
274
275         /**
276          * Device statistics request ordering type (ordered or simple).
277          */
278         devstat_tag_type         ds_tag_type;
279
280         /**
281          * Device statistics request type (read, write, no_data).
282          */
283         devstat_trans_flags      ds_trans_type;
284
285         /**
286          * The start time for this request.
287          */
288         struct bintime           ds_t0;
289
290         /**
291          * Linked list of contiguous requests with the same operation type.
292          */
293         struct xbb_xen_req_list  contig_req_list;
294
295         /**
296          * Linked list links used to aggregate idle requests in the
297          * request list free pool (xbb->reqlist_free_stailq) and pending
298          * requests waiting for execution (xbb->reqlist_pending_stailq).
299          */
300         STAILQ_ENTRY(xbb_xen_reqlist) links;
301 };
302
303 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
304
305 /**
306  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
307  */
308 struct xbb_xen_req {
309         /**
310          * Linked list links used to aggregate requests into a reqlist
311          * and to store them in the request free pool.
312          */
313         STAILQ_ENTRY(xbb_xen_req) links;
314
315         /**
316          * The remote domain's identifier for this I/O request.
317          */
318         uint64_t                  id;
319
320         /**
321          * The number of pages currently mapped for this request.
322          */
323         int                       nr_pages;
324
325         /**
326          * The number of 512 byte sectors comprising this requests.
327          */
328         int                       nr_512b_sectors;
329
330         /**
331          * The number of struct bio requests still outstanding for this
332          * request on the backend device.  This field is only used for  
333          * device (rather than file) backed I/O.
334          */
335         int                       pendcnt;
336
337         /**
338          * BLKIF_OP code for this request.
339          */
340         int                       operation;
341
342         /**
343          * Storage used for non-native ring requests.
344          */
345         blkif_request_t          ring_req_storage;
346
347         /**
348          * Pointer to the Xen request in the ring.
349          */
350         blkif_request_t         *ring_req;
351
352         /**
353          * Consumer index for this request.
354          */
355         RING_IDX                 req_ring_idx;
356
357         /**
358          * The start time for this request.
359          */
360         struct bintime           ds_t0;
361
362         /**
363          * Pointer back to our parent request list.
364          */
365         struct xbb_xen_reqlist  *reqlist;
366 };
367 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
368
369 /**
370  * \brief Configuration data for the shared memory request ring
371  *        used to communicate with the front-end client of this
372  *        this driver.
373  */
374 struct xbb_ring_config {
375         /** KVA address where ring memory is mapped. */
376         vm_offset_t     va;
377
378         /** The pseudo-physical address where ring memory is mapped.*/
379         uint64_t        gnt_addr;
380
381         /**
382          * Grant table handles, one per-ring page, returned by the
383          * hyperpervisor upon mapping of the ring and required to
384          * unmap it when a connection is torn down.
385          */
386         grant_handle_t  handle[XBB_MAX_RING_PAGES];
387
388         /**
389          * The device bus address returned by the hypervisor when
390          * mapping the ring and required to unmap it when a connection
391          * is torn down.
392          */
393         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
394
395         /** The number of ring pages mapped for the current connection. */
396         u_int           ring_pages;
397
398         /**
399          * The grant references, one per-ring page, supplied by the
400          * front-end, allowing us to reference the ring pages in the
401          * front-end's domain and to map these pages into our own domain.
402          */
403         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
404
405         /** The interrupt driven even channel used to signal ring events. */
406         evtchn_port_t   evtchn;
407 };
408
409 /**
410  * Per-instance connection state flags.
411  */
412 typedef enum
413 {
414         /**
415          * The front-end requested a read-only mount of the
416          * back-end device/file.
417          */
418         XBBF_READ_ONLY         = 0x01,
419
420         /** Communication with the front-end has been established. */
421         XBBF_RING_CONNECTED    = 0x02,
422
423         /**
424          * Front-end requests exist in the ring and are waiting for
425          * xbb_xen_req objects to free up.
426          */
427         XBBF_RESOURCE_SHORTAGE = 0x04,
428
429         /** Connection teardown in progress. */
430         XBBF_SHUTDOWN          = 0x08,
431
432         /** A thread is already performing shutdown processing. */
433         XBBF_IN_SHUTDOWN       = 0x10
434 } xbb_flag_t;
435
436 /** Backend device type.  */
437 typedef enum {
438         /** Backend type unknown. */
439         XBB_TYPE_NONE           = 0x00,
440
441         /**
442          * Backend type disk (access via cdev switch
443          * strategy routine).
444          */
445         XBB_TYPE_DISK           = 0x01,
446
447         /** Backend type file (access vnode operations.). */
448         XBB_TYPE_FILE           = 0x02
449 } xbb_type;
450
451 /**
452  * \brief Structure used to memoize information about a per-request
453  *        scatter-gather list.
454  *
455  * The chief benefit of using this data structure is it avoids having
456  * to reparse the possibly discontiguous S/G list in the original
457  * request.  Due to the way that the mapping of the memory backing an
458  * I/O transaction is handled by Xen, a second pass is unavoidable.
459  * At least this way the second walk is a simple array traversal.
460  *
461  * \note A single Scatter/Gather element in the block interface covers
462  *       at most 1 machine page.  In this context a sector (blkif
463  *       nomenclature, not what I'd choose) is a 512b aligned unit
464  *       of mapping within the machine page referenced by an S/G
465  *       element.
466  */
467 struct xbb_sg {
468         /** The number of 512b data chunks mapped in this S/G element. */
469         int16_t nsect;
470
471         /**
472          * The index (0 based) of the first 512b data chunk mapped
473          * in this S/G element.
474          */
475         uint8_t first_sect;
476
477         /**
478          * The index (0 based) of the last 512b data chunk mapped
479          * in this S/G element.
480          */
481         uint8_t last_sect;
482 };
483
484 /**
485  * Character device backend specific configuration data.
486  */
487 struct xbb_dev_data {
488         /** Cdev used for device backend access.  */
489         struct cdev   *cdev;
490
491         /** Cdev switch used for device backend access.  */
492         struct cdevsw *csw;
493
494         /** Used to hold a reference on opened cdev backend devices. */
495         int            dev_ref;
496 };
497
498 /**
499  * File backend specific configuration data.
500  */
501 struct xbb_file_data {
502         /** Credentials to use for vnode backed (file based) I/O. */
503         struct ucred   *cred;
504
505         /**
506          * \brief Array of io vectors used to process file based I/O.
507          *
508          * Only a single file based request is outstanding per-xbb instance,
509          * so we only need one of these.
510          */
511         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
512 #ifdef XBB_USE_BOUNCE_BUFFERS
513
514         /**
515          * \brief Array of io vectors used to handle bouncing of file reads.
516          *
517          * Vnode operations are free to modify uio data during their
518          * exectuion.  In the case of a read with bounce buffering active,
519          * we need some of the data from the original uio in order to
520          * bounce-out the read data.  This array serves as the temporary
521          * storage for this saved data.
522          */
523         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
524
525         /**
526          * \brief Array of memoized bounce buffer kva offsets used
527          *        in the file based backend.
528          *
529          * Due to the way that the mapping of the memory backing an
530          * I/O transaction is handled by Xen, a second pass through
531          * the request sg elements is unavoidable. We memoize the computed
532          * bounce address here to reduce the cost of the second walk.
533          */
534         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
535 #endif /* XBB_USE_BOUNCE_BUFFERS */
536 };
537
538 /**
539  * Collection of backend type specific data.
540  */
541 union xbb_backend_data {
542         struct xbb_dev_data  dev;
543         struct xbb_file_data file;
544 };
545
546 /**
547  * Function signature of backend specific I/O handlers.
548  */
549 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
550                               struct xbb_xen_reqlist *reqlist, int operation,
551                               int flags);
552
553 /**
554  * Per-instance configuration data.
555  */
556 struct xbb_softc {
557
558         /**
559          * Task-queue used to process I/O requests.
560          */
561         struct taskqueue         *io_taskqueue;
562
563         /**
564          * Single "run the request queue" task enqueued
565          * on io_taskqueue.
566          */
567         struct task               io_task;
568
569         /** Device type for this instance. */
570         xbb_type                  device_type;
571
572         /** NewBus device corresponding to this instance. */
573         device_t                  dev;
574
575         /** Backend specific dispatch routine for this instance. */
576         xbb_dispatch_t            dispatch_io;
577
578         /** The number of requests outstanding on the backend device/file. */
579         int                       active_request_count;
580
581         /** Free pool of request tracking structures. */
582         struct xbb_xen_req_list   request_free_stailq;
583
584         /** Array, sized at connection time, of request tracking structures. */
585         struct xbb_xen_req       *requests;
586
587         /** Free pool of request list structures. */
588         struct xbb_xen_reqlist_list reqlist_free_stailq;
589
590         /** List of pending request lists awaiting execution. */
591         struct xbb_xen_reqlist_list reqlist_pending_stailq;
592
593         /** Array, sized at connection time, of request list structures. */
594         struct xbb_xen_reqlist   *request_lists;
595
596         /**
597          * Global pool of kva used for mapping remote domain ring
598          * and I/O transaction data.
599          */
600         vm_offset_t               kva;
601
602         /** Psuedo-physical address corresponding to kva. */
603         uint64_t                  gnt_base_addr;
604
605         /** The size of the global kva pool. */
606         int                       kva_size;
607
608         /** The size of the KVA area used for request lists. */
609         int                       reqlist_kva_size;
610
611         /** The number of pages of KVA used for request lists */
612         int                       reqlist_kva_pages;
613
614         /** Bitmap of free KVA pages */
615         bitstr_t                 *kva_free;
616
617         /**
618          * \brief Cached value of the front-end's domain id.
619          * 
620          * This value is used at once for each mapped page in
621          * a transaction.  We cache it to avoid incuring the
622          * cost of an ivar access every time this is needed.
623          */
624         domid_t                   otherend_id;
625
626         /**
627          * \brief The blkif protocol abi in effect.
628          *
629          * There are situations where the back and front ends can
630          * have a different, native abi (e.g. intel x86_64 and
631          * 32bit x86 domains on the same machine).  The back-end
632          * always accomodates the front-end's native abi.  That
633          * value is pulled from the XenStore and recorded here.
634          */
635         int                       abi;
636
637         /**
638          * \brief The maximum number of requests and request lists allowed
639          *        to be in flight at a time.
640          *
641          * This value is negotiated via the XenStore.
642          */
643         u_int                     max_requests;
644
645         /**
646          * \brief The maximum number of segments (1 page per segment)
647          *        that can be mapped by a request.
648          *
649          * This value is negotiated via the XenStore.
650          */
651         u_int                     max_request_segments;
652
653         /**
654          * \brief Maximum number of segments per request list.
655          *
656          * This value is derived from and will generally be larger than
657          * max_request_segments.
658          */
659         u_int                     max_reqlist_segments;
660
661         /**
662          * The maximum size of any request to this back-end
663          * device.
664          *
665          * This value is negotiated via the XenStore.
666          */
667         u_int                     max_request_size;
668
669         /**
670          * The maximum size of any request list.  This is derived directly
671          * from max_reqlist_segments.
672          */
673         u_int                     max_reqlist_size;
674
675         /** Various configuration and state bit flags. */
676         xbb_flag_t                flags;
677
678         /** Ring mapping and interrupt configuration data. */
679         struct xbb_ring_config    ring_config;
680
681         /** Runtime, cross-abi safe, structures for ring access. */
682         blkif_back_rings_t        rings;
683
684         /** IRQ mapping for the communication ring event channel. */
685         int                       irq;
686
687         /**
688          * \brief Backend access mode flags (e.g. write, or read-only).
689          *
690          * This value is passed to us by the front-end via the XenStore.
691          */
692         char                     *dev_mode;
693
694         /**
695          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
696          *
697          * This value is passed to us by the front-end via the XenStore.
698          * Currently unused.
699          */
700         char                     *dev_type;
701
702         /**
703          * \brief Backend device/file identifier.
704          *
705          * This value is passed to us by the front-end via the XenStore.
706          * We expect this to be a POSIX path indicating the file or
707          * device to open.
708          */
709         char                     *dev_name;
710
711         /**
712          * Vnode corresponding to the backend device node or file
713          * we are acessing.
714          */
715         struct vnode             *vn;
716
717         union xbb_backend_data    backend;
718
719         /** The native sector size of the backend. */
720         u_int                     sector_size;
721
722         /** log2 of sector_size.  */
723         u_int                     sector_size_shift;
724
725         /** Size in bytes of the backend device or file.  */
726         off_t                     media_size;
727
728         /**
729          * \brief media_size expressed in terms of the backend native
730          *        sector size.
731          *
732          * (e.g. xbb->media_size >> xbb->sector_size_shift).
733          */
734         uint64_t                  media_num_sectors;
735
736         /**
737          * \brief Array of memoized scatter gather data computed during the
738          *        conversion of blkif ring requests to internal xbb_xen_req
739          *        structures.
740          *
741          * Ring processing is serialized so we only need one of these.
742          */
743         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
744
745         /**
746          * Temporary grant table map used in xbb_dispatch_io().  When
747          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
748          * stack could cause a stack overflow.
749          */
750         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
751
752         /** Mutex protecting per-instance data. */
753         struct mtx                lock;
754
755 #ifdef XENHVM
756         /**
757          * Resource representing allocated physical address space
758          * associated with our per-instance kva region.
759          */
760         struct resource          *pseudo_phys_res;
761
762         /** Resource id for allocated physical address space. */
763         int                       pseudo_phys_res_id;
764 #endif
765
766         /**
767          * I/O statistics from BlockBack dispatch down.  These are
768          * coalesced requests, and we start them right before execution.
769          */
770         struct devstat           *xbb_stats;
771
772         /**
773          * I/O statistics coming into BlockBack.  These are the requests as
774          * we get them from BlockFront.  They are started as soon as we
775          * receive a request, and completed when the I/O is complete.
776          */
777         struct devstat           *xbb_stats_in;
778
779         /** Disable sending flush to the backend */
780         int                       disable_flush;
781
782         /** Send a real flush for every N flush requests */
783         int                       flush_interval;
784
785         /** Count of flush requests in the interval */
786         int                       flush_count;
787
788         /** Don't coalesce requests if this is set */
789         int                       no_coalesce_reqs;
790
791         /** Number of requests we have received */
792         uint64_t                  reqs_received;
793
794         /** Number of requests we have completed*/
795         uint64_t                  reqs_completed;
796
797         /** How many forced dispatches (i.e. without coalescing) have happend */
798         uint64_t                  forced_dispatch;
799
800         /** How many normal dispatches have happend */
801         uint64_t                  normal_dispatch;
802
803         /** How many total dispatches have happend */
804         uint64_t                  total_dispatch;
805
806         /** How many times we have run out of KVA */
807         uint64_t                  kva_shortages;
808
809         /** How many times we have run out of request structures */
810         uint64_t                  request_shortages;
811 };
812
813 /*---------------------------- Request Processing ----------------------------*/
814 /**
815  * Allocate an internal transaction tracking structure from the free pool.
816  *
817  * \param xbb  Per-instance xbb configuration structure.
818  *
819  * \return  On success, a pointer to the allocated xbb_xen_req structure.
820  *          Otherwise NULL.
821  */
822 static inline struct xbb_xen_req *
823 xbb_get_req(struct xbb_softc *xbb)
824 {
825         struct xbb_xen_req *req;
826
827         req = NULL;
828
829         mtx_assert(&xbb->lock, MA_OWNED);
830
831         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
832                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
833                 xbb->active_request_count++;
834         }
835
836         return (req);
837 }
838
839 /**
840  * Return an allocated transaction tracking structure to the free pool.
841  *
842  * \param xbb  Per-instance xbb configuration structure.
843  * \param req  The request structure to free.
844  */
845 static inline void
846 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
847 {
848         mtx_assert(&xbb->lock, MA_OWNED);
849
850         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
851         xbb->active_request_count--;
852
853         KASSERT(xbb->active_request_count >= 0,
854                 ("xbb_release_req: negative active count"));
855 }
856
857 /**
858  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
859  *
860  * \param xbb       Per-instance xbb configuration structure.
861  * \param req_list  The list of requests to free.
862  * \param nreqs     The number of items in the list.
863  */
864 static inline void
865 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
866                  int nreqs)
867 {
868         mtx_assert(&xbb->lock, MA_OWNED);
869
870         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
871         xbb->active_request_count -= nreqs;
872
873         KASSERT(xbb->active_request_count >= 0,
874                 ("xbb_release_reqs: negative active count"));
875 }
876
877 /**
878  * Given a page index and 512b sector offset within that page,
879  * calculate an offset into a request's kva region.
880  *
881  * \param reqlist The request structure whose kva region will be accessed.
882  * \param pagenr  The page index used to compute the kva offset.
883  * \param sector  The 512b sector index used to compute the page relative
884  *                kva offset.
885  *
886  * \return  The computed global KVA offset.
887  */
888 static inline uint8_t *
889 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
890 {
891         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
892 }
893
894 #ifdef XBB_USE_BOUNCE_BUFFERS
895 /**
896  * Given a page index and 512b sector offset within that page,
897  * calculate an offset into a request's local bounce memory region.
898  *
899  * \param reqlist The request structure whose bounce region will be accessed.
900  * \param pagenr  The page index used to compute the bounce offset.
901  * \param sector  The 512b sector index used to compute the page relative
902  *                bounce offset.
903  *
904  * \return  The computed global bounce buffer address.
905  */
906 static inline uint8_t *
907 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
908 {
909         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
910 }
911 #endif
912
913 /**
914  * Given a page number and 512b sector offset within that page,
915  * calculate an offset into the request's memory region that the
916  * underlying backend device/file should use for I/O.
917  *
918  * \param reqlist The request structure whose I/O region will be accessed.
919  * \param pagenr  The page index used to compute the I/O offset.
920  * \param sector  The 512b sector index used to compute the page relative
921  *                I/O offset.
922  *
923  * \return  The computed global I/O address.
924  *
925  * Depending on configuration, this will either be a local bounce buffer
926  * or a pointer to the memory mapped in from the front-end domain for
927  * this request.
928  */
929 static inline uint8_t *
930 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
931 {
932 #ifdef XBB_USE_BOUNCE_BUFFERS
933         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
934 #else
935         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
936 #endif
937 }
938
939 /**
940  * Given a page index and 512b sector offset within that page, calculate
941  * an offset into the local psuedo-physical address space used to map a
942  * front-end's request data into a request.
943  *
944  * \param reqlist The request list structure whose pseudo-physical region
945  *                will be accessed.
946  * \param pagenr  The page index used to compute the pseudo-physical offset.
947  * \param sector  The 512b sector index used to compute the page relative
948  *                pseudo-physical offset.
949  *
950  * \return  The computed global pseudo-phsyical address.
951  *
952  * Depending on configuration, this will either be a local bounce buffer
953  * or a pointer to the memory mapped in from the front-end domain for
954  * this request.
955  */
956 static inline uintptr_t
957 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
958 {
959         struct xbb_softc *xbb;
960
961         xbb = reqlist->xbb;
962
963         return ((uintptr_t)(xbb->gnt_base_addr +
964                 (uintptr_t)(reqlist->kva - xbb->kva) +
965                 (PAGE_SIZE * pagenr) + (sector << 9)));
966 }
967
968 /**
969  * Get Kernel Virtual Address space for mapping requests.
970  *
971  * \param xbb         Per-instance xbb configuration structure.
972  * \param nr_pages    Number of pages needed.
973  * \param check_only  If set, check for free KVA but don't allocate it.
974  * \param have_lock   If set, xbb lock is already held.
975  *
976  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
977  *
978  * Note:  This should be unnecessary once we have either chaining or
979  * scatter/gather support for struct bio.  At that point we'll be able to
980  * put multiple addresses and lengths in one bio/bio chain and won't need
981  * to map everything into one virtual segment.
982  */
983 static uint8_t *
984 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
985 {
986         intptr_t first_clear;
987         intptr_t num_clear;
988         uint8_t *free_kva;
989         int      i;
990
991         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
992
993         first_clear = 0;
994         free_kva = NULL;
995
996         mtx_lock(&xbb->lock);
997
998         /*
999          * Look for the first available page.  If there are none, we're done.
1000          */
1001         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
1002
1003         if (first_clear == -1)
1004                 goto bailout;
1005
1006         /*
1007          * Starting at the first available page, look for consecutive free
1008          * pages that will satisfy the user's request.
1009          */
1010         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1011                 /*
1012                  * If this is true, the page is used, so we have to reset
1013                  * the number of clear pages and the first clear page
1014                  * (since it pointed to a region with an insufficient number
1015                  * of clear pages).
1016                  */
1017                 if (bit_test(xbb->kva_free, i)) {
1018                         num_clear = 0;
1019                         first_clear = -1;
1020                         continue;
1021                 }
1022
1023                 if (first_clear == -1)
1024                         first_clear = i;
1025
1026                 /*
1027                  * If this is true, we've found a large enough free region
1028                  * to satisfy the request.
1029                  */
1030                 if (++num_clear == nr_pages) {
1031
1032                         bit_nset(xbb->kva_free, first_clear,
1033                                  first_clear + nr_pages - 1);
1034
1035                         free_kva = xbb->kva +
1036                                 (uint8_t *)(first_clear * PAGE_SIZE);
1037
1038                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1039                                 free_kva + (nr_pages * PAGE_SIZE) <=
1040                                 (uint8_t *)xbb->ring_config.va,
1041                                 ("Free KVA %p len %d out of range, "
1042                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1043                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1044                                  (uintmax_t)xbb->ring_config.va));
1045                         break;
1046                 }
1047         }
1048
1049 bailout:
1050
1051         if (free_kva == NULL) {
1052                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1053                 xbb->kva_shortages++;
1054         }
1055
1056         mtx_unlock(&xbb->lock);
1057
1058         return (free_kva);
1059 }
1060
1061 /**
1062  * Free allocated KVA.
1063  *
1064  * \param xbb       Per-instance xbb configuration structure.
1065  * \param kva_ptr   Pointer to allocated KVA region.  
1066  * \param nr_pages  Number of pages in the KVA region.
1067  */
1068 static void
1069 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1070 {
1071         intptr_t start_page;
1072
1073         mtx_assert(&xbb->lock, MA_OWNED);
1074
1075         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1076         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1077
1078 }
1079
1080 /**
1081  * Unmap the front-end pages associated with this I/O request.
1082  *
1083  * \param req  The request structure to unmap.
1084  */
1085 static void
1086 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1087 {
1088         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1089         u_int                         i;
1090         u_int                         invcount;
1091         int                           error;
1092
1093         invcount = 0;
1094         for (i = 0; i < reqlist->nr_segments; i++) {
1095
1096                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1097                         continue;
1098
1099                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1100                 unmap[invcount].dev_bus_addr = 0;
1101                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1102                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1103                 invcount++;
1104         }
1105
1106         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1107                                           unmap, invcount);
1108         KASSERT(error == 0, ("Grant table operation failed"));
1109 }
1110
1111 /**
1112  * Allocate an internal transaction tracking structure from the free pool.
1113  *
1114  * \param xbb  Per-instance xbb configuration structure.
1115  *
1116  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1117  *          Otherwise NULL.
1118  */
1119 static inline struct xbb_xen_reqlist *
1120 xbb_get_reqlist(struct xbb_softc *xbb)
1121 {
1122         struct xbb_xen_reqlist *reqlist;
1123
1124         reqlist = NULL;
1125
1126         mtx_assert(&xbb->lock, MA_OWNED);
1127
1128         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1129
1130                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1131                 reqlist->flags = XBB_REQLIST_NONE;
1132                 reqlist->kva = NULL;
1133                 reqlist->status = BLKIF_RSP_OKAY;
1134                 reqlist->residual_512b_sectors = 0;
1135                 reqlist->num_children = 0;
1136                 reqlist->nr_segments = 0;
1137                 STAILQ_INIT(&reqlist->contig_req_list);
1138         }
1139
1140         return (reqlist);
1141 }
1142
1143 /**
1144  * Return an allocated transaction tracking structure to the free pool.
1145  *
1146  * \param xbb        Per-instance xbb configuration structure.
1147  * \param req        The request list structure to free.
1148  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1149  *                   during a resource shortage condition.
1150  */
1151 static inline void
1152 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1153                     int wakeup)
1154 {
1155
1156         mtx_lock(&xbb->lock);
1157
1158         if (wakeup) {
1159                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1160                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1161         }
1162
1163         if (reqlist->kva != NULL)
1164                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1165
1166         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1167
1168         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1169
1170         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1171                 /*
1172                  * Shutdown is in progress.  See if we can
1173                  * progress further now that one more request
1174                  * has completed and been returned to the
1175                  * free pool.
1176                  */
1177                 xbb_shutdown(xbb);
1178         }
1179
1180         mtx_unlock(&xbb->lock);
1181
1182         if (wakeup != 0)
1183                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1184 }
1185
1186 /**
1187  * Request resources and do basic request setup.
1188  *
1189  * \param xbb          Per-instance xbb configuration structure.
1190  * \param reqlist      Pointer to reqlist pointer.
1191  * \param ring_req     Pointer to a block ring request.
1192  * \param ring_index   The ring index of this request.
1193  *
1194  * \return  0 for success, non-zero for failure.
1195  */
1196 static int
1197 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1198                   blkif_request_t *ring_req, RING_IDX ring_idx)
1199 {
1200         struct xbb_xen_reqlist *nreqlist;
1201         struct xbb_xen_req     *nreq;
1202
1203         nreqlist = NULL;
1204         nreq     = NULL;
1205
1206         mtx_lock(&xbb->lock);
1207
1208         /*
1209          * We don't allow new resources to be allocated if we're in the
1210          * process of shutting down.
1211          */
1212         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1213                 mtx_unlock(&xbb->lock);
1214                 return (1);
1215         }
1216
1217         /*
1218          * Allocate a reqlist if the caller doesn't have one already.
1219          */
1220         if (*reqlist == NULL) {
1221                 nreqlist = xbb_get_reqlist(xbb);
1222                 if (nreqlist == NULL)
1223                         goto bailout_error;
1224         }
1225
1226         /* We always allocate a request. */
1227         nreq = xbb_get_req(xbb);
1228         if (nreq == NULL)
1229                 goto bailout_error;
1230
1231         mtx_unlock(&xbb->lock);
1232
1233         if (*reqlist == NULL) {
1234                 *reqlist = nreqlist;
1235                 nreqlist->operation = ring_req->operation;
1236                 nreqlist->starting_sector_number = ring_req->sector_number;
1237                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1238                                    links);
1239         }
1240
1241         nreq->reqlist = *reqlist;
1242         nreq->req_ring_idx = ring_idx;
1243
1244         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1245                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1246                 nreq->ring_req = &nreq->ring_req_storage;
1247         } else {
1248                 nreq->ring_req = ring_req;
1249         }
1250
1251         binuptime(&nreq->ds_t0);
1252         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1253         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1254         (*reqlist)->num_children++;
1255         (*reqlist)->nr_segments += ring_req->nr_segments;
1256
1257         return (0);
1258
1259 bailout_error:
1260
1261         /*
1262          * We're out of resources, so set the shortage flag.  The next time
1263          * a request is released, we'll try waking up the work thread to
1264          * see if we can allocate more resources.
1265          */
1266         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1267         xbb->request_shortages++;
1268
1269         if (nreq != NULL)
1270                 xbb_release_req(xbb, nreq);
1271
1272         mtx_unlock(&xbb->lock);
1273
1274         if (nreqlist != NULL)
1275                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1276
1277         return (1);
1278 }
1279
1280 /**
1281  * Create and transmit a response to a blkif request.
1282  * 
1283  * \param xbb     Per-instance xbb configuration structure.
1284  * \param req     The request structure to which to respond.
1285  * \param status  The status code to report.  See BLKIF_RSP_*
1286  *                in sys/xen/interface/io/blkif.h.
1287  */
1288 static void
1289 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1290 {
1291         blkif_response_t *resp;
1292         int               more_to_do;
1293         int               notify;
1294
1295         more_to_do = 0;
1296
1297         /*
1298          * Place on the response ring for the relevant domain.
1299          * For now, only the spacing between entries is different
1300          * in the different ABIs, not the response entry layout.
1301          */
1302         mtx_lock(&xbb->lock);
1303         switch (xbb->abi) {
1304         case BLKIF_PROTOCOL_NATIVE:
1305                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1306                                          xbb->rings.native.rsp_prod_pvt);
1307                 break;
1308         case BLKIF_PROTOCOL_X86_32:
1309                 resp = (blkif_response_t *)
1310                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1311                                       xbb->rings.x86_32.rsp_prod_pvt);
1312                 break;
1313         case BLKIF_PROTOCOL_X86_64:
1314                 resp = (blkif_response_t *)
1315                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1316                                       xbb->rings.x86_64.rsp_prod_pvt);
1317                 break;
1318         default:
1319                 panic("Unexpected blkif protocol ABI.");
1320         }
1321
1322         resp->id        = req->id;
1323         resp->operation = req->operation;
1324         resp->status    = status;
1325
1326         xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
1327         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
1328
1329         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1330
1331                 /*
1332                  * Tail check for pending requests. Allows frontend to avoid
1333                  * notifications if requests are already in flight (lower
1334                  * overheads and promotes batching).
1335                  */
1336                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1337         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1338
1339                 more_to_do = 1;
1340         }
1341
1342         xbb->reqs_completed++;
1343
1344         mtx_unlock(&xbb->lock);
1345
1346         if (more_to_do)
1347                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1348
1349         if (notify)
1350                 notify_remote_via_irq(xbb->irq);
1351 }
1352
1353 /**
1354  * Complete a request list.
1355  *
1356  * \param xbb        Per-instance xbb configuration structure.
1357  * \param reqlist    Allocated internal request list structure.
1358  */
1359 static void
1360 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1361 {
1362         struct xbb_xen_req *nreq;
1363         off_t               sectors_sent;
1364
1365         sectors_sent = 0;
1366
1367         if (reqlist->flags & XBB_REQLIST_MAPPED)
1368                 xbb_unmap_reqlist(reqlist);
1369
1370         /*
1371          * All I/O is done, send the response.  A lock should not be
1372          * necessary here because the request list is complete, and
1373          * therefore this is the only context accessing this request
1374          * right now.  The functions we call do their own locking if
1375          * necessary.
1376          */
1377         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1378                 off_t cur_sectors_sent;
1379
1380                 xbb_send_response(xbb, nreq, reqlist->status);
1381
1382                 /* We don't report bytes sent if there is an error. */
1383                 if (reqlist->status == BLKIF_RSP_OKAY)
1384                         cur_sectors_sent = nreq->nr_512b_sectors;
1385                 else
1386                         cur_sectors_sent = 0;
1387
1388                 sectors_sent += cur_sectors_sent;
1389
1390                 devstat_end_transaction(xbb->xbb_stats_in,
1391                                         /*bytes*/cur_sectors_sent << 9,
1392                                         reqlist->ds_tag_type,
1393                                         reqlist->ds_trans_type,
1394                                         /*now*/NULL,
1395                                         /*then*/&nreq->ds_t0);
1396         }
1397
1398         /*
1399          * Take out any sectors not sent.  If we wind up negative (which
1400          * might happen if an error is reported as well as a residual), just
1401          * report 0 sectors sent.
1402          */
1403         sectors_sent -= reqlist->residual_512b_sectors;
1404         if (sectors_sent < 0)
1405                 sectors_sent = 0;
1406
1407         devstat_end_transaction(xbb->xbb_stats,
1408                                 /*bytes*/ sectors_sent << 9,
1409                                 reqlist->ds_tag_type,
1410                                 reqlist->ds_trans_type,
1411                                 /*now*/NULL,
1412                                 /*then*/&reqlist->ds_t0);
1413
1414         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1415 }
1416
1417 /**
1418  * Completion handler for buffer I/O requests issued by the device
1419  * backend driver.
1420  *
1421  * \param bio  The buffer I/O request on which to perform completion
1422  *             processing.
1423  */
1424 static void
1425 xbb_bio_done(struct bio *bio)
1426 {
1427         struct xbb_softc       *xbb;
1428         struct xbb_xen_reqlist *reqlist;
1429
1430         reqlist = bio->bio_caller1;
1431         xbb     = reqlist->xbb;
1432
1433         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1434
1435         /*
1436          * This is a bit imprecise.  With aggregated I/O a single
1437          * request list can contain multiple front-end requests and
1438          * a multiple bios may point to a single request.  By carefully
1439          * walking the request list, we could map residuals and errors
1440          * back to the original front-end request, but the interface
1441          * isn't sufficiently rich for us to properly report the error.
1442          * So, we just treat the entire request list as having failed if an
1443          * error occurs on any part.  And, if an error occurs, we treat
1444          * the amount of data transferred as 0.
1445          *
1446          * For residuals, we report it on the overall aggregated device,
1447          * but not on the individual requests, since we don't currently
1448          * do the work to determine which front-end request to which the
1449          * residual applies.
1450          */
1451         if (bio->bio_error) {
1452                 DPRINTF("BIO returned error %d for operation on device %s\n",
1453                         bio->bio_error, xbb->dev_name);
1454                 reqlist->status = BLKIF_RSP_ERROR;
1455
1456                 if (bio->bio_error == ENXIO
1457                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1458
1459                         /*
1460                          * Backend device has disappeared.  Signal the
1461                          * front-end that we (the device proxy) want to
1462                          * go away.
1463                          */
1464                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1465                 }
1466         }
1467
1468 #ifdef XBB_USE_BOUNCE_BUFFERS
1469         if (bio->bio_cmd == BIO_READ) {
1470                 vm_offset_t kva_offset;
1471
1472                 kva_offset = (vm_offset_t)bio->bio_data
1473                            - (vm_offset_t)reqlist->bounce;
1474                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1475                        bio->bio_data, bio->bio_bcount);
1476         }
1477 #endif /* XBB_USE_BOUNCE_BUFFERS */
1478
1479         /*
1480          * Decrement the pending count for the request list.  When we're
1481          * done with the requests, send status back for all of them.
1482          */
1483         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1484                 xbb_complete_reqlist(xbb, reqlist);
1485
1486         g_destroy_bio(bio);
1487 }
1488
1489 /**
1490  * Parse a blkif request into an internal request structure and send
1491  * it to the backend for processing.
1492  *
1493  * \param xbb       Per-instance xbb configuration structure.
1494  * \param reqlist   Allocated internal request list structure.
1495  *
1496  * \return          On success, 0.  For resource shortages, non-zero.
1497  *  
1498  * This routine performs the backend common aspects of request parsing
1499  * including compiling an internal request structure, parsing the S/G
1500  * list and any secondary ring requests in which they may reside, and
1501  * the mapping of front-end I/O pages into our domain.
1502  */
1503 static int
1504 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1505 {
1506         struct xbb_sg                *xbb_sg;
1507         struct gnttab_map_grant_ref  *map;
1508         struct blkif_request_segment *sg;
1509         struct blkif_request_segment *last_block_sg;
1510         struct xbb_xen_req           *nreq;
1511         u_int                         nseg;
1512         u_int                         seg_idx;
1513         u_int                         block_segs;
1514         int                           nr_sects;
1515         int                           total_sects;
1516         int                           operation;
1517         uint8_t                       bio_flags;
1518         int                           error;
1519
1520         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1521         bio_flags            = 0;
1522         total_sects          = 0;
1523         nr_sects             = 0;
1524
1525         /*
1526          * First determine whether we have enough free KVA to satisfy this
1527          * request list.  If not, tell xbb_run_queue() so it can go to
1528          * sleep until we have more KVA.
1529          */
1530         reqlist->kva = NULL;
1531         if (reqlist->nr_segments != 0) {
1532                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1533                 if (reqlist->kva == NULL) {
1534                         /*
1535                          * If we're out of KVA, return ENOMEM.
1536                          */
1537                         return (ENOMEM);
1538                 }
1539         }
1540
1541         binuptime(&reqlist->ds_t0);
1542         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1543
1544         switch (reqlist->operation) {
1545         case BLKIF_OP_WRITE_BARRIER:
1546                 bio_flags       |= BIO_ORDERED;
1547                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1548                 /* FALLTHROUGH */
1549         case BLKIF_OP_WRITE:
1550                 operation = BIO_WRITE;
1551                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1552                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1553                         DPRINTF("Attempt to write to read only device %s\n",
1554                                 xbb->dev_name);
1555                         reqlist->status = BLKIF_RSP_ERROR;
1556                         goto send_response;
1557                 }
1558                 break;
1559         case BLKIF_OP_READ:
1560                 operation = BIO_READ;
1561                 reqlist->ds_trans_type = DEVSTAT_READ;
1562                 break;
1563         case BLKIF_OP_FLUSH_DISKCACHE:
1564                 /*
1565                  * If this is true, the user has requested that we disable
1566                  * flush support.  So we just complete the requests
1567                  * successfully.
1568                  */
1569                 if (xbb->disable_flush != 0) {
1570                         goto send_response;
1571                 }
1572
1573                 /*
1574                  * The user has requested that we only send a real flush
1575                  * for every N flush requests.  So keep count, and either
1576                  * complete the request immediately or queue it for the
1577                  * backend.
1578                  */
1579                 if (xbb->flush_interval != 0) {
1580                         if (++(xbb->flush_count) < xbb->flush_interval) {
1581                                 goto send_response;
1582                         } else
1583                                 xbb->flush_count = 0;
1584                 }
1585
1586                 operation = BIO_FLUSH;
1587                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1588                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1589                 goto do_dispatch;
1590                 /*NOTREACHED*/
1591         default:
1592                 DPRINTF("error: unknown block io operation [%d]\n",
1593                         reqlist->operation);
1594                 reqlist->status = BLKIF_RSP_ERROR;
1595                 goto send_response;
1596         }
1597
1598         reqlist->xbb  = xbb;
1599         xbb_sg        = xbb->xbb_sgs;
1600         map           = xbb->maps;
1601         seg_idx       = 0;
1602
1603         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1604                 blkif_request_t         *ring_req;
1605                 RING_IDX                 req_ring_idx;
1606                 u_int                    req_seg_idx;
1607
1608                 ring_req              = nreq->ring_req;
1609                 req_ring_idx          = nreq->req_ring_idx;
1610                 nr_sects              = 0;
1611                 nseg                  = ring_req->nr_segments;
1612                 nreq->id              = ring_req->id;
1613                 nreq->nr_pages        = nseg;
1614                 nreq->nr_512b_sectors = 0;
1615                 req_seg_idx           = 0;
1616                 sg                    = NULL;
1617
1618                 /* Check that number of segments is sane. */
1619                 if (unlikely(nseg == 0)
1620                  || unlikely(nseg > xbb->max_request_segments)) {
1621                         DPRINTF("Bad number of segments in request (%d)\n",
1622                                 nseg);
1623                         reqlist->status = BLKIF_RSP_ERROR;
1624                         goto send_response;
1625                 }
1626
1627                 block_segs    = MIN(nreq->nr_pages,
1628                                     BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1629                 sg            = ring_req->seg;
1630                 last_block_sg = sg + block_segs;
1631                 while (1) {
1632
1633                         while (sg < last_block_sg) {
1634                                 KASSERT(seg_idx <
1635                                         XBB_MAX_SEGMENTS_PER_REQLIST,
1636                                         ("seg_idx %d is too large, max "
1637                                         "segs %d\n", seg_idx,
1638                                         XBB_MAX_SEGMENTS_PER_REQLIST));
1639                         
1640                                 xbb_sg->first_sect = sg->first_sect;
1641                                 xbb_sg->last_sect  = sg->last_sect;
1642                                 xbb_sg->nsect =
1643                                     (int8_t)(sg->last_sect -
1644                                     sg->first_sect + 1);
1645
1646                                 if ((sg->last_sect >= (PAGE_SIZE >> 9))
1647                                  || (xbb_sg->nsect <= 0)) {
1648                                         reqlist->status = BLKIF_RSP_ERROR;
1649                                         goto send_response;
1650                                 }
1651
1652                                 nr_sects += xbb_sg->nsect;
1653                                 map->host_addr = xbb_get_gntaddr(reqlist,
1654                                                         seg_idx, /*sector*/0);
1655                                 KASSERT(map->host_addr + PAGE_SIZE <=
1656                                         xbb->ring_config.gnt_addr,
1657                                         ("Host address %#jx len %d overlaps "
1658                                          "ring address %#jx\n",
1659                                         (uintmax_t)map->host_addr, PAGE_SIZE,
1660                                         (uintmax_t)xbb->ring_config.gnt_addr));
1661                                         
1662                                 map->flags     = GNTMAP_host_map;
1663                                 map->ref       = sg->gref;
1664                                 map->dom       = xbb->otherend_id;
1665                                 if (operation == BIO_WRITE)
1666                                         map->flags |= GNTMAP_readonly;
1667                                 sg++;
1668                                 map++;
1669                                 xbb_sg++;
1670                                 seg_idx++;
1671                                 req_seg_idx++;
1672                         }
1673
1674                         block_segs = MIN(nseg - req_seg_idx,
1675                                          BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1676                         if (block_segs == 0)
1677                                 break;
1678
1679                         /*
1680                          * Fetch the next request block full of SG elements.
1681                          * For now, only the spacing between entries is
1682                          * different in the different ABIs, not the sg entry
1683                          * layout.
1684                          */
1685                         req_ring_idx++;
1686                         switch (xbb->abi) {
1687                         case BLKIF_PROTOCOL_NATIVE:
1688                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
1689                                                            req_ring_idx);
1690                                 break;
1691                         case BLKIF_PROTOCOL_X86_32:
1692                         {
1693                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
1694                                                            req_ring_idx);
1695                                 break;
1696                         }
1697                         case BLKIF_PROTOCOL_X86_64:
1698                         {
1699                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
1700                                                            req_ring_idx);
1701                                 break;
1702                         }
1703                         default:
1704                                 panic("Unexpected blkif protocol ABI.");
1705                                 /* NOTREACHED */
1706                         } 
1707                         last_block_sg = sg + block_segs;
1708                 }
1709
1710                 /* Convert to the disk's sector size */
1711                 nreq->nr_512b_sectors = nr_sects;
1712                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1713                 total_sects += nr_sects;
1714
1715                 if ((nreq->nr_512b_sectors &
1716                     ((xbb->sector_size >> 9) - 1)) != 0) {
1717                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1718                                       "a multiple of the backing store sector "
1719                                       "size (%d)\n", __func__,
1720                                       nreq->nr_512b_sectors << 9,
1721                                       xbb->sector_size);
1722                         reqlist->status = BLKIF_RSP_ERROR;
1723                         goto send_response;
1724                 }
1725         }
1726
1727         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1728                                           xbb->maps, reqlist->nr_segments);
1729         if (error != 0)
1730                 panic("Grant table operation failed (%d)", error);
1731
1732         reqlist->flags |= XBB_REQLIST_MAPPED;
1733
1734         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1735              seg_idx++, map++){
1736
1737                 if (unlikely(map->status != 0)) {
1738                         DPRINTF("invalid buffer -- could not remap "
1739                                 "it (%d)\n", map->status);
1740                         DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
1741                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1742                                 map->host_addr, map->flags, map->ref,
1743                                 map->dom);
1744                         reqlist->status = BLKIF_RSP_ERROR;
1745                         goto send_response;
1746                 }
1747
1748                 reqlist->gnt_handles[seg_idx] = map->handle;
1749         }
1750         if (reqlist->starting_sector_number + total_sects >
1751             xbb->media_num_sectors) {
1752
1753                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1754                         "extends past end of device %s\n",
1755                         operation == BIO_READ ? "read" : "write",
1756                         reqlist->starting_sector_number,
1757                         reqlist->starting_sector_number + total_sects,
1758                         xbb->dev_name); 
1759                 reqlist->status = BLKIF_RSP_ERROR;
1760                 goto send_response;
1761         }
1762
1763 do_dispatch:
1764
1765         error = xbb->dispatch_io(xbb,
1766                                  reqlist,
1767                                  operation,
1768                                  bio_flags);
1769
1770         if (error != 0) {
1771                 reqlist->status = BLKIF_RSP_ERROR;
1772                 goto send_response;
1773         }
1774
1775         return (0);
1776
1777 send_response:
1778
1779         xbb_complete_reqlist(xbb, reqlist);
1780
1781         return (0);
1782 }
1783
1784 static __inline int
1785 xbb_count_sects(blkif_request_t *ring_req)
1786 {
1787         int i;
1788         int cur_size = 0;
1789
1790         for (i = 0; i < ring_req->nr_segments; i++) {
1791                 int nsect;
1792
1793                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1794                         ring_req->seg[i].first_sect + 1);
1795                 if (nsect <= 0)
1796                         break;
1797
1798                 cur_size += nsect;
1799         }
1800
1801         return (cur_size);
1802 }
1803
1804 /**
1805  * Process incoming requests from the shared communication ring in response
1806  * to a signal on the ring's event channel.
1807  *
1808  * \param context  Callback argument registerd during task initialization -
1809  *                 the xbb_softc for this instance.
1810  * \param pending  The number of taskqueue_enqueue events that have
1811  *                 occurred since this handler was last run.
1812  */
1813 static void
1814 xbb_run_queue(void *context, int pending)
1815 {
1816         struct xbb_softc       *xbb;
1817         blkif_back_rings_t     *rings;
1818         RING_IDX                rp;
1819         uint64_t                cur_sector;
1820         int                     cur_operation;
1821         struct xbb_xen_reqlist *reqlist;
1822
1823
1824         xbb   = (struct xbb_softc *)context;
1825         rings = &xbb->rings;
1826
1827         /*
1828          * Work gather and dispatch loop.  Note that we have a bias here
1829          * towards gathering I/O sent by blockfront.  We first gather up
1830          * everything in the ring, as long as we have resources.  Then we
1831          * dispatch one request, and then attempt to gather up any
1832          * additional requests that have come in while we were dispatching
1833          * the request.
1834          *
1835          * This allows us to get a clearer picture (via devstat) of how
1836          * many requests blockfront is queueing to us at any given time.
1837          */
1838         for (;;) {
1839                 int retval;
1840
1841                 /*
1842                  * Initialize reqlist to the last element in the pending
1843                  * queue, if there is one.  This allows us to add more
1844                  * requests to that request list, if we have room.
1845                  */
1846                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1847                                       xbb_xen_reqlist, links);
1848                 if (reqlist != NULL) {
1849                         cur_sector = reqlist->next_contig_sector;
1850                         cur_operation = reqlist->operation;
1851                 } else {
1852                         cur_operation = 0;
1853                         cur_sector    = 0;
1854                 }
1855
1856                 /*
1857                  * Cache req_prod to avoid accessing a cache line shared
1858                  * with the frontend.
1859                  */
1860                 rp = rings->common.sring->req_prod;
1861
1862                 /* Ensure we see queued requests up to 'rp'. */
1863                 rmb();
1864
1865                 /**
1866                  * Run so long as there is work to consume and the generation
1867                  * of a response will not overflow the ring.
1868                  *
1869                  * @note There's a 1 to 1 relationship between requests and
1870                  *       responses, so an overflow should never occur.  This
1871                  *       test is to protect our domain from digesting bogus
1872                  *       data.  Shouldn't we log this?
1873                  */
1874                 while (rings->common.req_cons != rp
1875                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1876                                                   rings->common.req_cons) == 0){
1877                         blkif_request_t         ring_req_storage;
1878                         blkif_request_t        *ring_req;
1879                         int                     cur_size;
1880
1881                         switch (xbb->abi) {
1882                         case BLKIF_PROTOCOL_NATIVE:
1883                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1884                                     rings->common.req_cons);
1885                                 break;
1886                         case BLKIF_PROTOCOL_X86_32:
1887                         {
1888                                 struct blkif_x86_32_request *ring_req32;
1889
1890                                 ring_req32 = RING_GET_REQUEST(
1891                                     &xbb->rings.x86_32, rings->common.req_cons);
1892                                 blkif_get_x86_32_req(&ring_req_storage,
1893                                                      ring_req32);
1894                                 ring_req = &ring_req_storage;
1895                                 break;
1896                         }
1897                         case BLKIF_PROTOCOL_X86_64:
1898                         {
1899                                 struct blkif_x86_64_request *ring_req64;
1900
1901                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1902                                     rings->common.req_cons);
1903                                 blkif_get_x86_64_req(&ring_req_storage,
1904                                                      ring_req64);
1905                                 ring_req = &ring_req_storage;
1906                                 break;
1907                         }
1908                         default:
1909                                 panic("Unexpected blkif protocol ABI.");
1910                                 /* NOTREACHED */
1911                         } 
1912
1913                         /*
1914                          * Check for situations that would require closing
1915                          * off this I/O for further coalescing:
1916                          *  - Coalescing is turned off.
1917                          *  - Current I/O is out of sequence with the previous
1918                          *    I/O.
1919                          *  - Coalesced I/O would be too large.
1920                          */
1921                         if ((reqlist != NULL)
1922                          && ((xbb->no_coalesce_reqs != 0)
1923                           || ((xbb->no_coalesce_reqs == 0)
1924                            && ((ring_req->sector_number != cur_sector)
1925                             || (ring_req->operation != cur_operation)
1926                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1927                                  xbb->max_reqlist_segments))))) {
1928                                 reqlist = NULL;
1929                         }
1930
1931                         /*
1932                          * Grab and check for all resources in one shot.
1933                          * If we can't get all of the resources we need,
1934                          * the shortage is noted and the thread will get
1935                          * woken up when more resources are available.
1936                          */
1937                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1938                                                    xbb->rings.common.req_cons);
1939
1940                         if (retval != 0) {
1941                                 /*
1942                                  * Resource shortage has been recorded.
1943                                  * We'll be scheduled to run once a request
1944                                  * object frees up due to a completion.
1945                                  */
1946                                 break;
1947                         }
1948
1949                         /*
1950                          * Signify that we can overwrite this request with
1951                          * a response by incrementing our consumer index.
1952                          * The response won't be generated until after
1953                          * we've already consumed all necessary data out
1954                          * of the version of the request in the ring buffer
1955                          * (for native mode).  We must update the consumer
1956                          * index  before issueing back-end I/O so there is
1957                          * no possibility that it will complete and a
1958                          * response be generated before we make room in 
1959                          * the queue for that response.
1960                          */
1961                         xbb->rings.common.req_cons +=
1962                             BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
1963                         xbb->reqs_received++;
1964
1965                         cur_size = xbb_count_sects(ring_req);
1966                         cur_sector = ring_req->sector_number + cur_size;
1967                         reqlist->next_contig_sector = cur_sector;
1968                         cur_operation = ring_req->operation;
1969                 }
1970
1971                 /* Check for I/O to dispatch */
1972                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1973                 if (reqlist == NULL) {
1974                         /*
1975                          * We're out of work to do, put the task queue to
1976                          * sleep.
1977                          */
1978                         break;
1979                 }
1980
1981                 /*
1982                  * Grab the first request off the queue and attempt
1983                  * to dispatch it.
1984                  */
1985                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1986
1987                 retval = xbb_dispatch_io(xbb, reqlist);
1988                 if (retval != 0) {
1989                         /*
1990                          * xbb_dispatch_io() returns non-zero only when
1991                          * there is a resource shortage.  If that's the
1992                          * case, re-queue this request on the head of the
1993                          * queue, and go to sleep until we have more
1994                          * resources.
1995                          */
1996                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1997                                            reqlist, links);
1998                         break;
1999                 } else {
2000                         /*
2001                          * If we still have anything on the queue after
2002                          * removing the head entry, that is because we
2003                          * met one of the criteria to create a new
2004                          * request list (outlined above), and we'll call
2005                          * that a forced dispatch for statistical purposes.
2006                          *
2007                          * Otherwise, if there is only one element on the
2008                          * queue, we coalesced everything available on
2009                          * the ring and we'll call that a normal dispatch.
2010                          */
2011                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2012
2013                         if (reqlist != NULL)
2014                                 xbb->forced_dispatch++;
2015                         else
2016                                 xbb->normal_dispatch++;
2017
2018                         xbb->total_dispatch++;
2019                 }
2020         }
2021 }
2022
2023 /**
2024  * Interrupt handler bound to the shared ring's event channel.
2025  *
2026  * \param arg  Callback argument registerd during event channel
2027  *             binding - the xbb_softc for this instance.
2028  */
2029 static void
2030 xbb_intr(void *arg)
2031 {
2032         struct xbb_softc *xbb;
2033
2034         /* Defer to kernel thread. */
2035         xbb = (struct xbb_softc *)arg;
2036         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
2037 }
2038
2039 SDT_PROVIDER_DEFINE(xbb);
2040 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
2041 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
2042                   "uint64_t");
2043 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
2044                   "uint64_t", "uint64_t");
2045
2046 /*----------------------------- Backend Handlers -----------------------------*/
2047 /**
2048  * Backend handler for character device access.
2049  *
2050  * \param xbb        Per-instance xbb configuration structure.
2051  * \param reqlist    Allocated internal request list structure.
2052  * \param operation  BIO_* I/O operation code.
2053  * \param bio_flags  Additional bio_flag data to pass to any generated
2054  *                   bios (e.g. BIO_ORDERED)..
2055  *
2056  * \return  0 for success, errno codes for failure.
2057  */
2058 static int
2059 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2060                  int operation, int bio_flags)
2061 {
2062         struct xbb_dev_data *dev_data;
2063         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2064         struct xbb_xen_req  *nreq;
2065         off_t                bio_offset;
2066         struct bio          *bio;
2067         struct xbb_sg       *xbb_sg;
2068         u_int                nbio;
2069         u_int                bio_idx;
2070         u_int                nseg;
2071         u_int                seg_idx;
2072         int                  error;
2073
2074         dev_data   = &xbb->backend.dev;
2075         bio_offset = (off_t)reqlist->starting_sector_number
2076                    << xbb->sector_size_shift;
2077         error      = 0;
2078         nbio       = 0;
2079         bio_idx    = 0;
2080
2081         if (operation == BIO_FLUSH) {
2082                 nreq = STAILQ_FIRST(&reqlist->contig_req_list);
2083                 bio = g_new_bio();
2084                 if (unlikely(bio == NULL)) {
2085                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2086                         error = ENOMEM;
2087                         return (error);
2088                 }
2089
2090                 bio->bio_cmd     = BIO_FLUSH;
2091                 bio->bio_flags  |= BIO_ORDERED;
2092                 bio->bio_dev     = dev_data->cdev;
2093                 bio->bio_offset  = 0;
2094                 bio->bio_data    = 0;
2095                 bio->bio_done    = xbb_bio_done;
2096                 bio->bio_caller1 = nreq;
2097                 bio->bio_pblkno  = 0;
2098
2099                 nreq->pendcnt    = 1;
2100
2101                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2102                            device_get_unit(xbb->dev));
2103
2104                 (*dev_data->csw->d_strategy)(bio);
2105
2106                 return (0);
2107         }
2108
2109         xbb_sg = xbb->xbb_sgs;
2110         bio    = NULL;
2111         nseg = reqlist->nr_segments;
2112
2113         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2114
2115                 /*
2116                  * KVA will not be contiguous, so any additional
2117                  * I/O will need to be represented in a new bio.
2118                  */
2119                 if ((bio != NULL)
2120                  && (xbb_sg->first_sect != 0)) {
2121                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2122                                 printf("%s: Discontiguous I/O request "
2123                                        "from domain %d ends on "
2124                                        "non-sector boundary\n",
2125                                        __func__, xbb->otherend_id);
2126                                 error = EINVAL;
2127                                 goto fail_free_bios;
2128                         }
2129                         bio = NULL;
2130                 }
2131
2132                 if (bio == NULL) {
2133                         /*
2134                          * Make sure that the start of this bio is
2135                          * aligned to a device sector.
2136                          */
2137                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2138                                 printf("%s: Misaligned I/O request "
2139                                        "from domain %d\n", __func__,
2140                                        xbb->otherend_id);
2141                                 error = EINVAL;
2142                                 goto fail_free_bios;
2143                         }
2144
2145                         bio = bios[nbio++] = g_new_bio();
2146                         if (unlikely(bio == NULL)) {
2147                                 error = ENOMEM;
2148                                 goto fail_free_bios;
2149                         }
2150                         bio->bio_cmd     = operation;
2151                         bio->bio_flags  |= bio_flags;
2152                         bio->bio_dev     = dev_data->cdev;
2153                         bio->bio_offset  = bio_offset;
2154                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2155                                                 xbb_sg->first_sect);
2156                         bio->bio_done    = xbb_bio_done;
2157                         bio->bio_caller1 = reqlist;
2158                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2159                 }
2160
2161                 bio->bio_length += xbb_sg->nsect << 9;
2162                 bio->bio_bcount  = bio->bio_length;
2163                 bio_offset      += xbb_sg->nsect << 9;
2164
2165                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2166
2167                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2168                                 printf("%s: Discontiguous I/O request "
2169                                        "from domain %d ends on "
2170                                        "non-sector boundary\n",
2171                                        __func__, xbb->otherend_id);
2172                                 error = EINVAL;
2173                                 goto fail_free_bios;
2174                         }
2175                         /*
2176                          * KVA will not be contiguous, so any additional
2177                          * I/O will need to be represented in a new bio.
2178                          */
2179                         bio = NULL;
2180                 }
2181         }
2182
2183         reqlist->pendcnt = nbio;
2184
2185         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2186         {
2187 #ifdef XBB_USE_BOUNCE_BUFFERS
2188                 vm_offset_t kva_offset;
2189
2190                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2191                            - (vm_offset_t)reqlist->bounce;
2192                 if (operation == BIO_WRITE) {
2193                         memcpy(bios[bio_idx]->bio_data,
2194                                (uint8_t *)reqlist->kva + kva_offset,
2195                                bios[bio_idx]->bio_bcount);
2196                 }
2197 #endif
2198                 if (operation == BIO_READ) {
2199                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2200                                    device_get_unit(xbb->dev),
2201                                    bios[bio_idx]->bio_offset,
2202                                    bios[bio_idx]->bio_length);
2203                 } else if (operation == BIO_WRITE) {
2204                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2205                                    device_get_unit(xbb->dev),
2206                                    bios[bio_idx]->bio_offset,
2207                                    bios[bio_idx]->bio_length);
2208                 }
2209                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2210         }
2211
2212         return (error);
2213
2214 fail_free_bios:
2215         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2216                 g_destroy_bio(bios[bio_idx]);
2217         
2218         return (error);
2219 }
2220
2221 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
2222 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
2223                   "uint64_t");
2224 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
2225                   "uint64_t", "uint64_t");
2226
2227 /**
2228  * Backend handler for file access.
2229  *
2230  * \param xbb        Per-instance xbb configuration structure.
2231  * \param reqlist    Allocated internal request list.
2232  * \param operation  BIO_* I/O operation code.
2233  * \param flags      Additional bio_flag data to pass to any generated bios
2234  *                   (e.g. BIO_ORDERED)..
2235  *
2236  * \return  0 for success, errno codes for failure.
2237  */
2238 static int
2239 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2240                   int operation, int flags)
2241 {
2242         struct xbb_file_data *file_data;
2243         u_int                 seg_idx;
2244         u_int                 nseg;
2245         off_t                 sectors_sent;
2246         struct uio            xuio;
2247         struct xbb_sg        *xbb_sg;
2248         struct iovec         *xiovec;
2249 #ifdef XBB_USE_BOUNCE_BUFFERS
2250         void                **p_vaddr;
2251         int                   saved_uio_iovcnt;
2252 #endif /* XBB_USE_BOUNCE_BUFFERS */
2253         int                   vfs_is_locked;
2254         int                   error;
2255
2256         file_data = &xbb->backend.file;
2257         sectors_sent = 0;
2258         error = 0;
2259         bzero(&xuio, sizeof(xuio));
2260
2261         switch (operation) {
2262         case BIO_READ:
2263                 xuio.uio_rw = UIO_READ;
2264                 break;
2265         case BIO_WRITE:
2266                 xuio.uio_rw = UIO_WRITE;
2267                 break;
2268         case BIO_FLUSH: {
2269                 struct mount *mountpoint;
2270
2271                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2272                            device_get_unit(xbb->dev));
2273
2274                 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2275
2276                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2277
2278                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2279                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2280                 VOP_UNLOCK(xbb->vn, 0);
2281
2282                 vn_finished_write(mountpoint);
2283
2284                 VFS_UNLOCK_GIANT(vfs_is_locked);
2285
2286                 goto bailout_send_response;
2287                 /* NOTREACHED */
2288         }
2289         default:
2290                 panic("invalid operation %d", operation);
2291                 /* NOTREACHED */
2292         }
2293         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2294                         << xbb->sector_size_shift;
2295         xuio.uio_segflg = UIO_SYSSPACE;
2296         xuio.uio_iov = file_data->xiovecs;
2297         xuio.uio_iovcnt = 0;
2298         xbb_sg = xbb->xbb_sgs;
2299         nseg = reqlist->nr_segments;
2300
2301         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2302
2303                 /*
2304                  * If the first sector is not 0, the KVA will
2305                  * not be contiguous and we'll need to go on
2306                  * to another segment.
2307                  */
2308                 if (xbb_sg->first_sect != 0)
2309                         xiovec = NULL;
2310
2311                 if (xiovec == NULL) {
2312                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2313                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2314                             seg_idx, xbb_sg->first_sect);
2315 #ifdef XBB_USE_BOUNCE_BUFFERS
2316                         /*
2317                          * Store the address of the incoming
2318                          * buffer at this particular offset
2319                          * as well, so we can do the copy
2320                          * later without having to do more
2321                          * work to recalculate this address.
2322                          */
2323                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2324                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2325                             xbb_sg->first_sect);
2326 #endif /* XBB_USE_BOUNCE_BUFFERS */
2327                         xiovec->iov_len = 0;
2328                         xuio.uio_iovcnt++;
2329                 }
2330
2331                 xiovec->iov_len += xbb_sg->nsect << 9;
2332
2333                 xuio.uio_resid += xbb_sg->nsect << 9;
2334
2335                 /*
2336                  * If the last sector is not the full page
2337                  * size count, the next segment will not be
2338                  * contiguous in KVA and we need a new iovec.
2339                  */
2340                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2341                         xiovec = NULL;
2342         }
2343
2344         xuio.uio_td = curthread;
2345
2346 #ifdef XBB_USE_BOUNCE_BUFFERS
2347         saved_uio_iovcnt = xuio.uio_iovcnt;
2348
2349         if (operation == BIO_WRITE) {
2350                 /* Copy the write data to the local buffer. */
2351                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2352                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2353                      seg_idx++, xiovec++, p_vaddr++) {
2354
2355                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2356                 }
2357         } else {
2358                 /*
2359                  * We only need to save off the iovecs in the case of a
2360                  * read, because the copy for the read happens after the
2361                  * VOP_READ().  (The uio will get modified in that call
2362                  * sequence.)
2363                  */
2364                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2365                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2366         }
2367 #endif /* XBB_USE_BOUNCE_BUFFERS */
2368
2369         vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2370         switch (operation) {
2371         case BIO_READ:
2372
2373                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2374                            device_get_unit(xbb->dev), xuio.uio_offset,
2375                            xuio.uio_resid);
2376
2377                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2378
2379                 /*
2380                  * UFS pays attention to IO_DIRECT for reads.  If the
2381                  * DIRECTIO option is configured into the kernel, it calls
2382                  * ffs_rawread().  But that only works for single-segment
2383                  * uios with user space addresses.  In our case, with a
2384                  * kernel uio, it still reads into the buffer cache, but it
2385                  * will just try to release the buffer from the cache later
2386                  * on in ffs_read().
2387                  *
2388                  * ZFS does not pay attention to IO_DIRECT for reads.
2389                  *
2390                  * UFS does not pay attention to IO_SYNC for reads.
2391                  *
2392                  * ZFS pays attention to IO_SYNC (which translates into the
2393                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2394                  * attempts to sync the file before reading.
2395                  *
2396                  * So, to attempt to provide some barrier semantics in the
2397                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2398                  */
2399                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2400                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2401
2402                 VOP_UNLOCK(xbb->vn, 0);
2403                 break;
2404         case BIO_WRITE: {
2405                 struct mount *mountpoint;
2406
2407                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2408                            device_get_unit(xbb->dev), xuio.uio_offset,
2409                            xuio.uio_resid);
2410
2411                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2412
2413                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2414
2415                 /*
2416                  * UFS pays attention to IO_DIRECT for writes.  The write
2417                  * is done asynchronously.  (Normally the write would just
2418                  * get put into cache.
2419                  *
2420                  * UFS pays attention to IO_SYNC for writes.  It will
2421                  * attempt to write the buffer out synchronously if that
2422                  * flag is set.
2423                  *
2424                  * ZFS does not pay attention to IO_DIRECT for writes.
2425                  *
2426                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2427                  * for writes.  It will flush the transaction from the
2428                  * cache before returning.
2429                  *
2430                  * So if we've got the BIO_ORDERED flag set, we want
2431                  * IO_SYNC in either the UFS or ZFS case.
2432                  */
2433                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2434                                   IO_SYNC : 0, file_data->cred);
2435                 VOP_UNLOCK(xbb->vn, 0);
2436
2437                 vn_finished_write(mountpoint);
2438
2439                 break;
2440         }
2441         default:
2442                 panic("invalid operation %d", operation);
2443                 /* NOTREACHED */
2444         }
2445         VFS_UNLOCK_GIANT(vfs_is_locked);
2446
2447 #ifdef XBB_USE_BOUNCE_BUFFERS
2448         /* We only need to copy here for read operations */
2449         if (operation == BIO_READ) {
2450
2451                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2452                      xiovec = file_data->saved_xiovecs;
2453                      seg_idx < saved_uio_iovcnt; seg_idx++,
2454                      xiovec++, p_vaddr++) {
2455
2456                         /*
2457                          * Note that we have to use the copy of the 
2458                          * io vector we made above.  uiomove() modifies
2459                          * the uio and its referenced vector as uiomove
2460                          * performs the copy, so we can't rely on any
2461                          * state from the original uio.
2462                          */
2463                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2464                 }
2465         }
2466 #endif /* XBB_USE_BOUNCE_BUFFERS */
2467
2468 bailout_send_response:
2469
2470         if (error != 0)
2471                 reqlist->status = BLKIF_RSP_ERROR;
2472
2473         xbb_complete_reqlist(xbb, reqlist);
2474
2475         return (0);
2476 }
2477
2478 /*--------------------------- Backend Configuration --------------------------*/
2479 /**
2480  * Close and cleanup any backend device/file specific state for this
2481  * block back instance. 
2482  *
2483  * \param xbb  Per-instance xbb configuration structure.
2484  */
2485 static void
2486 xbb_close_backend(struct xbb_softc *xbb)
2487 {
2488         DROP_GIANT();
2489         DPRINTF("closing dev=%s\n", xbb->dev_name);
2490         if (xbb->vn) {
2491                 int flags = FREAD;
2492                 int vfs_is_locked = 0;
2493
2494                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2495                         flags |= FWRITE;
2496
2497                 switch (xbb->device_type) {
2498                 case XBB_TYPE_DISK:
2499                         if (xbb->backend.dev.csw) {
2500                                 dev_relthread(xbb->backend.dev.cdev,
2501                                               xbb->backend.dev.dev_ref);
2502                                 xbb->backend.dev.csw  = NULL;
2503                                 xbb->backend.dev.cdev = NULL;
2504                         }
2505                         break;
2506                 case XBB_TYPE_FILE:
2507                         vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2508                         break;
2509                 case XBB_TYPE_NONE:
2510                 default:
2511                         panic("Unexpected backend type.");
2512                         break;
2513                 }
2514
2515                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2516                 xbb->vn = NULL;
2517
2518                 switch (xbb->device_type) {
2519                 case XBB_TYPE_DISK:
2520                         break;
2521                 case XBB_TYPE_FILE:
2522                         VFS_UNLOCK_GIANT(vfs_is_locked);
2523                         if (xbb->backend.file.cred != NULL) {
2524                                 crfree(xbb->backend.file.cred);
2525                                 xbb->backend.file.cred = NULL;
2526                         }
2527                         break;
2528                 case XBB_TYPE_NONE:
2529                 default:
2530                         panic("Unexpected backend type.");
2531                         break;
2532                 }
2533         }
2534         PICKUP_GIANT();
2535 }
2536
2537 /**
2538  * Open a character device to be used for backend I/O.
2539  *
2540  * \param xbb  Per-instance xbb configuration structure.
2541  *
2542  * \return  0 for success, errno codes for failure.
2543  */
2544 static int
2545 xbb_open_dev(struct xbb_softc *xbb)
2546 {
2547         struct vattr   vattr;
2548         struct cdev   *dev;
2549         struct cdevsw *devsw;
2550         int            error;
2551
2552         xbb->device_type = XBB_TYPE_DISK;
2553         xbb->dispatch_io = xbb_dispatch_dev;
2554         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2555         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2556                                              &xbb->backend.dev.dev_ref);
2557         if (xbb->backend.dev.csw == NULL)
2558                 panic("Unable to retrieve device switch");
2559
2560         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2561         if (error) {
2562                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2563                                  "vnode attributes for device %s",
2564                                  xbb->dev_name);
2565                 return (error);
2566         }
2567
2568
2569         dev = xbb->vn->v_rdev;
2570         devsw = dev->si_devsw;
2571         if (!devsw->d_ioctl) {
2572                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2573                                  "device %s!", xbb->dev_name);
2574                 return (ENODEV);
2575         }
2576
2577         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2578                                (caddr_t)&xbb->sector_size, FREAD,
2579                                curthread);
2580         if (error) {
2581                 xenbus_dev_fatal(xbb->dev, error,
2582                                  "error calling ioctl DIOCGSECTORSIZE "
2583                                  "for device %s", xbb->dev_name);
2584                 return (error);
2585         }
2586
2587         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2588                                (caddr_t)&xbb->media_size, FREAD,
2589                                curthread);
2590         if (error) {
2591                 xenbus_dev_fatal(xbb->dev, error,
2592                                  "error calling ioctl DIOCGMEDIASIZE "
2593                                  "for device %s", xbb->dev_name);
2594                 return (error);
2595         }
2596
2597         return (0);
2598 }
2599
2600 /**
2601  * Open a file to be used for backend I/O.
2602  *
2603  * \param xbb  Per-instance xbb configuration structure.
2604  *
2605  * \return  0 for success, errno codes for failure.
2606  */
2607 static int
2608 xbb_open_file(struct xbb_softc *xbb)
2609 {
2610         struct xbb_file_data *file_data;
2611         struct vattr          vattr;
2612         int                   error;
2613
2614         file_data = &xbb->backend.file;
2615         xbb->device_type = XBB_TYPE_FILE;
2616         xbb->dispatch_io = xbb_dispatch_file;
2617         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2618         if (error != 0) {
2619                 xenbus_dev_fatal(xbb->dev, error,
2620                                  "error calling VOP_GETATTR()"
2621                                  "for file %s", xbb->dev_name);
2622                 return (error);
2623         }
2624
2625         /*
2626          * Verify that we have the ability to upgrade to exclusive
2627          * access on this file so we can trap errors at open instead
2628          * of reporting them during first access.
2629          */
2630         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2631                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2632                 if (xbb->vn->v_iflag & VI_DOOMED) {
2633                         error = EBADF;
2634                         xenbus_dev_fatal(xbb->dev, error,
2635                                          "error locking file %s",
2636                                          xbb->dev_name);
2637
2638                         return (error);
2639                 }
2640         }
2641
2642         file_data->cred = crhold(curthread->td_ucred);
2643         xbb->media_size = vattr.va_size;
2644
2645         /*
2646          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2647          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2648          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2649          * may not work with other OSes as well.  So just export a sector
2650          * size of 512 bytes, which should work with any OS or
2651          * application.  Since our backing is a file, any block size will
2652          * work fine for the backing store.
2653          */
2654 #if 0
2655         xbb->sector_size = vattr.va_blocksize;
2656 #endif
2657         xbb->sector_size = 512;
2658
2659         /*
2660          * Sanity check.  The media size has to be at least one
2661          * sector long.
2662          */
2663         if (xbb->media_size < xbb->sector_size) {
2664                 error = EINVAL;
2665                 xenbus_dev_fatal(xbb->dev, error,
2666                                  "file %s size %ju < block size %u",
2667                                  xbb->dev_name,
2668                                  (uintmax_t)xbb->media_size,
2669                                  xbb->sector_size);
2670         }
2671         return (error);
2672 }
2673
2674 /**
2675  * Open the backend provider for this connection.
2676  *
2677  * \param xbb  Per-instance xbb configuration structure.
2678  *
2679  * \return  0 for success, errno codes for failure.
2680  */
2681 static int
2682 xbb_open_backend(struct xbb_softc *xbb)
2683 {
2684         struct nameidata nd;
2685         int              flags;
2686         int              error;
2687         int              vfs_is_locked;
2688
2689         flags = FREAD;
2690         error = 0;
2691
2692         DPRINTF("opening dev=%s\n", xbb->dev_name);
2693
2694         if (rootvnode == NULL) {
2695                 xenbus_dev_fatal(xbb->dev, ENOENT,
2696                                  "Root file system not mounted");
2697                 return (ENOENT);
2698         }
2699
2700         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2701                 flags |= FWRITE;
2702
2703         if (!curthread->td_proc->p_fd->fd_cdir) {
2704                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
2705                 VREF(rootvnode);
2706         }
2707         if (!curthread->td_proc->p_fd->fd_rdir) {
2708                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
2709                 VREF(rootvnode);
2710         }
2711         if (!curthread->td_proc->p_fd->fd_jdir) {
2712                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
2713                 VREF(rootvnode);
2714         }
2715
2716  again:
2717         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2718         error = vn_open(&nd, &flags, 0, NULL);
2719         if (error) {
2720                 /*
2721                  * This is the only reasonable guess we can make as far as
2722                  * path if the user doesn't give us a fully qualified path.
2723                  * If they want to specify a file, they need to specify the
2724                  * full path.
2725                  */
2726                 if (xbb->dev_name[0] != '/') {
2727                         char *dev_path = "/dev/";
2728                         char *dev_name;
2729
2730                         /* Try adding device path at beginning of name */
2731                         dev_name = malloc(strlen(xbb->dev_name)
2732                                         + strlen(dev_path) + 1,
2733                                           M_XENBLOCKBACK, M_NOWAIT);
2734                         if (dev_name) {
2735                                 sprintf(dev_name, "%s%s", dev_path,
2736                                         xbb->dev_name);
2737                                 free(xbb->dev_name, M_XENBLOCKBACK);
2738                                 xbb->dev_name = dev_name;
2739                                 goto again;
2740                         }
2741                 }
2742                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2743                                  xbb->dev_name);
2744                 return (error);
2745         }
2746
2747         vfs_is_locked = NDHASGIANT(&nd);
2748
2749         NDFREE(&nd, NDF_ONLY_PNBUF);
2750                 
2751         xbb->vn = nd.ni_vp;
2752
2753         /* We only support disks and files. */
2754         if (vn_isdisk(xbb->vn, &error)) {
2755                 error = xbb_open_dev(xbb);
2756         } else if (xbb->vn->v_type == VREG) {
2757                 error = xbb_open_file(xbb);
2758         } else {
2759                 error = EINVAL;
2760                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2761                                  "or file", xbb->dev_name);
2762         }
2763         VOP_UNLOCK(xbb->vn, 0);
2764         VFS_UNLOCK_GIANT(vfs_is_locked);
2765
2766         if (error != 0) {
2767                 xbb_close_backend(xbb);
2768                 return (error);
2769         }
2770
2771         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2772         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2773
2774         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2775                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2776                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2777
2778         return (0);
2779 }
2780
2781 /*------------------------ Inter-Domain Communication ------------------------*/
2782 /**
2783  * Free dynamically allocated KVA or pseudo-physical address allocations.
2784  *
2785  * \param xbb  Per-instance xbb configuration structure.
2786  */
2787 static void
2788 xbb_free_communication_mem(struct xbb_softc *xbb)
2789 {
2790         if (xbb->kva != 0) {
2791 #ifndef XENHVM
2792                 kmem_free(kernel_map, xbb->kva, xbb->kva_size);
2793 #else
2794                 if (xbb->pseudo_phys_res != NULL) {
2795                         bus_release_resource(xbb->dev, SYS_RES_MEMORY,
2796                                              xbb->pseudo_phys_res_id,
2797                                              xbb->pseudo_phys_res);
2798                         xbb->pseudo_phys_res = NULL;
2799                 }
2800 #endif
2801         }
2802         xbb->kva = 0;
2803         xbb->gnt_base_addr = 0;
2804         if (xbb->kva_free != NULL) {
2805                 free(xbb->kva_free, M_XENBLOCKBACK);
2806                 xbb->kva_free = NULL;
2807         }
2808 }
2809
2810 /**
2811  * Cleanup all inter-domain communication mechanisms.
2812  *
2813  * \param xbb  Per-instance xbb configuration structure.
2814  */
2815 static int
2816 xbb_disconnect(struct xbb_softc *xbb)
2817 {
2818         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2819         struct gnttab_unmap_grant_ref *op;
2820         u_int                          ring_idx;
2821         int                            error;
2822
2823         DPRINTF("\n");
2824
2825         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2826                 return (0);
2827
2828         if (xbb->irq != 0) {
2829                 unbind_from_irqhandler(xbb->irq);
2830                 xbb->irq = 0;
2831         }
2832
2833         mtx_unlock(&xbb->lock);
2834         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2835         mtx_lock(&xbb->lock);
2836
2837         /*
2838          * No new interrupts can generate work, but we must wait
2839          * for all currently active requests to drain.
2840          */
2841         if (xbb->active_request_count != 0)
2842                 return (EAGAIN);
2843         
2844         for (ring_idx = 0, op = ops;
2845              ring_idx < xbb->ring_config.ring_pages;
2846              ring_idx++, op++) {
2847
2848                 op->host_addr    = xbb->ring_config.gnt_addr
2849                                  + (ring_idx * PAGE_SIZE);
2850                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2851                 op->handle       = xbb->ring_config.handle[ring_idx];
2852         }
2853
2854         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2855                                           xbb->ring_config.ring_pages);
2856         if (error != 0)
2857                 panic("Grant table op failed (%d)", error);
2858
2859         xbb_free_communication_mem(xbb);
2860
2861         if (xbb->requests != NULL) {
2862                 free(xbb->requests, M_XENBLOCKBACK);
2863                 xbb->requests = NULL;
2864         }
2865
2866         if (xbb->request_lists != NULL) {
2867                 struct xbb_xen_reqlist *reqlist;
2868                 int i;
2869
2870                 /* There is one request list for ever allocated request. */
2871                 for (i = 0, reqlist = xbb->request_lists;
2872                      i < xbb->max_requests; i++, reqlist++){
2873 #ifdef XBB_USE_BOUNCE_BUFFERS
2874                         if (reqlist->bounce != NULL) {
2875                                 free(reqlist->bounce, M_XENBLOCKBACK);
2876                                 reqlist->bounce = NULL;
2877                         }
2878 #endif
2879                         if (reqlist->gnt_handles != NULL) {
2880                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2881                                 reqlist->gnt_handles = NULL;
2882                         }
2883                 }
2884                 free(xbb->request_lists, M_XENBLOCKBACK);
2885                 xbb->request_lists = NULL;
2886         }
2887
2888         xbb->flags &= ~XBBF_RING_CONNECTED;
2889         return (0);
2890 }
2891
2892 /**
2893  * Map shared memory ring into domain local address space, initialize
2894  * ring control structures, and bind an interrupt to the event channel
2895  * used to notify us of ring changes.
2896  *
2897  * \param xbb  Per-instance xbb configuration structure.
2898  */
2899 static int
2900 xbb_connect_ring(struct xbb_softc *xbb)
2901 {
2902         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2903         struct gnttab_map_grant_ref *gnt;
2904         u_int                        ring_idx;
2905         int                          error;
2906
2907         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2908                 return (0);
2909
2910         /*
2911          * Kva for our ring is at the tail of the region of kva allocated
2912          * by xbb_alloc_communication_mem().
2913          */
2914         xbb->ring_config.va = xbb->kva
2915                             + (xbb->kva_size
2916                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2917         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2918                                   + (xbb->kva_size
2919                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2920
2921         for (ring_idx = 0, gnt = gnts;
2922              ring_idx < xbb->ring_config.ring_pages;
2923              ring_idx++, gnt++) {
2924
2925                 gnt->host_addr = xbb->ring_config.gnt_addr
2926                                + (ring_idx * PAGE_SIZE);
2927                 gnt->flags     = GNTMAP_host_map;
2928                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2929                 gnt->dom       = xbb->otherend_id;
2930         }
2931
2932         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2933                                           xbb->ring_config.ring_pages);
2934         if (error)
2935                 panic("blkback: Ring page grant table op failed (%d)", error);
2936
2937         for (ring_idx = 0, gnt = gnts;
2938              ring_idx < xbb->ring_config.ring_pages;
2939              ring_idx++, gnt++) {
2940                 if (gnt->status != 0) {
2941                         xbb->ring_config.va = 0;
2942                         xenbus_dev_fatal(xbb->dev, EACCES,
2943                                          "Ring shared page mapping failed. "
2944                                          "Status %d.", gnt->status);
2945                         return (EACCES);
2946                 }
2947                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2948                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2949         }
2950
2951         /* Initialize the ring based on ABI. */
2952         switch (xbb->abi) {
2953         case BLKIF_PROTOCOL_NATIVE:
2954         {
2955                 blkif_sring_t *sring;
2956                 sring = (blkif_sring_t *)xbb->ring_config.va;
2957                 BACK_RING_INIT(&xbb->rings.native, sring,
2958                                xbb->ring_config.ring_pages * PAGE_SIZE);
2959                 break;
2960         }
2961         case BLKIF_PROTOCOL_X86_32:
2962         {
2963                 blkif_x86_32_sring_t *sring_x86_32;
2964                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2965                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2966                                xbb->ring_config.ring_pages * PAGE_SIZE);
2967                 break;
2968         }
2969         case BLKIF_PROTOCOL_X86_64:
2970         {
2971                 blkif_x86_64_sring_t *sring_x86_64;
2972                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2973                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2974                                xbb->ring_config.ring_pages * PAGE_SIZE);
2975                 break;
2976         }
2977         default:
2978                 panic("Unexpected blkif protocol ABI.");
2979         }
2980
2981         xbb->flags |= XBBF_RING_CONNECTED;
2982
2983         error =
2984             bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id,
2985                                                   xbb->ring_config.evtchn,
2986                                                   device_get_nameunit(xbb->dev),
2987                                                   xbb_intr, /*arg*/xbb,
2988                                                   INTR_TYPE_BIO | INTR_MPSAFE,
2989                                                   &xbb->irq);
2990         if (error) {
2991                 (void)xbb_disconnect(xbb);
2992                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2993                 return (error);
2994         }
2995
2996         DPRINTF("rings connected!\n");
2997
2998         return 0;
2999 }
3000
3001 /* Needed to make bit_alloc() macro work */
3002 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,      \
3003                                    M_NOWAIT|M_ZERO);
3004
3005 /**
3006  * Size KVA and pseudo-physical address allocations based on negotiated
3007  * values for the size and number of I/O requests, and the size of our
3008  * communication ring.
3009  *
3010  * \param xbb  Per-instance xbb configuration structure.
3011  *
3012  * These address spaces are used to dynamically map pages in the
3013  * front-end's domain into our own.
3014  */
3015 static int
3016 xbb_alloc_communication_mem(struct xbb_softc *xbb)
3017 {
3018         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
3019         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
3020         xbb->kva_size = xbb->reqlist_kva_size +
3021                         (xbb->ring_config.ring_pages * PAGE_SIZE);
3022
3023         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
3024         if (xbb->kva_free == NULL)
3025                 return (ENOMEM);
3026
3027         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
3028                 device_get_nameunit(xbb->dev), xbb->kva_size,
3029                 xbb->reqlist_kva_size);
3030 #ifndef XENHVM
3031         xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size);
3032         if (xbb->kva == 0)
3033                 return (ENOMEM);
3034         xbb->gnt_base_addr = xbb->kva;
3035 #else /* XENHVM */
3036         /*
3037          * Reserve a range of pseudo physical memory that we can map
3038          * into kva.  These pages will only be backed by machine
3039          * pages ("real memory") during the lifetime of front-end requests
3040          * via grant table operations.
3041          */
3042         xbb->pseudo_phys_res_id = 0;
3043         xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
3044                                                   &xbb->pseudo_phys_res_id,
3045                                                   0, ~0, xbb->kva_size,
3046                                                   RF_ACTIVE);
3047         if (xbb->pseudo_phys_res == NULL) {
3048                 xbb->kva = 0;
3049                 return (ENOMEM);
3050         }
3051         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3052         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3053 #endif /* XENHVM */
3054
3055         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3056                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3057                 (uintmax_t)xbb->gnt_base_addr); 
3058         return (0);
3059 }
3060
3061 /**
3062  * Collect front-end information from the XenStore.
3063  *
3064  * \param xbb  Per-instance xbb configuration structure.
3065  */
3066 static int
3067 xbb_collect_frontend_info(struct xbb_softc *xbb)
3068 {
3069         char        protocol_abi[64];
3070         const char *otherend_path;
3071         int         error;
3072         u_int       ring_idx;
3073         u_int       ring_page_order;
3074         size_t      ring_size;
3075
3076         otherend_path = xenbus_get_otherend_path(xbb->dev);
3077
3078         /*
3079          * Protocol defaults valid even if all negotiation fails.
3080          */
3081         xbb->ring_config.ring_pages = 1;
3082         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
3083         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3084
3085         /*
3086          * Mandatory data (used in all versions of the protocol) first.
3087          */
3088         error = xs_scanf(XST_NIL, otherend_path,
3089                          "event-channel", NULL, "%" PRIu32,
3090                          &xbb->ring_config.evtchn);
3091         if (error != 0) {
3092                 xenbus_dev_fatal(xbb->dev, error,
3093                                  "Unable to retrieve event-channel information "
3094                                  "from frontend %s.  Unable to connect.",
3095                                  xenbus_get_otherend_path(xbb->dev));
3096                 return (error);
3097         }
3098
3099         /*
3100          * These fields are initialized to legacy protocol defaults
3101          * so we only need to fail if reading the updated value succeeds
3102          * and the new value is outside of its allowed range.
3103          *
3104          * \note xs_gather() returns on the first encountered error, so
3105          *       we must use independant calls in order to guarantee
3106          *       we don't miss information in a sparsly populated front-end
3107          *       tree.
3108          *
3109          * \note xs_scanf() does not update variables for unmatched
3110          *       fields.
3111          */
3112         ring_page_order = 0;
3113         (void)xs_scanf(XST_NIL, otherend_path,
3114                        "ring-page-order", NULL, "%u",
3115                        &ring_page_order);
3116         xbb->ring_config.ring_pages = 1 << ring_page_order;
3117         (void)xs_scanf(XST_NIL, otherend_path,
3118                        "num-ring-pages", NULL, "%u",
3119                        &xbb->ring_config.ring_pages);
3120         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3121         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3122
3123         (void)xs_scanf(XST_NIL, otherend_path,
3124                        "max-requests", NULL, "%u",
3125                        &xbb->max_requests);
3126
3127         (void)xs_scanf(XST_NIL, otherend_path,
3128                        "max-request-segments", NULL, "%u",
3129                        &xbb->max_request_segments);
3130
3131         (void)xs_scanf(XST_NIL, otherend_path,
3132                        "max-request-size", NULL, "%u",
3133                        &xbb->max_request_size);
3134
3135         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3136                 xenbus_dev_fatal(xbb->dev, EINVAL,
3137                                  "Front-end specified ring-pages of %u "
3138                                  "exceeds backend limit of %zu.  "
3139                                  "Unable to connect.",
3140                                  xbb->ring_config.ring_pages,
3141                                  XBB_MAX_RING_PAGES);
3142                 return (EINVAL);
3143         } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
3144                 xenbus_dev_fatal(xbb->dev, EINVAL,
3145                                  "Front-end specified max_requests of %u "
3146                                  "exceeds backend limit of %u.  "
3147                                  "Unable to connect.",
3148                                  xbb->max_requests,
3149                                  XBB_MAX_REQUESTS);
3150                 return (EINVAL);
3151         } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
3152                 xenbus_dev_fatal(xbb->dev, EINVAL,
3153                                  "Front-end specified max_requests_segments "
3154                                  "of %u exceeds backend limit of %u.  "
3155                                  "Unable to connect.",
3156                                  xbb->max_request_segments,
3157                                  XBB_MAX_SEGMENTS_PER_REQUEST);
3158                 return (EINVAL);
3159         } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
3160                 xenbus_dev_fatal(xbb->dev, EINVAL,
3161                                  "Front-end specified max_request_size "
3162                                  "of %u exceeds backend limit of %u.  "
3163                                  "Unable to connect.",
3164                                  xbb->max_request_size,
3165                                  XBB_MAX_REQUEST_SIZE);
3166                 return (EINVAL);
3167         }
3168
3169         if (xbb->ring_config.ring_pages == 1) {
3170                 error = xs_gather(XST_NIL, otherend_path,
3171                                   "ring-ref", "%" PRIu32,
3172                                   &xbb->ring_config.ring_ref[0],
3173                                   NULL);
3174                 if (error != 0) {
3175                         xenbus_dev_fatal(xbb->dev, error,
3176                                          "Unable to retrieve ring information "
3177                                          "from frontend %s.  Unable to "
3178                                          "connect.",
3179                                          xenbus_get_otherend_path(xbb->dev));
3180                         return (error);
3181                 }
3182         } else {
3183                 /* Multi-page ring format. */
3184                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3185                      ring_idx++) {
3186                         char ring_ref_name[]= "ring_refXX";
3187
3188                         snprintf(ring_ref_name, sizeof(ring_ref_name),
3189                                  "ring-ref%u", ring_idx);
3190                         error = xs_scanf(XST_NIL, otherend_path,
3191                                          ring_ref_name, NULL, "%" PRIu32,
3192                                          &xbb->ring_config.ring_ref[ring_idx]);
3193                         if (error != 0) {
3194                                 xenbus_dev_fatal(xbb->dev, error,
3195                                                  "Failed to retriev grant "
3196                                                  "reference for page %u of "
3197                                                  "shared ring.  Unable "
3198                                                  "to connect.", ring_idx);
3199                                 return (error);
3200                         }
3201                 }
3202         }
3203
3204         error = xs_gather(XST_NIL, otherend_path,
3205                           "protocol", "%63s", protocol_abi,
3206                           NULL); 
3207         if (error != 0
3208          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3209                 /*
3210                  * Assume native if the frontend has not
3211                  * published ABI data or it has published and
3212                  * matches our own ABI.
3213                  */
3214                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3215         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3216
3217                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3218         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3219
3220                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3221         } else {
3222
3223                 xenbus_dev_fatal(xbb->dev, EINVAL,
3224                                  "Unknown protocol ABI (%s) published by "
3225                                  "frontend.  Unable to connect.", protocol_abi);
3226                 return (EINVAL);
3227         }
3228         return (0);
3229 }
3230
3231 /**
3232  * Allocate per-request data structures given request size and number
3233  * information negotiated with the front-end.
3234  *
3235  * \param xbb  Per-instance xbb configuration structure.
3236  */
3237 static int
3238 xbb_alloc_requests(struct xbb_softc *xbb)
3239 {
3240         struct xbb_xen_req *req;
3241         struct xbb_xen_req *last_req;
3242
3243         /*
3244          * Allocate request book keeping datastructures.
3245          */
3246         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3247                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3248         if (xbb->requests == NULL) {
3249                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3250                                   "Unable to allocate request structures");
3251                 return (ENOMEM);
3252         }
3253
3254         req      = xbb->requests;
3255         last_req = &xbb->requests[xbb->max_requests - 1];
3256         STAILQ_INIT(&xbb->request_free_stailq);
3257         while (req <= last_req) {
3258                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3259                 req++;
3260         }
3261         return (0);
3262 }
3263
3264 static int
3265 xbb_alloc_request_lists(struct xbb_softc *xbb)
3266 {
3267         struct xbb_xen_reqlist *reqlist;
3268         int                     i;
3269
3270         /*
3271          * If no requests can be merged, we need 1 request list per
3272          * in flight request.
3273          */
3274         xbb->request_lists = malloc(xbb->max_requests *
3275                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3276         if (xbb->request_lists == NULL) {
3277                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3278                                   "Unable to allocate request list structures");
3279                 return (ENOMEM);
3280         }
3281
3282         STAILQ_INIT(&xbb->reqlist_free_stailq);
3283         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3284         for (i = 0; i < xbb->max_requests; i++) {
3285                 int seg;
3286
3287                 reqlist      = &xbb->request_lists[i];
3288
3289                 reqlist->xbb = xbb;
3290
3291 #ifdef XBB_USE_BOUNCE_BUFFERS
3292                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3293                                          M_XENBLOCKBACK, M_NOWAIT);
3294                 if (reqlist->bounce == NULL) {
3295                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3296                                          "Unable to allocate request "
3297                                          "bounce buffers");
3298                         return (ENOMEM);
3299                 }
3300 #endif /* XBB_USE_BOUNCE_BUFFERS */
3301
3302                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3303                                               sizeof(*reqlist->gnt_handles),
3304                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3305                 if (reqlist->gnt_handles == NULL) {
3306                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3307                                           "Unable to allocate request "
3308                                           "grant references");
3309                         return (ENOMEM);
3310                 }
3311
3312                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3313                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3314
3315                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3316         }
3317         return (0);
3318 }
3319
3320 /**
3321  * Supply information about the physical device to the frontend
3322  * via XenBus.
3323  *
3324  * \param xbb  Per-instance xbb configuration structure.
3325  */
3326 static int
3327 xbb_publish_backend_info(struct xbb_softc *xbb)
3328 {
3329         struct xs_transaction xst;
3330         const char           *our_path;
3331         const char           *leaf;
3332         int                   error;
3333
3334         our_path = xenbus_get_node(xbb->dev);
3335         while (1) {
3336                 error = xs_transaction_start(&xst);
3337                 if (error != 0) {
3338                         xenbus_dev_fatal(xbb->dev, error,
3339                                          "Error publishing backend info "
3340                                          "(start transaction)");
3341                         return (error);
3342                 }
3343
3344                 leaf = "sectors";
3345                 error = xs_printf(xst, our_path, leaf,
3346                                   "%"PRIu64, xbb->media_num_sectors);
3347                 if (error != 0)
3348                         break;
3349
3350                 /* XXX Support all VBD attributes here. */
3351                 leaf = "info";
3352                 error = xs_printf(xst, our_path, leaf, "%u",
3353                                   xbb->flags & XBBF_READ_ONLY
3354                                 ? VDISK_READONLY : 0);
3355                 if (error != 0)
3356                         break;
3357
3358                 leaf = "sector-size";
3359                 error = xs_printf(xst, our_path, leaf, "%u",
3360                                   xbb->sector_size);
3361                 if (error != 0)
3362                         break;
3363
3364                 error = xs_transaction_end(xst, 0);
3365                 if (error == 0) {
3366                         return (0);
3367                 } else if (error != EAGAIN) {
3368                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3369                         return (error);
3370                 }
3371         }
3372
3373         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3374                         our_path, leaf);
3375         xs_transaction_end(xst, 1);
3376         return (error);
3377 }
3378
3379 /**
3380  * Connect to our blkfront peer now that it has completed publishing
3381  * its configuration into the XenStore.
3382  *
3383  * \param xbb  Per-instance xbb configuration structure.
3384  */
3385 static void
3386 xbb_connect(struct xbb_softc *xbb)
3387 {
3388         int error;
3389
3390         if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
3391                 return;
3392
3393         if (xbb_collect_frontend_info(xbb) != 0)
3394                 return;
3395
3396         xbb->flags &= ~XBBF_SHUTDOWN;
3397
3398         /*
3399          * We limit the maximum number of reqlist segments to the maximum
3400          * number of segments in the ring, or our absolute maximum,
3401          * whichever is smaller.
3402          */
3403         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3404                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3405
3406         /*
3407          * The maximum size is simply a function of the number of segments
3408          * we can handle.
3409          */
3410         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3411
3412         /* Allocate resources whose size depends on front-end configuration. */
3413         error = xbb_alloc_communication_mem(xbb);
3414         if (error != 0) {
3415                 xenbus_dev_fatal(xbb->dev, error,
3416                                  "Unable to allocate communication memory");
3417                 return;
3418         }
3419
3420         error = xbb_alloc_requests(xbb);
3421         if (error != 0) {
3422                 /* Specific errors are reported by xbb_alloc_requests(). */
3423                 return;
3424         }
3425
3426         error = xbb_alloc_request_lists(xbb);
3427         if (error != 0) {
3428                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3429                 return;
3430         }
3431
3432         /*
3433          * Connect communication channel.
3434          */
3435         error = xbb_connect_ring(xbb);
3436         if (error != 0) {
3437                 /* Specific errors are reported by xbb_connect_ring(). */
3438                 return;
3439         }
3440         
3441         if (xbb_publish_backend_info(xbb) != 0) {
3442                 /*
3443                  * If we can't publish our data, we cannot participate
3444                  * in this connection, and waiting for a front-end state
3445                  * change will not help the situation.
3446                  */
3447                 (void)xbb_disconnect(xbb);
3448                 return;
3449         }
3450
3451         /* Ready for I/O. */
3452         xenbus_set_state(xbb->dev, XenbusStateConnected);
3453 }
3454
3455 /*-------------------------- Device Teardown Support -------------------------*/
3456 /**
3457  * Perform device shutdown functions.
3458  *
3459  * \param xbb  Per-instance xbb configuration structure.
3460  *
3461  * Mark this instance as shutting down, wait for any active I/O on the
3462  * backend device/file to drain, disconnect from the front-end, and notify
3463  * any waiters (e.g. a thread invoking our detach method) that detach can
3464  * now proceed.
3465  */
3466 static int
3467 xbb_shutdown(struct xbb_softc *xbb)
3468 {
3469         XenbusState frontState;
3470         int         error;
3471
3472         DPRINTF("\n");
3473
3474         /*
3475          * Due to the need to drop our mutex during some
3476          * xenbus operations, it is possible for two threads
3477          * to attempt to close out shutdown processing at
3478          * the same time.  Tell the caller that hits this
3479          * race to try back later. 
3480          */
3481         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3482                 return (EAGAIN);
3483
3484         xbb->flags |= XBBF_IN_SHUTDOWN;
3485         mtx_unlock(&xbb->lock);
3486
3487         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3488                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3489
3490         frontState = xenbus_get_otherend_state(xbb->dev);
3491         mtx_lock(&xbb->lock);
3492         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3493
3494         /* The front can submit I/O until entering the closed state. */
3495         if (frontState < XenbusStateClosed)
3496                 return (EAGAIN);
3497
3498         DPRINTF("\n");
3499
3500         /* Indicate shutdown is in progress. */
3501         xbb->flags |= XBBF_SHUTDOWN;
3502
3503         /* Disconnect from the front-end. */
3504         error = xbb_disconnect(xbb);
3505         if (error != 0) {
3506                 /*
3507                  * Requests still outstanding.  We'll be called again
3508                  * once they complete.
3509                  */
3510                 KASSERT(error == EAGAIN,
3511                         ("%s: Unexpected xbb_disconnect() failure %d",
3512                          __func__, error));
3513
3514                 return (error);
3515         }
3516
3517         DPRINTF("\n");
3518
3519         /* Indicate to xbb_detach() that is it safe to proceed. */
3520         wakeup(xbb);
3521
3522         return (0);
3523 }
3524
3525 /**
3526  * Report an attach time error to the console and Xen, and cleanup
3527  * this instance by forcing immediate detach processing.
3528  *
3529  * \param xbb  Per-instance xbb configuration structure.
3530  * \param err  Errno describing the error.
3531  * \param fmt  Printf style format and arguments
3532  */
3533 static void
3534 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3535 {
3536         va_list ap;
3537         va_list ap_hotplug;
3538
3539         va_start(ap, fmt);
3540         va_copy(ap_hotplug, ap);
3541         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3542                   "hotplug-error", fmt, ap_hotplug);
3543         va_end(ap_hotplug);
3544         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3545                   "hotplug-status", "error");
3546
3547         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3548         va_end(ap);
3549
3550         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3551                   "online", "0");
3552         xbb_detach(xbb->dev);
3553 }
3554
3555 /*---------------------------- NewBus Entrypoints ----------------------------*/
3556 /**
3557  * Inspect a XenBus device and claim it if is of the appropriate type.
3558  * 
3559  * \param dev  NewBus device object representing a candidate XenBus device.
3560  *
3561  * \return  0 for success, errno codes for failure.
3562  */
3563 static int
3564 xbb_probe(device_t dev)
3565 {
3566  
3567         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3568                 device_set_desc(dev, "Backend Virtual Block Device");
3569                 device_quiet(dev);
3570                 return (0);
3571         }
3572
3573         return (ENXIO);
3574 }
3575
3576 /**
3577  * Setup sysctl variables to control various Block Back parameters.
3578  *
3579  * \param xbb  Xen Block Back softc.
3580  *
3581  */
3582 static void
3583 xbb_setup_sysctl(struct xbb_softc *xbb)
3584 {
3585         struct sysctl_ctx_list *sysctl_ctx = NULL;
3586         struct sysctl_oid      *sysctl_tree = NULL;
3587         
3588         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3589         if (sysctl_ctx == NULL)
3590                 return;
3591
3592         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3593         if (sysctl_tree == NULL)
3594                 return;
3595
3596         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3597                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3598                        "fake the flush command");
3599
3600         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3601                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3602                        "send a real flush for N flush requests");
3603
3604         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3605                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3606                        "Don't coalesce contiguous requests");
3607
3608         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3609                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3610                          "how many I/O requests we have received");
3611
3612         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3613                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3614                          "how many I/O requests have been completed");
3615
3616         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3617                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3618                          "how many I/O dispatches were forced");
3619
3620         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3621                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3622                          "how many I/O dispatches were normal");
3623
3624         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3625                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3626                          "total number of I/O dispatches");
3627
3628         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3629                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3630                          "how many times we have run out of KVA");
3631
3632         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3633                          "request_shortages", CTLFLAG_RW,
3634                          &xbb->request_shortages,
3635                          "how many times we have run out of requests");
3636
3637         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3638                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3639                         "maximum outstanding requests (negotiated)");
3640
3641         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3642                         "max_request_segments", CTLFLAG_RD,
3643                         &xbb->max_request_segments, 0,
3644                         "maximum number of pages per requests (negotiated)");
3645
3646         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3647                         "max_request_size", CTLFLAG_RD,
3648                         &xbb->max_request_size, 0,
3649                         "maximum size in bytes of a request (negotiated)");
3650
3651         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3652                         "ring_pages", CTLFLAG_RD,
3653                         &xbb->ring_config.ring_pages, 0,
3654                         "communication channel pages (negotiated)");
3655 }
3656
3657 /**
3658  * Attach to a XenBus device that has been claimed by our probe routine.
3659  *
3660  * \param dev  NewBus device object representing this Xen Block Back instance.
3661  *
3662  * \return  0 for success, errno codes for failure.
3663  */
3664 static int
3665 xbb_attach(device_t dev)
3666 {
3667         struct xbb_softc        *xbb;
3668         int                      error;
3669         u_int                    max_ring_page_order;
3670
3671         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3672
3673         /*
3674          * Basic initialization.
3675          * After this block it is safe to call xbb_detach()
3676          * to clean up any allocated data for this instance.
3677          */
3678         xbb = device_get_softc(dev);
3679         xbb->dev = dev;
3680         xbb->otherend_id = xenbus_get_otherend_id(dev);
3681         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3682         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3683
3684         /*
3685          * Publish protocol capabilities for consumption by the
3686          * front-end.
3687          */
3688         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3689                           "feature-barrier", "1");
3690         if (error) {
3691                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3692                                   xenbus_get_node(xbb->dev));
3693                 return (error);
3694         }
3695
3696         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3697                           "feature-flush-cache", "1");
3698         if (error) {
3699                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3700                                   xenbus_get_node(xbb->dev));
3701                 return (error);
3702         }
3703
3704         /*
3705          * Amazon EC2 client compatility.  They refer to max-ring-pages
3706          * instead of to max-ring-page-order.
3707          */
3708         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3709                           "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
3710         if (error) {
3711                 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
3712                                   xenbus_get_node(xbb->dev));
3713                 return (error);
3714         }
3715
3716         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3717         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3718                           "max-ring-page-order", "%u", max_ring_page_order);
3719         if (error) {
3720                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3721                                   xenbus_get_node(xbb->dev));
3722                 return (error);
3723         }
3724
3725         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3726                           "max-requests", "%u", XBB_MAX_REQUESTS);
3727         if (error) {
3728                 xbb_attach_failed(xbb, error, "writing %s/max-requests",
3729                                   xenbus_get_node(xbb->dev));
3730                 return (error);
3731         }
3732
3733         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3734                           "max-request-segments", "%u",
3735                           XBB_MAX_SEGMENTS_PER_REQUEST);
3736         if (error) {
3737                 xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
3738                                   xenbus_get_node(xbb->dev));
3739                 return (error);
3740         }
3741
3742         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3743                           "max-request-size", "%u",
3744                           XBB_MAX_REQUEST_SIZE);
3745         if (error) {
3746                 xbb_attach_failed(xbb, error, "writing %s/max-request-size",
3747                                   xenbus_get_node(xbb->dev));
3748                 return (error);
3749         }
3750
3751         /* Collect physical device information. */
3752         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3753                           "device-type", NULL, &xbb->dev_type,
3754                           NULL);
3755         if (error != 0)
3756                 xbb->dev_type = NULL;
3757
3758         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3759                           "mode", NULL, &xbb->dev_mode,
3760                           "params", NULL, &xbb->dev_name,
3761                           NULL);
3762         if (error != 0) {
3763                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3764                                   xenbus_get_node(dev));
3765                 return (ENXIO);
3766         }
3767
3768         /* Parse fopen style mode flags. */
3769         if (strchr(xbb->dev_mode, 'w') == NULL)
3770                 xbb->flags |= XBBF_READ_ONLY;
3771
3772         /*
3773          * Verify the physical device is present and can support
3774          * the desired I/O mode.
3775          */
3776         DROP_GIANT();
3777         error = xbb_open_backend(xbb);
3778         PICKUP_GIANT();
3779         if (error != 0) {
3780                 xbb_attach_failed(xbb, error, "Unable to open %s",
3781                                   xbb->dev_name);
3782                 return (ENXIO);
3783         }
3784
3785         /* Use devstat(9) for recording statistics. */
3786         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3787                                            xbb->sector_size,
3788                                            DEVSTAT_ALL_SUPPORTED,
3789                                            DEVSTAT_TYPE_DIRECT
3790                                          | DEVSTAT_TYPE_IF_OTHER,
3791                                            DEVSTAT_PRIORITY_OTHER);
3792
3793         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3794                                               xbb->sector_size,
3795                                               DEVSTAT_ALL_SUPPORTED,
3796                                               DEVSTAT_TYPE_DIRECT
3797                                             | DEVSTAT_TYPE_IF_OTHER,
3798                                               DEVSTAT_PRIORITY_OTHER);
3799         /*
3800          * Setup sysctl variables.
3801          */
3802         xbb_setup_sysctl(xbb);
3803
3804         /*
3805          * Create a taskqueue for doing work that must occur from a
3806          * thread context.
3807          */
3808         xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT,
3809                                              taskqueue_thread_enqueue,
3810                                              /*context*/&xbb->io_taskqueue);
3811         if (xbb->io_taskqueue == NULL) {
3812                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3813                 return (ENOMEM);
3814         }
3815
3816         taskqueue_start_threads(&xbb->io_taskqueue,
3817                                 /*num threads*/1,
3818                                 /*priority*/PWAIT,
3819                                 /*thread name*/
3820                                 "%s taskq", device_get_nameunit(dev));
3821
3822         /* Update hot-plug status to satisfy xend. */
3823         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3824                           "hotplug-status", "connected");
3825         if (error) {
3826                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3827                                   xenbus_get_node(xbb->dev));
3828                 return (error);
3829         }
3830
3831         /* Tell the front end that we are ready to connect. */
3832         xenbus_set_state(dev, XenbusStateInitWait);
3833
3834         return (0);
3835 }
3836
3837 /**
3838  * Detach from a block back device instance.
3839  *
3840  * \param dev  NewBus device object representing this Xen Block Back instance.
3841  *
3842  * \return  0 for success, errno codes for failure.
3843  * 
3844  * \note A block back device may be detached at any time in its life-cycle,
3845  *       including part way through the attach process.  For this reason,
3846  *       initialization order and the intialization state checks in this
3847  *       routine must be carefully coupled so that attach time failures
3848  *       are gracefully handled.
3849  */
3850 static int
3851 xbb_detach(device_t dev)
3852 {
3853         struct xbb_softc *xbb;
3854
3855         DPRINTF("\n");
3856
3857         xbb = device_get_softc(dev);
3858         mtx_lock(&xbb->lock);
3859         while (xbb_shutdown(xbb) == EAGAIN) {
3860                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3861                        "xbb_shutdown", 0);
3862         }
3863         mtx_unlock(&xbb->lock);
3864
3865         DPRINTF("\n");
3866
3867         if (xbb->io_taskqueue != NULL)
3868                 taskqueue_free(xbb->io_taskqueue);
3869
3870         if (xbb->xbb_stats != NULL)
3871                 devstat_remove_entry(xbb->xbb_stats);
3872
3873         if (xbb->xbb_stats_in != NULL)
3874                 devstat_remove_entry(xbb->xbb_stats_in);
3875
3876         xbb_close_backend(xbb);
3877
3878         if (xbb->dev_mode != NULL) {
3879                 free(xbb->dev_mode, M_XENBUS);
3880                 xbb->dev_mode = NULL;
3881         }
3882
3883         if (xbb->dev_type != NULL) {
3884                 free(xbb->dev_type, M_XENBUS);
3885                 xbb->dev_type = NULL;
3886         }
3887
3888         if (xbb->dev_name != NULL) {
3889                 free(xbb->dev_name, M_XENBUS);
3890                 xbb->dev_name = NULL;
3891         }
3892
3893         mtx_destroy(&xbb->lock);
3894         return (0);
3895 }
3896
3897 /**
3898  * Prepare this block back device for suspension of this VM.
3899  * 
3900  * \param dev  NewBus device object representing this Xen Block Back instance.
3901  *
3902  * \return  0 for success, errno codes for failure.
3903  */
3904 static int
3905 xbb_suspend(device_t dev)
3906 {
3907 #ifdef NOT_YET
3908         struct xbb_softc *sc = device_get_softc(dev);
3909
3910         /* Prevent new requests being issued until we fix things up. */
3911         mtx_lock(&sc->xb_io_lock);
3912         sc->connected = BLKIF_STATE_SUSPENDED;
3913         mtx_unlock(&sc->xb_io_lock);
3914 #endif
3915
3916         return (0);
3917 }
3918
3919 /**
3920  * Perform any processing required to recover from a suspended state.
3921  * 
3922  * \param dev  NewBus device object representing this Xen Block Back instance.
3923  *
3924  * \return  0 for success, errno codes for failure.
3925  */
3926 static int
3927 xbb_resume(device_t dev)
3928 {
3929         return (0);
3930 }
3931
3932 /**
3933  * Handle state changes expressed via the XenStore by our front-end peer.
3934  *
3935  * \param dev             NewBus device object representing this Xen
3936  *                        Block Back instance.
3937  * \param frontend_state  The new state of the front-end.
3938  *
3939  * \return  0 for success, errno codes for failure.
3940  */
3941 static void
3942 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3943 {
3944         struct xbb_softc *xbb = device_get_softc(dev);
3945
3946         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3947                 xenbus_strstate(frontend_state),
3948                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3949
3950         switch (frontend_state) {
3951         case XenbusStateInitialising:
3952                 break;
3953         case XenbusStateInitialised:
3954         case XenbusStateConnected:
3955                 xbb_connect(xbb);
3956                 break;
3957         case XenbusStateClosing:
3958         case XenbusStateClosed:
3959                 mtx_lock(&xbb->lock);
3960                 xbb_shutdown(xbb);
3961                 mtx_unlock(&xbb->lock);
3962                 if (frontend_state == XenbusStateClosed)
3963                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3964                 break;
3965         default:
3966                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3967                                  frontend_state);
3968                 break;
3969         }
3970 }
3971
3972 /*---------------------------- NewBus Registration ---------------------------*/
3973 static device_method_t xbb_methods[] = {
3974         /* Device interface */
3975         DEVMETHOD(device_probe,         xbb_probe),
3976         DEVMETHOD(device_attach,        xbb_attach),
3977         DEVMETHOD(device_detach,        xbb_detach),
3978         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3979         DEVMETHOD(device_suspend,       xbb_suspend),
3980         DEVMETHOD(device_resume,        xbb_resume),
3981
3982         /* Xenbus interface */
3983         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3984
3985         { 0, 0 }
3986 };
3987
3988 static driver_t xbb_driver = {
3989         "xbbd",
3990         xbb_methods,
3991         sizeof(struct xbb_softc),
3992 };
3993 devclass_t xbb_devclass;
3994
3995 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);