]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/xen/blkback/blkback.c
Copy head (r256279) to stable/10 as part of the 10.0-RELEASE cycle.
[FreeBSD/stable/10.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * Copyright (c) 2009-2011 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  *          Ken Merry           (Spectra Logic Corporation)
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 /**
37  * \file blkback.c
38  *
39  * \brief Device driver supporting the vending of block storage from
40  *        a FreeBSD domain to other domains.
41  */
42
43 #include "opt_kdtrace.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49
50 #include <sys/bio.h>
51 #include <sys/bus.h>
52 #include <sys/conf.h>
53 #include <sys/devicestat.h>
54 #include <sys/disk.h>
55 #include <sys/fcntl.h>
56 #include <sys/filedesc.h>
57 #include <sys/kdb.h>
58 #include <sys/module.h>
59 #include <sys/namei.h>
60 #include <sys/proc.h>
61 #include <sys/rman.h>
62 #include <sys/taskqueue.h>
63 #include <sys/types.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/sysctl.h>
67 #include <sys/bitstring.h>
68 #include <sys/sdt.h>
69
70 #include <geom/geom.h>
71
72 #include <machine/_inttypes.h>
73
74 #include <vm/vm.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_kern.h>
77
78 #include <xen/xen-os.h>
79 #include <xen/blkif.h>
80 #include <xen/gnttab.h>
81 #include <xen/xen_intr.h>
82
83 #include <xen/interface/event_channel.h>
84 #include <xen/interface/grant_table.h>
85
86 #include <xen/xenbus/xenbusvar.h>
87
88 /*--------------------------- Compile-time Tunables --------------------------*/
89 /**
90  * The maximum number of outstanding request blocks (request headers plus
91  * additional segment blocks) we will allow in a negotiated block-front/back
92  * communication channel.
93  */
94 #define XBB_MAX_REQUESTS        256
95
96 /**
97  * \brief Define to force all I/O to be performed on memory owned by the
98  *        backend device, with a copy-in/out to the remote domain's memory.
99  *
100  * \note  This option is currently required when this driver's domain is
101  *        operating in HVM mode on a system using an IOMMU.
102  *
103  * This driver uses Xen's grant table API to gain access to the memory of
104  * the remote domains it serves.  When our domain is operating in PV mode,
105  * the grant table mechanism directly updates our domain's page table entries
106  * to point to the physical pages of the remote domain.  This scheme guarantees
107  * that blkback and the backing devices it uses can safely perform DMA
108  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
109  * insure that our domain cannot DMA to pages owned by another domain.  As
110  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
111  * table API.  For this reason, in HVM mode, we must bounce all requests into
112  * memory that is mapped into our domain at domain startup and thus has
113  * valid IOMMU mappings.
114  */
115 #define XBB_USE_BOUNCE_BUFFERS
116
117 /**
118  * \brief Define to enable rudimentary request logging to the console.
119  */
120 #undef XBB_DEBUG
121
122 /*---------------------------------- Macros ----------------------------------*/
123 /**
124  * Custom malloc type for all driver allocations.
125  */
126 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
127
128 #ifdef XBB_DEBUG
129 #define DPRINTF(fmt, args...)                                   \
130     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
131 #else
132 #define DPRINTF(fmt, args...) do {} while(0)
133 #endif
134
135 /**
136  * The maximum mapped region size per request we will allow in a negotiated
137  * block-front/back communication channel.
138  */
139 #define XBB_MAX_REQUEST_SIZE                                    \
140         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
141
142 /**
143  * The maximum number of segments (within a request header and accompanying
144  * segment blocks) per request we will allow in a negotiated block-front/back
145  * communication channel.
146  */
147 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
148         (MIN(UIO_MAXIOV,                                        \
149              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
150                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
151
152 /**
153  * The maximum number of shared memory ring pages we will allow in a
154  * negotiated block-front/back communication channel.  Allow enough
155  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
156  */
157 #define XBB_MAX_RING_PAGES                                                  \
158         BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
159                        * XBB_MAX_REQUESTS)
160 /**
161  * The maximum number of ring pages that we can allow per request list.
162  * We limit this to the maximum number of segments per request, because
163  * that is already a reasonable number of segments to aggregate.  This
164  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
165  * because that would leave situations where we can't dispatch even one
166  * large request.
167  */
168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
169
170 /*--------------------------- Forward Declarations ---------------------------*/
171 struct xbb_softc;
172 struct xbb_xen_req;
173
174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
175                               ...) __attribute__((format(printf, 3, 4)));
176 static int  xbb_shutdown(struct xbb_softc *xbb);
177 static int  xbb_detach(device_t dev);
178
179 /*------------------------------ Data Structures -----------------------------*/
180
181 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
182
183 typedef enum {
184         XBB_REQLIST_NONE        = 0x00,
185         XBB_REQLIST_MAPPED      = 0x01
186 } xbb_reqlist_flags;
187
188 struct xbb_xen_reqlist {
189         /**
190          * Back reference to the parent block back instance for this
191          * request.  Used during bio_done handling.
192          */
193         struct xbb_softc        *xbb;
194
195         /**
196          * BLKIF_OP code for this request.
197          */
198         int                      operation;
199
200         /**
201          * Set to BLKIF_RSP_* to indicate request status.
202          *
203          * This field allows an error status to be recorded even if the
204          * delivery of this status must be deferred.  Deferred reporting
205          * is necessary, for example, when an error is detected during
206          * completion processing of one bio when other bios for this
207          * request are still outstanding.
208          */
209         int                      status;
210
211         /**
212          * Number of 512 byte sectors not transferred.
213          */
214         int                      residual_512b_sectors;
215
216         /**
217          * Starting sector number of the first request in the list.
218          */
219         off_t                    starting_sector_number;
220
221         /**
222          * If we're going to coalesce, the next contiguous sector would be
223          * this one.
224          */
225         off_t                    next_contig_sector;
226
227         /**
228          * Number of child requests in the list.
229          */
230         int                      num_children;
231
232         /**
233          * Number of I/O requests still pending on the backend.
234          */
235         int                      pendcnt;
236
237         /**
238          * Total number of segments for requests in the list.
239          */
240         int                      nr_segments;
241
242         /**
243          * Flags for this particular request list.
244          */
245         xbb_reqlist_flags        flags;
246
247         /**
248          * Kernel virtual address space reserved for this request
249          * list structure and used to map the remote domain's pages for
250          * this I/O, into our domain's address space.
251          */
252         uint8_t                 *kva;
253
254         /**
255          * Base, psuedo-physical address, corresponding to the start
256          * of this request's kva region.
257          */
258         uint64_t                 gnt_base;
259
260
261 #ifdef XBB_USE_BOUNCE_BUFFERS
262         /**
263          * Pre-allocated domain local memory used to proxy remote
264          * domain memory during I/O operations.
265          */
266         uint8_t                 *bounce;
267 #endif
268
269         /**
270          * Array of grant handles (one per page) used to map this request.
271          */
272         grant_handle_t          *gnt_handles;
273
274         /**
275          * Device statistics request ordering type (ordered or simple).
276          */
277         devstat_tag_type         ds_tag_type;
278
279         /**
280          * Device statistics request type (read, write, no_data).
281          */
282         devstat_trans_flags      ds_trans_type;
283
284         /**
285          * The start time for this request.
286          */
287         struct bintime           ds_t0;
288
289         /**
290          * Linked list of contiguous requests with the same operation type.
291          */
292         struct xbb_xen_req_list  contig_req_list;
293
294         /**
295          * Linked list links used to aggregate idle requests in the
296          * request list free pool (xbb->reqlist_free_stailq) and pending
297          * requests waiting for execution (xbb->reqlist_pending_stailq).
298          */
299         STAILQ_ENTRY(xbb_xen_reqlist) links;
300 };
301
302 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
303
304 /**
305  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
306  */
307 struct xbb_xen_req {
308         /**
309          * Linked list links used to aggregate requests into a reqlist
310          * and to store them in the request free pool.
311          */
312         STAILQ_ENTRY(xbb_xen_req) links;
313
314         /**
315          * The remote domain's identifier for this I/O request.
316          */
317         uint64_t                  id;
318
319         /**
320          * The number of pages currently mapped for this request.
321          */
322         int                       nr_pages;
323
324         /**
325          * The number of 512 byte sectors comprising this requests.
326          */
327         int                       nr_512b_sectors;
328
329         /**
330          * BLKIF_OP code for this request.
331          */
332         int                       operation;
333
334         /**
335          * Storage used for non-native ring requests.
336          */
337         blkif_request_t          ring_req_storage;
338
339         /**
340          * Pointer to the Xen request in the ring.
341          */
342         blkif_request_t         *ring_req;
343
344         /**
345          * Consumer index for this request.
346          */
347         RING_IDX                 req_ring_idx;
348
349         /**
350          * The start time for this request.
351          */
352         struct bintime           ds_t0;
353
354         /**
355          * Pointer back to our parent request list.
356          */
357         struct xbb_xen_reqlist  *reqlist;
358 };
359 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
360
361 /**
362  * \brief Configuration data for the shared memory request ring
363  *        used to communicate with the front-end client of this
364  *        this driver.
365  */
366 struct xbb_ring_config {
367         /** KVA address where ring memory is mapped. */
368         vm_offset_t     va;
369
370         /** The pseudo-physical address where ring memory is mapped.*/
371         uint64_t        gnt_addr;
372
373         /**
374          * Grant table handles, one per-ring page, returned by the
375          * hyperpervisor upon mapping of the ring and required to
376          * unmap it when a connection is torn down.
377          */
378         grant_handle_t  handle[XBB_MAX_RING_PAGES];
379
380         /**
381          * The device bus address returned by the hypervisor when
382          * mapping the ring and required to unmap it when a connection
383          * is torn down.
384          */
385         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
386
387         /** The number of ring pages mapped for the current connection. */
388         u_int           ring_pages;
389
390         /**
391          * The grant references, one per-ring page, supplied by the
392          * front-end, allowing us to reference the ring pages in the
393          * front-end's domain and to map these pages into our own domain.
394          */
395         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
396
397         /** The interrupt driven even channel used to signal ring events. */
398         evtchn_port_t   evtchn;
399 };
400
401 /**
402  * Per-instance connection state flags.
403  */
404 typedef enum
405 {
406         /**
407          * The front-end requested a read-only mount of the
408          * back-end device/file.
409          */
410         XBBF_READ_ONLY         = 0x01,
411
412         /** Communication with the front-end has been established. */
413         XBBF_RING_CONNECTED    = 0x02,
414
415         /**
416          * Front-end requests exist in the ring and are waiting for
417          * xbb_xen_req objects to free up.
418          */
419         XBBF_RESOURCE_SHORTAGE = 0x04,
420
421         /** Connection teardown in progress. */
422         XBBF_SHUTDOWN          = 0x08,
423
424         /** A thread is already performing shutdown processing. */
425         XBBF_IN_SHUTDOWN       = 0x10
426 } xbb_flag_t;
427
428 /** Backend device type.  */
429 typedef enum {
430         /** Backend type unknown. */
431         XBB_TYPE_NONE           = 0x00,
432
433         /**
434          * Backend type disk (access via cdev switch
435          * strategy routine).
436          */
437         XBB_TYPE_DISK           = 0x01,
438
439         /** Backend type file (access vnode operations.). */
440         XBB_TYPE_FILE           = 0x02
441 } xbb_type;
442
443 /**
444  * \brief Structure used to memoize information about a per-request
445  *        scatter-gather list.
446  *
447  * The chief benefit of using this data structure is it avoids having
448  * to reparse the possibly discontiguous S/G list in the original
449  * request.  Due to the way that the mapping of the memory backing an
450  * I/O transaction is handled by Xen, a second pass is unavoidable.
451  * At least this way the second walk is a simple array traversal.
452  *
453  * \note A single Scatter/Gather element in the block interface covers
454  *       at most 1 machine page.  In this context a sector (blkif
455  *       nomenclature, not what I'd choose) is a 512b aligned unit
456  *       of mapping within the machine page referenced by an S/G
457  *       element.
458  */
459 struct xbb_sg {
460         /** The number of 512b data chunks mapped in this S/G element. */
461         int16_t nsect;
462
463         /**
464          * The index (0 based) of the first 512b data chunk mapped
465          * in this S/G element.
466          */
467         uint8_t first_sect;
468
469         /**
470          * The index (0 based) of the last 512b data chunk mapped
471          * in this S/G element.
472          */
473         uint8_t last_sect;
474 };
475
476 /**
477  * Character device backend specific configuration data.
478  */
479 struct xbb_dev_data {
480         /** Cdev used for device backend access.  */
481         struct cdev   *cdev;
482
483         /** Cdev switch used for device backend access.  */
484         struct cdevsw *csw;
485
486         /** Used to hold a reference on opened cdev backend devices. */
487         int            dev_ref;
488 };
489
490 /**
491  * File backend specific configuration data.
492  */
493 struct xbb_file_data {
494         /** Credentials to use for vnode backed (file based) I/O. */
495         struct ucred   *cred;
496
497         /**
498          * \brief Array of io vectors used to process file based I/O.
499          *
500          * Only a single file based request is outstanding per-xbb instance,
501          * so we only need one of these.
502          */
503         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
504 #ifdef XBB_USE_BOUNCE_BUFFERS
505
506         /**
507          * \brief Array of io vectors used to handle bouncing of file reads.
508          *
509          * Vnode operations are free to modify uio data during their
510          * exectuion.  In the case of a read with bounce buffering active,
511          * we need some of the data from the original uio in order to
512          * bounce-out the read data.  This array serves as the temporary
513          * storage for this saved data.
514          */
515         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
516
517         /**
518          * \brief Array of memoized bounce buffer kva offsets used
519          *        in the file based backend.
520          *
521          * Due to the way that the mapping of the memory backing an
522          * I/O transaction is handled by Xen, a second pass through
523          * the request sg elements is unavoidable. We memoize the computed
524          * bounce address here to reduce the cost of the second walk.
525          */
526         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
527 #endif /* XBB_USE_BOUNCE_BUFFERS */
528 };
529
530 /**
531  * Collection of backend type specific data.
532  */
533 union xbb_backend_data {
534         struct xbb_dev_data  dev;
535         struct xbb_file_data file;
536 };
537
538 /**
539  * Function signature of backend specific I/O handlers.
540  */
541 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
542                               struct xbb_xen_reqlist *reqlist, int operation,
543                               int flags);
544
545 /**
546  * Per-instance configuration data.
547  */
548 struct xbb_softc {
549
550         /**
551          * Task-queue used to process I/O requests.
552          */
553         struct taskqueue         *io_taskqueue;
554
555         /**
556          * Single "run the request queue" task enqueued
557          * on io_taskqueue.
558          */
559         struct task               io_task;
560
561         /** Device type for this instance. */
562         xbb_type                  device_type;
563
564         /** NewBus device corresponding to this instance. */
565         device_t                  dev;
566
567         /** Backend specific dispatch routine for this instance. */
568         xbb_dispatch_t            dispatch_io;
569
570         /** The number of requests outstanding on the backend device/file. */
571         int                       active_request_count;
572
573         /** Free pool of request tracking structures. */
574         struct xbb_xen_req_list   request_free_stailq;
575
576         /** Array, sized at connection time, of request tracking structures. */
577         struct xbb_xen_req       *requests;
578
579         /** Free pool of request list structures. */
580         struct xbb_xen_reqlist_list reqlist_free_stailq;
581
582         /** List of pending request lists awaiting execution. */
583         struct xbb_xen_reqlist_list reqlist_pending_stailq;
584
585         /** Array, sized at connection time, of request list structures. */
586         struct xbb_xen_reqlist   *request_lists;
587
588         /**
589          * Global pool of kva used for mapping remote domain ring
590          * and I/O transaction data.
591          */
592         vm_offset_t               kva;
593
594         /** Psuedo-physical address corresponding to kva. */
595         uint64_t                  gnt_base_addr;
596
597         /** The size of the global kva pool. */
598         int                       kva_size;
599
600         /** The size of the KVA area used for request lists. */
601         int                       reqlist_kva_size;
602
603         /** The number of pages of KVA used for request lists */
604         int                       reqlist_kva_pages;
605
606         /** Bitmap of free KVA pages */
607         bitstr_t                 *kva_free;
608
609         /**
610          * \brief Cached value of the front-end's domain id.
611          * 
612          * This value is used at once for each mapped page in
613          * a transaction.  We cache it to avoid incuring the
614          * cost of an ivar access every time this is needed.
615          */
616         domid_t                   otherend_id;
617
618         /**
619          * \brief The blkif protocol abi in effect.
620          *
621          * There are situations where the back and front ends can
622          * have a different, native abi (e.g. intel x86_64 and
623          * 32bit x86 domains on the same machine).  The back-end
624          * always accomodates the front-end's native abi.  That
625          * value is pulled from the XenStore and recorded here.
626          */
627         int                       abi;
628
629         /**
630          * \brief The maximum number of requests and request lists allowed
631          *        to be in flight at a time.
632          *
633          * This value is negotiated via the XenStore.
634          */
635         u_int                     max_requests;
636
637         /**
638          * \brief The maximum number of segments (1 page per segment)
639          *        that can be mapped by a request.
640          *
641          * This value is negotiated via the XenStore.
642          */
643         u_int                     max_request_segments;
644
645         /**
646          * \brief Maximum number of segments per request list.
647          *
648          * This value is derived from and will generally be larger than
649          * max_request_segments.
650          */
651         u_int                     max_reqlist_segments;
652
653         /**
654          * The maximum size of any request to this back-end
655          * device.
656          *
657          * This value is negotiated via the XenStore.
658          */
659         u_int                     max_request_size;
660
661         /**
662          * The maximum size of any request list.  This is derived directly
663          * from max_reqlist_segments.
664          */
665         u_int                     max_reqlist_size;
666
667         /** Various configuration and state bit flags. */
668         xbb_flag_t                flags;
669
670         /** Ring mapping and interrupt configuration data. */
671         struct xbb_ring_config    ring_config;
672
673         /** Runtime, cross-abi safe, structures for ring access. */
674         blkif_back_rings_t        rings;
675
676         /** IRQ mapping for the communication ring event channel. */
677         xen_intr_handle_t         xen_intr_handle;
678
679         /**
680          * \brief Backend access mode flags (e.g. write, or read-only).
681          *
682          * This value is passed to us by the front-end via the XenStore.
683          */
684         char                     *dev_mode;
685
686         /**
687          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
688          *
689          * This value is passed to us by the front-end via the XenStore.
690          * Currently unused.
691          */
692         char                     *dev_type;
693
694         /**
695          * \brief Backend device/file identifier.
696          *
697          * This value is passed to us by the front-end via the XenStore.
698          * We expect this to be a POSIX path indicating the file or
699          * device to open.
700          */
701         char                     *dev_name;
702
703         /**
704          * Vnode corresponding to the backend device node or file
705          * we are acessing.
706          */
707         struct vnode             *vn;
708
709         union xbb_backend_data    backend;
710
711         /** The native sector size of the backend. */
712         u_int                     sector_size;
713
714         /** log2 of sector_size.  */
715         u_int                     sector_size_shift;
716
717         /** Size in bytes of the backend device or file.  */
718         off_t                     media_size;
719
720         /**
721          * \brief media_size expressed in terms of the backend native
722          *        sector size.
723          *
724          * (e.g. xbb->media_size >> xbb->sector_size_shift).
725          */
726         uint64_t                  media_num_sectors;
727
728         /**
729          * \brief Array of memoized scatter gather data computed during the
730          *        conversion of blkif ring requests to internal xbb_xen_req
731          *        structures.
732          *
733          * Ring processing is serialized so we only need one of these.
734          */
735         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
736
737         /**
738          * Temporary grant table map used in xbb_dispatch_io().  When
739          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
740          * stack could cause a stack overflow.
741          */
742         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
743
744         /** Mutex protecting per-instance data. */
745         struct mtx                lock;
746
747 #ifdef XENHVM
748         /**
749          * Resource representing allocated physical address space
750          * associated with our per-instance kva region.
751          */
752         struct resource          *pseudo_phys_res;
753
754         /** Resource id for allocated physical address space. */
755         int                       pseudo_phys_res_id;
756 #endif
757
758         /**
759          * I/O statistics from BlockBack dispatch down.  These are
760          * coalesced requests, and we start them right before execution.
761          */
762         struct devstat           *xbb_stats;
763
764         /**
765          * I/O statistics coming into BlockBack.  These are the requests as
766          * we get them from BlockFront.  They are started as soon as we
767          * receive a request, and completed when the I/O is complete.
768          */
769         struct devstat           *xbb_stats_in;
770
771         /** Disable sending flush to the backend */
772         int                       disable_flush;
773
774         /** Send a real flush for every N flush requests */
775         int                       flush_interval;
776
777         /** Count of flush requests in the interval */
778         int                       flush_count;
779
780         /** Don't coalesce requests if this is set */
781         int                       no_coalesce_reqs;
782
783         /** Number of requests we have received */
784         uint64_t                  reqs_received;
785
786         /** Number of requests we have completed*/
787         uint64_t                  reqs_completed;
788
789         /** How many forced dispatches (i.e. without coalescing) have happend */
790         uint64_t                  forced_dispatch;
791
792         /** How many normal dispatches have happend */
793         uint64_t                  normal_dispatch;
794
795         /** How many total dispatches have happend */
796         uint64_t                  total_dispatch;
797
798         /** How many times we have run out of KVA */
799         uint64_t                  kva_shortages;
800
801         /** How many times we have run out of request structures */
802         uint64_t                  request_shortages;
803 };
804
805 /*---------------------------- Request Processing ----------------------------*/
806 /**
807  * Allocate an internal transaction tracking structure from the free pool.
808  *
809  * \param xbb  Per-instance xbb configuration structure.
810  *
811  * \return  On success, a pointer to the allocated xbb_xen_req structure.
812  *          Otherwise NULL.
813  */
814 static inline struct xbb_xen_req *
815 xbb_get_req(struct xbb_softc *xbb)
816 {
817         struct xbb_xen_req *req;
818
819         req = NULL;
820
821         mtx_assert(&xbb->lock, MA_OWNED);
822
823         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
824                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
825                 xbb->active_request_count++;
826         }
827
828         return (req);
829 }
830
831 /**
832  * Return an allocated transaction tracking structure to the free pool.
833  *
834  * \param xbb  Per-instance xbb configuration structure.
835  * \param req  The request structure to free.
836  */
837 static inline void
838 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
839 {
840         mtx_assert(&xbb->lock, MA_OWNED);
841
842         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
843         xbb->active_request_count--;
844
845         KASSERT(xbb->active_request_count >= 0,
846                 ("xbb_release_req: negative active count"));
847 }
848
849 /**
850  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
851  *
852  * \param xbb       Per-instance xbb configuration structure.
853  * \param req_list  The list of requests to free.
854  * \param nreqs     The number of items in the list.
855  */
856 static inline void
857 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
858                  int nreqs)
859 {
860         mtx_assert(&xbb->lock, MA_OWNED);
861
862         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
863         xbb->active_request_count -= nreqs;
864
865         KASSERT(xbb->active_request_count >= 0,
866                 ("xbb_release_reqs: negative active count"));
867 }
868
869 /**
870  * Given a page index and 512b sector offset within that page,
871  * calculate an offset into a request's kva region.
872  *
873  * \param reqlist The request structure whose kva region will be accessed.
874  * \param pagenr  The page index used to compute the kva offset.
875  * \param sector  The 512b sector index used to compute the page relative
876  *                kva offset.
877  *
878  * \return  The computed global KVA offset.
879  */
880 static inline uint8_t *
881 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
882 {
883         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
884 }
885
886 #ifdef XBB_USE_BOUNCE_BUFFERS
887 /**
888  * Given a page index and 512b sector offset within that page,
889  * calculate an offset into a request's local bounce memory region.
890  *
891  * \param reqlist The request structure whose bounce region will be accessed.
892  * \param pagenr  The page index used to compute the bounce offset.
893  * \param sector  The 512b sector index used to compute the page relative
894  *                bounce offset.
895  *
896  * \return  The computed global bounce buffer address.
897  */
898 static inline uint8_t *
899 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
900 {
901         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
902 }
903 #endif
904
905 /**
906  * Given a page number and 512b sector offset within that page,
907  * calculate an offset into the request's memory region that the
908  * underlying backend device/file should use for I/O.
909  *
910  * \param reqlist The request structure whose I/O region will be accessed.
911  * \param pagenr  The page index used to compute the I/O offset.
912  * \param sector  The 512b sector index used to compute the page relative
913  *                I/O offset.
914  *
915  * \return  The computed global I/O address.
916  *
917  * Depending on configuration, this will either be a local bounce buffer
918  * or a pointer to the memory mapped in from the front-end domain for
919  * this request.
920  */
921 static inline uint8_t *
922 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
923 {
924 #ifdef XBB_USE_BOUNCE_BUFFERS
925         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
926 #else
927         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
928 #endif
929 }
930
931 /**
932  * Given a page index and 512b sector offset within that page, calculate
933  * an offset into the local psuedo-physical address space used to map a
934  * front-end's request data into a request.
935  *
936  * \param reqlist The request list structure whose pseudo-physical region
937  *                will be accessed.
938  * \param pagenr  The page index used to compute the pseudo-physical offset.
939  * \param sector  The 512b sector index used to compute the page relative
940  *                pseudo-physical offset.
941  *
942  * \return  The computed global pseudo-phsyical address.
943  *
944  * Depending on configuration, this will either be a local bounce buffer
945  * or a pointer to the memory mapped in from the front-end domain for
946  * this request.
947  */
948 static inline uintptr_t
949 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
950 {
951         struct xbb_softc *xbb;
952
953         xbb = reqlist->xbb;
954
955         return ((uintptr_t)(xbb->gnt_base_addr +
956                 (uintptr_t)(reqlist->kva - xbb->kva) +
957                 (PAGE_SIZE * pagenr) + (sector << 9)));
958 }
959
960 /**
961  * Get Kernel Virtual Address space for mapping requests.
962  *
963  * \param xbb         Per-instance xbb configuration structure.
964  * \param nr_pages    Number of pages needed.
965  * \param check_only  If set, check for free KVA but don't allocate it.
966  * \param have_lock   If set, xbb lock is already held.
967  *
968  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
969  *
970  * Note:  This should be unnecessary once we have either chaining or
971  * scatter/gather support for struct bio.  At that point we'll be able to
972  * put multiple addresses and lengths in one bio/bio chain and won't need
973  * to map everything into one virtual segment.
974  */
975 static uint8_t *
976 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
977 {
978         intptr_t first_clear;
979         intptr_t num_clear;
980         uint8_t *free_kva;
981         int      i;
982
983         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
984
985         first_clear = 0;
986         free_kva = NULL;
987
988         mtx_lock(&xbb->lock);
989
990         /*
991          * Look for the first available page.  If there are none, we're done.
992          */
993         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
994
995         if (first_clear == -1)
996                 goto bailout;
997
998         /*
999          * Starting at the first available page, look for consecutive free
1000          * pages that will satisfy the user's request.
1001          */
1002         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1003                 /*
1004                  * If this is true, the page is used, so we have to reset
1005                  * the number of clear pages and the first clear page
1006                  * (since it pointed to a region with an insufficient number
1007                  * of clear pages).
1008                  */
1009                 if (bit_test(xbb->kva_free, i)) {
1010                         num_clear = 0;
1011                         first_clear = -1;
1012                         continue;
1013                 }
1014
1015                 if (first_clear == -1)
1016                         first_clear = i;
1017
1018                 /*
1019                  * If this is true, we've found a large enough free region
1020                  * to satisfy the request.
1021                  */
1022                 if (++num_clear == nr_pages) {
1023
1024                         bit_nset(xbb->kva_free, first_clear,
1025                                  first_clear + nr_pages - 1);
1026
1027                         free_kva = xbb->kva +
1028                                 (uint8_t *)(first_clear * PAGE_SIZE);
1029
1030                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1031                                 free_kva + (nr_pages * PAGE_SIZE) <=
1032                                 (uint8_t *)xbb->ring_config.va,
1033                                 ("Free KVA %p len %d out of range, "
1034                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1035                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1036                                  (uintmax_t)xbb->ring_config.va));
1037                         break;
1038                 }
1039         }
1040
1041 bailout:
1042
1043         if (free_kva == NULL) {
1044                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1045                 xbb->kva_shortages++;
1046         }
1047
1048         mtx_unlock(&xbb->lock);
1049
1050         return (free_kva);
1051 }
1052
1053 /**
1054  * Free allocated KVA.
1055  *
1056  * \param xbb       Per-instance xbb configuration structure.
1057  * \param kva_ptr   Pointer to allocated KVA region.  
1058  * \param nr_pages  Number of pages in the KVA region.
1059  */
1060 static void
1061 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1062 {
1063         intptr_t start_page;
1064
1065         mtx_assert(&xbb->lock, MA_OWNED);
1066
1067         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1068         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1069
1070 }
1071
1072 /**
1073  * Unmap the front-end pages associated with this I/O request.
1074  *
1075  * \param req  The request structure to unmap.
1076  */
1077 static void
1078 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1079 {
1080         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1081         u_int                         i;
1082         u_int                         invcount;
1083         int                           error;
1084
1085         invcount = 0;
1086         for (i = 0; i < reqlist->nr_segments; i++) {
1087
1088                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1089                         continue;
1090
1091                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1092                 unmap[invcount].dev_bus_addr = 0;
1093                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1094                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1095                 invcount++;
1096         }
1097
1098         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1099                                           unmap, invcount);
1100         KASSERT(error == 0, ("Grant table operation failed"));
1101 }
1102
1103 /**
1104  * Allocate an internal transaction tracking structure from the free pool.
1105  *
1106  * \param xbb  Per-instance xbb configuration structure.
1107  *
1108  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1109  *          Otherwise NULL.
1110  */
1111 static inline struct xbb_xen_reqlist *
1112 xbb_get_reqlist(struct xbb_softc *xbb)
1113 {
1114         struct xbb_xen_reqlist *reqlist;
1115
1116         reqlist = NULL;
1117
1118         mtx_assert(&xbb->lock, MA_OWNED);
1119
1120         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1121
1122                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1123                 reqlist->flags = XBB_REQLIST_NONE;
1124                 reqlist->kva = NULL;
1125                 reqlist->status = BLKIF_RSP_OKAY;
1126                 reqlist->residual_512b_sectors = 0;
1127                 reqlist->num_children = 0;
1128                 reqlist->nr_segments = 0;
1129                 STAILQ_INIT(&reqlist->contig_req_list);
1130         }
1131
1132         return (reqlist);
1133 }
1134
1135 /**
1136  * Return an allocated transaction tracking structure to the free pool.
1137  *
1138  * \param xbb        Per-instance xbb configuration structure.
1139  * \param req        The request list structure to free.
1140  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1141  *                   during a resource shortage condition.
1142  */
1143 static inline void
1144 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1145                     int wakeup)
1146 {
1147
1148         mtx_lock(&xbb->lock);
1149
1150         if (wakeup) {
1151                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1152                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1153         }
1154
1155         if (reqlist->kva != NULL)
1156                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1157
1158         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1159
1160         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1161
1162         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1163                 /*
1164                  * Shutdown is in progress.  See if we can
1165                  * progress further now that one more request
1166                  * has completed and been returned to the
1167                  * free pool.
1168                  */
1169                 xbb_shutdown(xbb);
1170         }
1171
1172         mtx_unlock(&xbb->lock);
1173
1174         if (wakeup != 0)
1175                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1176 }
1177
1178 /**
1179  * Request resources and do basic request setup.
1180  *
1181  * \param xbb          Per-instance xbb configuration structure.
1182  * \param reqlist      Pointer to reqlist pointer.
1183  * \param ring_req     Pointer to a block ring request.
1184  * \param ring_index   The ring index of this request.
1185  *
1186  * \return  0 for success, non-zero for failure.
1187  */
1188 static int
1189 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1190                   blkif_request_t *ring_req, RING_IDX ring_idx)
1191 {
1192         struct xbb_xen_reqlist *nreqlist;
1193         struct xbb_xen_req     *nreq;
1194
1195         nreqlist = NULL;
1196         nreq     = NULL;
1197
1198         mtx_lock(&xbb->lock);
1199
1200         /*
1201          * We don't allow new resources to be allocated if we're in the
1202          * process of shutting down.
1203          */
1204         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1205                 mtx_unlock(&xbb->lock);
1206                 return (1);
1207         }
1208
1209         /*
1210          * Allocate a reqlist if the caller doesn't have one already.
1211          */
1212         if (*reqlist == NULL) {
1213                 nreqlist = xbb_get_reqlist(xbb);
1214                 if (nreqlist == NULL)
1215                         goto bailout_error;
1216         }
1217
1218         /* We always allocate a request. */
1219         nreq = xbb_get_req(xbb);
1220         if (nreq == NULL)
1221                 goto bailout_error;
1222
1223         mtx_unlock(&xbb->lock);
1224
1225         if (*reqlist == NULL) {
1226                 *reqlist = nreqlist;
1227                 nreqlist->operation = ring_req->operation;
1228                 nreqlist->starting_sector_number = ring_req->sector_number;
1229                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1230                                    links);
1231         }
1232
1233         nreq->reqlist = *reqlist;
1234         nreq->req_ring_idx = ring_idx;
1235         nreq->id = ring_req->id;
1236         nreq->operation = ring_req->operation;
1237
1238         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1239                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1240                 nreq->ring_req = &nreq->ring_req_storage;
1241         } else {
1242                 nreq->ring_req = ring_req;
1243         }
1244
1245         binuptime(&nreq->ds_t0);
1246         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1247         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1248         (*reqlist)->num_children++;
1249         (*reqlist)->nr_segments += ring_req->nr_segments;
1250
1251         return (0);
1252
1253 bailout_error:
1254
1255         /*
1256          * We're out of resources, so set the shortage flag.  The next time
1257          * a request is released, we'll try waking up the work thread to
1258          * see if we can allocate more resources.
1259          */
1260         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1261         xbb->request_shortages++;
1262
1263         if (nreq != NULL)
1264                 xbb_release_req(xbb, nreq);
1265
1266         mtx_unlock(&xbb->lock);
1267
1268         if (nreqlist != NULL)
1269                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1270
1271         return (1);
1272 }
1273
1274 /**
1275  * Create and transmit a response to a blkif request.
1276  * 
1277  * \param xbb     Per-instance xbb configuration structure.
1278  * \param req     The request structure to which to respond.
1279  * \param status  The status code to report.  See BLKIF_RSP_*
1280  *                in sys/xen/interface/io/blkif.h.
1281  */
1282 static void
1283 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1284 {
1285         blkif_response_t *resp;
1286         int               more_to_do;
1287         int               notify;
1288
1289         more_to_do = 0;
1290
1291         /*
1292          * Place on the response ring for the relevant domain.
1293          * For now, only the spacing between entries is different
1294          * in the different ABIs, not the response entry layout.
1295          */
1296         mtx_lock(&xbb->lock);
1297         switch (xbb->abi) {
1298         case BLKIF_PROTOCOL_NATIVE:
1299                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1300                                          xbb->rings.native.rsp_prod_pvt);
1301                 break;
1302         case BLKIF_PROTOCOL_X86_32:
1303                 resp = (blkif_response_t *)
1304                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1305                                       xbb->rings.x86_32.rsp_prod_pvt);
1306                 break;
1307         case BLKIF_PROTOCOL_X86_64:
1308                 resp = (blkif_response_t *)
1309                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1310                                       xbb->rings.x86_64.rsp_prod_pvt);
1311                 break;
1312         default:
1313                 panic("Unexpected blkif protocol ABI.");
1314         }
1315
1316         resp->id        = req->id;
1317         resp->operation = req->operation;
1318         resp->status    = status;
1319
1320         xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
1321         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
1322
1323         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1324
1325                 /*
1326                  * Tail check for pending requests. Allows frontend to avoid
1327                  * notifications if requests are already in flight (lower
1328                  * overheads and promotes batching).
1329                  */
1330                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1331         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1332
1333                 more_to_do = 1;
1334         }
1335
1336         xbb->reqs_completed++;
1337
1338         mtx_unlock(&xbb->lock);
1339
1340         if (more_to_do)
1341                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1342
1343         if (notify)
1344                 xen_intr_signal(xbb->xen_intr_handle);
1345 }
1346
1347 /**
1348  * Complete a request list.
1349  *
1350  * \param xbb        Per-instance xbb configuration structure.
1351  * \param reqlist    Allocated internal request list structure.
1352  */
1353 static void
1354 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1355 {
1356         struct xbb_xen_req *nreq;
1357         off_t               sectors_sent;
1358
1359         sectors_sent = 0;
1360
1361         if (reqlist->flags & XBB_REQLIST_MAPPED)
1362                 xbb_unmap_reqlist(reqlist);
1363
1364         /*
1365          * All I/O is done, send the response.  A lock should not be
1366          * necessary here because the request list is complete, and
1367          * therefore this is the only context accessing this request
1368          * right now.  The functions we call do their own locking if
1369          * necessary.
1370          */
1371         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1372                 off_t cur_sectors_sent;
1373
1374                 xbb_send_response(xbb, nreq, reqlist->status);
1375
1376                 /* We don't report bytes sent if there is an error. */
1377                 if (reqlist->status == BLKIF_RSP_OKAY)
1378                         cur_sectors_sent = nreq->nr_512b_sectors;
1379                 else
1380                         cur_sectors_sent = 0;
1381
1382                 sectors_sent += cur_sectors_sent;
1383
1384                 devstat_end_transaction(xbb->xbb_stats_in,
1385                                         /*bytes*/cur_sectors_sent << 9,
1386                                         reqlist->ds_tag_type,
1387                                         reqlist->ds_trans_type,
1388                                         /*now*/NULL,
1389                                         /*then*/&nreq->ds_t0);
1390         }
1391
1392         /*
1393          * Take out any sectors not sent.  If we wind up negative (which
1394          * might happen if an error is reported as well as a residual), just
1395          * report 0 sectors sent.
1396          */
1397         sectors_sent -= reqlist->residual_512b_sectors;
1398         if (sectors_sent < 0)
1399                 sectors_sent = 0;
1400
1401         devstat_end_transaction(xbb->xbb_stats,
1402                                 /*bytes*/ sectors_sent << 9,
1403                                 reqlist->ds_tag_type,
1404                                 reqlist->ds_trans_type,
1405                                 /*now*/NULL,
1406                                 /*then*/&reqlist->ds_t0);
1407
1408         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1409 }
1410
1411 /**
1412  * Completion handler for buffer I/O requests issued by the device
1413  * backend driver.
1414  *
1415  * \param bio  The buffer I/O request on which to perform completion
1416  *             processing.
1417  */
1418 static void
1419 xbb_bio_done(struct bio *bio)
1420 {
1421         struct xbb_softc       *xbb;
1422         struct xbb_xen_reqlist *reqlist;
1423
1424         reqlist = bio->bio_caller1;
1425         xbb     = reqlist->xbb;
1426
1427         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1428
1429         /*
1430          * This is a bit imprecise.  With aggregated I/O a single
1431          * request list can contain multiple front-end requests and
1432          * a multiple bios may point to a single request.  By carefully
1433          * walking the request list, we could map residuals and errors
1434          * back to the original front-end request, but the interface
1435          * isn't sufficiently rich for us to properly report the error.
1436          * So, we just treat the entire request list as having failed if an
1437          * error occurs on any part.  And, if an error occurs, we treat
1438          * the amount of data transferred as 0.
1439          *
1440          * For residuals, we report it on the overall aggregated device,
1441          * but not on the individual requests, since we don't currently
1442          * do the work to determine which front-end request to which the
1443          * residual applies.
1444          */
1445         if (bio->bio_error) {
1446                 DPRINTF("BIO returned error %d for operation on device %s\n",
1447                         bio->bio_error, xbb->dev_name);
1448                 reqlist->status = BLKIF_RSP_ERROR;
1449
1450                 if (bio->bio_error == ENXIO
1451                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1452
1453                         /*
1454                          * Backend device has disappeared.  Signal the
1455                          * front-end that we (the device proxy) want to
1456                          * go away.
1457                          */
1458                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1459                 }
1460         }
1461
1462 #ifdef XBB_USE_BOUNCE_BUFFERS
1463         if (bio->bio_cmd == BIO_READ) {
1464                 vm_offset_t kva_offset;
1465
1466                 kva_offset = (vm_offset_t)bio->bio_data
1467                            - (vm_offset_t)reqlist->bounce;
1468                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1469                        bio->bio_data, bio->bio_bcount);
1470         }
1471 #endif /* XBB_USE_BOUNCE_BUFFERS */
1472
1473         /*
1474          * Decrement the pending count for the request list.  When we're
1475          * done with the requests, send status back for all of them.
1476          */
1477         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1478                 xbb_complete_reqlist(xbb, reqlist);
1479
1480         g_destroy_bio(bio);
1481 }
1482
1483 /**
1484  * Parse a blkif request into an internal request structure and send
1485  * it to the backend for processing.
1486  *
1487  * \param xbb       Per-instance xbb configuration structure.
1488  * \param reqlist   Allocated internal request list structure.
1489  *
1490  * \return          On success, 0.  For resource shortages, non-zero.
1491  *  
1492  * This routine performs the backend common aspects of request parsing
1493  * including compiling an internal request structure, parsing the S/G
1494  * list and any secondary ring requests in which they may reside, and
1495  * the mapping of front-end I/O pages into our domain.
1496  */
1497 static int
1498 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1499 {
1500         struct xbb_sg                *xbb_sg;
1501         struct gnttab_map_grant_ref  *map;
1502         struct blkif_request_segment *sg;
1503         struct blkif_request_segment *last_block_sg;
1504         struct xbb_xen_req           *nreq;
1505         u_int                         nseg;
1506         u_int                         seg_idx;
1507         u_int                         block_segs;
1508         int                           nr_sects;
1509         int                           total_sects;
1510         int                           operation;
1511         uint8_t                       bio_flags;
1512         int                           error;
1513
1514         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1515         bio_flags            = 0;
1516         total_sects          = 0;
1517         nr_sects             = 0;
1518
1519         /*
1520          * First determine whether we have enough free KVA to satisfy this
1521          * request list.  If not, tell xbb_run_queue() so it can go to
1522          * sleep until we have more KVA.
1523          */
1524         reqlist->kva = NULL;
1525         if (reqlist->nr_segments != 0) {
1526                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1527                 if (reqlist->kva == NULL) {
1528                         /*
1529                          * If we're out of KVA, return ENOMEM.
1530                          */
1531                         return (ENOMEM);
1532                 }
1533         }
1534
1535         binuptime(&reqlist->ds_t0);
1536         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1537
1538         switch (reqlist->operation) {
1539         case BLKIF_OP_WRITE_BARRIER:
1540                 bio_flags       |= BIO_ORDERED;
1541                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1542                 /* FALLTHROUGH */
1543         case BLKIF_OP_WRITE:
1544                 operation = BIO_WRITE;
1545                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1546                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1547                         DPRINTF("Attempt to write to read only device %s\n",
1548                                 xbb->dev_name);
1549                         reqlist->status = BLKIF_RSP_ERROR;
1550                         goto send_response;
1551                 }
1552                 break;
1553         case BLKIF_OP_READ:
1554                 operation = BIO_READ;
1555                 reqlist->ds_trans_type = DEVSTAT_READ;
1556                 break;
1557         case BLKIF_OP_FLUSH_DISKCACHE:
1558                 /*
1559                  * If this is true, the user has requested that we disable
1560                  * flush support.  So we just complete the requests
1561                  * successfully.
1562                  */
1563                 if (xbb->disable_flush != 0) {
1564                         goto send_response;
1565                 }
1566
1567                 /*
1568                  * The user has requested that we only send a real flush
1569                  * for every N flush requests.  So keep count, and either
1570                  * complete the request immediately or queue it for the
1571                  * backend.
1572                  */
1573                 if (xbb->flush_interval != 0) {
1574                         if (++(xbb->flush_count) < xbb->flush_interval) {
1575                                 goto send_response;
1576                         } else
1577                                 xbb->flush_count = 0;
1578                 }
1579
1580                 operation = BIO_FLUSH;
1581                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1582                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1583                 goto do_dispatch;
1584                 /*NOTREACHED*/
1585         default:
1586                 DPRINTF("error: unknown block io operation [%d]\n",
1587                         reqlist->operation);
1588                 reqlist->status = BLKIF_RSP_ERROR;
1589                 goto send_response;
1590         }
1591
1592         reqlist->xbb  = xbb;
1593         xbb_sg        = xbb->xbb_sgs;
1594         map           = xbb->maps;
1595         seg_idx       = 0;
1596
1597         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1598                 blkif_request_t         *ring_req;
1599                 RING_IDX                 req_ring_idx;
1600                 u_int                    req_seg_idx;
1601
1602                 ring_req              = nreq->ring_req;
1603                 req_ring_idx          = nreq->req_ring_idx;
1604                 nr_sects              = 0;
1605                 nseg                  = ring_req->nr_segments;
1606                 nreq->nr_pages        = nseg;
1607                 nreq->nr_512b_sectors = 0;
1608                 req_seg_idx           = 0;
1609                 sg                    = NULL;
1610
1611                 /* Check that number of segments is sane. */
1612                 if (__predict_false(nseg == 0)
1613                  || __predict_false(nseg > xbb->max_request_segments)) {
1614                         DPRINTF("Bad number of segments in request (%d)\n",
1615                                 nseg);
1616                         reqlist->status = BLKIF_RSP_ERROR;
1617                         goto send_response;
1618                 }
1619
1620                 block_segs    = MIN(nreq->nr_pages,
1621                                     BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1622                 sg            = ring_req->seg;
1623                 last_block_sg = sg + block_segs;
1624                 while (1) {
1625
1626                         while (sg < last_block_sg) {
1627                                 KASSERT(seg_idx <
1628                                         XBB_MAX_SEGMENTS_PER_REQLIST,
1629                                         ("seg_idx %d is too large, max "
1630                                         "segs %d\n", seg_idx,
1631                                         XBB_MAX_SEGMENTS_PER_REQLIST));
1632                         
1633                                 xbb_sg->first_sect = sg->first_sect;
1634                                 xbb_sg->last_sect  = sg->last_sect;
1635                                 xbb_sg->nsect =
1636                                     (int8_t)(sg->last_sect -
1637                                     sg->first_sect + 1);
1638
1639                                 if ((sg->last_sect >= (PAGE_SIZE >> 9))
1640                                  || (xbb_sg->nsect <= 0)) {
1641                                         reqlist->status = BLKIF_RSP_ERROR;
1642                                         goto send_response;
1643                                 }
1644
1645                                 nr_sects += xbb_sg->nsect;
1646                                 map->host_addr = xbb_get_gntaddr(reqlist,
1647                                                         seg_idx, /*sector*/0);
1648                                 KASSERT(map->host_addr + PAGE_SIZE <=
1649                                         xbb->ring_config.gnt_addr,
1650                                         ("Host address %#jx len %d overlaps "
1651                                          "ring address %#jx\n",
1652                                         (uintmax_t)map->host_addr, PAGE_SIZE,
1653                                         (uintmax_t)xbb->ring_config.gnt_addr));
1654                                         
1655                                 map->flags     = GNTMAP_host_map;
1656                                 map->ref       = sg->gref;
1657                                 map->dom       = xbb->otherend_id;
1658                                 if (operation == BIO_WRITE)
1659                                         map->flags |= GNTMAP_readonly;
1660                                 sg++;
1661                                 map++;
1662                                 xbb_sg++;
1663                                 seg_idx++;
1664                                 req_seg_idx++;
1665                         }
1666
1667                         block_segs = MIN(nseg - req_seg_idx,
1668                                          BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1669                         if (block_segs == 0)
1670                                 break;
1671
1672                         /*
1673                          * Fetch the next request block full of SG elements.
1674                          * For now, only the spacing between entries is
1675                          * different in the different ABIs, not the sg entry
1676                          * layout.
1677                          */
1678                         req_ring_idx++;
1679                         switch (xbb->abi) {
1680                         case BLKIF_PROTOCOL_NATIVE:
1681                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.native,
1682                                                            req_ring_idx);
1683                                 break;
1684                         case BLKIF_PROTOCOL_X86_32:
1685                         {
1686                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_32,
1687                                                            req_ring_idx);
1688                                 break;
1689                         }
1690                         case BLKIF_PROTOCOL_X86_64:
1691                         {
1692                                 sg = BLKRING_GET_SEG_BLOCK(&xbb->rings.x86_64,
1693                                                            req_ring_idx);
1694                                 break;
1695                         }
1696                         default:
1697                                 panic("Unexpected blkif protocol ABI.");
1698                                 /* NOTREACHED */
1699                         } 
1700                         last_block_sg = sg + block_segs;
1701                 }
1702
1703                 /* Convert to the disk's sector size */
1704                 nreq->nr_512b_sectors = nr_sects;
1705                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1706                 total_sects += nr_sects;
1707
1708                 if ((nreq->nr_512b_sectors &
1709                     ((xbb->sector_size >> 9) - 1)) != 0) {
1710                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1711                                       "a multiple of the backing store sector "
1712                                       "size (%d)\n", __func__,
1713                                       nreq->nr_512b_sectors << 9,
1714                                       xbb->sector_size);
1715                         reqlist->status = BLKIF_RSP_ERROR;
1716                         goto send_response;
1717                 }
1718         }
1719
1720         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1721                                           xbb->maps, reqlist->nr_segments);
1722         if (error != 0)
1723                 panic("Grant table operation failed (%d)", error);
1724
1725         reqlist->flags |= XBB_REQLIST_MAPPED;
1726
1727         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1728              seg_idx++, map++){
1729
1730                 if (__predict_false(map->status != 0)) {
1731                         DPRINTF("invalid buffer -- could not remap "
1732                                 "it (%d)\n", map->status);
1733                         DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
1734                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1735                                 map->host_addr, map->flags, map->ref,
1736                                 map->dom);
1737                         reqlist->status = BLKIF_RSP_ERROR;
1738                         goto send_response;
1739                 }
1740
1741                 reqlist->gnt_handles[seg_idx] = map->handle;
1742         }
1743         if (reqlist->starting_sector_number + total_sects >
1744             xbb->media_num_sectors) {
1745
1746                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1747                         "extends past end of device %s\n",
1748                         operation == BIO_READ ? "read" : "write",
1749                         reqlist->starting_sector_number,
1750                         reqlist->starting_sector_number + total_sects,
1751                         xbb->dev_name); 
1752                 reqlist->status = BLKIF_RSP_ERROR;
1753                 goto send_response;
1754         }
1755
1756 do_dispatch:
1757
1758         error = xbb->dispatch_io(xbb,
1759                                  reqlist,
1760                                  operation,
1761                                  bio_flags);
1762
1763         if (error != 0) {
1764                 reqlist->status = BLKIF_RSP_ERROR;
1765                 goto send_response;
1766         }
1767
1768         return (0);
1769
1770 send_response:
1771
1772         xbb_complete_reqlist(xbb, reqlist);
1773
1774         return (0);
1775 }
1776
1777 static __inline int
1778 xbb_count_sects(blkif_request_t *ring_req)
1779 {
1780         int i;
1781         int cur_size = 0;
1782
1783         for (i = 0; i < ring_req->nr_segments; i++) {
1784                 int nsect;
1785
1786                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1787                         ring_req->seg[i].first_sect + 1);
1788                 if (nsect <= 0)
1789                         break;
1790
1791                 cur_size += nsect;
1792         }
1793
1794         return (cur_size);
1795 }
1796
1797 /**
1798  * Process incoming requests from the shared communication ring in response
1799  * to a signal on the ring's event channel.
1800  *
1801  * \param context  Callback argument registerd during task initialization -
1802  *                 the xbb_softc for this instance.
1803  * \param pending  The number of taskqueue_enqueue events that have
1804  *                 occurred since this handler was last run.
1805  */
1806 static void
1807 xbb_run_queue(void *context, int pending)
1808 {
1809         struct xbb_softc       *xbb;
1810         blkif_back_rings_t     *rings;
1811         RING_IDX                rp;
1812         uint64_t                cur_sector;
1813         int                     cur_operation;
1814         struct xbb_xen_reqlist *reqlist;
1815
1816
1817         xbb   = (struct xbb_softc *)context;
1818         rings = &xbb->rings;
1819
1820         /*
1821          * Work gather and dispatch loop.  Note that we have a bias here
1822          * towards gathering I/O sent by blockfront.  We first gather up
1823          * everything in the ring, as long as we have resources.  Then we
1824          * dispatch one request, and then attempt to gather up any
1825          * additional requests that have come in while we were dispatching
1826          * the request.
1827          *
1828          * This allows us to get a clearer picture (via devstat) of how
1829          * many requests blockfront is queueing to us at any given time.
1830          */
1831         for (;;) {
1832                 int retval;
1833
1834                 /*
1835                  * Initialize reqlist to the last element in the pending
1836                  * queue, if there is one.  This allows us to add more
1837                  * requests to that request list, if we have room.
1838                  */
1839                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1840                                       xbb_xen_reqlist, links);
1841                 if (reqlist != NULL) {
1842                         cur_sector = reqlist->next_contig_sector;
1843                         cur_operation = reqlist->operation;
1844                 } else {
1845                         cur_operation = 0;
1846                         cur_sector    = 0;
1847                 }
1848
1849                 /*
1850                  * Cache req_prod to avoid accessing a cache line shared
1851                  * with the frontend.
1852                  */
1853                 rp = rings->common.sring->req_prod;
1854
1855                 /* Ensure we see queued requests up to 'rp'. */
1856                 rmb();
1857
1858                 /**
1859                  * Run so long as there is work to consume and the generation
1860                  * of a response will not overflow the ring.
1861                  *
1862                  * @note There's a 1 to 1 relationship between requests and
1863                  *       responses, so an overflow should never occur.  This
1864                  *       test is to protect our domain from digesting bogus
1865                  *       data.  Shouldn't we log this?
1866                  */
1867                 while (rings->common.req_cons != rp
1868                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1869                                                   rings->common.req_cons) == 0){
1870                         blkif_request_t         ring_req_storage;
1871                         blkif_request_t        *ring_req;
1872                         int                     cur_size;
1873
1874                         switch (xbb->abi) {
1875                         case BLKIF_PROTOCOL_NATIVE:
1876                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1877                                     rings->common.req_cons);
1878                                 break;
1879                         case BLKIF_PROTOCOL_X86_32:
1880                         {
1881                                 struct blkif_x86_32_request *ring_req32;
1882
1883                                 ring_req32 = RING_GET_REQUEST(
1884                                     &xbb->rings.x86_32, rings->common.req_cons);
1885                                 blkif_get_x86_32_req(&ring_req_storage,
1886                                                      ring_req32);
1887                                 ring_req = &ring_req_storage;
1888                                 break;
1889                         }
1890                         case BLKIF_PROTOCOL_X86_64:
1891                         {
1892                                 struct blkif_x86_64_request *ring_req64;
1893
1894                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1895                                     rings->common.req_cons);
1896                                 blkif_get_x86_64_req(&ring_req_storage,
1897                                                      ring_req64);
1898                                 ring_req = &ring_req_storage;
1899                                 break;
1900                         }
1901                         default:
1902                                 panic("Unexpected blkif protocol ABI.");
1903                                 /* NOTREACHED */
1904                         } 
1905
1906                         /*
1907                          * Check for situations that would require closing
1908                          * off this I/O for further coalescing:
1909                          *  - Coalescing is turned off.
1910                          *  - Current I/O is out of sequence with the previous
1911                          *    I/O.
1912                          *  - Coalesced I/O would be too large.
1913                          */
1914                         if ((reqlist != NULL)
1915                          && ((xbb->no_coalesce_reqs != 0)
1916                           || ((xbb->no_coalesce_reqs == 0)
1917                            && ((ring_req->sector_number != cur_sector)
1918                             || (ring_req->operation != cur_operation)
1919                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1920                                  xbb->max_reqlist_segments))))) {
1921                                 reqlist = NULL;
1922                         }
1923
1924                         /*
1925                          * Grab and check for all resources in one shot.
1926                          * If we can't get all of the resources we need,
1927                          * the shortage is noted and the thread will get
1928                          * woken up when more resources are available.
1929                          */
1930                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1931                                                    xbb->rings.common.req_cons);
1932
1933                         if (retval != 0) {
1934                                 /*
1935                                  * Resource shortage has been recorded.
1936                                  * We'll be scheduled to run once a request
1937                                  * object frees up due to a completion.
1938                                  */
1939                                 break;
1940                         }
1941
1942                         /*
1943                          * Signify that we can overwrite this request with
1944                          * a response by incrementing our consumer index.
1945                          * The response won't be generated until after
1946                          * we've already consumed all necessary data out
1947                          * of the version of the request in the ring buffer
1948                          * (for native mode).  We must update the consumer
1949                          * index  before issueing back-end I/O so there is
1950                          * no possibility that it will complete and a
1951                          * response be generated before we make room in 
1952                          * the queue for that response.
1953                          */
1954                         xbb->rings.common.req_cons +=
1955                             BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
1956                         xbb->reqs_received++;
1957
1958                         cur_size = xbb_count_sects(ring_req);
1959                         cur_sector = ring_req->sector_number + cur_size;
1960                         reqlist->next_contig_sector = cur_sector;
1961                         cur_operation = ring_req->operation;
1962                 }
1963
1964                 /* Check for I/O to dispatch */
1965                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1966                 if (reqlist == NULL) {
1967                         /*
1968                          * We're out of work to do, put the task queue to
1969                          * sleep.
1970                          */
1971                         break;
1972                 }
1973
1974                 /*
1975                  * Grab the first request off the queue and attempt
1976                  * to dispatch it.
1977                  */
1978                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1979
1980                 retval = xbb_dispatch_io(xbb, reqlist);
1981                 if (retval != 0) {
1982                         /*
1983                          * xbb_dispatch_io() returns non-zero only when
1984                          * there is a resource shortage.  If that's the
1985                          * case, re-queue this request on the head of the
1986                          * queue, and go to sleep until we have more
1987                          * resources.
1988                          */
1989                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1990                                            reqlist, links);
1991                         break;
1992                 } else {
1993                         /*
1994                          * If we still have anything on the queue after
1995                          * removing the head entry, that is because we
1996                          * met one of the criteria to create a new
1997                          * request list (outlined above), and we'll call
1998                          * that a forced dispatch for statistical purposes.
1999                          *
2000                          * Otherwise, if there is only one element on the
2001                          * queue, we coalesced everything available on
2002                          * the ring and we'll call that a normal dispatch.
2003                          */
2004                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2005
2006                         if (reqlist != NULL)
2007                                 xbb->forced_dispatch++;
2008                         else
2009                                 xbb->normal_dispatch++;
2010
2011                         xbb->total_dispatch++;
2012                 }
2013         }
2014 }
2015
2016 /**
2017  * Interrupt handler bound to the shared ring's event channel.
2018  *
2019  * \param arg  Callback argument registerd during event channel
2020  *             binding - the xbb_softc for this instance.
2021  */
2022 static int
2023 xbb_filter(void *arg)
2024 {
2025         struct xbb_softc *xbb;
2026
2027         /* Defer to taskqueue thread. */
2028         xbb = (struct xbb_softc *)arg;
2029         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
2030
2031         return (FILTER_HANDLED);
2032 }
2033
2034 SDT_PROVIDER_DEFINE(xbb);
2035 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, flush, "int");
2036 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, read, "int", "uint64_t",
2037                   "uint64_t");
2038 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, write, "int",
2039                   "uint64_t", "uint64_t");
2040
2041 /*----------------------------- Backend Handlers -----------------------------*/
2042 /**
2043  * Backend handler for character device access.
2044  *
2045  * \param xbb        Per-instance xbb configuration structure.
2046  * \param reqlist    Allocated internal request list structure.
2047  * \param operation  BIO_* I/O operation code.
2048  * \param bio_flags  Additional bio_flag data to pass to any generated
2049  *                   bios (e.g. BIO_ORDERED)..
2050  *
2051  * \return  0 for success, errno codes for failure.
2052  */
2053 static int
2054 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2055                  int operation, int bio_flags)
2056 {
2057         struct xbb_dev_data *dev_data;
2058         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2059         off_t                bio_offset;
2060         struct bio          *bio;
2061         struct xbb_sg       *xbb_sg;
2062         u_int                nbio;
2063         u_int                bio_idx;
2064         u_int                nseg;
2065         u_int                seg_idx;
2066         int                  error;
2067
2068         dev_data   = &xbb->backend.dev;
2069         bio_offset = (off_t)reqlist->starting_sector_number
2070                    << xbb->sector_size_shift;
2071         error      = 0;
2072         nbio       = 0;
2073         bio_idx    = 0;
2074
2075         if (operation == BIO_FLUSH) {
2076                 bio = g_new_bio();
2077                 if (__predict_false(bio == NULL)) {
2078                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2079                         error = ENOMEM;
2080                         return (error);
2081                 }
2082
2083                 bio->bio_cmd     = BIO_FLUSH;
2084                 bio->bio_flags  |= BIO_ORDERED;
2085                 bio->bio_dev     = dev_data->cdev;
2086                 bio->bio_offset  = 0;
2087                 bio->bio_data    = 0;
2088                 bio->bio_done    = xbb_bio_done;
2089                 bio->bio_caller1 = reqlist;
2090                 bio->bio_pblkno  = 0;
2091
2092                 reqlist->pendcnt = 1;
2093
2094                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2095                            device_get_unit(xbb->dev));
2096
2097                 (*dev_data->csw->d_strategy)(bio);
2098
2099                 return (0);
2100         }
2101
2102         xbb_sg = xbb->xbb_sgs;
2103         bio    = NULL;
2104         nseg = reqlist->nr_segments;
2105
2106         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2107
2108                 /*
2109                  * KVA will not be contiguous, so any additional
2110                  * I/O will need to be represented in a new bio.
2111                  */
2112                 if ((bio != NULL)
2113                  && (xbb_sg->first_sect != 0)) {
2114                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2115                                 printf("%s: Discontiguous I/O request "
2116                                        "from domain %d ends on "
2117                                        "non-sector boundary\n",
2118                                        __func__, xbb->otherend_id);
2119                                 error = EINVAL;
2120                                 goto fail_free_bios;
2121                         }
2122                         bio = NULL;
2123                 }
2124
2125                 if (bio == NULL) {
2126                         /*
2127                          * Make sure that the start of this bio is
2128                          * aligned to a device sector.
2129                          */
2130                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2131                                 printf("%s: Misaligned I/O request "
2132                                        "from domain %d\n", __func__,
2133                                        xbb->otherend_id);
2134                                 error = EINVAL;
2135                                 goto fail_free_bios;
2136                         }
2137
2138                         bio = bios[nbio++] = g_new_bio();
2139                         if (__predict_false(bio == NULL)) {
2140                                 error = ENOMEM;
2141                                 goto fail_free_bios;
2142                         }
2143                         bio->bio_cmd     = operation;
2144                         bio->bio_flags  |= bio_flags;
2145                         bio->bio_dev     = dev_data->cdev;
2146                         bio->bio_offset  = bio_offset;
2147                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2148                                                 xbb_sg->first_sect);
2149                         bio->bio_done    = xbb_bio_done;
2150                         bio->bio_caller1 = reqlist;
2151                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2152                 }
2153
2154                 bio->bio_length += xbb_sg->nsect << 9;
2155                 bio->bio_bcount  = bio->bio_length;
2156                 bio_offset      += xbb_sg->nsect << 9;
2157
2158                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2159
2160                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2161                                 printf("%s: Discontiguous I/O request "
2162                                        "from domain %d ends on "
2163                                        "non-sector boundary\n",
2164                                        __func__, xbb->otherend_id);
2165                                 error = EINVAL;
2166                                 goto fail_free_bios;
2167                         }
2168                         /*
2169                          * KVA will not be contiguous, so any additional
2170                          * I/O will need to be represented in a new bio.
2171                          */
2172                         bio = NULL;
2173                 }
2174         }
2175
2176         reqlist->pendcnt = nbio;
2177
2178         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2179         {
2180 #ifdef XBB_USE_BOUNCE_BUFFERS
2181                 vm_offset_t kva_offset;
2182
2183                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2184                            - (vm_offset_t)reqlist->bounce;
2185                 if (operation == BIO_WRITE) {
2186                         memcpy(bios[bio_idx]->bio_data,
2187                                (uint8_t *)reqlist->kva + kva_offset,
2188                                bios[bio_idx]->bio_bcount);
2189                 }
2190 #endif
2191                 if (operation == BIO_READ) {
2192                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2193                                    device_get_unit(xbb->dev),
2194                                    bios[bio_idx]->bio_offset,
2195                                    bios[bio_idx]->bio_length);
2196                 } else if (operation == BIO_WRITE) {
2197                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2198                                    device_get_unit(xbb->dev),
2199                                    bios[bio_idx]->bio_offset,
2200                                    bios[bio_idx]->bio_length);
2201                 }
2202                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2203         }
2204
2205         return (error);
2206
2207 fail_free_bios:
2208         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2209                 g_destroy_bio(bios[bio_idx]);
2210         
2211         return (error);
2212 }
2213
2214 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, flush, "int");
2215 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, read, "int", "uint64_t",
2216                   "uint64_t");
2217 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, write, "int",
2218                   "uint64_t", "uint64_t");
2219
2220 /**
2221  * Backend handler for file access.
2222  *
2223  * \param xbb        Per-instance xbb configuration structure.
2224  * \param reqlist    Allocated internal request list.
2225  * \param operation  BIO_* I/O operation code.
2226  * \param flags      Additional bio_flag data to pass to any generated bios
2227  *                   (e.g. BIO_ORDERED)..
2228  *
2229  * \return  0 for success, errno codes for failure.
2230  */
2231 static int
2232 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2233                   int operation, int flags)
2234 {
2235         struct xbb_file_data *file_data;
2236         u_int                 seg_idx;
2237         u_int                 nseg;
2238         off_t                 sectors_sent;
2239         struct uio            xuio;
2240         struct xbb_sg        *xbb_sg;
2241         struct iovec         *xiovec;
2242 #ifdef XBB_USE_BOUNCE_BUFFERS
2243         void                **p_vaddr;
2244         int                   saved_uio_iovcnt;
2245 #endif /* XBB_USE_BOUNCE_BUFFERS */
2246         int                   error;
2247
2248         file_data = &xbb->backend.file;
2249         sectors_sent = 0;
2250         error = 0;
2251         bzero(&xuio, sizeof(xuio));
2252
2253         switch (operation) {
2254         case BIO_READ:
2255                 xuio.uio_rw = UIO_READ;
2256                 break;
2257         case BIO_WRITE:
2258                 xuio.uio_rw = UIO_WRITE;
2259                 break;
2260         case BIO_FLUSH: {
2261                 struct mount *mountpoint;
2262
2263                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2264                            device_get_unit(xbb->dev));
2265
2266                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2267
2268                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2269                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2270                 VOP_UNLOCK(xbb->vn, 0);
2271
2272                 vn_finished_write(mountpoint);
2273
2274                 goto bailout_send_response;
2275                 /* NOTREACHED */
2276         }
2277         default:
2278                 panic("invalid operation %d", operation);
2279                 /* NOTREACHED */
2280         }
2281         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2282                         << xbb->sector_size_shift;
2283         xuio.uio_segflg = UIO_SYSSPACE;
2284         xuio.uio_iov = file_data->xiovecs;
2285         xuio.uio_iovcnt = 0;
2286         xbb_sg = xbb->xbb_sgs;
2287         nseg = reqlist->nr_segments;
2288
2289         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2290
2291                 /*
2292                  * If the first sector is not 0, the KVA will
2293                  * not be contiguous and we'll need to go on
2294                  * to another segment.
2295                  */
2296                 if (xbb_sg->first_sect != 0)
2297                         xiovec = NULL;
2298
2299                 if (xiovec == NULL) {
2300                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2301                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2302                             seg_idx, xbb_sg->first_sect);
2303 #ifdef XBB_USE_BOUNCE_BUFFERS
2304                         /*
2305                          * Store the address of the incoming
2306                          * buffer at this particular offset
2307                          * as well, so we can do the copy
2308                          * later without having to do more
2309                          * work to recalculate this address.
2310                          */
2311                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2312                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2313                             xbb_sg->first_sect);
2314 #endif /* XBB_USE_BOUNCE_BUFFERS */
2315                         xiovec->iov_len = 0;
2316                         xuio.uio_iovcnt++;
2317                 }
2318
2319                 xiovec->iov_len += xbb_sg->nsect << 9;
2320
2321                 xuio.uio_resid += xbb_sg->nsect << 9;
2322
2323                 /*
2324                  * If the last sector is not the full page
2325                  * size count, the next segment will not be
2326                  * contiguous in KVA and we need a new iovec.
2327                  */
2328                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2329                         xiovec = NULL;
2330         }
2331
2332         xuio.uio_td = curthread;
2333
2334 #ifdef XBB_USE_BOUNCE_BUFFERS
2335         saved_uio_iovcnt = xuio.uio_iovcnt;
2336
2337         if (operation == BIO_WRITE) {
2338                 /* Copy the write data to the local buffer. */
2339                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2340                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2341                      seg_idx++, xiovec++, p_vaddr++) {
2342
2343                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2344                 }
2345         } else {
2346                 /*
2347                  * We only need to save off the iovecs in the case of a
2348                  * read, because the copy for the read happens after the
2349                  * VOP_READ().  (The uio will get modified in that call
2350                  * sequence.)
2351                  */
2352                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2353                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2354         }
2355 #endif /* XBB_USE_BOUNCE_BUFFERS */
2356
2357         switch (operation) {
2358         case BIO_READ:
2359
2360                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2361                            device_get_unit(xbb->dev), xuio.uio_offset,
2362                            xuio.uio_resid);
2363
2364                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2365
2366                 /*
2367                  * UFS pays attention to IO_DIRECT for reads.  If the
2368                  * DIRECTIO option is configured into the kernel, it calls
2369                  * ffs_rawread().  But that only works for single-segment
2370                  * uios with user space addresses.  In our case, with a
2371                  * kernel uio, it still reads into the buffer cache, but it
2372                  * will just try to release the buffer from the cache later
2373                  * on in ffs_read().
2374                  *
2375                  * ZFS does not pay attention to IO_DIRECT for reads.
2376                  *
2377                  * UFS does not pay attention to IO_SYNC for reads.
2378                  *
2379                  * ZFS pays attention to IO_SYNC (which translates into the
2380                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2381                  * attempts to sync the file before reading.
2382                  *
2383                  * So, to attempt to provide some barrier semantics in the
2384                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2385                  */
2386                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2387                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2388
2389                 VOP_UNLOCK(xbb->vn, 0);
2390                 break;
2391         case BIO_WRITE: {
2392                 struct mount *mountpoint;
2393
2394                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2395                            device_get_unit(xbb->dev), xuio.uio_offset,
2396                            xuio.uio_resid);
2397
2398                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2399
2400                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2401
2402                 /*
2403                  * UFS pays attention to IO_DIRECT for writes.  The write
2404                  * is done asynchronously.  (Normally the write would just
2405                  * get put into cache.
2406                  *
2407                  * UFS pays attention to IO_SYNC for writes.  It will
2408                  * attempt to write the buffer out synchronously if that
2409                  * flag is set.
2410                  *
2411                  * ZFS does not pay attention to IO_DIRECT for writes.
2412                  *
2413                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2414                  * for writes.  It will flush the transaction from the
2415                  * cache before returning.
2416                  *
2417                  * So if we've got the BIO_ORDERED flag set, we want
2418                  * IO_SYNC in either the UFS or ZFS case.
2419                  */
2420                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2421                                   IO_SYNC : 0, file_data->cred);
2422                 VOP_UNLOCK(xbb->vn, 0);
2423
2424                 vn_finished_write(mountpoint);
2425
2426                 break;
2427         }
2428         default:
2429                 panic("invalid operation %d", operation);
2430                 /* NOTREACHED */
2431         }
2432
2433 #ifdef XBB_USE_BOUNCE_BUFFERS
2434         /* We only need to copy here for read operations */
2435         if (operation == BIO_READ) {
2436
2437                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2438                      xiovec = file_data->saved_xiovecs;
2439                      seg_idx < saved_uio_iovcnt; seg_idx++,
2440                      xiovec++, p_vaddr++) {
2441
2442                         /*
2443                          * Note that we have to use the copy of the 
2444                          * io vector we made above.  uiomove() modifies
2445                          * the uio and its referenced vector as uiomove
2446                          * performs the copy, so we can't rely on any
2447                          * state from the original uio.
2448                          */
2449                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2450                 }
2451         }
2452 #endif /* XBB_USE_BOUNCE_BUFFERS */
2453
2454 bailout_send_response:
2455
2456         if (error != 0)
2457                 reqlist->status = BLKIF_RSP_ERROR;
2458
2459         xbb_complete_reqlist(xbb, reqlist);
2460
2461         return (0);
2462 }
2463
2464 /*--------------------------- Backend Configuration --------------------------*/
2465 /**
2466  * Close and cleanup any backend device/file specific state for this
2467  * block back instance. 
2468  *
2469  * \param xbb  Per-instance xbb configuration structure.
2470  */
2471 static void
2472 xbb_close_backend(struct xbb_softc *xbb)
2473 {
2474         DROP_GIANT();
2475         DPRINTF("closing dev=%s\n", xbb->dev_name);
2476         if (xbb->vn) {
2477                 int flags = FREAD;
2478
2479                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2480                         flags |= FWRITE;
2481
2482                 switch (xbb->device_type) {
2483                 case XBB_TYPE_DISK:
2484                         if (xbb->backend.dev.csw) {
2485                                 dev_relthread(xbb->backend.dev.cdev,
2486                                               xbb->backend.dev.dev_ref);
2487                                 xbb->backend.dev.csw  = NULL;
2488                                 xbb->backend.dev.cdev = NULL;
2489                         }
2490                         break;
2491                 case XBB_TYPE_FILE:
2492                         break;
2493                 case XBB_TYPE_NONE:
2494                 default:
2495                         panic("Unexpected backend type.");
2496                         break;
2497                 }
2498
2499                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2500                 xbb->vn = NULL;
2501
2502                 switch (xbb->device_type) {
2503                 case XBB_TYPE_DISK:
2504                         break;
2505                 case XBB_TYPE_FILE:
2506                         if (xbb->backend.file.cred != NULL) {
2507                                 crfree(xbb->backend.file.cred);
2508                                 xbb->backend.file.cred = NULL;
2509                         }
2510                         break;
2511                 case XBB_TYPE_NONE:
2512                 default:
2513                         panic("Unexpected backend type.");
2514                         break;
2515                 }
2516         }
2517         PICKUP_GIANT();
2518 }
2519
2520 /**
2521  * Open a character device to be used for backend I/O.
2522  *
2523  * \param xbb  Per-instance xbb configuration structure.
2524  *
2525  * \return  0 for success, errno codes for failure.
2526  */
2527 static int
2528 xbb_open_dev(struct xbb_softc *xbb)
2529 {
2530         struct vattr   vattr;
2531         struct cdev   *dev;
2532         struct cdevsw *devsw;
2533         int            error;
2534
2535         xbb->device_type = XBB_TYPE_DISK;
2536         xbb->dispatch_io = xbb_dispatch_dev;
2537         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2538         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2539                                              &xbb->backend.dev.dev_ref);
2540         if (xbb->backend.dev.csw == NULL)
2541                 panic("Unable to retrieve device switch");
2542
2543         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2544         if (error) {
2545                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2546                                  "vnode attributes for device %s",
2547                                  xbb->dev_name);
2548                 return (error);
2549         }
2550
2551
2552         dev = xbb->vn->v_rdev;
2553         devsw = dev->si_devsw;
2554         if (!devsw->d_ioctl) {
2555                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2556                                  "device %s!", xbb->dev_name);
2557                 return (ENODEV);
2558         }
2559
2560         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2561                                (caddr_t)&xbb->sector_size, FREAD,
2562                                curthread);
2563         if (error) {
2564                 xenbus_dev_fatal(xbb->dev, error,
2565                                  "error calling ioctl DIOCGSECTORSIZE "
2566                                  "for device %s", xbb->dev_name);
2567                 return (error);
2568         }
2569
2570         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2571                                (caddr_t)&xbb->media_size, FREAD,
2572                                curthread);
2573         if (error) {
2574                 xenbus_dev_fatal(xbb->dev, error,
2575                                  "error calling ioctl DIOCGMEDIASIZE "
2576                                  "for device %s", xbb->dev_name);
2577                 return (error);
2578         }
2579
2580         return (0);
2581 }
2582
2583 /**
2584  * Open a file to be used for backend I/O.
2585  *
2586  * \param xbb  Per-instance xbb configuration structure.
2587  *
2588  * \return  0 for success, errno codes for failure.
2589  */
2590 static int
2591 xbb_open_file(struct xbb_softc *xbb)
2592 {
2593         struct xbb_file_data *file_data;
2594         struct vattr          vattr;
2595         int                   error;
2596
2597         file_data = &xbb->backend.file;
2598         xbb->device_type = XBB_TYPE_FILE;
2599         xbb->dispatch_io = xbb_dispatch_file;
2600         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2601         if (error != 0) {
2602                 xenbus_dev_fatal(xbb->dev, error,
2603                                  "error calling VOP_GETATTR()"
2604                                  "for file %s", xbb->dev_name);
2605                 return (error);
2606         }
2607
2608         /*
2609          * Verify that we have the ability to upgrade to exclusive
2610          * access on this file so we can trap errors at open instead
2611          * of reporting them during first access.
2612          */
2613         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2614                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2615                 if (xbb->vn->v_iflag & VI_DOOMED) {
2616                         error = EBADF;
2617                         xenbus_dev_fatal(xbb->dev, error,
2618                                          "error locking file %s",
2619                                          xbb->dev_name);
2620
2621                         return (error);
2622                 }
2623         }
2624
2625         file_data->cred = crhold(curthread->td_ucred);
2626         xbb->media_size = vattr.va_size;
2627
2628         /*
2629          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2630          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2631          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2632          * may not work with other OSes as well.  So just export a sector
2633          * size of 512 bytes, which should work with any OS or
2634          * application.  Since our backing is a file, any block size will
2635          * work fine for the backing store.
2636          */
2637 #if 0
2638         xbb->sector_size = vattr.va_blocksize;
2639 #endif
2640         xbb->sector_size = 512;
2641
2642         /*
2643          * Sanity check.  The media size has to be at least one
2644          * sector long.
2645          */
2646         if (xbb->media_size < xbb->sector_size) {
2647                 error = EINVAL;
2648                 xenbus_dev_fatal(xbb->dev, error,
2649                                  "file %s size %ju < block size %u",
2650                                  xbb->dev_name,
2651                                  (uintmax_t)xbb->media_size,
2652                                  xbb->sector_size);
2653         }
2654         return (error);
2655 }
2656
2657 /**
2658  * Open the backend provider for this connection.
2659  *
2660  * \param xbb  Per-instance xbb configuration structure.
2661  *
2662  * \return  0 for success, errno codes for failure.
2663  */
2664 static int
2665 xbb_open_backend(struct xbb_softc *xbb)
2666 {
2667         struct nameidata nd;
2668         int              flags;
2669         int              error;
2670
2671         flags = FREAD;
2672         error = 0;
2673
2674         DPRINTF("opening dev=%s\n", xbb->dev_name);
2675
2676         if (rootvnode == NULL) {
2677                 xenbus_dev_fatal(xbb->dev, ENOENT,
2678                                  "Root file system not mounted");
2679                 return (ENOENT);
2680         }
2681
2682         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2683                 flags |= FWRITE;
2684
2685         if (!curthread->td_proc->p_fd->fd_cdir) {
2686                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
2687                 VREF(rootvnode);
2688         }
2689         if (!curthread->td_proc->p_fd->fd_rdir) {
2690                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
2691                 VREF(rootvnode);
2692         }
2693         if (!curthread->td_proc->p_fd->fd_jdir) {
2694                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
2695                 VREF(rootvnode);
2696         }
2697
2698  again:
2699         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2700         error = vn_open(&nd, &flags, 0, NULL);
2701         if (error) {
2702                 /*
2703                  * This is the only reasonable guess we can make as far as
2704                  * path if the user doesn't give us a fully qualified path.
2705                  * If they want to specify a file, they need to specify the
2706                  * full path.
2707                  */
2708                 if (xbb->dev_name[0] != '/') {
2709                         char *dev_path = "/dev/";
2710                         char *dev_name;
2711
2712                         /* Try adding device path at beginning of name */
2713                         dev_name = malloc(strlen(xbb->dev_name)
2714                                         + strlen(dev_path) + 1,
2715                                           M_XENBLOCKBACK, M_NOWAIT);
2716                         if (dev_name) {
2717                                 sprintf(dev_name, "%s%s", dev_path,
2718                                         xbb->dev_name);
2719                                 free(xbb->dev_name, M_XENBLOCKBACK);
2720                                 xbb->dev_name = dev_name;
2721                                 goto again;
2722                         }
2723                 }
2724                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2725                                  xbb->dev_name);
2726                 return (error);
2727         }
2728
2729         NDFREE(&nd, NDF_ONLY_PNBUF);
2730                 
2731         xbb->vn = nd.ni_vp;
2732
2733         /* We only support disks and files. */
2734         if (vn_isdisk(xbb->vn, &error)) {
2735                 error = xbb_open_dev(xbb);
2736         } else if (xbb->vn->v_type == VREG) {
2737                 error = xbb_open_file(xbb);
2738         } else {
2739                 error = EINVAL;
2740                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2741                                  "or file", xbb->dev_name);
2742         }
2743         VOP_UNLOCK(xbb->vn, 0);
2744
2745         if (error != 0) {
2746                 xbb_close_backend(xbb);
2747                 return (error);
2748         }
2749
2750         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2751         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2752
2753         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2754                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2755                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2756
2757         return (0);
2758 }
2759
2760 /*------------------------ Inter-Domain Communication ------------------------*/
2761 /**
2762  * Free dynamically allocated KVA or pseudo-physical address allocations.
2763  *
2764  * \param xbb  Per-instance xbb configuration structure.
2765  */
2766 static void
2767 xbb_free_communication_mem(struct xbb_softc *xbb)
2768 {
2769         if (xbb->kva != 0) {
2770 #ifndef XENHVM
2771                 kva_free(xbb->kva, xbb->kva_size);
2772 #else
2773                 if (xbb->pseudo_phys_res != NULL) {
2774                         bus_release_resource(xbb->dev, SYS_RES_MEMORY,
2775                                              xbb->pseudo_phys_res_id,
2776                                              xbb->pseudo_phys_res);
2777                         xbb->pseudo_phys_res = NULL;
2778                 }
2779 #endif
2780         }
2781         xbb->kva = 0;
2782         xbb->gnt_base_addr = 0;
2783         if (xbb->kva_free != NULL) {
2784                 free(xbb->kva_free, M_XENBLOCKBACK);
2785                 xbb->kva_free = NULL;
2786         }
2787 }
2788
2789 /**
2790  * Cleanup all inter-domain communication mechanisms.
2791  *
2792  * \param xbb  Per-instance xbb configuration structure.
2793  */
2794 static int
2795 xbb_disconnect(struct xbb_softc *xbb)
2796 {
2797         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2798         struct gnttab_unmap_grant_ref *op;
2799         u_int                          ring_idx;
2800         int                            error;
2801
2802         DPRINTF("\n");
2803
2804         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2805                 return (0);
2806
2807         xen_intr_unbind(&xbb->xen_intr_handle);
2808
2809         mtx_unlock(&xbb->lock);
2810         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2811         mtx_lock(&xbb->lock);
2812
2813         /*
2814          * No new interrupts can generate work, but we must wait
2815          * for all currently active requests to drain.
2816          */
2817         if (xbb->active_request_count != 0)
2818                 return (EAGAIN);
2819         
2820         for (ring_idx = 0, op = ops;
2821              ring_idx < xbb->ring_config.ring_pages;
2822              ring_idx++, op++) {
2823
2824                 op->host_addr    = xbb->ring_config.gnt_addr
2825                                  + (ring_idx * PAGE_SIZE);
2826                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2827                 op->handle       = xbb->ring_config.handle[ring_idx];
2828         }
2829
2830         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2831                                           xbb->ring_config.ring_pages);
2832         if (error != 0)
2833                 panic("Grant table op failed (%d)", error);
2834
2835         xbb_free_communication_mem(xbb);
2836
2837         if (xbb->requests != NULL) {
2838                 free(xbb->requests, M_XENBLOCKBACK);
2839                 xbb->requests = NULL;
2840         }
2841
2842         if (xbb->request_lists != NULL) {
2843                 struct xbb_xen_reqlist *reqlist;
2844                 int i;
2845
2846                 /* There is one request list for ever allocated request. */
2847                 for (i = 0, reqlist = xbb->request_lists;
2848                      i < xbb->max_requests; i++, reqlist++){
2849 #ifdef XBB_USE_BOUNCE_BUFFERS
2850                         if (reqlist->bounce != NULL) {
2851                                 free(reqlist->bounce, M_XENBLOCKBACK);
2852                                 reqlist->bounce = NULL;
2853                         }
2854 #endif
2855                         if (reqlist->gnt_handles != NULL) {
2856                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2857                                 reqlist->gnt_handles = NULL;
2858                         }
2859                 }
2860                 free(xbb->request_lists, M_XENBLOCKBACK);
2861                 xbb->request_lists = NULL;
2862         }
2863
2864         xbb->flags &= ~XBBF_RING_CONNECTED;
2865         return (0);
2866 }
2867
2868 /**
2869  * Map shared memory ring into domain local address space, initialize
2870  * ring control structures, and bind an interrupt to the event channel
2871  * used to notify us of ring changes.
2872  *
2873  * \param xbb  Per-instance xbb configuration structure.
2874  */
2875 static int
2876 xbb_connect_ring(struct xbb_softc *xbb)
2877 {
2878         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2879         struct gnttab_map_grant_ref *gnt;
2880         u_int                        ring_idx;
2881         int                          error;
2882
2883         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2884                 return (0);
2885
2886         /*
2887          * Kva for our ring is at the tail of the region of kva allocated
2888          * by xbb_alloc_communication_mem().
2889          */
2890         xbb->ring_config.va = xbb->kva
2891                             + (xbb->kva_size
2892                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2893         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2894                                   + (xbb->kva_size
2895                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2896
2897         for (ring_idx = 0, gnt = gnts;
2898              ring_idx < xbb->ring_config.ring_pages;
2899              ring_idx++, gnt++) {
2900
2901                 gnt->host_addr = xbb->ring_config.gnt_addr
2902                                + (ring_idx * PAGE_SIZE);
2903                 gnt->flags     = GNTMAP_host_map;
2904                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2905                 gnt->dom       = xbb->otherend_id;
2906         }
2907
2908         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2909                                           xbb->ring_config.ring_pages);
2910         if (error)
2911                 panic("blkback: Ring page grant table op failed (%d)", error);
2912
2913         for (ring_idx = 0, gnt = gnts;
2914              ring_idx < xbb->ring_config.ring_pages;
2915              ring_idx++, gnt++) {
2916                 if (gnt->status != 0) {
2917                         xbb->ring_config.va = 0;
2918                         xenbus_dev_fatal(xbb->dev, EACCES,
2919                                          "Ring shared page mapping failed. "
2920                                          "Status %d.", gnt->status);
2921                         return (EACCES);
2922                 }
2923                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2924                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2925         }
2926
2927         /* Initialize the ring based on ABI. */
2928         switch (xbb->abi) {
2929         case BLKIF_PROTOCOL_NATIVE:
2930         {
2931                 blkif_sring_t *sring;
2932                 sring = (blkif_sring_t *)xbb->ring_config.va;
2933                 BACK_RING_INIT(&xbb->rings.native, sring,
2934                                xbb->ring_config.ring_pages * PAGE_SIZE);
2935                 break;
2936         }
2937         case BLKIF_PROTOCOL_X86_32:
2938         {
2939                 blkif_x86_32_sring_t *sring_x86_32;
2940                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2941                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2942                                xbb->ring_config.ring_pages * PAGE_SIZE);
2943                 break;
2944         }
2945         case BLKIF_PROTOCOL_X86_64:
2946         {
2947                 blkif_x86_64_sring_t *sring_x86_64;
2948                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2949                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2950                                xbb->ring_config.ring_pages * PAGE_SIZE);
2951                 break;
2952         }
2953         default:
2954                 panic("Unexpected blkif protocol ABI.");
2955         }
2956
2957         xbb->flags |= XBBF_RING_CONNECTED;
2958
2959         error = xen_intr_bind_remote_port(xbb->dev,
2960                                           xbb->otherend_id,
2961                                           xbb->ring_config.evtchn,
2962                                           xbb_filter,
2963                                           /*ithread_handler*/NULL,
2964                                           /*arg*/xbb,
2965                                           INTR_TYPE_BIO | INTR_MPSAFE,
2966                                           &xbb->xen_intr_handle);
2967         if (error) {
2968                 (void)xbb_disconnect(xbb);
2969                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2970                 return (error);
2971         }
2972
2973         DPRINTF("rings connected!\n");
2974
2975         return 0;
2976 }
2977
2978 /* Needed to make bit_alloc() macro work */
2979 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,      \
2980                                    M_NOWAIT|M_ZERO);
2981
2982 /**
2983  * Size KVA and pseudo-physical address allocations based on negotiated
2984  * values for the size and number of I/O requests, and the size of our
2985  * communication ring.
2986  *
2987  * \param xbb  Per-instance xbb configuration structure.
2988  *
2989  * These address spaces are used to dynamically map pages in the
2990  * front-end's domain into our own.
2991  */
2992 static int
2993 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2994 {
2995         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2996         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2997         xbb->kva_size = xbb->reqlist_kva_size +
2998                         (xbb->ring_config.ring_pages * PAGE_SIZE);
2999
3000         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
3001         if (xbb->kva_free == NULL)
3002                 return (ENOMEM);
3003
3004         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
3005                 device_get_nameunit(xbb->dev), xbb->kva_size,
3006                 xbb->reqlist_kva_size);
3007 #ifndef XENHVM
3008         xbb->kva = kva_alloc(xbb->kva_size);
3009         if (xbb->kva == 0)
3010                 return (ENOMEM);
3011         xbb->gnt_base_addr = xbb->kva;
3012 #else /* XENHVM */
3013         /*
3014          * Reserve a range of pseudo physical memory that we can map
3015          * into kva.  These pages will only be backed by machine
3016          * pages ("real memory") during the lifetime of front-end requests
3017          * via grant table operations.
3018          */
3019         xbb->pseudo_phys_res_id = 0;
3020         xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
3021                                                   &xbb->pseudo_phys_res_id,
3022                                                   0, ~0, xbb->kva_size,
3023                                                   RF_ACTIVE);
3024         if (xbb->pseudo_phys_res == NULL) {
3025                 xbb->kva = 0;
3026                 return (ENOMEM);
3027         }
3028         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3029         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3030 #endif /* XENHVM */
3031
3032         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3033                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3034                 (uintmax_t)xbb->gnt_base_addr); 
3035         return (0);
3036 }
3037
3038 /**
3039  * Collect front-end information from the XenStore.
3040  *
3041  * \param xbb  Per-instance xbb configuration structure.
3042  */
3043 static int
3044 xbb_collect_frontend_info(struct xbb_softc *xbb)
3045 {
3046         char        protocol_abi[64];
3047         const char *otherend_path;
3048         int         error;
3049         u_int       ring_idx;
3050         u_int       ring_page_order;
3051         size_t      ring_size;
3052
3053         otherend_path = xenbus_get_otherend_path(xbb->dev);
3054
3055         /*
3056          * Protocol defaults valid even if all negotiation fails.
3057          */
3058         xbb->ring_config.ring_pages = 1;
3059         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
3060         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3061
3062         /*
3063          * Mandatory data (used in all versions of the protocol) first.
3064          */
3065         error = xs_scanf(XST_NIL, otherend_path,
3066                          "event-channel", NULL, "%" PRIu32,
3067                          &xbb->ring_config.evtchn);
3068         if (error != 0) {
3069                 xenbus_dev_fatal(xbb->dev, error,
3070                                  "Unable to retrieve event-channel information "
3071                                  "from frontend %s.  Unable to connect.",
3072                                  xenbus_get_otherend_path(xbb->dev));
3073                 return (error);
3074         }
3075
3076         /*
3077          * These fields are initialized to legacy protocol defaults
3078          * so we only need to fail if reading the updated value succeeds
3079          * and the new value is outside of its allowed range.
3080          *
3081          * \note xs_gather() returns on the first encountered error, so
3082          *       we must use independant calls in order to guarantee
3083          *       we don't miss information in a sparsly populated front-end
3084          *       tree.
3085          *
3086          * \note xs_scanf() does not update variables for unmatched
3087          *       fields.
3088          */
3089         ring_page_order = 0;
3090         (void)xs_scanf(XST_NIL, otherend_path,
3091                        "ring-page-order", NULL, "%u",
3092                        &ring_page_order);
3093         xbb->ring_config.ring_pages = 1 << ring_page_order;
3094         (void)xs_scanf(XST_NIL, otherend_path,
3095                        "num-ring-pages", NULL, "%u",
3096                        &xbb->ring_config.ring_pages);
3097         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3098         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3099
3100         (void)xs_scanf(XST_NIL, otherend_path,
3101                        "max-requests", NULL, "%u",
3102                        &xbb->max_requests);
3103
3104         (void)xs_scanf(XST_NIL, otherend_path,
3105                        "max-request-segments", NULL, "%u",
3106                        &xbb->max_request_segments);
3107
3108         (void)xs_scanf(XST_NIL, otherend_path,
3109                        "max-request-size", NULL, "%u",
3110                        &xbb->max_request_size);
3111
3112         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3113                 xenbus_dev_fatal(xbb->dev, EINVAL,
3114                                  "Front-end specified ring-pages of %u "
3115                                  "exceeds backend limit of %zu.  "
3116                                  "Unable to connect.",
3117                                  xbb->ring_config.ring_pages,
3118                                  XBB_MAX_RING_PAGES);
3119                 return (EINVAL);
3120         } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
3121                 xenbus_dev_fatal(xbb->dev, EINVAL,
3122                                  "Front-end specified max_requests of %u "
3123                                  "exceeds backend limit of %u.  "
3124                                  "Unable to connect.",
3125                                  xbb->max_requests,
3126                                  XBB_MAX_REQUESTS);
3127                 return (EINVAL);
3128         } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
3129                 xenbus_dev_fatal(xbb->dev, EINVAL,
3130                                  "Front-end specified max_requests_segments "
3131                                  "of %u exceeds backend limit of %u.  "
3132                                  "Unable to connect.",
3133                                  xbb->max_request_segments,
3134                                  XBB_MAX_SEGMENTS_PER_REQUEST);
3135                 return (EINVAL);
3136         } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
3137                 xenbus_dev_fatal(xbb->dev, EINVAL,
3138                                  "Front-end specified max_request_size "
3139                                  "of %u exceeds backend limit of %u.  "
3140                                  "Unable to connect.",
3141                                  xbb->max_request_size,
3142                                  XBB_MAX_REQUEST_SIZE);
3143                 return (EINVAL);
3144         }
3145
3146         if (xbb->ring_config.ring_pages == 1) {
3147                 error = xs_gather(XST_NIL, otherend_path,
3148                                   "ring-ref", "%" PRIu32,
3149                                   &xbb->ring_config.ring_ref[0],
3150                                   NULL);
3151                 if (error != 0) {
3152                         xenbus_dev_fatal(xbb->dev, error,
3153                                          "Unable to retrieve ring information "
3154                                          "from frontend %s.  Unable to "
3155                                          "connect.",
3156                                          xenbus_get_otherend_path(xbb->dev));
3157                         return (error);
3158                 }
3159         } else {
3160                 /* Multi-page ring format. */
3161                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3162                      ring_idx++) {
3163                         char ring_ref_name[]= "ring_refXX";
3164
3165                         snprintf(ring_ref_name, sizeof(ring_ref_name),
3166                                  "ring-ref%u", ring_idx);
3167                         error = xs_scanf(XST_NIL, otherend_path,
3168                                          ring_ref_name, NULL, "%" PRIu32,
3169                                          &xbb->ring_config.ring_ref[ring_idx]);
3170                         if (error != 0) {
3171                                 xenbus_dev_fatal(xbb->dev, error,
3172                                                  "Failed to retriev grant "
3173                                                  "reference for page %u of "
3174                                                  "shared ring.  Unable "
3175                                                  "to connect.", ring_idx);
3176                                 return (error);
3177                         }
3178                 }
3179         }
3180
3181         error = xs_gather(XST_NIL, otherend_path,
3182                           "protocol", "%63s", protocol_abi,
3183                           NULL); 
3184         if (error != 0
3185          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3186                 /*
3187                  * Assume native if the frontend has not
3188                  * published ABI data or it has published and
3189                  * matches our own ABI.
3190                  */
3191                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3192         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3193
3194                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3195         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3196
3197                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3198         } else {
3199
3200                 xenbus_dev_fatal(xbb->dev, EINVAL,
3201                                  "Unknown protocol ABI (%s) published by "
3202                                  "frontend.  Unable to connect.", protocol_abi);
3203                 return (EINVAL);
3204         }
3205         return (0);
3206 }
3207
3208 /**
3209  * Allocate per-request data structures given request size and number
3210  * information negotiated with the front-end.
3211  *
3212  * \param xbb  Per-instance xbb configuration structure.
3213  */
3214 static int
3215 xbb_alloc_requests(struct xbb_softc *xbb)
3216 {
3217         struct xbb_xen_req *req;
3218         struct xbb_xen_req *last_req;
3219
3220         /*
3221          * Allocate request book keeping datastructures.
3222          */
3223         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3224                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3225         if (xbb->requests == NULL) {
3226                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3227                                   "Unable to allocate request structures");
3228                 return (ENOMEM);
3229         }
3230
3231         req      = xbb->requests;
3232         last_req = &xbb->requests[xbb->max_requests - 1];
3233         STAILQ_INIT(&xbb->request_free_stailq);
3234         while (req <= last_req) {
3235                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3236                 req++;
3237         }
3238         return (0);
3239 }
3240
3241 static int
3242 xbb_alloc_request_lists(struct xbb_softc *xbb)
3243 {
3244         struct xbb_xen_reqlist *reqlist;
3245         int                     i;
3246
3247         /*
3248          * If no requests can be merged, we need 1 request list per
3249          * in flight request.
3250          */
3251         xbb->request_lists = malloc(xbb->max_requests *
3252                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3253         if (xbb->request_lists == NULL) {
3254                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3255                                   "Unable to allocate request list structures");
3256                 return (ENOMEM);
3257         }
3258
3259         STAILQ_INIT(&xbb->reqlist_free_stailq);
3260         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3261         for (i = 0; i < xbb->max_requests; i++) {
3262                 int seg;
3263
3264                 reqlist      = &xbb->request_lists[i];
3265
3266                 reqlist->xbb = xbb;
3267
3268 #ifdef XBB_USE_BOUNCE_BUFFERS
3269                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3270                                          M_XENBLOCKBACK, M_NOWAIT);
3271                 if (reqlist->bounce == NULL) {
3272                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3273                                          "Unable to allocate request "
3274                                          "bounce buffers");
3275                         return (ENOMEM);
3276                 }
3277 #endif /* XBB_USE_BOUNCE_BUFFERS */
3278
3279                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3280                                               sizeof(*reqlist->gnt_handles),
3281                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3282                 if (reqlist->gnt_handles == NULL) {
3283                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3284                                           "Unable to allocate request "
3285                                           "grant references");
3286                         return (ENOMEM);
3287                 }
3288
3289                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3290                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3291
3292                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3293         }
3294         return (0);
3295 }
3296
3297 /**
3298  * Supply information about the physical device to the frontend
3299  * via XenBus.
3300  *
3301  * \param xbb  Per-instance xbb configuration structure.
3302  */
3303 static int
3304 xbb_publish_backend_info(struct xbb_softc *xbb)
3305 {
3306         struct xs_transaction xst;
3307         const char           *our_path;
3308         const char           *leaf;
3309         int                   error;
3310
3311         our_path = xenbus_get_node(xbb->dev);
3312         while (1) {
3313                 error = xs_transaction_start(&xst);
3314                 if (error != 0) {
3315                         xenbus_dev_fatal(xbb->dev, error,
3316                                          "Error publishing backend info "
3317                                          "(start transaction)");
3318                         return (error);
3319                 }
3320
3321                 leaf = "sectors";
3322                 error = xs_printf(xst, our_path, leaf,
3323                                   "%"PRIu64, xbb->media_num_sectors);
3324                 if (error != 0)
3325                         break;
3326
3327                 /* XXX Support all VBD attributes here. */
3328                 leaf = "info";
3329                 error = xs_printf(xst, our_path, leaf, "%u",
3330                                   xbb->flags & XBBF_READ_ONLY
3331                                 ? VDISK_READONLY : 0);
3332                 if (error != 0)
3333                         break;
3334
3335                 leaf = "sector-size";
3336                 error = xs_printf(xst, our_path, leaf, "%u",
3337                                   xbb->sector_size);
3338                 if (error != 0)
3339                         break;
3340
3341                 error = xs_transaction_end(xst, 0);
3342                 if (error == 0) {
3343                         return (0);
3344                 } else if (error != EAGAIN) {
3345                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3346                         return (error);
3347                 }
3348         }
3349
3350         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3351                         our_path, leaf);
3352         xs_transaction_end(xst, 1);
3353         return (error);
3354 }
3355
3356 /**
3357  * Connect to our blkfront peer now that it has completed publishing
3358  * its configuration into the XenStore.
3359  *
3360  * \param xbb  Per-instance xbb configuration structure.
3361  */
3362 static void
3363 xbb_connect(struct xbb_softc *xbb)
3364 {
3365         int error;
3366
3367         if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
3368                 return;
3369
3370         if (xbb_collect_frontend_info(xbb) != 0)
3371                 return;
3372
3373         xbb->flags &= ~XBBF_SHUTDOWN;
3374
3375         /*
3376          * We limit the maximum number of reqlist segments to the maximum
3377          * number of segments in the ring, or our absolute maximum,
3378          * whichever is smaller.
3379          */
3380         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3381                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3382
3383         /*
3384          * The maximum size is simply a function of the number of segments
3385          * we can handle.
3386          */
3387         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3388
3389         /* Allocate resources whose size depends on front-end configuration. */
3390         error = xbb_alloc_communication_mem(xbb);
3391         if (error != 0) {
3392                 xenbus_dev_fatal(xbb->dev, error,
3393                                  "Unable to allocate communication memory");
3394                 return;
3395         }
3396
3397         error = xbb_alloc_requests(xbb);
3398         if (error != 0) {
3399                 /* Specific errors are reported by xbb_alloc_requests(). */
3400                 return;
3401         }
3402
3403         error = xbb_alloc_request_lists(xbb);
3404         if (error != 0) {
3405                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3406                 return;
3407         }
3408
3409         /*
3410          * Connect communication channel.
3411          */
3412         error = xbb_connect_ring(xbb);
3413         if (error != 0) {
3414                 /* Specific errors are reported by xbb_connect_ring(). */
3415                 return;
3416         }
3417         
3418         if (xbb_publish_backend_info(xbb) != 0) {
3419                 /*
3420                  * If we can't publish our data, we cannot participate
3421                  * in this connection, and waiting for a front-end state
3422                  * change will not help the situation.
3423                  */
3424                 (void)xbb_disconnect(xbb);
3425                 return;
3426         }
3427
3428         /* Ready for I/O. */
3429         xenbus_set_state(xbb->dev, XenbusStateConnected);
3430 }
3431
3432 /*-------------------------- Device Teardown Support -------------------------*/
3433 /**
3434  * Perform device shutdown functions.
3435  *
3436  * \param xbb  Per-instance xbb configuration structure.
3437  *
3438  * Mark this instance as shutting down, wait for any active I/O on the
3439  * backend device/file to drain, disconnect from the front-end, and notify
3440  * any waiters (e.g. a thread invoking our detach method) that detach can
3441  * now proceed.
3442  */
3443 static int
3444 xbb_shutdown(struct xbb_softc *xbb)
3445 {
3446         XenbusState frontState;
3447         int         error;
3448
3449         DPRINTF("\n");
3450
3451         /*
3452          * Due to the need to drop our mutex during some
3453          * xenbus operations, it is possible for two threads
3454          * to attempt to close out shutdown processing at
3455          * the same time.  Tell the caller that hits this
3456          * race to try back later. 
3457          */
3458         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3459                 return (EAGAIN);
3460
3461         xbb->flags |= XBBF_IN_SHUTDOWN;
3462         mtx_unlock(&xbb->lock);
3463
3464         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3465                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3466
3467         frontState = xenbus_get_otherend_state(xbb->dev);
3468         mtx_lock(&xbb->lock);
3469         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3470
3471         /* The front can submit I/O until entering the closed state. */
3472         if (frontState < XenbusStateClosed)
3473                 return (EAGAIN);
3474
3475         DPRINTF("\n");
3476
3477         /* Indicate shutdown is in progress. */
3478         xbb->flags |= XBBF_SHUTDOWN;
3479
3480         /* Disconnect from the front-end. */
3481         error = xbb_disconnect(xbb);
3482         if (error != 0) {
3483                 /*
3484                  * Requests still outstanding.  We'll be called again
3485                  * once they complete.
3486                  */
3487                 KASSERT(error == EAGAIN,
3488                         ("%s: Unexpected xbb_disconnect() failure %d",
3489                          __func__, error));
3490
3491                 return (error);
3492         }
3493
3494         DPRINTF("\n");
3495
3496         /* Indicate to xbb_detach() that is it safe to proceed. */
3497         wakeup(xbb);
3498
3499         return (0);
3500 }
3501
3502 /**
3503  * Report an attach time error to the console and Xen, and cleanup
3504  * this instance by forcing immediate detach processing.
3505  *
3506  * \param xbb  Per-instance xbb configuration structure.
3507  * \param err  Errno describing the error.
3508  * \param fmt  Printf style format and arguments
3509  */
3510 static void
3511 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3512 {
3513         va_list ap;
3514         va_list ap_hotplug;
3515
3516         va_start(ap, fmt);
3517         va_copy(ap_hotplug, ap);
3518         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3519                   "hotplug-error", fmt, ap_hotplug);
3520         va_end(ap_hotplug);
3521         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3522                   "hotplug-status", "error");
3523
3524         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3525         va_end(ap);
3526
3527         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3528                   "online", "0");
3529         xbb_detach(xbb->dev);
3530 }
3531
3532 /*---------------------------- NewBus Entrypoints ----------------------------*/
3533 /**
3534  * Inspect a XenBus device and claim it if is of the appropriate type.
3535  * 
3536  * \param dev  NewBus device object representing a candidate XenBus device.
3537  *
3538  * \return  0 for success, errno codes for failure.
3539  */
3540 static int
3541 xbb_probe(device_t dev)
3542 {
3543  
3544         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3545                 device_set_desc(dev, "Backend Virtual Block Device");
3546                 device_quiet(dev);
3547                 return (0);
3548         }
3549
3550         return (ENXIO);
3551 }
3552
3553 /**
3554  * Setup sysctl variables to control various Block Back parameters.
3555  *
3556  * \param xbb  Xen Block Back softc.
3557  *
3558  */
3559 static void
3560 xbb_setup_sysctl(struct xbb_softc *xbb)
3561 {
3562         struct sysctl_ctx_list *sysctl_ctx = NULL;
3563         struct sysctl_oid      *sysctl_tree = NULL;
3564         
3565         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3566         if (sysctl_ctx == NULL)
3567                 return;
3568
3569         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3570         if (sysctl_tree == NULL)
3571                 return;
3572
3573         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3574                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3575                        "fake the flush command");
3576
3577         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3578                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3579                        "send a real flush for N flush requests");
3580
3581         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3582                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3583                        "Don't coalesce contiguous requests");
3584
3585         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3586                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3587                          "how many I/O requests we have received");
3588
3589         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3590                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3591                          "how many I/O requests have been completed");
3592
3593         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3594                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3595                          "how many I/O dispatches were forced");
3596
3597         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3598                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3599                          "how many I/O dispatches were normal");
3600
3601         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3602                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3603                          "total number of I/O dispatches");
3604
3605         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3606                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3607                          "how many times we have run out of KVA");
3608
3609         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3610                          "request_shortages", CTLFLAG_RW,
3611                          &xbb->request_shortages,
3612                          "how many times we have run out of requests");
3613
3614         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3615                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3616                         "maximum outstanding requests (negotiated)");
3617
3618         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3619                         "max_request_segments", CTLFLAG_RD,
3620                         &xbb->max_request_segments, 0,
3621                         "maximum number of pages per requests (negotiated)");
3622
3623         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3624                         "max_request_size", CTLFLAG_RD,
3625                         &xbb->max_request_size, 0,
3626                         "maximum size in bytes of a request (negotiated)");
3627
3628         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3629                         "ring_pages", CTLFLAG_RD,
3630                         &xbb->ring_config.ring_pages, 0,
3631                         "communication channel pages (negotiated)");
3632 }
3633
3634 /**
3635  * Attach to a XenBus device that has been claimed by our probe routine.
3636  *
3637  * \param dev  NewBus device object representing this Xen Block Back instance.
3638  *
3639  * \return  0 for success, errno codes for failure.
3640  */
3641 static int
3642 xbb_attach(device_t dev)
3643 {
3644         struct xbb_softc        *xbb;
3645         int                      error;
3646         u_int                    max_ring_page_order;
3647
3648         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3649
3650         /*
3651          * Basic initialization.
3652          * After this block it is safe to call xbb_detach()
3653          * to clean up any allocated data for this instance.
3654          */
3655         xbb = device_get_softc(dev);
3656         xbb->dev = dev;
3657         xbb->otherend_id = xenbus_get_otherend_id(dev);
3658         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3659         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3660
3661         /*
3662          * Publish protocol capabilities for consumption by the
3663          * front-end.
3664          */
3665         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3666                           "feature-barrier", "1");
3667         if (error) {
3668                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3669                                   xenbus_get_node(xbb->dev));
3670                 return (error);
3671         }
3672
3673         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3674                           "feature-flush-cache", "1");
3675         if (error) {
3676                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3677                                   xenbus_get_node(xbb->dev));
3678                 return (error);
3679         }
3680
3681         /*
3682          * Amazon EC2 client compatility.  They refer to max-ring-pages
3683          * instead of to max-ring-page-order.
3684          */
3685         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3686                           "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
3687         if (error) {
3688                 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
3689                                   xenbus_get_node(xbb->dev));
3690                 return (error);
3691         }
3692
3693         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3694         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3695                           "max-ring-page-order", "%u", max_ring_page_order);
3696         if (error) {
3697                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3698                                   xenbus_get_node(xbb->dev));
3699                 return (error);
3700         }
3701
3702         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3703                           "max-requests", "%u", XBB_MAX_REQUESTS);
3704         if (error) {
3705                 xbb_attach_failed(xbb, error, "writing %s/max-requests",
3706                                   xenbus_get_node(xbb->dev));
3707                 return (error);
3708         }
3709
3710         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3711                           "max-request-segments", "%u",
3712                           XBB_MAX_SEGMENTS_PER_REQUEST);
3713         if (error) {
3714                 xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
3715                                   xenbus_get_node(xbb->dev));
3716                 return (error);
3717         }
3718
3719         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3720                           "max-request-size", "%u",
3721                           XBB_MAX_REQUEST_SIZE);
3722         if (error) {
3723                 xbb_attach_failed(xbb, error, "writing %s/max-request-size",
3724                                   xenbus_get_node(xbb->dev));
3725                 return (error);
3726         }
3727
3728         /* Collect physical device information. */
3729         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3730                           "device-type", NULL, &xbb->dev_type,
3731                           NULL);
3732         if (error != 0)
3733                 xbb->dev_type = NULL;
3734
3735         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3736                           "mode", NULL, &xbb->dev_mode,
3737                           "params", NULL, &xbb->dev_name,
3738                           NULL);
3739         if (error != 0) {
3740                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3741                                   xenbus_get_node(dev));
3742                 return (ENXIO);
3743         }
3744
3745         /* Parse fopen style mode flags. */
3746         if (strchr(xbb->dev_mode, 'w') == NULL)
3747                 xbb->flags |= XBBF_READ_ONLY;
3748
3749         /*
3750          * Verify the physical device is present and can support
3751          * the desired I/O mode.
3752          */
3753         DROP_GIANT();
3754         error = xbb_open_backend(xbb);
3755         PICKUP_GIANT();
3756         if (error != 0) {
3757                 xbb_attach_failed(xbb, error, "Unable to open %s",
3758                                   xbb->dev_name);
3759                 return (ENXIO);
3760         }
3761
3762         /* Use devstat(9) for recording statistics. */
3763         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3764                                            xbb->sector_size,
3765                                            DEVSTAT_ALL_SUPPORTED,
3766                                            DEVSTAT_TYPE_DIRECT
3767                                          | DEVSTAT_TYPE_IF_OTHER,
3768                                            DEVSTAT_PRIORITY_OTHER);
3769
3770         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3771                                               xbb->sector_size,
3772                                               DEVSTAT_ALL_SUPPORTED,
3773                                               DEVSTAT_TYPE_DIRECT
3774                                             | DEVSTAT_TYPE_IF_OTHER,
3775                                               DEVSTAT_PRIORITY_OTHER);
3776         /*
3777          * Setup sysctl variables.
3778          */
3779         xbb_setup_sysctl(xbb);
3780
3781         /*
3782          * Create a taskqueue for doing work that must occur from a
3783          * thread context.
3784          */
3785         xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3786                                                   M_NOWAIT,
3787                                                   taskqueue_thread_enqueue,
3788                                                   /*contxt*/&xbb->io_taskqueue);
3789         if (xbb->io_taskqueue == NULL) {
3790                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3791                 return (ENOMEM);
3792         }
3793
3794         taskqueue_start_threads(&xbb->io_taskqueue,
3795                                 /*num threads*/1,
3796                                 /*priority*/PWAIT,
3797                                 /*thread name*/
3798                                 "%s taskq", device_get_nameunit(dev));
3799
3800         /* Update hot-plug status to satisfy xend. */
3801         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3802                           "hotplug-status", "connected");
3803         if (error) {
3804                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3805                                   xenbus_get_node(xbb->dev));
3806                 return (error);
3807         }
3808
3809         /* Tell the front end that we are ready to connect. */
3810         xenbus_set_state(dev, XenbusStateInitWait);
3811
3812         return (0);
3813 }
3814
3815 /**
3816  * Detach from a block back device instance.
3817  *
3818  * \param dev  NewBus device object representing this Xen Block Back instance.
3819  *
3820  * \return  0 for success, errno codes for failure.
3821  * 
3822  * \note A block back device may be detached at any time in its life-cycle,
3823  *       including part way through the attach process.  For this reason,
3824  *       initialization order and the intialization state checks in this
3825  *       routine must be carefully coupled so that attach time failures
3826  *       are gracefully handled.
3827  */
3828 static int
3829 xbb_detach(device_t dev)
3830 {
3831         struct xbb_softc *xbb;
3832
3833         DPRINTF("\n");
3834
3835         xbb = device_get_softc(dev);
3836         mtx_lock(&xbb->lock);
3837         while (xbb_shutdown(xbb) == EAGAIN) {
3838                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3839                        "xbb_shutdown", 0);
3840         }
3841         mtx_unlock(&xbb->lock);
3842
3843         DPRINTF("\n");
3844
3845         if (xbb->io_taskqueue != NULL)
3846                 taskqueue_free(xbb->io_taskqueue);
3847
3848         if (xbb->xbb_stats != NULL)
3849                 devstat_remove_entry(xbb->xbb_stats);
3850
3851         if (xbb->xbb_stats_in != NULL)
3852                 devstat_remove_entry(xbb->xbb_stats_in);
3853
3854         xbb_close_backend(xbb);
3855
3856         if (xbb->dev_mode != NULL) {
3857                 free(xbb->dev_mode, M_XENBUS);
3858                 xbb->dev_mode = NULL;
3859         }
3860
3861         if (xbb->dev_type != NULL) {
3862                 free(xbb->dev_type, M_XENBUS);
3863                 xbb->dev_type = NULL;
3864         }
3865
3866         if (xbb->dev_name != NULL) {
3867                 free(xbb->dev_name, M_XENBUS);
3868                 xbb->dev_name = NULL;
3869         }
3870
3871         mtx_destroy(&xbb->lock);
3872         return (0);
3873 }
3874
3875 /**
3876  * Prepare this block back device for suspension of this VM.
3877  * 
3878  * \param dev  NewBus device object representing this Xen Block Back instance.
3879  *
3880  * \return  0 for success, errno codes for failure.
3881  */
3882 static int
3883 xbb_suspend(device_t dev)
3884 {
3885 #ifdef NOT_YET
3886         struct xbb_softc *sc = device_get_softc(dev);
3887
3888         /* Prevent new requests being issued until we fix things up. */
3889         mtx_lock(&sc->xb_io_lock);
3890         sc->connected = BLKIF_STATE_SUSPENDED;
3891         mtx_unlock(&sc->xb_io_lock);
3892 #endif
3893
3894         return (0);
3895 }
3896
3897 /**
3898  * Perform any processing required to recover from a suspended state.
3899  * 
3900  * \param dev  NewBus device object representing this Xen Block Back instance.
3901  *
3902  * \return  0 for success, errno codes for failure.
3903  */
3904 static int
3905 xbb_resume(device_t dev)
3906 {
3907         return (0);
3908 }
3909
3910 /**
3911  * Handle state changes expressed via the XenStore by our front-end peer.
3912  *
3913  * \param dev             NewBus device object representing this Xen
3914  *                        Block Back instance.
3915  * \param frontend_state  The new state of the front-end.
3916  *
3917  * \return  0 for success, errno codes for failure.
3918  */
3919 static void
3920 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3921 {
3922         struct xbb_softc *xbb = device_get_softc(dev);
3923
3924         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3925                 xenbus_strstate(frontend_state),
3926                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3927
3928         switch (frontend_state) {
3929         case XenbusStateInitialising:
3930                 break;
3931         case XenbusStateInitialised:
3932         case XenbusStateConnected:
3933                 xbb_connect(xbb);
3934                 break;
3935         case XenbusStateClosing:
3936         case XenbusStateClosed:
3937                 mtx_lock(&xbb->lock);
3938                 xbb_shutdown(xbb);
3939                 mtx_unlock(&xbb->lock);
3940                 if (frontend_state == XenbusStateClosed)
3941                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3942                 break;
3943         default:
3944                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3945                                  frontend_state);
3946                 break;
3947         }
3948 }
3949
3950 /*---------------------------- NewBus Registration ---------------------------*/
3951 static device_method_t xbb_methods[] = {
3952         /* Device interface */
3953         DEVMETHOD(device_probe,         xbb_probe),
3954         DEVMETHOD(device_attach,        xbb_attach),
3955         DEVMETHOD(device_detach,        xbb_detach),
3956         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3957         DEVMETHOD(device_suspend,       xbb_suspend),
3958         DEVMETHOD(device_resume,        xbb_resume),
3959
3960         /* Xenbus interface */
3961         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3962
3963         { 0, 0 }
3964 };
3965
3966 static driver_t xbb_driver = {
3967         "xbbd",
3968         xbb_methods,
3969         sizeof(struct xbb_softc),
3970 };
3971 devclass_t xbb_devclass;
3972
3973 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);