]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - sys/dev/xen/blkback/blkback.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * Copyright (c) 2009-2011 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  *          Ken Merry           (Spectra Logic Corporation)
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 /**
37  * \file blkback.c
38  *
39  * \brief Device driver supporting the vending of block storage from
40  *        a FreeBSD domain to other domains.
41  */
42
43 #include "opt_kdtrace.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/malloc.h>
49
50 #include <sys/bio.h>
51 #include <sys/bus.h>
52 #include <sys/conf.h>
53 #include <sys/devicestat.h>
54 #include <sys/disk.h>
55 #include <sys/fcntl.h>
56 #include <sys/filedesc.h>
57 #include <sys/kdb.h>
58 #include <sys/module.h>
59 #include <sys/namei.h>
60 #include <sys/proc.h>
61 #include <sys/rman.h>
62 #include <sys/taskqueue.h>
63 #include <sys/types.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/sysctl.h>
67 #include <sys/bitstring.h>
68 #include <sys/sdt.h>
69
70 #include <geom/geom.h>
71
72 #include <machine/_inttypes.h>
73
74 #include <vm/vm.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_kern.h>
77
78 #include <xen/xen-os.h>
79 #include <xen/blkif.h>
80 #include <xen/gnttab.h>
81 #include <xen/xen_intr.h>
82
83 #include <xen/interface/event_channel.h>
84 #include <xen/interface/grant_table.h>
85
86 #include <xen/xenbus/xenbusvar.h>
87
88 /*--------------------------- Compile-time Tunables --------------------------*/
89 /**
90  * The maximum number of shared memory ring pages we will allow in a
91  * negotiated block-front/back communication channel.  Allow enough
92  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
93  */
94 #define XBB_MAX_RING_PAGES              32
95
96 /**
97  * The maximum number of outstanding request blocks (request headers plus
98  * additional segment blocks) we will allow in a negotiated block-front/back
99  * communication channel.
100  */
101 #define XBB_MAX_REQUESTS                                        \
102         __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
103
104 /**
105  * \brief Define to force all I/O to be performed on memory owned by the
106  *        backend device, with a copy-in/out to the remote domain's memory.
107  *
108  * \note  This option is currently required when this driver's domain is
109  *        operating in HVM mode on a system using an IOMMU.
110  *
111  * This driver uses Xen's grant table API to gain access to the memory of
112  * the remote domains it serves.  When our domain is operating in PV mode,
113  * the grant table mechanism directly updates our domain's page table entries
114  * to point to the physical pages of the remote domain.  This scheme guarantees
115  * that blkback and the backing devices it uses can safely perform DMA
116  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
117  * insure that our domain cannot DMA to pages owned by another domain.  As
118  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
119  * table API.  For this reason, in HVM mode, we must bounce all requests into
120  * memory that is mapped into our domain at domain startup and thus has
121  * valid IOMMU mappings.
122  */
123 #define XBB_USE_BOUNCE_BUFFERS
124
125 /**
126  * \brief Define to enable rudimentary request logging to the console.
127  */
128 #undef XBB_DEBUG
129
130 /*---------------------------------- Macros ----------------------------------*/
131 /**
132  * Custom malloc type for all driver allocations.
133  */
134 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
135
136 #ifdef XBB_DEBUG
137 #define DPRINTF(fmt, args...)                                   \
138     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
139 #else
140 #define DPRINTF(fmt, args...) do {} while(0)
141 #endif
142
143 /**
144  * The maximum mapped region size per request we will allow in a negotiated
145  * block-front/back communication channel.
146  */
147 #define XBB_MAX_REQUEST_SIZE                                    \
148         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
149
150 /**
151  * The maximum number of segments (within a request header and accompanying
152  * segment blocks) per request we will allow in a negotiated block-front/back
153  * communication channel.
154  */
155 #define XBB_MAX_SEGMENTS_PER_REQUEST                            \
156         (MIN(UIO_MAXIOV,                                        \
157              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,                \
158                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
159
160 /**
161  * The maximum number of ring pages that we can allow per request list.
162  * We limit this to the maximum number of segments per request, because
163  * that is already a reasonable number of segments to aggregate.  This
164  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
165  * because that would leave situations where we can't dispatch even one
166  * large request.
167  */
168 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
169
170 /*--------------------------- Forward Declarations ---------------------------*/
171 struct xbb_softc;
172 struct xbb_xen_req;
173
174 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
175                               ...) __attribute__((format(printf, 3, 4)));
176 static int  xbb_shutdown(struct xbb_softc *xbb);
177 static int  xbb_detach(device_t dev);
178
179 /*------------------------------ Data Structures -----------------------------*/
180
181 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
182
183 typedef enum {
184         XBB_REQLIST_NONE        = 0x00,
185         XBB_REQLIST_MAPPED      = 0x01
186 } xbb_reqlist_flags;
187
188 struct xbb_xen_reqlist {
189         /**
190          * Back reference to the parent block back instance for this
191          * request.  Used during bio_done handling.
192          */
193         struct xbb_softc        *xbb;
194
195         /**
196          * BLKIF_OP code for this request.
197          */
198         int                      operation;
199
200         /**
201          * Set to BLKIF_RSP_* to indicate request status.
202          *
203          * This field allows an error status to be recorded even if the
204          * delivery of this status must be deferred.  Deferred reporting
205          * is necessary, for example, when an error is detected during
206          * completion processing of one bio when other bios for this
207          * request are still outstanding.
208          */
209         int                      status;
210
211         /**
212          * Number of 512 byte sectors not transferred.
213          */
214         int                      residual_512b_sectors;
215
216         /**
217          * Starting sector number of the first request in the list.
218          */
219         off_t                    starting_sector_number;
220
221         /**
222          * If we're going to coalesce, the next contiguous sector would be
223          * this one.
224          */
225         off_t                    next_contig_sector;
226
227         /**
228          * Number of child requests in the list.
229          */
230         int                      num_children;
231
232         /**
233          * Number of I/O requests still pending on the backend.
234          */
235         int                      pendcnt;
236
237         /**
238          * Total number of segments for requests in the list.
239          */
240         int                      nr_segments;
241
242         /**
243          * Flags for this particular request list.
244          */
245         xbb_reqlist_flags        flags;
246
247         /**
248          * Kernel virtual address space reserved for this request
249          * list structure and used to map the remote domain's pages for
250          * this I/O, into our domain's address space.
251          */
252         uint8_t                 *kva;
253
254         /**
255          * Base, psuedo-physical address, corresponding to the start
256          * of this request's kva region.
257          */
258         uint64_t                 gnt_base;
259
260
261 #ifdef XBB_USE_BOUNCE_BUFFERS
262         /**
263          * Pre-allocated domain local memory used to proxy remote
264          * domain memory during I/O operations.
265          */
266         uint8_t                 *bounce;
267 #endif
268
269         /**
270          * Array of grant handles (one per page) used to map this request.
271          */
272         grant_handle_t          *gnt_handles;
273
274         /**
275          * Device statistics request ordering type (ordered or simple).
276          */
277         devstat_tag_type         ds_tag_type;
278
279         /**
280          * Device statistics request type (read, write, no_data).
281          */
282         devstat_trans_flags      ds_trans_type;
283
284         /**
285          * The start time for this request.
286          */
287         struct bintime           ds_t0;
288
289         /**
290          * Linked list of contiguous requests with the same operation type.
291          */
292         struct xbb_xen_req_list  contig_req_list;
293
294         /**
295          * Linked list links used to aggregate idle requests in the
296          * request list free pool (xbb->reqlist_free_stailq) and pending
297          * requests waiting for execution (xbb->reqlist_pending_stailq).
298          */
299         STAILQ_ENTRY(xbb_xen_reqlist) links;
300 };
301
302 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
303
304 /**
305  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
306  */
307 struct xbb_xen_req {
308         /**
309          * Linked list links used to aggregate requests into a reqlist
310          * and to store them in the request free pool.
311          */
312         STAILQ_ENTRY(xbb_xen_req) links;
313
314         /**
315          * The remote domain's identifier for this I/O request.
316          */
317         uint64_t                  id;
318
319         /**
320          * The number of pages currently mapped for this request.
321          */
322         int                       nr_pages;
323
324         /**
325          * The number of 512 byte sectors comprising this requests.
326          */
327         int                       nr_512b_sectors;
328
329         /**
330          * BLKIF_OP code for this request.
331          */
332         int                       operation;
333
334         /**
335          * Storage used for non-native ring requests.
336          */
337         blkif_request_t          ring_req_storage;
338
339         /**
340          * Pointer to the Xen request in the ring.
341          */
342         blkif_request_t         *ring_req;
343
344         /**
345          * Consumer index for this request.
346          */
347         RING_IDX                 req_ring_idx;
348
349         /**
350          * The start time for this request.
351          */
352         struct bintime           ds_t0;
353
354         /**
355          * Pointer back to our parent request list.
356          */
357         struct xbb_xen_reqlist  *reqlist;
358 };
359 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
360
361 /**
362  * \brief Configuration data for the shared memory request ring
363  *        used to communicate with the front-end client of this
364  *        this driver.
365  */
366 struct xbb_ring_config {
367         /** KVA address where ring memory is mapped. */
368         vm_offset_t     va;
369
370         /** The pseudo-physical address where ring memory is mapped.*/
371         uint64_t        gnt_addr;
372
373         /**
374          * Grant table handles, one per-ring page, returned by the
375          * hyperpervisor upon mapping of the ring and required to
376          * unmap it when a connection is torn down.
377          */
378         grant_handle_t  handle[XBB_MAX_RING_PAGES];
379
380         /**
381          * The device bus address returned by the hypervisor when
382          * mapping the ring and required to unmap it when a connection
383          * is torn down.
384          */
385         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
386
387         /** The number of ring pages mapped for the current connection. */
388         u_int           ring_pages;
389
390         /**
391          * The grant references, one per-ring page, supplied by the
392          * front-end, allowing us to reference the ring pages in the
393          * front-end's domain and to map these pages into our own domain.
394          */
395         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
396
397         /** The interrupt driven even channel used to signal ring events. */
398         evtchn_port_t   evtchn;
399 };
400
401 /**
402  * Per-instance connection state flags.
403  */
404 typedef enum
405 {
406         /**
407          * The front-end requested a read-only mount of the
408          * back-end device/file.
409          */
410         XBBF_READ_ONLY         = 0x01,
411
412         /** Communication with the front-end has been established. */
413         XBBF_RING_CONNECTED    = 0x02,
414
415         /**
416          * Front-end requests exist in the ring and are waiting for
417          * xbb_xen_req objects to free up.
418          */
419         XBBF_RESOURCE_SHORTAGE = 0x04,
420
421         /** Connection teardown in progress. */
422         XBBF_SHUTDOWN          = 0x08,
423
424         /** A thread is already performing shutdown processing. */
425         XBBF_IN_SHUTDOWN       = 0x10
426 } xbb_flag_t;
427
428 /** Backend device type.  */
429 typedef enum {
430         /** Backend type unknown. */
431         XBB_TYPE_NONE           = 0x00,
432
433         /**
434          * Backend type disk (access via cdev switch
435          * strategy routine).
436          */
437         XBB_TYPE_DISK           = 0x01,
438
439         /** Backend type file (access vnode operations.). */
440         XBB_TYPE_FILE           = 0x02
441 } xbb_type;
442
443 /**
444  * \brief Structure used to memoize information about a per-request
445  *        scatter-gather list.
446  *
447  * The chief benefit of using this data structure is it avoids having
448  * to reparse the possibly discontiguous S/G list in the original
449  * request.  Due to the way that the mapping of the memory backing an
450  * I/O transaction is handled by Xen, a second pass is unavoidable.
451  * At least this way the second walk is a simple array traversal.
452  *
453  * \note A single Scatter/Gather element in the block interface covers
454  *       at most 1 machine page.  In this context a sector (blkif
455  *       nomenclature, not what I'd choose) is a 512b aligned unit
456  *       of mapping within the machine page referenced by an S/G
457  *       element.
458  */
459 struct xbb_sg {
460         /** The number of 512b data chunks mapped in this S/G element. */
461         int16_t nsect;
462
463         /**
464          * The index (0 based) of the first 512b data chunk mapped
465          * in this S/G element.
466          */
467         uint8_t first_sect;
468
469         /**
470          * The index (0 based) of the last 512b data chunk mapped
471          * in this S/G element.
472          */
473         uint8_t last_sect;
474 };
475
476 /**
477  * Character device backend specific configuration data.
478  */
479 struct xbb_dev_data {
480         /** Cdev used for device backend access.  */
481         struct cdev   *cdev;
482
483         /** Cdev switch used for device backend access.  */
484         struct cdevsw *csw;
485
486         /** Used to hold a reference on opened cdev backend devices. */
487         int            dev_ref;
488 };
489
490 /**
491  * File backend specific configuration data.
492  */
493 struct xbb_file_data {
494         /** Credentials to use for vnode backed (file based) I/O. */
495         struct ucred   *cred;
496
497         /**
498          * \brief Array of io vectors used to process file based I/O.
499          *
500          * Only a single file based request is outstanding per-xbb instance,
501          * so we only need one of these.
502          */
503         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
504 #ifdef XBB_USE_BOUNCE_BUFFERS
505
506         /**
507          * \brief Array of io vectors used to handle bouncing of file reads.
508          *
509          * Vnode operations are free to modify uio data during their
510          * exectuion.  In the case of a read with bounce buffering active,
511          * we need some of the data from the original uio in order to
512          * bounce-out the read data.  This array serves as the temporary
513          * storage for this saved data.
514          */
515         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
516
517         /**
518          * \brief Array of memoized bounce buffer kva offsets used
519          *        in the file based backend.
520          *
521          * Due to the way that the mapping of the memory backing an
522          * I/O transaction is handled by Xen, a second pass through
523          * the request sg elements is unavoidable. We memoize the computed
524          * bounce address here to reduce the cost of the second walk.
525          */
526         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
527 #endif /* XBB_USE_BOUNCE_BUFFERS */
528 };
529
530 /**
531  * Collection of backend type specific data.
532  */
533 union xbb_backend_data {
534         struct xbb_dev_data  dev;
535         struct xbb_file_data file;
536 };
537
538 /**
539  * Function signature of backend specific I/O handlers.
540  */
541 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
542                               struct xbb_xen_reqlist *reqlist, int operation,
543                               int flags);
544
545 /**
546  * Per-instance configuration data.
547  */
548 struct xbb_softc {
549
550         /**
551          * Task-queue used to process I/O requests.
552          */
553         struct taskqueue         *io_taskqueue;
554
555         /**
556          * Single "run the request queue" task enqueued
557          * on io_taskqueue.
558          */
559         struct task               io_task;
560
561         /** Device type for this instance. */
562         xbb_type                  device_type;
563
564         /** NewBus device corresponding to this instance. */
565         device_t                  dev;
566
567         /** Backend specific dispatch routine for this instance. */
568         xbb_dispatch_t            dispatch_io;
569
570         /** The number of requests outstanding on the backend device/file. */
571         int                       active_request_count;
572
573         /** Free pool of request tracking structures. */
574         struct xbb_xen_req_list   request_free_stailq;
575
576         /** Array, sized at connection time, of request tracking structures. */
577         struct xbb_xen_req       *requests;
578
579         /** Free pool of request list structures. */
580         struct xbb_xen_reqlist_list reqlist_free_stailq;
581
582         /** List of pending request lists awaiting execution. */
583         struct xbb_xen_reqlist_list reqlist_pending_stailq;
584
585         /** Array, sized at connection time, of request list structures. */
586         struct xbb_xen_reqlist   *request_lists;
587
588         /**
589          * Global pool of kva used for mapping remote domain ring
590          * and I/O transaction data.
591          */
592         vm_offset_t               kva;
593
594         /** Psuedo-physical address corresponding to kva. */
595         uint64_t                  gnt_base_addr;
596
597         /** The size of the global kva pool. */
598         int                       kva_size;
599
600         /** The size of the KVA area used for request lists. */
601         int                       reqlist_kva_size;
602
603         /** The number of pages of KVA used for request lists */
604         int                       reqlist_kva_pages;
605
606         /** Bitmap of free KVA pages */
607         bitstr_t                 *kva_free;
608
609         /**
610          * \brief Cached value of the front-end's domain id.
611          * 
612          * This value is used at once for each mapped page in
613          * a transaction.  We cache it to avoid incuring the
614          * cost of an ivar access every time this is needed.
615          */
616         domid_t                   otherend_id;
617
618         /**
619          * \brief The blkif protocol abi in effect.
620          *
621          * There are situations where the back and front ends can
622          * have a different, native abi (e.g. intel x86_64 and
623          * 32bit x86 domains on the same machine).  The back-end
624          * always accomodates the front-end's native abi.  That
625          * value is pulled from the XenStore and recorded here.
626          */
627         int                       abi;
628
629         /**
630          * \brief The maximum number of requests and request lists allowed
631          *        to be in flight at a time.
632          *
633          * This value is negotiated via the XenStore.
634          */
635         u_int                     max_requests;
636
637         /**
638          * \brief The maximum number of segments (1 page per segment)
639          *        that can be mapped by a request.
640          *
641          * This value is negotiated via the XenStore.
642          */
643         u_int                     max_request_segments;
644
645         /**
646          * \brief Maximum number of segments per request list.
647          *
648          * This value is derived from and will generally be larger than
649          * max_request_segments.
650          */
651         u_int                     max_reqlist_segments;
652
653         /**
654          * The maximum size of any request to this back-end
655          * device.
656          *
657          * This value is negotiated via the XenStore.
658          */
659         u_int                     max_request_size;
660
661         /**
662          * The maximum size of any request list.  This is derived directly
663          * from max_reqlist_segments.
664          */
665         u_int                     max_reqlist_size;
666
667         /** Various configuration and state bit flags. */
668         xbb_flag_t                flags;
669
670         /** Ring mapping and interrupt configuration data. */
671         struct xbb_ring_config    ring_config;
672
673         /** Runtime, cross-abi safe, structures for ring access. */
674         blkif_back_rings_t        rings;
675
676         /** IRQ mapping for the communication ring event channel. */
677         xen_intr_handle_t         xen_intr_handle;
678
679         /**
680          * \brief Backend access mode flags (e.g. write, or read-only).
681          *
682          * This value is passed to us by the front-end via the XenStore.
683          */
684         char                     *dev_mode;
685
686         /**
687          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
688          *
689          * This value is passed to us by the front-end via the XenStore.
690          * Currently unused.
691          */
692         char                     *dev_type;
693
694         /**
695          * \brief Backend device/file identifier.
696          *
697          * This value is passed to us by the front-end via the XenStore.
698          * We expect this to be a POSIX path indicating the file or
699          * device to open.
700          */
701         char                     *dev_name;
702
703         /**
704          * Vnode corresponding to the backend device node or file
705          * we are acessing.
706          */
707         struct vnode             *vn;
708
709         union xbb_backend_data    backend;
710
711         /** The native sector size of the backend. */
712         u_int                     sector_size;
713
714         /** log2 of sector_size.  */
715         u_int                     sector_size_shift;
716
717         /** Size in bytes of the backend device or file.  */
718         off_t                     media_size;
719
720         /**
721          * \brief media_size expressed in terms of the backend native
722          *        sector size.
723          *
724          * (e.g. xbb->media_size >> xbb->sector_size_shift).
725          */
726         uint64_t                  media_num_sectors;
727
728         /**
729          * \brief Array of memoized scatter gather data computed during the
730          *        conversion of blkif ring requests to internal xbb_xen_req
731          *        structures.
732          *
733          * Ring processing is serialized so we only need one of these.
734          */
735         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
736
737         /**
738          * Temporary grant table map used in xbb_dispatch_io().  When
739          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
740          * stack could cause a stack overflow.
741          */
742         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
743
744         /** Mutex protecting per-instance data. */
745         struct mtx                lock;
746
747 #ifdef XENHVM
748         /**
749          * Resource representing allocated physical address space
750          * associated with our per-instance kva region.
751          */
752         struct resource          *pseudo_phys_res;
753
754         /** Resource id for allocated physical address space. */
755         int                       pseudo_phys_res_id;
756 #endif
757
758         /**
759          * I/O statistics from BlockBack dispatch down.  These are
760          * coalesced requests, and we start them right before execution.
761          */
762         struct devstat           *xbb_stats;
763
764         /**
765          * I/O statistics coming into BlockBack.  These are the requests as
766          * we get them from BlockFront.  They are started as soon as we
767          * receive a request, and completed when the I/O is complete.
768          */
769         struct devstat           *xbb_stats_in;
770
771         /** Disable sending flush to the backend */
772         int                       disable_flush;
773
774         /** Send a real flush for every N flush requests */
775         int                       flush_interval;
776
777         /** Count of flush requests in the interval */
778         int                       flush_count;
779
780         /** Don't coalesce requests if this is set */
781         int                       no_coalesce_reqs;
782
783         /** Number of requests we have received */
784         uint64_t                  reqs_received;
785
786         /** Number of requests we have completed*/
787         uint64_t                  reqs_completed;
788
789         /** How many forced dispatches (i.e. without coalescing) have happend */
790         uint64_t                  forced_dispatch;
791
792         /** How many normal dispatches have happend */
793         uint64_t                  normal_dispatch;
794
795         /** How many total dispatches have happend */
796         uint64_t                  total_dispatch;
797
798         /** How many times we have run out of KVA */
799         uint64_t                  kva_shortages;
800
801         /** How many times we have run out of request structures */
802         uint64_t                  request_shortages;
803 };
804
805 /*---------------------------- Request Processing ----------------------------*/
806 /**
807  * Allocate an internal transaction tracking structure from the free pool.
808  *
809  * \param xbb  Per-instance xbb configuration structure.
810  *
811  * \return  On success, a pointer to the allocated xbb_xen_req structure.
812  *          Otherwise NULL.
813  */
814 static inline struct xbb_xen_req *
815 xbb_get_req(struct xbb_softc *xbb)
816 {
817         struct xbb_xen_req *req;
818
819         req = NULL;
820
821         mtx_assert(&xbb->lock, MA_OWNED);
822
823         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
824                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
825                 xbb->active_request_count++;
826         }
827
828         return (req);
829 }
830
831 /**
832  * Return an allocated transaction tracking structure to the free pool.
833  *
834  * \param xbb  Per-instance xbb configuration structure.
835  * \param req  The request structure to free.
836  */
837 static inline void
838 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
839 {
840         mtx_assert(&xbb->lock, MA_OWNED);
841
842         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
843         xbb->active_request_count--;
844
845         KASSERT(xbb->active_request_count >= 0,
846                 ("xbb_release_req: negative active count"));
847 }
848
849 /**
850  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
851  *
852  * \param xbb       Per-instance xbb configuration structure.
853  * \param req_list  The list of requests to free.
854  * \param nreqs     The number of items in the list.
855  */
856 static inline void
857 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
858                  int nreqs)
859 {
860         mtx_assert(&xbb->lock, MA_OWNED);
861
862         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
863         xbb->active_request_count -= nreqs;
864
865         KASSERT(xbb->active_request_count >= 0,
866                 ("xbb_release_reqs: negative active count"));
867 }
868
869 /**
870  * Given a page index and 512b sector offset within that page,
871  * calculate an offset into a request's kva region.
872  *
873  * \param reqlist The request structure whose kva region will be accessed.
874  * \param pagenr  The page index used to compute the kva offset.
875  * \param sector  The 512b sector index used to compute the page relative
876  *                kva offset.
877  *
878  * \return  The computed global KVA offset.
879  */
880 static inline uint8_t *
881 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
882 {
883         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
884 }
885
886 #ifdef XBB_USE_BOUNCE_BUFFERS
887 /**
888  * Given a page index and 512b sector offset within that page,
889  * calculate an offset into a request's local bounce memory region.
890  *
891  * \param reqlist The request structure whose bounce region will be accessed.
892  * \param pagenr  The page index used to compute the bounce offset.
893  * \param sector  The 512b sector index used to compute the page relative
894  *                bounce offset.
895  *
896  * \return  The computed global bounce buffer address.
897  */
898 static inline uint8_t *
899 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
900 {
901         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
902 }
903 #endif
904
905 /**
906  * Given a page number and 512b sector offset within that page,
907  * calculate an offset into the request's memory region that the
908  * underlying backend device/file should use for I/O.
909  *
910  * \param reqlist The request structure whose I/O region will be accessed.
911  * \param pagenr  The page index used to compute the I/O offset.
912  * \param sector  The 512b sector index used to compute the page relative
913  *                I/O offset.
914  *
915  * \return  The computed global I/O address.
916  *
917  * Depending on configuration, this will either be a local bounce buffer
918  * or a pointer to the memory mapped in from the front-end domain for
919  * this request.
920  */
921 static inline uint8_t *
922 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
923 {
924 #ifdef XBB_USE_BOUNCE_BUFFERS
925         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
926 #else
927         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
928 #endif
929 }
930
931 /**
932  * Given a page index and 512b sector offset within that page, calculate
933  * an offset into the local psuedo-physical address space used to map a
934  * front-end's request data into a request.
935  *
936  * \param reqlist The request list structure whose pseudo-physical region
937  *                will be accessed.
938  * \param pagenr  The page index used to compute the pseudo-physical offset.
939  * \param sector  The 512b sector index used to compute the page relative
940  *                pseudo-physical offset.
941  *
942  * \return  The computed global pseudo-phsyical address.
943  *
944  * Depending on configuration, this will either be a local bounce buffer
945  * or a pointer to the memory mapped in from the front-end domain for
946  * this request.
947  */
948 static inline uintptr_t
949 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
950 {
951         struct xbb_softc *xbb;
952
953         xbb = reqlist->xbb;
954
955         return ((uintptr_t)(xbb->gnt_base_addr +
956                 (uintptr_t)(reqlist->kva - xbb->kva) +
957                 (PAGE_SIZE * pagenr) + (sector << 9)));
958 }
959
960 /**
961  * Get Kernel Virtual Address space for mapping requests.
962  *
963  * \param xbb         Per-instance xbb configuration structure.
964  * \param nr_pages    Number of pages needed.
965  * \param check_only  If set, check for free KVA but don't allocate it.
966  * \param have_lock   If set, xbb lock is already held.
967  *
968  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
969  *
970  * Note:  This should be unnecessary once we have either chaining or
971  * scatter/gather support for struct bio.  At that point we'll be able to
972  * put multiple addresses and lengths in one bio/bio chain and won't need
973  * to map everything into one virtual segment.
974  */
975 static uint8_t *
976 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
977 {
978         intptr_t first_clear;
979         intptr_t num_clear;
980         uint8_t *free_kva;
981         int      i;
982
983         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
984
985         first_clear = 0;
986         free_kva = NULL;
987
988         mtx_lock(&xbb->lock);
989
990         /*
991          * Look for the first available page.  If there are none, we're done.
992          */
993         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
994
995         if (first_clear == -1)
996                 goto bailout;
997
998         /*
999          * Starting at the first available page, look for consecutive free
1000          * pages that will satisfy the user's request.
1001          */
1002         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1003                 /*
1004                  * If this is true, the page is used, so we have to reset
1005                  * the number of clear pages and the first clear page
1006                  * (since it pointed to a region with an insufficient number
1007                  * of clear pages).
1008                  */
1009                 if (bit_test(xbb->kva_free, i)) {
1010                         num_clear = 0;
1011                         first_clear = -1;
1012                         continue;
1013                 }
1014
1015                 if (first_clear == -1)
1016                         first_clear = i;
1017
1018                 /*
1019                  * If this is true, we've found a large enough free region
1020                  * to satisfy the request.
1021                  */
1022                 if (++num_clear == nr_pages) {
1023
1024                         bit_nset(xbb->kva_free, first_clear,
1025                                  first_clear + nr_pages - 1);
1026
1027                         free_kva = xbb->kva +
1028                                 (uint8_t *)(first_clear * PAGE_SIZE);
1029
1030                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1031                                 free_kva + (nr_pages * PAGE_SIZE) <=
1032                                 (uint8_t *)xbb->ring_config.va,
1033                                 ("Free KVA %p len %d out of range, "
1034                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1035                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1036                                  (uintmax_t)xbb->ring_config.va));
1037                         break;
1038                 }
1039         }
1040
1041 bailout:
1042
1043         if (free_kva == NULL) {
1044                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1045                 xbb->kva_shortages++;
1046         }
1047
1048         mtx_unlock(&xbb->lock);
1049
1050         return (free_kva);
1051 }
1052
1053 /**
1054  * Free allocated KVA.
1055  *
1056  * \param xbb       Per-instance xbb configuration structure.
1057  * \param kva_ptr   Pointer to allocated KVA region.  
1058  * \param nr_pages  Number of pages in the KVA region.
1059  */
1060 static void
1061 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1062 {
1063         intptr_t start_page;
1064
1065         mtx_assert(&xbb->lock, MA_OWNED);
1066
1067         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1068         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1069
1070 }
1071
1072 /**
1073  * Unmap the front-end pages associated with this I/O request.
1074  *
1075  * \param req  The request structure to unmap.
1076  */
1077 static void
1078 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1079 {
1080         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1081         u_int                         i;
1082         u_int                         invcount;
1083         int                           error;
1084
1085         invcount = 0;
1086         for (i = 0; i < reqlist->nr_segments; i++) {
1087
1088                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1089                         continue;
1090
1091                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1092                 unmap[invcount].dev_bus_addr = 0;
1093                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1094                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1095                 invcount++;
1096         }
1097
1098         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1099                                           unmap, invcount);
1100         KASSERT(error == 0, ("Grant table operation failed"));
1101 }
1102
1103 /**
1104  * Allocate an internal transaction tracking structure from the free pool.
1105  *
1106  * \param xbb  Per-instance xbb configuration structure.
1107  *
1108  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1109  *          Otherwise NULL.
1110  */
1111 static inline struct xbb_xen_reqlist *
1112 xbb_get_reqlist(struct xbb_softc *xbb)
1113 {
1114         struct xbb_xen_reqlist *reqlist;
1115
1116         reqlist = NULL;
1117
1118         mtx_assert(&xbb->lock, MA_OWNED);
1119
1120         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1121
1122                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1123                 reqlist->flags = XBB_REQLIST_NONE;
1124                 reqlist->kva = NULL;
1125                 reqlist->status = BLKIF_RSP_OKAY;
1126                 reqlist->residual_512b_sectors = 0;
1127                 reqlist->num_children = 0;
1128                 reqlist->nr_segments = 0;
1129                 STAILQ_INIT(&reqlist->contig_req_list);
1130         }
1131
1132         return (reqlist);
1133 }
1134
1135 /**
1136  * Return an allocated transaction tracking structure to the free pool.
1137  *
1138  * \param xbb        Per-instance xbb configuration structure.
1139  * \param req        The request list structure to free.
1140  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1141  *                   during a resource shortage condition.
1142  */
1143 static inline void
1144 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1145                     int wakeup)
1146 {
1147
1148         mtx_lock(&xbb->lock);
1149
1150         if (wakeup) {
1151                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1152                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1153         }
1154
1155         if (reqlist->kva != NULL)
1156                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1157
1158         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1159
1160         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1161
1162         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1163                 /*
1164                  * Shutdown is in progress.  See if we can
1165                  * progress further now that one more request
1166                  * has completed and been returned to the
1167                  * free pool.
1168                  */
1169                 xbb_shutdown(xbb);
1170         }
1171
1172         mtx_unlock(&xbb->lock);
1173
1174         if (wakeup != 0)
1175                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1176 }
1177
1178 /**
1179  * Request resources and do basic request setup.
1180  *
1181  * \param xbb          Per-instance xbb configuration structure.
1182  * \param reqlist      Pointer to reqlist pointer.
1183  * \param ring_req     Pointer to a block ring request.
1184  * \param ring_index   The ring index of this request.
1185  *
1186  * \return  0 for success, non-zero for failure.
1187  */
1188 static int
1189 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1190                   blkif_request_t *ring_req, RING_IDX ring_idx)
1191 {
1192         struct xbb_xen_reqlist *nreqlist;
1193         struct xbb_xen_req     *nreq;
1194
1195         nreqlist = NULL;
1196         nreq     = NULL;
1197
1198         mtx_lock(&xbb->lock);
1199
1200         /*
1201          * We don't allow new resources to be allocated if we're in the
1202          * process of shutting down.
1203          */
1204         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1205                 mtx_unlock(&xbb->lock);
1206                 return (1);
1207         }
1208
1209         /*
1210          * Allocate a reqlist if the caller doesn't have one already.
1211          */
1212         if (*reqlist == NULL) {
1213                 nreqlist = xbb_get_reqlist(xbb);
1214                 if (nreqlist == NULL)
1215                         goto bailout_error;
1216         }
1217
1218         /* We always allocate a request. */
1219         nreq = xbb_get_req(xbb);
1220         if (nreq == NULL)
1221                 goto bailout_error;
1222
1223         mtx_unlock(&xbb->lock);
1224
1225         if (*reqlist == NULL) {
1226                 *reqlist = nreqlist;
1227                 nreqlist->operation = ring_req->operation;
1228                 nreqlist->starting_sector_number = ring_req->sector_number;
1229                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1230                                    links);
1231         }
1232
1233         nreq->reqlist = *reqlist;
1234         nreq->req_ring_idx = ring_idx;
1235         nreq->id = ring_req->id;
1236         nreq->operation = ring_req->operation;
1237
1238         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1239                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1240                 nreq->ring_req = &nreq->ring_req_storage;
1241         } else {
1242                 nreq->ring_req = ring_req;
1243         }
1244
1245         binuptime(&nreq->ds_t0);
1246         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1247         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1248         (*reqlist)->num_children++;
1249         (*reqlist)->nr_segments += ring_req->nr_segments;
1250
1251         return (0);
1252
1253 bailout_error:
1254
1255         /*
1256          * We're out of resources, so set the shortage flag.  The next time
1257          * a request is released, we'll try waking up the work thread to
1258          * see if we can allocate more resources.
1259          */
1260         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1261         xbb->request_shortages++;
1262
1263         if (nreq != NULL)
1264                 xbb_release_req(xbb, nreq);
1265
1266         mtx_unlock(&xbb->lock);
1267
1268         if (nreqlist != NULL)
1269                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1270
1271         return (1);
1272 }
1273
1274 /**
1275  * Create and transmit a response to a blkif request.
1276  * 
1277  * \param xbb     Per-instance xbb configuration structure.
1278  * \param req     The request structure to which to respond.
1279  * \param status  The status code to report.  See BLKIF_RSP_*
1280  *                in sys/xen/interface/io/blkif.h.
1281  */
1282 static void
1283 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1284 {
1285         blkif_response_t *resp;
1286         int               more_to_do;
1287         int               notify;
1288
1289         more_to_do = 0;
1290
1291         /*
1292          * Place on the response ring for the relevant domain.
1293          * For now, only the spacing between entries is different
1294          * in the different ABIs, not the response entry layout.
1295          */
1296         mtx_lock(&xbb->lock);
1297         switch (xbb->abi) {
1298         case BLKIF_PROTOCOL_NATIVE:
1299                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1300                                          xbb->rings.native.rsp_prod_pvt);
1301                 break;
1302         case BLKIF_PROTOCOL_X86_32:
1303                 resp = (blkif_response_t *)
1304                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1305                                       xbb->rings.x86_32.rsp_prod_pvt);
1306                 break;
1307         case BLKIF_PROTOCOL_X86_64:
1308                 resp = (blkif_response_t *)
1309                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1310                                       xbb->rings.x86_64.rsp_prod_pvt);
1311                 break;
1312         default:
1313                 panic("Unexpected blkif protocol ABI.");
1314         }
1315
1316         resp->id        = req->id;
1317         resp->operation = req->operation;
1318         resp->status    = status;
1319
1320         xbb->rings.common.rsp_prod_pvt++;
1321         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
1322
1323         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1324
1325                 /*
1326                  * Tail check for pending requests. Allows frontend to avoid
1327                  * notifications if requests are already in flight (lower
1328                  * overheads and promotes batching).
1329                  */
1330                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1331         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1332
1333                 more_to_do = 1;
1334         }
1335
1336         xbb->reqs_completed++;
1337
1338         mtx_unlock(&xbb->lock);
1339
1340         if (more_to_do)
1341                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1342
1343         if (notify)
1344                 xen_intr_signal(xbb->xen_intr_handle);
1345 }
1346
1347 /**
1348  * Complete a request list.
1349  *
1350  * \param xbb        Per-instance xbb configuration structure.
1351  * \param reqlist    Allocated internal request list structure.
1352  */
1353 static void
1354 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1355 {
1356         struct xbb_xen_req *nreq;
1357         off_t               sectors_sent;
1358
1359         sectors_sent = 0;
1360
1361         if (reqlist->flags & XBB_REQLIST_MAPPED)
1362                 xbb_unmap_reqlist(reqlist);
1363
1364         /*
1365          * All I/O is done, send the response.  A lock should not be
1366          * necessary here because the request list is complete, and
1367          * therefore this is the only context accessing this request
1368          * right now.  The functions we call do their own locking if
1369          * necessary.
1370          */
1371         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1372                 off_t cur_sectors_sent;
1373
1374                 xbb_send_response(xbb, nreq, reqlist->status);
1375
1376                 /* We don't report bytes sent if there is an error. */
1377                 if (reqlist->status == BLKIF_RSP_OKAY)
1378                         cur_sectors_sent = nreq->nr_512b_sectors;
1379                 else
1380                         cur_sectors_sent = 0;
1381
1382                 sectors_sent += cur_sectors_sent;
1383
1384                 devstat_end_transaction(xbb->xbb_stats_in,
1385                                         /*bytes*/cur_sectors_sent << 9,
1386                                         reqlist->ds_tag_type,
1387                                         reqlist->ds_trans_type,
1388                                         /*now*/NULL,
1389                                         /*then*/&nreq->ds_t0);
1390         }
1391
1392         /*
1393          * Take out any sectors not sent.  If we wind up negative (which
1394          * might happen if an error is reported as well as a residual), just
1395          * report 0 sectors sent.
1396          */
1397         sectors_sent -= reqlist->residual_512b_sectors;
1398         if (sectors_sent < 0)
1399                 sectors_sent = 0;
1400
1401         devstat_end_transaction(xbb->xbb_stats,
1402                                 /*bytes*/ sectors_sent << 9,
1403                                 reqlist->ds_tag_type,
1404                                 reqlist->ds_trans_type,
1405                                 /*now*/NULL,
1406                                 /*then*/&reqlist->ds_t0);
1407
1408         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1409 }
1410
1411 /**
1412  * Completion handler for buffer I/O requests issued by the device
1413  * backend driver.
1414  *
1415  * \param bio  The buffer I/O request on which to perform completion
1416  *             processing.
1417  */
1418 static void
1419 xbb_bio_done(struct bio *bio)
1420 {
1421         struct xbb_softc       *xbb;
1422         struct xbb_xen_reqlist *reqlist;
1423
1424         reqlist = bio->bio_caller1;
1425         xbb     = reqlist->xbb;
1426
1427         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1428
1429         /*
1430          * This is a bit imprecise.  With aggregated I/O a single
1431          * request list can contain multiple front-end requests and
1432          * a multiple bios may point to a single request.  By carefully
1433          * walking the request list, we could map residuals and errors
1434          * back to the original front-end request, but the interface
1435          * isn't sufficiently rich for us to properly report the error.
1436          * So, we just treat the entire request list as having failed if an
1437          * error occurs on any part.  And, if an error occurs, we treat
1438          * the amount of data transferred as 0.
1439          *
1440          * For residuals, we report it on the overall aggregated device,
1441          * but not on the individual requests, since we don't currently
1442          * do the work to determine which front-end request to which the
1443          * residual applies.
1444          */
1445         if (bio->bio_error) {
1446                 DPRINTF("BIO returned error %d for operation on device %s\n",
1447                         bio->bio_error, xbb->dev_name);
1448                 reqlist->status = BLKIF_RSP_ERROR;
1449
1450                 if (bio->bio_error == ENXIO
1451                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1452
1453                         /*
1454                          * Backend device has disappeared.  Signal the
1455                          * front-end that we (the device proxy) want to
1456                          * go away.
1457                          */
1458                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1459                 }
1460         }
1461
1462 #ifdef XBB_USE_BOUNCE_BUFFERS
1463         if (bio->bio_cmd == BIO_READ) {
1464                 vm_offset_t kva_offset;
1465
1466                 kva_offset = (vm_offset_t)bio->bio_data
1467                            - (vm_offset_t)reqlist->bounce;
1468                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1469                        bio->bio_data, bio->bio_bcount);
1470         }
1471 #endif /* XBB_USE_BOUNCE_BUFFERS */
1472
1473         /*
1474          * Decrement the pending count for the request list.  When we're
1475          * done with the requests, send status back for all of them.
1476          */
1477         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1478                 xbb_complete_reqlist(xbb, reqlist);
1479
1480         g_destroy_bio(bio);
1481 }
1482
1483 /**
1484  * Parse a blkif request into an internal request structure and send
1485  * it to the backend for processing.
1486  *
1487  * \param xbb       Per-instance xbb configuration structure.
1488  * \param reqlist   Allocated internal request list structure.
1489  *
1490  * \return          On success, 0.  For resource shortages, non-zero.
1491  *  
1492  * This routine performs the backend common aspects of request parsing
1493  * including compiling an internal request structure, parsing the S/G
1494  * list and any secondary ring requests in which they may reside, and
1495  * the mapping of front-end I/O pages into our domain.
1496  */
1497 static int
1498 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1499 {
1500         struct xbb_sg                *xbb_sg;
1501         struct gnttab_map_grant_ref  *map;
1502         struct blkif_request_segment *sg;
1503         struct blkif_request_segment *last_block_sg;
1504         struct xbb_xen_req           *nreq;
1505         u_int                         nseg;
1506         u_int                         seg_idx;
1507         u_int                         block_segs;
1508         int                           nr_sects;
1509         int                           total_sects;
1510         int                           operation;
1511         uint8_t                       bio_flags;
1512         int                           error;
1513
1514         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1515         bio_flags            = 0;
1516         total_sects          = 0;
1517         nr_sects             = 0;
1518
1519         /*
1520          * First determine whether we have enough free KVA to satisfy this
1521          * request list.  If not, tell xbb_run_queue() so it can go to
1522          * sleep until we have more KVA.
1523          */
1524         reqlist->kva = NULL;
1525         if (reqlist->nr_segments != 0) {
1526                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1527                 if (reqlist->kva == NULL) {
1528                         /*
1529                          * If we're out of KVA, return ENOMEM.
1530                          */
1531                         return (ENOMEM);
1532                 }
1533         }
1534
1535         binuptime(&reqlist->ds_t0);
1536         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1537
1538         switch (reqlist->operation) {
1539         case BLKIF_OP_WRITE_BARRIER:
1540                 bio_flags       |= BIO_ORDERED;
1541                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1542                 /* FALLTHROUGH */
1543         case BLKIF_OP_WRITE:
1544                 operation = BIO_WRITE;
1545                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1546                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1547                         DPRINTF("Attempt to write to read only device %s\n",
1548                                 xbb->dev_name);
1549                         reqlist->status = BLKIF_RSP_ERROR;
1550                         goto send_response;
1551                 }
1552                 break;
1553         case BLKIF_OP_READ:
1554                 operation = BIO_READ;
1555                 reqlist->ds_trans_type = DEVSTAT_READ;
1556                 break;
1557         case BLKIF_OP_FLUSH_DISKCACHE:
1558                 /*
1559                  * If this is true, the user has requested that we disable
1560                  * flush support.  So we just complete the requests
1561                  * successfully.
1562                  */
1563                 if (xbb->disable_flush != 0) {
1564                         goto send_response;
1565                 }
1566
1567                 /*
1568                  * The user has requested that we only send a real flush
1569                  * for every N flush requests.  So keep count, and either
1570                  * complete the request immediately or queue it for the
1571                  * backend.
1572                  */
1573                 if (xbb->flush_interval != 0) {
1574                         if (++(xbb->flush_count) < xbb->flush_interval) {
1575                                 goto send_response;
1576                         } else
1577                                 xbb->flush_count = 0;
1578                 }
1579
1580                 operation = BIO_FLUSH;
1581                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1582                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1583                 goto do_dispatch;
1584                 /*NOTREACHED*/
1585         default:
1586                 DPRINTF("error: unknown block io operation [%d]\n",
1587                         reqlist->operation);
1588                 reqlist->status = BLKIF_RSP_ERROR;
1589                 goto send_response;
1590         }
1591
1592         reqlist->xbb  = xbb;
1593         xbb_sg        = xbb->xbb_sgs;
1594         map           = xbb->maps;
1595         seg_idx       = 0;
1596
1597         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1598                 blkif_request_t         *ring_req;
1599                 RING_IDX                 req_ring_idx;
1600                 u_int                    req_seg_idx;
1601
1602                 ring_req              = nreq->ring_req;
1603                 req_ring_idx          = nreq->req_ring_idx;
1604                 nr_sects              = 0;
1605                 nseg                  = ring_req->nr_segments;
1606                 nreq->nr_pages        = nseg;
1607                 nreq->nr_512b_sectors = 0;
1608                 req_seg_idx           = 0;
1609                 sg                    = NULL;
1610
1611                 /* Check that number of segments is sane. */
1612                 if (__predict_false(nseg == 0)
1613                  || __predict_false(nseg > xbb->max_request_segments)) {
1614                         DPRINTF("Bad number of segments in request (%d)\n",
1615                                 nseg);
1616                         reqlist->status = BLKIF_RSP_ERROR;
1617                         goto send_response;
1618                 }
1619
1620                 block_segs    = nseg;
1621                 sg            = ring_req->seg;
1622                 last_block_sg = sg + block_segs;
1623
1624                 while (sg < last_block_sg) {
1625                         KASSERT(seg_idx <
1626                                 XBB_MAX_SEGMENTS_PER_REQLIST,
1627                                 ("seg_idx %d is too large, max "
1628                                 "segs %d\n", seg_idx,
1629                                 XBB_MAX_SEGMENTS_PER_REQLIST));
1630
1631                         xbb_sg->first_sect = sg->first_sect;
1632                         xbb_sg->last_sect  = sg->last_sect;
1633                         xbb_sg->nsect =
1634                             (int8_t)(sg->last_sect -
1635                             sg->first_sect + 1);
1636
1637                         if ((sg->last_sect >= (PAGE_SIZE >> 9))
1638                          || (xbb_sg->nsect <= 0)) {
1639                                 reqlist->status = BLKIF_RSP_ERROR;
1640                                 goto send_response;
1641                         }
1642
1643                         nr_sects += xbb_sg->nsect;
1644                         map->host_addr = xbb_get_gntaddr(reqlist,
1645                                                 seg_idx, /*sector*/0);
1646                         KASSERT(map->host_addr + PAGE_SIZE <=
1647                                 xbb->ring_config.gnt_addr,
1648                                 ("Host address %#jx len %d overlaps "
1649                                  "ring address %#jx\n",
1650                                 (uintmax_t)map->host_addr, PAGE_SIZE,
1651                                 (uintmax_t)xbb->ring_config.gnt_addr));
1652
1653                         map->flags     = GNTMAP_host_map;
1654                         map->ref       = sg->gref;
1655                         map->dom       = xbb->otherend_id;
1656                         if (operation == BIO_WRITE)
1657                                 map->flags |= GNTMAP_readonly;
1658                         sg++;
1659                         map++;
1660                         xbb_sg++;
1661                         seg_idx++;
1662                         req_seg_idx++;
1663                 }
1664
1665                 /* Convert to the disk's sector size */
1666                 nreq->nr_512b_sectors = nr_sects;
1667                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1668                 total_sects += nr_sects;
1669
1670                 if ((nreq->nr_512b_sectors &
1671                     ((xbb->sector_size >> 9) - 1)) != 0) {
1672                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1673                                       "a multiple of the backing store sector "
1674                                       "size (%d)\n", __func__,
1675                                       nreq->nr_512b_sectors << 9,
1676                                       xbb->sector_size);
1677                         reqlist->status = BLKIF_RSP_ERROR;
1678                         goto send_response;
1679                 }
1680         }
1681
1682         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1683                                           xbb->maps, reqlist->nr_segments);
1684         if (error != 0)
1685                 panic("Grant table operation failed (%d)", error);
1686
1687         reqlist->flags |= XBB_REQLIST_MAPPED;
1688
1689         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1690              seg_idx++, map++){
1691
1692                 if (__predict_false(map->status != 0)) {
1693                         DPRINTF("invalid buffer -- could not remap "
1694                                 "it (%d)\n", map->status);
1695                         DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
1696                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1697                                 map->host_addr, map->flags, map->ref,
1698                                 map->dom);
1699                         reqlist->status = BLKIF_RSP_ERROR;
1700                         goto send_response;
1701                 }
1702
1703                 reqlist->gnt_handles[seg_idx] = map->handle;
1704         }
1705         if (reqlist->starting_sector_number + total_sects >
1706             xbb->media_num_sectors) {
1707
1708                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1709                         "extends past end of device %s\n",
1710                         operation == BIO_READ ? "read" : "write",
1711                         reqlist->starting_sector_number,
1712                         reqlist->starting_sector_number + total_sects,
1713                         xbb->dev_name); 
1714                 reqlist->status = BLKIF_RSP_ERROR;
1715                 goto send_response;
1716         }
1717
1718 do_dispatch:
1719
1720         error = xbb->dispatch_io(xbb,
1721                                  reqlist,
1722                                  operation,
1723                                  bio_flags);
1724
1725         if (error != 0) {
1726                 reqlist->status = BLKIF_RSP_ERROR;
1727                 goto send_response;
1728         }
1729
1730         return (0);
1731
1732 send_response:
1733
1734         xbb_complete_reqlist(xbb, reqlist);
1735
1736         return (0);
1737 }
1738
1739 static __inline int
1740 xbb_count_sects(blkif_request_t *ring_req)
1741 {
1742         int i;
1743         int cur_size = 0;
1744
1745         for (i = 0; i < ring_req->nr_segments; i++) {
1746                 int nsect;
1747
1748                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1749                         ring_req->seg[i].first_sect + 1);
1750                 if (nsect <= 0)
1751                         break;
1752
1753                 cur_size += nsect;
1754         }
1755
1756         return (cur_size);
1757 }
1758
1759 /**
1760  * Process incoming requests from the shared communication ring in response
1761  * to a signal on the ring's event channel.
1762  *
1763  * \param context  Callback argument registerd during task initialization -
1764  *                 the xbb_softc for this instance.
1765  * \param pending  The number of taskqueue_enqueue events that have
1766  *                 occurred since this handler was last run.
1767  */
1768 static void
1769 xbb_run_queue(void *context, int pending)
1770 {
1771         struct xbb_softc       *xbb;
1772         blkif_back_rings_t     *rings;
1773         RING_IDX                rp;
1774         uint64_t                cur_sector;
1775         int                     cur_operation;
1776         struct xbb_xen_reqlist *reqlist;
1777
1778
1779         xbb   = (struct xbb_softc *)context;
1780         rings = &xbb->rings;
1781
1782         /*
1783          * Work gather and dispatch loop.  Note that we have a bias here
1784          * towards gathering I/O sent by blockfront.  We first gather up
1785          * everything in the ring, as long as we have resources.  Then we
1786          * dispatch one request, and then attempt to gather up any
1787          * additional requests that have come in while we were dispatching
1788          * the request.
1789          *
1790          * This allows us to get a clearer picture (via devstat) of how
1791          * many requests blockfront is queueing to us at any given time.
1792          */
1793         for (;;) {
1794                 int retval;
1795
1796                 /*
1797                  * Initialize reqlist to the last element in the pending
1798                  * queue, if there is one.  This allows us to add more
1799                  * requests to that request list, if we have room.
1800                  */
1801                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1802                                       xbb_xen_reqlist, links);
1803                 if (reqlist != NULL) {
1804                         cur_sector = reqlist->next_contig_sector;
1805                         cur_operation = reqlist->operation;
1806                 } else {
1807                         cur_operation = 0;
1808                         cur_sector    = 0;
1809                 }
1810
1811                 /*
1812                  * Cache req_prod to avoid accessing a cache line shared
1813                  * with the frontend.
1814                  */
1815                 rp = rings->common.sring->req_prod;
1816
1817                 /* Ensure we see queued requests up to 'rp'. */
1818                 rmb();
1819
1820                 /**
1821                  * Run so long as there is work to consume and the generation
1822                  * of a response will not overflow the ring.
1823                  *
1824                  * @note There's a 1 to 1 relationship between requests and
1825                  *       responses, so an overflow should never occur.  This
1826                  *       test is to protect our domain from digesting bogus
1827                  *       data.  Shouldn't we log this?
1828                  */
1829                 while (rings->common.req_cons != rp
1830                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1831                                                   rings->common.req_cons) == 0){
1832                         blkif_request_t         ring_req_storage;
1833                         blkif_request_t        *ring_req;
1834                         int                     cur_size;
1835
1836                         switch (xbb->abi) {
1837                         case BLKIF_PROTOCOL_NATIVE:
1838                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1839                                     rings->common.req_cons);
1840                                 break;
1841                         case BLKIF_PROTOCOL_X86_32:
1842                         {
1843                                 struct blkif_x86_32_request *ring_req32;
1844
1845                                 ring_req32 = RING_GET_REQUEST(
1846                                     &xbb->rings.x86_32, rings->common.req_cons);
1847                                 blkif_get_x86_32_req(&ring_req_storage,
1848                                                      ring_req32);
1849                                 ring_req = &ring_req_storage;
1850                                 break;
1851                         }
1852                         case BLKIF_PROTOCOL_X86_64:
1853                         {
1854                                 struct blkif_x86_64_request *ring_req64;
1855
1856                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1857                                     rings->common.req_cons);
1858                                 blkif_get_x86_64_req(&ring_req_storage,
1859                                                      ring_req64);
1860                                 ring_req = &ring_req_storage;
1861                                 break;
1862                         }
1863                         default:
1864                                 panic("Unexpected blkif protocol ABI.");
1865                                 /* NOTREACHED */
1866                         } 
1867
1868                         /*
1869                          * Check for situations that would require closing
1870                          * off this I/O for further coalescing:
1871                          *  - Coalescing is turned off.
1872                          *  - Current I/O is out of sequence with the previous
1873                          *    I/O.
1874                          *  - Coalesced I/O would be too large.
1875                          */
1876                         if ((reqlist != NULL)
1877                          && ((xbb->no_coalesce_reqs != 0)
1878                           || ((xbb->no_coalesce_reqs == 0)
1879                            && ((ring_req->sector_number != cur_sector)
1880                             || (ring_req->operation != cur_operation)
1881                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1882                                  xbb->max_reqlist_segments))))) {
1883                                 reqlist = NULL;
1884                         }
1885
1886                         /*
1887                          * Grab and check for all resources in one shot.
1888                          * If we can't get all of the resources we need,
1889                          * the shortage is noted and the thread will get
1890                          * woken up when more resources are available.
1891                          */
1892                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1893                                                    xbb->rings.common.req_cons);
1894
1895                         if (retval != 0) {
1896                                 /*
1897                                  * Resource shortage has been recorded.
1898                                  * We'll be scheduled to run once a request
1899                                  * object frees up due to a completion.
1900                                  */
1901                                 break;
1902                         }
1903
1904                         /*
1905                          * Signify that we can overwrite this request with
1906                          * a response by incrementing our consumer index.
1907                          * The response won't be generated until after
1908                          * we've already consumed all necessary data out
1909                          * of the version of the request in the ring buffer
1910                          * (for native mode).  We must update the consumer
1911                          * index  before issueing back-end I/O so there is
1912                          * no possibility that it will complete and a
1913                          * response be generated before we make room in 
1914                          * the queue for that response.
1915                          */
1916                         xbb->rings.common.req_cons++;
1917                         xbb->reqs_received++;
1918
1919                         cur_size = xbb_count_sects(ring_req);
1920                         cur_sector = ring_req->sector_number + cur_size;
1921                         reqlist->next_contig_sector = cur_sector;
1922                         cur_operation = ring_req->operation;
1923                 }
1924
1925                 /* Check for I/O to dispatch */
1926                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1927                 if (reqlist == NULL) {
1928                         /*
1929                          * We're out of work to do, put the task queue to
1930                          * sleep.
1931                          */
1932                         break;
1933                 }
1934
1935                 /*
1936                  * Grab the first request off the queue and attempt
1937                  * to dispatch it.
1938                  */
1939                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1940
1941                 retval = xbb_dispatch_io(xbb, reqlist);
1942                 if (retval != 0) {
1943                         /*
1944                          * xbb_dispatch_io() returns non-zero only when
1945                          * there is a resource shortage.  If that's the
1946                          * case, re-queue this request on the head of the
1947                          * queue, and go to sleep until we have more
1948                          * resources.
1949                          */
1950                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1951                                            reqlist, links);
1952                         break;
1953                 } else {
1954                         /*
1955                          * If we still have anything on the queue after
1956                          * removing the head entry, that is because we
1957                          * met one of the criteria to create a new
1958                          * request list (outlined above), and we'll call
1959                          * that a forced dispatch for statistical purposes.
1960                          *
1961                          * Otherwise, if there is only one element on the
1962                          * queue, we coalesced everything available on
1963                          * the ring and we'll call that a normal dispatch.
1964                          */
1965                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1966
1967                         if (reqlist != NULL)
1968                                 xbb->forced_dispatch++;
1969                         else
1970                                 xbb->normal_dispatch++;
1971
1972                         xbb->total_dispatch++;
1973                 }
1974         }
1975 }
1976
1977 /**
1978  * Interrupt handler bound to the shared ring's event channel.
1979  *
1980  * \param arg  Callback argument registerd during event channel
1981  *             binding - the xbb_softc for this instance.
1982  */
1983 static int
1984 xbb_filter(void *arg)
1985 {
1986         struct xbb_softc *xbb;
1987
1988         /* Defer to taskqueue thread. */
1989         xbb = (struct xbb_softc *)arg;
1990         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1991
1992         return (FILTER_HANDLED);
1993 }
1994
1995 SDT_PROVIDER_DEFINE(xbb);
1996 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
1997 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
1998                   "uint64_t");
1999 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
2000                   "uint64_t", "uint64_t");
2001
2002 /*----------------------------- Backend Handlers -----------------------------*/
2003 /**
2004  * Backend handler for character device access.
2005  *
2006  * \param xbb        Per-instance xbb configuration structure.
2007  * \param reqlist    Allocated internal request list structure.
2008  * \param operation  BIO_* I/O operation code.
2009  * \param bio_flags  Additional bio_flag data to pass to any generated
2010  *                   bios (e.g. BIO_ORDERED)..
2011  *
2012  * \return  0 for success, errno codes for failure.
2013  */
2014 static int
2015 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2016                  int operation, int bio_flags)
2017 {
2018         struct xbb_dev_data *dev_data;
2019         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2020         off_t                bio_offset;
2021         struct bio          *bio;
2022         struct xbb_sg       *xbb_sg;
2023         u_int                nbio;
2024         u_int                bio_idx;
2025         u_int                nseg;
2026         u_int                seg_idx;
2027         int                  error;
2028
2029         dev_data   = &xbb->backend.dev;
2030         bio_offset = (off_t)reqlist->starting_sector_number
2031                    << xbb->sector_size_shift;
2032         error      = 0;
2033         nbio       = 0;
2034         bio_idx    = 0;
2035
2036         if (operation == BIO_FLUSH) {
2037                 bio = g_new_bio();
2038                 if (__predict_false(bio == NULL)) {
2039                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2040                         error = ENOMEM;
2041                         return (error);
2042                 }
2043
2044                 bio->bio_cmd     = BIO_FLUSH;
2045                 bio->bio_flags  |= BIO_ORDERED;
2046                 bio->bio_dev     = dev_data->cdev;
2047                 bio->bio_offset  = 0;
2048                 bio->bio_data    = 0;
2049                 bio->bio_done    = xbb_bio_done;
2050                 bio->bio_caller1 = reqlist;
2051                 bio->bio_pblkno  = 0;
2052
2053                 reqlist->pendcnt = 1;
2054
2055                 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2056                            device_get_unit(xbb->dev));
2057
2058                 (*dev_data->csw->d_strategy)(bio);
2059
2060                 return (0);
2061         }
2062
2063         xbb_sg = xbb->xbb_sgs;
2064         bio    = NULL;
2065         nseg = reqlist->nr_segments;
2066
2067         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2068
2069                 /*
2070                  * KVA will not be contiguous, so any additional
2071                  * I/O will need to be represented in a new bio.
2072                  */
2073                 if ((bio != NULL)
2074                  && (xbb_sg->first_sect != 0)) {
2075                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2076                                 printf("%s: Discontiguous I/O request "
2077                                        "from domain %d ends on "
2078                                        "non-sector boundary\n",
2079                                        __func__, xbb->otherend_id);
2080                                 error = EINVAL;
2081                                 goto fail_free_bios;
2082                         }
2083                         bio = NULL;
2084                 }
2085
2086                 if (bio == NULL) {
2087                         /*
2088                          * Make sure that the start of this bio is
2089                          * aligned to a device sector.
2090                          */
2091                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2092                                 printf("%s: Misaligned I/O request "
2093                                        "from domain %d\n", __func__,
2094                                        xbb->otherend_id);
2095                                 error = EINVAL;
2096                                 goto fail_free_bios;
2097                         }
2098
2099                         bio = bios[nbio++] = g_new_bio();
2100                         if (__predict_false(bio == NULL)) {
2101                                 error = ENOMEM;
2102                                 goto fail_free_bios;
2103                         }
2104                         bio->bio_cmd     = operation;
2105                         bio->bio_flags  |= bio_flags;
2106                         bio->bio_dev     = dev_data->cdev;
2107                         bio->bio_offset  = bio_offset;
2108                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2109                                                 xbb_sg->first_sect);
2110                         bio->bio_done    = xbb_bio_done;
2111                         bio->bio_caller1 = reqlist;
2112                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2113                 }
2114
2115                 bio->bio_length += xbb_sg->nsect << 9;
2116                 bio->bio_bcount  = bio->bio_length;
2117                 bio_offset      += xbb_sg->nsect << 9;
2118
2119                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2120
2121                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2122                                 printf("%s: Discontiguous I/O request "
2123                                        "from domain %d ends on "
2124                                        "non-sector boundary\n",
2125                                        __func__, xbb->otherend_id);
2126                                 error = EINVAL;
2127                                 goto fail_free_bios;
2128                         }
2129                         /*
2130                          * KVA will not be contiguous, so any additional
2131                          * I/O will need to be represented in a new bio.
2132                          */
2133                         bio = NULL;
2134                 }
2135         }
2136
2137         reqlist->pendcnt = nbio;
2138
2139         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2140         {
2141 #ifdef XBB_USE_BOUNCE_BUFFERS
2142                 vm_offset_t kva_offset;
2143
2144                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2145                            - (vm_offset_t)reqlist->bounce;
2146                 if (operation == BIO_WRITE) {
2147                         memcpy(bios[bio_idx]->bio_data,
2148                                (uint8_t *)reqlist->kva + kva_offset,
2149                                bios[bio_idx]->bio_bcount);
2150                 }
2151 #endif
2152                 if (operation == BIO_READ) {
2153                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2154                                    device_get_unit(xbb->dev),
2155                                    bios[bio_idx]->bio_offset,
2156                                    bios[bio_idx]->bio_length);
2157                 } else if (operation == BIO_WRITE) {
2158                         SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2159                                    device_get_unit(xbb->dev),
2160                                    bios[bio_idx]->bio_offset,
2161                                    bios[bio_idx]->bio_length);
2162                 }
2163                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2164         }
2165
2166         return (error);
2167
2168 fail_free_bios:
2169         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2170                 g_destroy_bio(bios[bio_idx]);
2171         
2172         return (error);
2173 }
2174
2175 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
2176 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
2177                   "uint64_t");
2178 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
2179                   "uint64_t", "uint64_t");
2180
2181 /**
2182  * Backend handler for file access.
2183  *
2184  * \param xbb        Per-instance xbb configuration structure.
2185  * \param reqlist    Allocated internal request list.
2186  * \param operation  BIO_* I/O operation code.
2187  * \param flags      Additional bio_flag data to pass to any generated bios
2188  *                   (e.g. BIO_ORDERED)..
2189  *
2190  * \return  0 for success, errno codes for failure.
2191  */
2192 static int
2193 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2194                   int operation, int flags)
2195 {
2196         struct xbb_file_data *file_data;
2197         u_int                 seg_idx;
2198         u_int                 nseg;
2199         off_t                 sectors_sent;
2200         struct uio            xuio;
2201         struct xbb_sg        *xbb_sg;
2202         struct iovec         *xiovec;
2203 #ifdef XBB_USE_BOUNCE_BUFFERS
2204         void                **p_vaddr;
2205         int                   saved_uio_iovcnt;
2206 #endif /* XBB_USE_BOUNCE_BUFFERS */
2207         int                   error;
2208
2209         file_data = &xbb->backend.file;
2210         sectors_sent = 0;
2211         error = 0;
2212         bzero(&xuio, sizeof(xuio));
2213
2214         switch (operation) {
2215         case BIO_READ:
2216                 xuio.uio_rw = UIO_READ;
2217                 break;
2218         case BIO_WRITE:
2219                 xuio.uio_rw = UIO_WRITE;
2220                 break;
2221         case BIO_FLUSH: {
2222                 struct mount *mountpoint;
2223
2224                 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2225                            device_get_unit(xbb->dev));
2226
2227                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2228
2229                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2230                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2231                 VOP_UNLOCK(xbb->vn, 0);
2232
2233                 vn_finished_write(mountpoint);
2234
2235                 goto bailout_send_response;
2236                 /* NOTREACHED */
2237         }
2238         default:
2239                 panic("invalid operation %d", operation);
2240                 /* NOTREACHED */
2241         }
2242         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2243                         << xbb->sector_size_shift;
2244         xuio.uio_segflg = UIO_SYSSPACE;
2245         xuio.uio_iov = file_data->xiovecs;
2246         xuio.uio_iovcnt = 0;
2247         xbb_sg = xbb->xbb_sgs;
2248         nseg = reqlist->nr_segments;
2249
2250         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2251
2252                 /*
2253                  * If the first sector is not 0, the KVA will
2254                  * not be contiguous and we'll need to go on
2255                  * to another segment.
2256                  */
2257                 if (xbb_sg->first_sect != 0)
2258                         xiovec = NULL;
2259
2260                 if (xiovec == NULL) {
2261                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2262                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2263                             seg_idx, xbb_sg->first_sect);
2264 #ifdef XBB_USE_BOUNCE_BUFFERS
2265                         /*
2266                          * Store the address of the incoming
2267                          * buffer at this particular offset
2268                          * as well, so we can do the copy
2269                          * later without having to do more
2270                          * work to recalculate this address.
2271                          */
2272                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2273                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2274                             xbb_sg->first_sect);
2275 #endif /* XBB_USE_BOUNCE_BUFFERS */
2276                         xiovec->iov_len = 0;
2277                         xuio.uio_iovcnt++;
2278                 }
2279
2280                 xiovec->iov_len += xbb_sg->nsect << 9;
2281
2282                 xuio.uio_resid += xbb_sg->nsect << 9;
2283
2284                 /*
2285                  * If the last sector is not the full page
2286                  * size count, the next segment will not be
2287                  * contiguous in KVA and we need a new iovec.
2288                  */
2289                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2290                         xiovec = NULL;
2291         }
2292
2293         xuio.uio_td = curthread;
2294
2295 #ifdef XBB_USE_BOUNCE_BUFFERS
2296         saved_uio_iovcnt = xuio.uio_iovcnt;
2297
2298         if (operation == BIO_WRITE) {
2299                 /* Copy the write data to the local buffer. */
2300                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2301                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2302                      seg_idx++, xiovec++, p_vaddr++) {
2303
2304                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2305                 }
2306         } else {
2307                 /*
2308                  * We only need to save off the iovecs in the case of a
2309                  * read, because the copy for the read happens after the
2310                  * VOP_READ().  (The uio will get modified in that call
2311                  * sequence.)
2312                  */
2313                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2314                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2315         }
2316 #endif /* XBB_USE_BOUNCE_BUFFERS */
2317
2318         switch (operation) {
2319         case BIO_READ:
2320
2321                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2322                            device_get_unit(xbb->dev), xuio.uio_offset,
2323                            xuio.uio_resid);
2324
2325                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2326
2327                 /*
2328                  * UFS pays attention to IO_DIRECT for reads.  If the
2329                  * DIRECTIO option is configured into the kernel, it calls
2330                  * ffs_rawread().  But that only works for single-segment
2331                  * uios with user space addresses.  In our case, with a
2332                  * kernel uio, it still reads into the buffer cache, but it
2333                  * will just try to release the buffer from the cache later
2334                  * on in ffs_read().
2335                  *
2336                  * ZFS does not pay attention to IO_DIRECT for reads.
2337                  *
2338                  * UFS does not pay attention to IO_SYNC for reads.
2339                  *
2340                  * ZFS pays attention to IO_SYNC (which translates into the
2341                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2342                  * attempts to sync the file before reading.
2343                  *
2344                  * So, to attempt to provide some barrier semantics in the
2345                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2346                  */
2347                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2348                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2349
2350                 VOP_UNLOCK(xbb->vn, 0);
2351                 break;
2352         case BIO_WRITE: {
2353                 struct mount *mountpoint;
2354
2355                 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2356                            device_get_unit(xbb->dev), xuio.uio_offset,
2357                            xuio.uio_resid);
2358
2359                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2360
2361                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2362
2363                 /*
2364                  * UFS pays attention to IO_DIRECT for writes.  The write
2365                  * is done asynchronously.  (Normally the write would just
2366                  * get put into cache.
2367                  *
2368                  * UFS pays attention to IO_SYNC for writes.  It will
2369                  * attempt to write the buffer out synchronously if that
2370                  * flag is set.
2371                  *
2372                  * ZFS does not pay attention to IO_DIRECT for writes.
2373                  *
2374                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2375                  * for writes.  It will flush the transaction from the
2376                  * cache before returning.
2377                  *
2378                  * So if we've got the BIO_ORDERED flag set, we want
2379                  * IO_SYNC in either the UFS or ZFS case.
2380                  */
2381                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2382                                   IO_SYNC : 0, file_data->cred);
2383                 VOP_UNLOCK(xbb->vn, 0);
2384
2385                 vn_finished_write(mountpoint);
2386
2387                 break;
2388         }
2389         default:
2390                 panic("invalid operation %d", operation);
2391                 /* NOTREACHED */
2392         }
2393
2394 #ifdef XBB_USE_BOUNCE_BUFFERS
2395         /* We only need to copy here for read operations */
2396         if (operation == BIO_READ) {
2397
2398                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2399                      xiovec = file_data->saved_xiovecs;
2400                      seg_idx < saved_uio_iovcnt; seg_idx++,
2401                      xiovec++, p_vaddr++) {
2402
2403                         /*
2404                          * Note that we have to use the copy of the 
2405                          * io vector we made above.  uiomove() modifies
2406                          * the uio and its referenced vector as uiomove
2407                          * performs the copy, so we can't rely on any
2408                          * state from the original uio.
2409                          */
2410                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2411                 }
2412         }
2413 #endif /* XBB_USE_BOUNCE_BUFFERS */
2414
2415 bailout_send_response:
2416
2417         if (error != 0)
2418                 reqlist->status = BLKIF_RSP_ERROR;
2419
2420         xbb_complete_reqlist(xbb, reqlist);
2421
2422         return (0);
2423 }
2424
2425 /*--------------------------- Backend Configuration --------------------------*/
2426 /**
2427  * Close and cleanup any backend device/file specific state for this
2428  * block back instance. 
2429  *
2430  * \param xbb  Per-instance xbb configuration structure.
2431  */
2432 static void
2433 xbb_close_backend(struct xbb_softc *xbb)
2434 {
2435         DROP_GIANT();
2436         DPRINTF("closing dev=%s\n", xbb->dev_name);
2437         if (xbb->vn) {
2438                 int flags = FREAD;
2439
2440                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2441                         flags |= FWRITE;
2442
2443                 switch (xbb->device_type) {
2444                 case XBB_TYPE_DISK:
2445                         if (xbb->backend.dev.csw) {
2446                                 dev_relthread(xbb->backend.dev.cdev,
2447                                               xbb->backend.dev.dev_ref);
2448                                 xbb->backend.dev.csw  = NULL;
2449                                 xbb->backend.dev.cdev = NULL;
2450                         }
2451                         break;
2452                 case XBB_TYPE_FILE:
2453                         break;
2454                 case XBB_TYPE_NONE:
2455                 default:
2456                         panic("Unexpected backend type.");
2457                         break;
2458                 }
2459
2460                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2461                 xbb->vn = NULL;
2462
2463                 switch (xbb->device_type) {
2464                 case XBB_TYPE_DISK:
2465                         break;
2466                 case XBB_TYPE_FILE:
2467                         if (xbb->backend.file.cred != NULL) {
2468                                 crfree(xbb->backend.file.cred);
2469                                 xbb->backend.file.cred = NULL;
2470                         }
2471                         break;
2472                 case XBB_TYPE_NONE:
2473                 default:
2474                         panic("Unexpected backend type.");
2475                         break;
2476                 }
2477         }
2478         PICKUP_GIANT();
2479 }
2480
2481 /**
2482  * Open a character device to be used for backend I/O.
2483  *
2484  * \param xbb  Per-instance xbb configuration structure.
2485  *
2486  * \return  0 for success, errno codes for failure.
2487  */
2488 static int
2489 xbb_open_dev(struct xbb_softc *xbb)
2490 {
2491         struct vattr   vattr;
2492         struct cdev   *dev;
2493         struct cdevsw *devsw;
2494         int            error;
2495
2496         xbb->device_type = XBB_TYPE_DISK;
2497         xbb->dispatch_io = xbb_dispatch_dev;
2498         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2499         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2500                                              &xbb->backend.dev.dev_ref);
2501         if (xbb->backend.dev.csw == NULL)
2502                 panic("Unable to retrieve device switch");
2503
2504         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2505         if (error) {
2506                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2507                                  "vnode attributes for device %s",
2508                                  xbb->dev_name);
2509                 return (error);
2510         }
2511
2512
2513         dev = xbb->vn->v_rdev;
2514         devsw = dev->si_devsw;
2515         if (!devsw->d_ioctl) {
2516                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2517                                  "device %s!", xbb->dev_name);
2518                 return (ENODEV);
2519         }
2520
2521         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2522                                (caddr_t)&xbb->sector_size, FREAD,
2523                                curthread);
2524         if (error) {
2525                 xenbus_dev_fatal(xbb->dev, error,
2526                                  "error calling ioctl DIOCGSECTORSIZE "
2527                                  "for device %s", xbb->dev_name);
2528                 return (error);
2529         }
2530
2531         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2532                                (caddr_t)&xbb->media_size, FREAD,
2533                                curthread);
2534         if (error) {
2535                 xenbus_dev_fatal(xbb->dev, error,
2536                                  "error calling ioctl DIOCGMEDIASIZE "
2537                                  "for device %s", xbb->dev_name);
2538                 return (error);
2539         }
2540
2541         return (0);
2542 }
2543
2544 /**
2545  * Open a file to be used for backend I/O.
2546  *
2547  * \param xbb  Per-instance xbb configuration structure.
2548  *
2549  * \return  0 for success, errno codes for failure.
2550  */
2551 static int
2552 xbb_open_file(struct xbb_softc *xbb)
2553 {
2554         struct xbb_file_data *file_data;
2555         struct vattr          vattr;
2556         int                   error;
2557
2558         file_data = &xbb->backend.file;
2559         xbb->device_type = XBB_TYPE_FILE;
2560         xbb->dispatch_io = xbb_dispatch_file;
2561         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2562         if (error != 0) {
2563                 xenbus_dev_fatal(xbb->dev, error,
2564                                  "error calling VOP_GETATTR()"
2565                                  "for file %s", xbb->dev_name);
2566                 return (error);
2567         }
2568
2569         /*
2570          * Verify that we have the ability to upgrade to exclusive
2571          * access on this file so we can trap errors at open instead
2572          * of reporting them during first access.
2573          */
2574         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2575                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2576                 if (xbb->vn->v_iflag & VI_DOOMED) {
2577                         error = EBADF;
2578                         xenbus_dev_fatal(xbb->dev, error,
2579                                          "error locking file %s",
2580                                          xbb->dev_name);
2581
2582                         return (error);
2583                 }
2584         }
2585
2586         file_data->cred = crhold(curthread->td_ucred);
2587         xbb->media_size = vattr.va_size;
2588
2589         /*
2590          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2591          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2592          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2593          * may not work with other OSes as well.  So just export a sector
2594          * size of 512 bytes, which should work with any OS or
2595          * application.  Since our backing is a file, any block size will
2596          * work fine for the backing store.
2597          */
2598 #if 0
2599         xbb->sector_size = vattr.va_blocksize;
2600 #endif
2601         xbb->sector_size = 512;
2602
2603         /*
2604          * Sanity check.  The media size has to be at least one
2605          * sector long.
2606          */
2607         if (xbb->media_size < xbb->sector_size) {
2608                 error = EINVAL;
2609                 xenbus_dev_fatal(xbb->dev, error,
2610                                  "file %s size %ju < block size %u",
2611                                  xbb->dev_name,
2612                                  (uintmax_t)xbb->media_size,
2613                                  xbb->sector_size);
2614         }
2615         return (error);
2616 }
2617
2618 /**
2619  * Open the backend provider for this connection.
2620  *
2621  * \param xbb  Per-instance xbb configuration structure.
2622  *
2623  * \return  0 for success, errno codes for failure.
2624  */
2625 static int
2626 xbb_open_backend(struct xbb_softc *xbb)
2627 {
2628         struct nameidata nd;
2629         int              flags;
2630         int              error;
2631
2632         flags = FREAD;
2633         error = 0;
2634
2635         DPRINTF("opening dev=%s\n", xbb->dev_name);
2636
2637         if (rootvnode == NULL) {
2638                 xenbus_dev_fatal(xbb->dev, ENOENT,
2639                                  "Root file system not mounted");
2640                 return (ENOENT);
2641         }
2642
2643         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2644                 flags |= FWRITE;
2645
2646         if (!curthread->td_proc->p_fd->fd_cdir) {
2647                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
2648                 VREF(rootvnode);
2649         }
2650         if (!curthread->td_proc->p_fd->fd_rdir) {
2651                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
2652                 VREF(rootvnode);
2653         }
2654         if (!curthread->td_proc->p_fd->fd_jdir) {
2655                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
2656                 VREF(rootvnode);
2657         }
2658
2659  again:
2660         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2661         error = vn_open(&nd, &flags, 0, NULL);
2662         if (error) {
2663                 /*
2664                  * This is the only reasonable guess we can make as far as
2665                  * path if the user doesn't give us a fully qualified path.
2666                  * If they want to specify a file, they need to specify the
2667                  * full path.
2668                  */
2669                 if (xbb->dev_name[0] != '/') {
2670                         char *dev_path = "/dev/";
2671                         char *dev_name;
2672
2673                         /* Try adding device path at beginning of name */
2674                         dev_name = malloc(strlen(xbb->dev_name)
2675                                         + strlen(dev_path) + 1,
2676                                           M_XENBLOCKBACK, M_NOWAIT);
2677                         if (dev_name) {
2678                                 sprintf(dev_name, "%s%s", dev_path,
2679                                         xbb->dev_name);
2680                                 free(xbb->dev_name, M_XENBLOCKBACK);
2681                                 xbb->dev_name = dev_name;
2682                                 goto again;
2683                         }
2684                 }
2685                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2686                                  xbb->dev_name);
2687                 return (error);
2688         }
2689
2690         NDFREE(&nd, NDF_ONLY_PNBUF);
2691                 
2692         xbb->vn = nd.ni_vp;
2693
2694         /* We only support disks and files. */
2695         if (vn_isdisk(xbb->vn, &error)) {
2696                 error = xbb_open_dev(xbb);
2697         } else if (xbb->vn->v_type == VREG) {
2698                 error = xbb_open_file(xbb);
2699         } else {
2700                 error = EINVAL;
2701                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2702                                  "or file", xbb->dev_name);
2703         }
2704         VOP_UNLOCK(xbb->vn, 0);
2705
2706         if (error != 0) {
2707                 xbb_close_backend(xbb);
2708                 return (error);
2709         }
2710
2711         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2712         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2713
2714         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2715                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2716                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2717
2718         return (0);
2719 }
2720
2721 /*------------------------ Inter-Domain Communication ------------------------*/
2722 /**
2723  * Free dynamically allocated KVA or pseudo-physical address allocations.
2724  *
2725  * \param xbb  Per-instance xbb configuration structure.
2726  */
2727 static void
2728 xbb_free_communication_mem(struct xbb_softc *xbb)
2729 {
2730         if (xbb->kva != 0) {
2731 #ifndef XENHVM
2732                 kva_free(xbb->kva, xbb->kva_size);
2733 #else
2734                 if (xbb->pseudo_phys_res != NULL) {
2735                         bus_release_resource(xbb->dev, SYS_RES_MEMORY,
2736                                              xbb->pseudo_phys_res_id,
2737                                              xbb->pseudo_phys_res);
2738                         xbb->pseudo_phys_res = NULL;
2739                 }
2740 #endif
2741         }
2742         xbb->kva = 0;
2743         xbb->gnt_base_addr = 0;
2744         if (xbb->kva_free != NULL) {
2745                 free(xbb->kva_free, M_XENBLOCKBACK);
2746                 xbb->kva_free = NULL;
2747         }
2748 }
2749
2750 /**
2751  * Cleanup all inter-domain communication mechanisms.
2752  *
2753  * \param xbb  Per-instance xbb configuration structure.
2754  */
2755 static int
2756 xbb_disconnect(struct xbb_softc *xbb)
2757 {
2758         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2759         struct gnttab_unmap_grant_ref *op;
2760         u_int                          ring_idx;
2761         int                            error;
2762
2763         DPRINTF("\n");
2764
2765         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2766                 return (0);
2767
2768         xen_intr_unbind(&xbb->xen_intr_handle);
2769
2770         mtx_unlock(&xbb->lock);
2771         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2772         mtx_lock(&xbb->lock);
2773
2774         /*
2775          * No new interrupts can generate work, but we must wait
2776          * for all currently active requests to drain.
2777          */
2778         if (xbb->active_request_count != 0)
2779                 return (EAGAIN);
2780         
2781         for (ring_idx = 0, op = ops;
2782              ring_idx < xbb->ring_config.ring_pages;
2783              ring_idx++, op++) {
2784
2785                 op->host_addr    = xbb->ring_config.gnt_addr
2786                                  + (ring_idx * PAGE_SIZE);
2787                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2788                 op->handle       = xbb->ring_config.handle[ring_idx];
2789         }
2790
2791         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2792                                           xbb->ring_config.ring_pages);
2793         if (error != 0)
2794                 panic("Grant table op failed (%d)", error);
2795
2796         xbb_free_communication_mem(xbb);
2797
2798         if (xbb->requests != NULL) {
2799                 free(xbb->requests, M_XENBLOCKBACK);
2800                 xbb->requests = NULL;
2801         }
2802
2803         if (xbb->request_lists != NULL) {
2804                 struct xbb_xen_reqlist *reqlist;
2805                 int i;
2806
2807                 /* There is one request list for ever allocated request. */
2808                 for (i = 0, reqlist = xbb->request_lists;
2809                      i < xbb->max_requests; i++, reqlist++){
2810 #ifdef XBB_USE_BOUNCE_BUFFERS
2811                         if (reqlist->bounce != NULL) {
2812                                 free(reqlist->bounce, M_XENBLOCKBACK);
2813                                 reqlist->bounce = NULL;
2814                         }
2815 #endif
2816                         if (reqlist->gnt_handles != NULL) {
2817                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2818                                 reqlist->gnt_handles = NULL;
2819                         }
2820                 }
2821                 free(xbb->request_lists, M_XENBLOCKBACK);
2822                 xbb->request_lists = NULL;
2823         }
2824
2825         xbb->flags &= ~XBBF_RING_CONNECTED;
2826         return (0);
2827 }
2828
2829 /**
2830  * Map shared memory ring into domain local address space, initialize
2831  * ring control structures, and bind an interrupt to the event channel
2832  * used to notify us of ring changes.
2833  *
2834  * \param xbb  Per-instance xbb configuration structure.
2835  */
2836 static int
2837 xbb_connect_ring(struct xbb_softc *xbb)
2838 {
2839         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2840         struct gnttab_map_grant_ref *gnt;
2841         u_int                        ring_idx;
2842         int                          error;
2843
2844         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2845                 return (0);
2846
2847         /*
2848          * Kva for our ring is at the tail of the region of kva allocated
2849          * by xbb_alloc_communication_mem().
2850          */
2851         xbb->ring_config.va = xbb->kva
2852                             + (xbb->kva_size
2853                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2854         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2855                                   + (xbb->kva_size
2856                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2857
2858         for (ring_idx = 0, gnt = gnts;
2859              ring_idx < xbb->ring_config.ring_pages;
2860              ring_idx++, gnt++) {
2861
2862                 gnt->host_addr = xbb->ring_config.gnt_addr
2863                                + (ring_idx * PAGE_SIZE);
2864                 gnt->flags     = GNTMAP_host_map;
2865                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2866                 gnt->dom       = xbb->otherend_id;
2867         }
2868
2869         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2870                                           xbb->ring_config.ring_pages);
2871         if (error)
2872                 panic("blkback: Ring page grant table op failed (%d)", error);
2873
2874         for (ring_idx = 0, gnt = gnts;
2875              ring_idx < xbb->ring_config.ring_pages;
2876              ring_idx++, gnt++) {
2877                 if (gnt->status != 0) {
2878                         xbb->ring_config.va = 0;
2879                         xenbus_dev_fatal(xbb->dev, EACCES,
2880                                          "Ring shared page mapping failed. "
2881                                          "Status %d.", gnt->status);
2882                         return (EACCES);
2883                 }
2884                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2885                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2886         }
2887
2888         /* Initialize the ring based on ABI. */
2889         switch (xbb->abi) {
2890         case BLKIF_PROTOCOL_NATIVE:
2891         {
2892                 blkif_sring_t *sring;
2893                 sring = (blkif_sring_t *)xbb->ring_config.va;
2894                 BACK_RING_INIT(&xbb->rings.native, sring,
2895                                xbb->ring_config.ring_pages * PAGE_SIZE);
2896                 break;
2897         }
2898         case BLKIF_PROTOCOL_X86_32:
2899         {
2900                 blkif_x86_32_sring_t *sring_x86_32;
2901                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2902                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2903                                xbb->ring_config.ring_pages * PAGE_SIZE);
2904                 break;
2905         }
2906         case BLKIF_PROTOCOL_X86_64:
2907         {
2908                 blkif_x86_64_sring_t *sring_x86_64;
2909                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2910                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2911                                xbb->ring_config.ring_pages * PAGE_SIZE);
2912                 break;
2913         }
2914         default:
2915                 panic("Unexpected blkif protocol ABI.");
2916         }
2917
2918         xbb->flags |= XBBF_RING_CONNECTED;
2919
2920         error = xen_intr_bind_remote_port(xbb->dev,
2921                                           xbb->otherend_id,
2922                                           xbb->ring_config.evtchn,
2923                                           xbb_filter,
2924                                           /*ithread_handler*/NULL,
2925                                           /*arg*/xbb,
2926                                           INTR_TYPE_BIO | INTR_MPSAFE,
2927                                           &xbb->xen_intr_handle);
2928         if (error) {
2929                 (void)xbb_disconnect(xbb);
2930                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2931                 return (error);
2932         }
2933
2934         DPRINTF("rings connected!\n");
2935
2936         return 0;
2937 }
2938
2939 /* Needed to make bit_alloc() macro work */
2940 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,      \
2941                                    M_NOWAIT|M_ZERO);
2942
2943 /**
2944  * Size KVA and pseudo-physical address allocations based on negotiated
2945  * values for the size and number of I/O requests, and the size of our
2946  * communication ring.
2947  *
2948  * \param xbb  Per-instance xbb configuration structure.
2949  *
2950  * These address spaces are used to dynamically map pages in the
2951  * front-end's domain into our own.
2952  */
2953 static int
2954 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2955 {
2956         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2957         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2958         xbb->kva_size = xbb->reqlist_kva_size +
2959                         (xbb->ring_config.ring_pages * PAGE_SIZE);
2960
2961         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
2962         if (xbb->kva_free == NULL)
2963                 return (ENOMEM);
2964
2965         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
2966                 device_get_nameunit(xbb->dev), xbb->kva_size,
2967                 xbb->reqlist_kva_size);
2968 #ifndef XENHVM
2969         xbb->kva = kva_alloc(xbb->kva_size);
2970         if (xbb->kva == 0)
2971                 return (ENOMEM);
2972         xbb->gnt_base_addr = xbb->kva;
2973 #else /* XENHVM */
2974         /*
2975          * Reserve a range of pseudo physical memory that we can map
2976          * into kva.  These pages will only be backed by machine
2977          * pages ("real memory") during the lifetime of front-end requests
2978          * via grant table operations.
2979          */
2980         xbb->pseudo_phys_res_id = 0;
2981         xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
2982                                                   &xbb->pseudo_phys_res_id,
2983                                                   0, ~0, xbb->kva_size,
2984                                                   RF_ACTIVE);
2985         if (xbb->pseudo_phys_res == NULL) {
2986                 xbb->kva = 0;
2987                 return (ENOMEM);
2988         }
2989         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
2990         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
2991 #endif /* XENHVM */
2992
2993         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
2994                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
2995                 (uintmax_t)xbb->gnt_base_addr); 
2996         return (0);
2997 }
2998
2999 /**
3000  * Collect front-end information from the XenStore.
3001  *
3002  * \param xbb  Per-instance xbb configuration structure.
3003  */
3004 static int
3005 xbb_collect_frontend_info(struct xbb_softc *xbb)
3006 {
3007         char        protocol_abi[64];
3008         const char *otherend_path;
3009         int         error;
3010         u_int       ring_idx;
3011         u_int       ring_page_order;
3012         size_t      ring_size;
3013
3014         otherend_path = xenbus_get_otherend_path(xbb->dev);
3015
3016         /*
3017          * Protocol defaults valid even if all negotiation fails.
3018          */
3019         xbb->ring_config.ring_pages = 1;
3020         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_REQUEST;
3021         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3022
3023         /*
3024          * Mandatory data (used in all versions of the protocol) first.
3025          */
3026         error = xs_scanf(XST_NIL, otherend_path,
3027                          "event-channel", NULL, "%" PRIu32,
3028                          &xbb->ring_config.evtchn);
3029         if (error != 0) {
3030                 xenbus_dev_fatal(xbb->dev, error,
3031                                  "Unable to retrieve event-channel information "
3032                                  "from frontend %s.  Unable to connect.",
3033                                  xenbus_get_otherend_path(xbb->dev));
3034                 return (error);
3035         }
3036
3037         /*
3038          * These fields are initialized to legacy protocol defaults
3039          * so we only need to fail if reading the updated value succeeds
3040          * and the new value is outside of its allowed range.
3041          *
3042          * \note xs_gather() returns on the first encountered error, so
3043          *       we must use independant calls in order to guarantee
3044          *       we don't miss information in a sparsly populated front-end
3045          *       tree.
3046          *
3047          * \note xs_scanf() does not update variables for unmatched
3048          *       fields.
3049          */
3050         ring_page_order = 0;
3051         xbb->max_requests = 32;
3052
3053         (void)xs_scanf(XST_NIL, otherend_path,
3054                        "ring-page-order", NULL, "%u",
3055                        &ring_page_order);
3056         xbb->ring_config.ring_pages = 1 << ring_page_order;
3057         ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
3058         xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
3059
3060         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3061                 xenbus_dev_fatal(xbb->dev, EINVAL,
3062                                  "Front-end specified ring-pages of %u "
3063                                  "exceeds backend limit of %u.  "
3064                                  "Unable to connect.",
3065                                  xbb->ring_config.ring_pages,
3066                                  XBB_MAX_RING_PAGES);
3067                 return (EINVAL);
3068         }
3069
3070         if (xbb->ring_config.ring_pages == 1) {
3071                 error = xs_gather(XST_NIL, otherend_path,
3072                                   "ring-ref", "%" PRIu32,
3073                                   &xbb->ring_config.ring_ref[0],
3074                                   NULL);
3075                 if (error != 0) {
3076                         xenbus_dev_fatal(xbb->dev, error,
3077                                          "Unable to retrieve ring information "
3078                                          "from frontend %s.  Unable to "
3079                                          "connect.",
3080                                          xenbus_get_otherend_path(xbb->dev));
3081                         return (error);
3082                 }
3083         } else {
3084                 /* Multi-page ring format. */
3085                 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
3086                      ring_idx++) {
3087                         char ring_ref_name[]= "ring_refXX";
3088
3089                         snprintf(ring_ref_name, sizeof(ring_ref_name),
3090                                  "ring-ref%u", ring_idx);
3091                         error = xs_scanf(XST_NIL, otherend_path,
3092                                          ring_ref_name, NULL, "%" PRIu32,
3093                                          &xbb->ring_config.ring_ref[ring_idx]);
3094                         if (error != 0) {
3095                                 xenbus_dev_fatal(xbb->dev, error,
3096                                                  "Failed to retriev grant "
3097                                                  "reference for page %u of "
3098                                                  "shared ring.  Unable "
3099                                                  "to connect.", ring_idx);
3100                                 return (error);
3101                         }
3102                 }
3103         }
3104
3105         error = xs_gather(XST_NIL, otherend_path,
3106                           "protocol", "%63s", protocol_abi,
3107                           NULL); 
3108         if (error != 0
3109          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3110                 /*
3111                  * Assume native if the frontend has not
3112                  * published ABI data or it has published and
3113                  * matches our own ABI.
3114                  */
3115                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3116         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3117
3118                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3119         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3120
3121                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3122         } else {
3123
3124                 xenbus_dev_fatal(xbb->dev, EINVAL,
3125                                  "Unknown protocol ABI (%s) published by "
3126                                  "frontend.  Unable to connect.", protocol_abi);
3127                 return (EINVAL);
3128         }
3129         return (0);
3130 }
3131
3132 /**
3133  * Allocate per-request data structures given request size and number
3134  * information negotiated with the front-end.
3135  *
3136  * \param xbb  Per-instance xbb configuration structure.
3137  */
3138 static int
3139 xbb_alloc_requests(struct xbb_softc *xbb)
3140 {
3141         struct xbb_xen_req *req;
3142         struct xbb_xen_req *last_req;
3143
3144         /*
3145          * Allocate request book keeping datastructures.
3146          */
3147         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3148                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3149         if (xbb->requests == NULL) {
3150                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3151                                   "Unable to allocate request structures");
3152                 return (ENOMEM);
3153         }
3154
3155         req      = xbb->requests;
3156         last_req = &xbb->requests[xbb->max_requests - 1];
3157         STAILQ_INIT(&xbb->request_free_stailq);
3158         while (req <= last_req) {
3159                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3160                 req++;
3161         }
3162         return (0);
3163 }
3164
3165 static int
3166 xbb_alloc_request_lists(struct xbb_softc *xbb)
3167 {
3168         struct xbb_xen_reqlist *reqlist;
3169         int                     i;
3170
3171         /*
3172          * If no requests can be merged, we need 1 request list per
3173          * in flight request.
3174          */
3175         xbb->request_lists = malloc(xbb->max_requests *
3176                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3177         if (xbb->request_lists == NULL) {
3178                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3179                                   "Unable to allocate request list structures");
3180                 return (ENOMEM);
3181         }
3182
3183         STAILQ_INIT(&xbb->reqlist_free_stailq);
3184         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3185         for (i = 0; i < xbb->max_requests; i++) {
3186                 int seg;
3187
3188                 reqlist      = &xbb->request_lists[i];
3189
3190                 reqlist->xbb = xbb;
3191
3192 #ifdef XBB_USE_BOUNCE_BUFFERS
3193                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3194                                          M_XENBLOCKBACK, M_NOWAIT);
3195                 if (reqlist->bounce == NULL) {
3196                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3197                                          "Unable to allocate request "
3198                                          "bounce buffers");
3199                         return (ENOMEM);
3200                 }
3201 #endif /* XBB_USE_BOUNCE_BUFFERS */
3202
3203                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3204                                               sizeof(*reqlist->gnt_handles),
3205                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3206                 if (reqlist->gnt_handles == NULL) {
3207                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3208                                           "Unable to allocate request "
3209                                           "grant references");
3210                         return (ENOMEM);
3211                 }
3212
3213                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3214                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3215
3216                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3217         }
3218         return (0);
3219 }
3220
3221 /**
3222  * Supply information about the physical device to the frontend
3223  * via XenBus.
3224  *
3225  * \param xbb  Per-instance xbb configuration structure.
3226  */
3227 static int
3228 xbb_publish_backend_info(struct xbb_softc *xbb)
3229 {
3230         struct xs_transaction xst;
3231         const char           *our_path;
3232         const char           *leaf;
3233         int                   error;
3234
3235         our_path = xenbus_get_node(xbb->dev);
3236         while (1) {
3237                 error = xs_transaction_start(&xst);
3238                 if (error != 0) {
3239                         xenbus_dev_fatal(xbb->dev, error,
3240                                          "Error publishing backend info "
3241                                          "(start transaction)");
3242                         return (error);
3243                 }
3244
3245                 leaf = "sectors";
3246                 error = xs_printf(xst, our_path, leaf,
3247                                   "%"PRIu64, xbb->media_num_sectors);
3248                 if (error != 0)
3249                         break;
3250
3251                 /* XXX Support all VBD attributes here. */
3252                 leaf = "info";
3253                 error = xs_printf(xst, our_path, leaf, "%u",
3254                                   xbb->flags & XBBF_READ_ONLY
3255                                 ? VDISK_READONLY : 0);
3256                 if (error != 0)
3257                         break;
3258
3259                 leaf = "sector-size";
3260                 error = xs_printf(xst, our_path, leaf, "%u",
3261                                   xbb->sector_size);
3262                 if (error != 0)
3263                         break;
3264
3265                 error = xs_transaction_end(xst, 0);
3266                 if (error == 0) {
3267                         return (0);
3268                 } else if (error != EAGAIN) {
3269                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3270                         return (error);
3271                 }
3272         }
3273
3274         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3275                         our_path, leaf);
3276         xs_transaction_end(xst, 1);
3277         return (error);
3278 }
3279
3280 /**
3281  * Connect to our blkfront peer now that it has completed publishing
3282  * its configuration into the XenStore.
3283  *
3284  * \param xbb  Per-instance xbb configuration structure.
3285  */
3286 static void
3287 xbb_connect(struct xbb_softc *xbb)
3288 {
3289         int error;
3290
3291         if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
3292                 return;
3293
3294         if (xbb_collect_frontend_info(xbb) != 0)
3295                 return;
3296
3297         xbb->flags &= ~XBBF_SHUTDOWN;
3298
3299         /*
3300          * We limit the maximum number of reqlist segments to the maximum
3301          * number of segments in the ring, or our absolute maximum,
3302          * whichever is smaller.
3303          */
3304         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3305                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3306
3307         /*
3308          * The maximum size is simply a function of the number of segments
3309          * we can handle.
3310          */
3311         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3312
3313         /* Allocate resources whose size depends on front-end configuration. */
3314         error = xbb_alloc_communication_mem(xbb);
3315         if (error != 0) {
3316                 xenbus_dev_fatal(xbb->dev, error,
3317                                  "Unable to allocate communication memory");
3318                 return;
3319         }
3320
3321         error = xbb_alloc_requests(xbb);
3322         if (error != 0) {
3323                 /* Specific errors are reported by xbb_alloc_requests(). */
3324                 return;
3325         }
3326
3327         error = xbb_alloc_request_lists(xbb);
3328         if (error != 0) {
3329                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3330                 return;
3331         }
3332
3333         /*
3334          * Connect communication channel.
3335          */
3336         error = xbb_connect_ring(xbb);
3337         if (error != 0) {
3338                 /* Specific errors are reported by xbb_connect_ring(). */
3339                 return;
3340         }
3341         
3342         if (xbb_publish_backend_info(xbb) != 0) {
3343                 /*
3344                  * If we can't publish our data, we cannot participate
3345                  * in this connection, and waiting for a front-end state
3346                  * change will not help the situation.
3347                  */
3348                 (void)xbb_disconnect(xbb);
3349                 return;
3350         }
3351
3352         /* Ready for I/O. */
3353         xenbus_set_state(xbb->dev, XenbusStateConnected);
3354 }
3355
3356 /*-------------------------- Device Teardown Support -------------------------*/
3357 /**
3358  * Perform device shutdown functions.
3359  *
3360  * \param xbb  Per-instance xbb configuration structure.
3361  *
3362  * Mark this instance as shutting down, wait for any active I/O on the
3363  * backend device/file to drain, disconnect from the front-end, and notify
3364  * any waiters (e.g. a thread invoking our detach method) that detach can
3365  * now proceed.
3366  */
3367 static int
3368 xbb_shutdown(struct xbb_softc *xbb)
3369 {
3370         XenbusState frontState;
3371         int         error;
3372
3373         DPRINTF("\n");
3374
3375         /*
3376          * Due to the need to drop our mutex during some
3377          * xenbus operations, it is possible for two threads
3378          * to attempt to close out shutdown processing at
3379          * the same time.  Tell the caller that hits this
3380          * race to try back later. 
3381          */
3382         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3383                 return (EAGAIN);
3384
3385         xbb->flags |= XBBF_IN_SHUTDOWN;
3386         mtx_unlock(&xbb->lock);
3387
3388         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3389                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3390
3391         frontState = xenbus_get_otherend_state(xbb->dev);
3392         mtx_lock(&xbb->lock);
3393         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3394
3395         /* The front can submit I/O until entering the closed state. */
3396         if (frontState < XenbusStateClosed)
3397                 return (EAGAIN);
3398
3399         DPRINTF("\n");
3400
3401         /* Indicate shutdown is in progress. */
3402         xbb->flags |= XBBF_SHUTDOWN;
3403
3404         /* Disconnect from the front-end. */
3405         error = xbb_disconnect(xbb);
3406         if (error != 0) {
3407                 /*
3408                  * Requests still outstanding.  We'll be called again
3409                  * once they complete.
3410                  */
3411                 KASSERT(error == EAGAIN,
3412                         ("%s: Unexpected xbb_disconnect() failure %d",
3413                          __func__, error));
3414
3415                 return (error);
3416         }
3417
3418         DPRINTF("\n");
3419
3420         /* Indicate to xbb_detach() that is it safe to proceed. */
3421         wakeup(xbb);
3422
3423         return (0);
3424 }
3425
3426 /**
3427  * Report an attach time error to the console and Xen, and cleanup
3428  * this instance by forcing immediate detach processing.
3429  *
3430  * \param xbb  Per-instance xbb configuration structure.
3431  * \param err  Errno describing the error.
3432  * \param fmt  Printf style format and arguments
3433  */
3434 static void
3435 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3436 {
3437         va_list ap;
3438         va_list ap_hotplug;
3439
3440         va_start(ap, fmt);
3441         va_copy(ap_hotplug, ap);
3442         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3443                   "hotplug-error", fmt, ap_hotplug);
3444         va_end(ap_hotplug);
3445         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3446                   "hotplug-status", "error");
3447
3448         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3449         va_end(ap);
3450
3451         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3452                   "online", "0");
3453         xbb_detach(xbb->dev);
3454 }
3455
3456 /*---------------------------- NewBus Entrypoints ----------------------------*/
3457 /**
3458  * Inspect a XenBus device and claim it if is of the appropriate type.
3459  * 
3460  * \param dev  NewBus device object representing a candidate XenBus device.
3461  *
3462  * \return  0 for success, errno codes for failure.
3463  */
3464 static int
3465 xbb_probe(device_t dev)
3466 {
3467  
3468         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3469                 device_set_desc(dev, "Backend Virtual Block Device");
3470                 device_quiet(dev);
3471                 return (0);
3472         }
3473
3474         return (ENXIO);
3475 }
3476
3477 /**
3478  * Setup sysctl variables to control various Block Back parameters.
3479  *
3480  * \param xbb  Xen Block Back softc.
3481  *
3482  */
3483 static void
3484 xbb_setup_sysctl(struct xbb_softc *xbb)
3485 {
3486         struct sysctl_ctx_list *sysctl_ctx = NULL;
3487         struct sysctl_oid      *sysctl_tree = NULL;
3488         
3489         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3490         if (sysctl_ctx == NULL)
3491                 return;
3492
3493         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3494         if (sysctl_tree == NULL)
3495                 return;
3496
3497         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3498                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3499                        "fake the flush command");
3500
3501         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3502                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3503                        "send a real flush for N flush requests");
3504
3505         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3506                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3507                        "Don't coalesce contiguous requests");
3508
3509         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3510                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3511                          "how many I/O requests we have received");
3512
3513         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3514                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3515                          "how many I/O requests have been completed");
3516
3517         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3518                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3519                          "how many I/O dispatches were forced");
3520
3521         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3522                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3523                          "how many I/O dispatches were normal");
3524
3525         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3526                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3527                          "total number of I/O dispatches");
3528
3529         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3530                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3531                          "how many times we have run out of KVA");
3532
3533         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3534                          "request_shortages", CTLFLAG_RW,
3535                          &xbb->request_shortages,
3536                          "how many times we have run out of requests");
3537
3538         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3539                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3540                         "maximum outstanding requests (negotiated)");
3541
3542         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3543                         "max_request_segments", CTLFLAG_RD,
3544                         &xbb->max_request_segments, 0,
3545                         "maximum number of pages per requests (negotiated)");
3546
3547         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3548                         "max_request_size", CTLFLAG_RD,
3549                         &xbb->max_request_size, 0,
3550                         "maximum size in bytes of a request (negotiated)");
3551
3552         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3553                         "ring_pages", CTLFLAG_RD,
3554                         &xbb->ring_config.ring_pages, 0,
3555                         "communication channel pages (negotiated)");
3556 }
3557
3558 /**
3559  * Attach to a XenBus device that has been claimed by our probe routine.
3560  *
3561  * \param dev  NewBus device object representing this Xen Block Back instance.
3562  *
3563  * \return  0 for success, errno codes for failure.
3564  */
3565 static int
3566 xbb_attach(device_t dev)
3567 {
3568         struct xbb_softc        *xbb;
3569         int                      error;
3570         u_int                    max_ring_page_order;
3571
3572         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3573
3574         /*
3575          * Basic initialization.
3576          * After this block it is safe to call xbb_detach()
3577          * to clean up any allocated data for this instance.
3578          */
3579         xbb = device_get_softc(dev);
3580         xbb->dev = dev;
3581         xbb->otherend_id = xenbus_get_otherend_id(dev);
3582         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3583         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3584
3585         /*
3586          * Publish protocol capabilities for consumption by the
3587          * front-end.
3588          */
3589         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3590                           "feature-barrier", "1");
3591         if (error) {
3592                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3593                                   xenbus_get_node(xbb->dev));
3594                 return (error);
3595         }
3596
3597         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3598                           "feature-flush-cache", "1");
3599         if (error) {
3600                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3601                                   xenbus_get_node(xbb->dev));
3602                 return (error);
3603         }
3604
3605         max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3606         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3607                           "max-ring-page-order", "%u", max_ring_page_order);
3608         if (error) {
3609                 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3610                                   xenbus_get_node(xbb->dev));
3611                 return (error);
3612         }
3613
3614         /* Collect physical device information. */
3615         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3616                           "device-type", NULL, &xbb->dev_type,
3617                           NULL);
3618         if (error != 0)
3619                 xbb->dev_type = NULL;
3620
3621         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3622                           "mode", NULL, &xbb->dev_mode,
3623                           "params", NULL, &xbb->dev_name,
3624                           NULL);
3625         if (error != 0) {
3626                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3627                                   xenbus_get_node(dev));
3628                 return (ENXIO);
3629         }
3630
3631         /* Parse fopen style mode flags. */
3632         if (strchr(xbb->dev_mode, 'w') == NULL)
3633                 xbb->flags |= XBBF_READ_ONLY;
3634
3635         /*
3636          * Verify the physical device is present and can support
3637          * the desired I/O mode.
3638          */
3639         DROP_GIANT();
3640         error = xbb_open_backend(xbb);
3641         PICKUP_GIANT();
3642         if (error != 0) {
3643                 xbb_attach_failed(xbb, error, "Unable to open %s",
3644                                   xbb->dev_name);
3645                 return (ENXIO);
3646         }
3647
3648         /* Use devstat(9) for recording statistics. */
3649         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3650                                            xbb->sector_size,
3651                                            DEVSTAT_ALL_SUPPORTED,
3652                                            DEVSTAT_TYPE_DIRECT
3653                                          | DEVSTAT_TYPE_IF_OTHER,
3654                                            DEVSTAT_PRIORITY_OTHER);
3655
3656         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3657                                               xbb->sector_size,
3658                                               DEVSTAT_ALL_SUPPORTED,
3659                                               DEVSTAT_TYPE_DIRECT
3660                                             | DEVSTAT_TYPE_IF_OTHER,
3661                                               DEVSTAT_PRIORITY_OTHER);
3662         /*
3663          * Setup sysctl variables.
3664          */
3665         xbb_setup_sysctl(xbb);
3666
3667         /*
3668          * Create a taskqueue for doing work that must occur from a
3669          * thread context.
3670          */
3671         xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3672                                                   M_NOWAIT,
3673                                                   taskqueue_thread_enqueue,
3674                                                   /*contxt*/&xbb->io_taskqueue);
3675         if (xbb->io_taskqueue == NULL) {
3676                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3677                 return (ENOMEM);
3678         }
3679
3680         taskqueue_start_threads(&xbb->io_taskqueue,
3681                                 /*num threads*/1,
3682                                 /*priority*/PWAIT,
3683                                 /*thread name*/
3684                                 "%s taskq", device_get_nameunit(dev));
3685
3686         /* Update hot-plug status to satisfy xend. */
3687         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3688                           "hotplug-status", "connected");
3689         if (error) {
3690                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3691                                   xenbus_get_node(xbb->dev));
3692                 return (error);
3693         }
3694
3695         /* Tell the front end that we are ready to connect. */
3696         xenbus_set_state(dev, XenbusStateInitWait);
3697
3698         return (0);
3699 }
3700
3701 /**
3702  * Detach from a block back device instance.
3703  *
3704  * \param dev  NewBus device object representing this Xen Block Back instance.
3705  *
3706  * \return  0 for success, errno codes for failure.
3707  * 
3708  * \note A block back device may be detached at any time in its life-cycle,
3709  *       including part way through the attach process.  For this reason,
3710  *       initialization order and the intialization state checks in this
3711  *       routine must be carefully coupled so that attach time failures
3712  *       are gracefully handled.
3713  */
3714 static int
3715 xbb_detach(device_t dev)
3716 {
3717         struct xbb_softc *xbb;
3718
3719         DPRINTF("\n");
3720
3721         xbb = device_get_softc(dev);
3722         mtx_lock(&xbb->lock);
3723         while (xbb_shutdown(xbb) == EAGAIN) {
3724                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3725                        "xbb_shutdown", 0);
3726         }
3727         mtx_unlock(&xbb->lock);
3728
3729         DPRINTF("\n");
3730
3731         if (xbb->io_taskqueue != NULL)
3732                 taskqueue_free(xbb->io_taskqueue);
3733
3734         if (xbb->xbb_stats != NULL)
3735                 devstat_remove_entry(xbb->xbb_stats);
3736
3737         if (xbb->xbb_stats_in != NULL)
3738                 devstat_remove_entry(xbb->xbb_stats_in);
3739
3740         xbb_close_backend(xbb);
3741
3742         if (xbb->dev_mode != NULL) {
3743                 free(xbb->dev_mode, M_XENBUS);
3744                 xbb->dev_mode = NULL;
3745         }
3746
3747         if (xbb->dev_type != NULL) {
3748                 free(xbb->dev_type, M_XENBUS);
3749                 xbb->dev_type = NULL;
3750         }
3751
3752         if (xbb->dev_name != NULL) {
3753                 free(xbb->dev_name, M_XENBUS);
3754                 xbb->dev_name = NULL;
3755         }
3756
3757         mtx_destroy(&xbb->lock);
3758         return (0);
3759 }
3760
3761 /**
3762  * Prepare this block back device for suspension of this VM.
3763  * 
3764  * \param dev  NewBus device object representing this Xen Block Back instance.
3765  *
3766  * \return  0 for success, errno codes for failure.
3767  */
3768 static int
3769 xbb_suspend(device_t dev)
3770 {
3771 #ifdef NOT_YET
3772         struct xbb_softc *sc = device_get_softc(dev);
3773
3774         /* Prevent new requests being issued until we fix things up. */
3775         mtx_lock(&sc->xb_io_lock);
3776         sc->connected = BLKIF_STATE_SUSPENDED;
3777         mtx_unlock(&sc->xb_io_lock);
3778 #endif
3779
3780         return (0);
3781 }
3782
3783 /**
3784  * Perform any processing required to recover from a suspended state.
3785  * 
3786  * \param dev  NewBus device object representing this Xen Block Back instance.
3787  *
3788  * \return  0 for success, errno codes for failure.
3789  */
3790 static int
3791 xbb_resume(device_t dev)
3792 {
3793         return (0);
3794 }
3795
3796 /**
3797  * Handle state changes expressed via the XenStore by our front-end peer.
3798  *
3799  * \param dev             NewBus device object representing this Xen
3800  *                        Block Back instance.
3801  * \param frontend_state  The new state of the front-end.
3802  *
3803  * \return  0 for success, errno codes for failure.
3804  */
3805 static void
3806 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3807 {
3808         struct xbb_softc *xbb = device_get_softc(dev);
3809
3810         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3811                 xenbus_strstate(frontend_state),
3812                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3813
3814         switch (frontend_state) {
3815         case XenbusStateInitialising:
3816                 break;
3817         case XenbusStateInitialised:
3818         case XenbusStateConnected:
3819                 xbb_connect(xbb);
3820                 break;
3821         case XenbusStateClosing:
3822         case XenbusStateClosed:
3823                 mtx_lock(&xbb->lock);
3824                 xbb_shutdown(xbb);
3825                 mtx_unlock(&xbb->lock);
3826                 if (frontend_state == XenbusStateClosed)
3827                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3828                 break;
3829         default:
3830                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3831                                  frontend_state);
3832                 break;
3833         }
3834 }
3835
3836 /*---------------------------- NewBus Registration ---------------------------*/
3837 static device_method_t xbb_methods[] = {
3838         /* Device interface */
3839         DEVMETHOD(device_probe,         xbb_probe),
3840         DEVMETHOD(device_attach,        xbb_attach),
3841         DEVMETHOD(device_detach,        xbb_detach),
3842         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3843         DEVMETHOD(device_suspend,       xbb_suspend),
3844         DEVMETHOD(device_resume,        xbb_resume),
3845
3846         /* Xenbus interface */
3847         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3848
3849         { 0, 0 }
3850 };
3851
3852 static driver_t xbb_driver = {
3853         "xbbd",
3854         xbb_methods,
3855         sizeof(struct xbb_softc),
3856 };
3857 devclass_t xbb_devclass;
3858
3859 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);