]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/dev/xen/blkback/blkback.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / dev / xen / blkback / blkback.c
1 /*-
2  * Copyright (c) 2009-2011 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  *          Ken Merry           (Spectra Logic Corporation)
32  */
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 /**
37  * \file blkback.c
38  *
39  * \brief Device driver supporting the vending of block storage from
40  *        a FreeBSD domain to other domains.
41  */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47
48 #include <sys/bio.h>
49 #include <sys/bus.h>
50 #include <sys/conf.h>
51 #include <sys/devicestat.h>
52 #include <sys/disk.h>
53 #include <sys/fcntl.h>
54 #include <sys/filedesc.h>
55 #include <sys/kdb.h>
56 #include <sys/module.h>
57 #include <sys/namei.h>
58 #include <sys/proc.h>
59 #include <sys/rman.h>
60 #include <sys/taskqueue.h>
61 #include <sys/types.h>
62 #include <sys/vnode.h>
63 #include <sys/mount.h>
64 #include <sys/sysctl.h>
65 #include <sys/bitstring.h>
66
67 #include <geom/geom.h>
68
69 #include <machine/_inttypes.h>
70 #include <machine/xen/xen-os.h>
71
72 #include <vm/vm.h>
73 #include <vm/vm_extern.h>
74 #include <vm/vm_kern.h>
75
76 #include <xen/blkif.h>
77 #include <xen/evtchn.h>
78 #include <xen/gnttab.h>
79 #include <xen/xen_intr.h>
80
81 #include <xen/interface/event_channel.h>
82 #include <xen/interface/grant_table.h>
83
84 #include <xen/xenbus/xenbusvar.h>
85
86 /*--------------------------- Compile-time Tunables --------------------------*/
87 /**
88  * The maximum number of outstanding request blocks (request headers plus
89  * additional segment blocks) we will allow in a negotiated block-front/back
90  * communication channel.
91  */
92 #define XBB_MAX_REQUESTS        256
93
94 /**
95  * \brief Define to force all I/O to be performed on memory owned by the
96  *        backend device, with a copy-in/out to the remote domain's memory.
97  *
98  * \note  This option is currently required when this driver's domain is
99  *        operating in HVM mode on a system using an IOMMU.
100  *
101  * This driver uses Xen's grant table API to gain access to the memory of
102  * the remote domains it serves.  When our domain is operating in PV mode,
103  * the grant table mechanism directly updates our domain's page table entries
104  * to point to the physical pages of the remote domain.  This scheme guarantees
105  * that blkback and the backing devices it uses can safely perform DMA
106  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
107  * insure that our domain cannot DMA to pages owned by another domain.  As
108  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
109  * table API.  For this reason, in HVM mode, we must bounce all requests into
110  * memory that is mapped into our domain at domain startup and thus has
111  * valid IOMMU mappings.
112  */
113 #define XBB_USE_BOUNCE_BUFFERS
114
115 /**
116  * \brief Define to enable rudimentary request logging to the console.
117  */
118 #undef XBB_DEBUG
119
120 /*---------------------------------- Macros ----------------------------------*/
121 /**
122  * Custom malloc type for all driver allocations.
123  */
124 MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
125
126 #ifdef XBB_DEBUG
127 #define DPRINTF(fmt, args...) \
128     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
129 #else
130 #define DPRINTF(fmt, args...) do {} while(0)
131 #endif
132
133 /**
134  * The maximum mapped region size per request we will allow in a negotiated
135  * block-front/back communication channel.
136  */
137 #define XBB_MAX_REQUEST_SIZE            \
138         MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
139
140 /**
141  * The maximum number of segments (within a request header and accompanying
142  * segment blocks) per request we will allow in a negotiated block-front/back
143  * communication channel.
144  */
145 #define XBB_MAX_SEGMENTS_PER_REQUEST                    \
146         (MIN(UIO_MAXIOV,                                \
147              MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,        \
148                  (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
149
150 /**
151  * The maximum number of shared memory ring pages we will allow in a
152  * negotiated block-front/back communication channel.  Allow enough
153  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
154  */
155 #define XBB_MAX_RING_PAGES                                                  \
156         BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
157                        * XBB_MAX_REQUESTS)
158 /**
159  * The maximum number of ring pages that we can allow per request list.
160  * We limit this to the maximum number of segments per request, because
161  * that is already a reasonable number of segments to aggregate.  This
162  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
163  * because that would leave situations where we can't dispatch even one
164  * large request.
165  */
166 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
167
168 /*--------------------------- Forward Declarations ---------------------------*/
169 struct xbb_softc;
170 struct xbb_xen_req;
171
172 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
173                               ...) __attribute__((format(printf, 3, 4)));
174 static int  xbb_shutdown(struct xbb_softc *xbb);
175 static int  xbb_detach(device_t dev);
176
177 /*------------------------------ Data Structures -----------------------------*/
178
179 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
180
181 typedef enum {
182         XBB_REQLIST_NONE        = 0x00,
183         XBB_REQLIST_MAPPED      = 0x01
184 } xbb_reqlist_flags;
185
186 struct xbb_xen_reqlist {
187         /**
188          * Back reference to the parent block back instance for this
189          * request.  Used during bio_done handling.
190          */
191         struct xbb_softc        *xbb;
192
193         /**
194          * BLKIF_OP code for this request.
195          */
196         int                      operation;
197
198         /**
199          * Set to BLKIF_RSP_* to indicate request status.
200          *
201          * This field allows an error status to be recorded even if the
202          * delivery of this status must be deferred.  Deferred reporting
203          * is necessary, for example, when an error is detected during
204          * completion processing of one bio when other bios for this
205          * request are still outstanding.
206          */
207         int                      status;
208
209         /**
210          * Number of 512 byte sectors not transferred.
211          */
212         int                      residual_512b_sectors;
213
214         /**
215          * Starting sector number of the first request in the list.
216          */
217         off_t                    starting_sector_number;
218
219         /**
220          * If we're going to coalesce, the next contiguous sector would be
221          * this one.
222          */
223         off_t                    next_contig_sector;
224
225         /**
226          * Number of child requests in the list.
227          */
228         int                      num_children;
229
230         /**
231          * Number of I/O requests dispatched to the backend.
232          */
233         int                      pendcnt;
234
235         /**
236          * Total number of segments for requests in the list.
237          */
238         int                      nr_segments;
239
240         /**
241          * Flags for this particular request list.
242          */
243         xbb_reqlist_flags        flags;
244
245         /**
246          * Kernel virtual address space reserved for this request
247          * list structure and used to map the remote domain's pages for
248          * this I/O, into our domain's address space.
249          */
250         uint8_t                 *kva;
251
252         /**
253          * Base, psuedo-physical address, corresponding to the start
254          * of this request's kva region.
255          */
256         uint64_t                 gnt_base;
257
258
259 #ifdef XBB_USE_BOUNCE_BUFFERS
260         /**
261          * Pre-allocated domain local memory used to proxy remote
262          * domain memory during I/O operations.
263          */
264         uint8_t                 *bounce;
265 #endif
266
267         /**
268          * Array of grant handles (one per page) used to map this request.
269          */
270         grant_handle_t          *gnt_handles;
271
272         /**
273          * Device statistics request ordering type (ordered or simple).
274          */
275         devstat_tag_type         ds_tag_type;
276
277         /**
278          * Device statistics request type (read, write, no_data).
279          */
280         devstat_trans_flags      ds_trans_type;
281
282         /**
283          * The start time for this request.
284          */
285         struct bintime           ds_t0;
286
287         /**
288          * Linked list of contiguous requests with the same operation type.
289          */
290         struct xbb_xen_req_list  contig_req_list;
291
292         /**
293          * Linked list links used to aggregate idle requests in the
294          * request list free pool (xbb->reqlist_free_stailq) and pending
295          * requests waiting for execution (xbb->reqlist_pending_stailq).
296          */
297         STAILQ_ENTRY(xbb_xen_reqlist) links;
298 };
299
300 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
301
302 /**
303  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
304  */
305 struct xbb_xen_req {
306         /**
307          * Linked list links used to aggregate requests into a reqlist
308          * and to store them in the request free pool.
309          */
310         STAILQ_ENTRY(xbb_xen_req) links;
311
312         /**
313          * The remote domain's identifier for this I/O request.
314          */
315         uint64_t                  id;
316
317         /**
318          * The number of pages currently mapped for this request.
319          */
320         int                       nr_pages;
321
322         /**
323          * The number of 512 byte sectors comprising this requests.
324          */
325         int                       nr_512b_sectors;
326
327         /**
328          * The number of struct bio requests still outstanding for this
329          * request on the backend device.  This field is only used for  
330          * device (rather than file) backed I/O.
331          */
332         int                       pendcnt;
333
334         /**
335          * BLKIF_OP code for this request.
336          */
337         int                       operation;
338
339         /**
340          * Storage used for non-native ring requests.
341          */
342         blkif_request_t          ring_req_storage;
343
344         /**
345          * Pointer to the Xen request in the ring.
346          */
347         blkif_request_t         *ring_req;
348
349         /**
350          * Consumer index for this request.
351          */
352         RING_IDX                 req_ring_idx;
353
354         /**
355          * The start time for this request.
356          */
357         struct bintime           ds_t0;
358
359         /**
360          * Pointer back to our parent request list.
361          */
362         struct xbb_xen_reqlist  *reqlist;
363 };
364 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
365
366 /**
367  * \brief Configuration data for the shared memory request ring
368  *        used to communicate with the front-end client of this
369  *        this driver.
370  */
371 struct xbb_ring_config {
372         /** KVA address where ring memory is mapped. */
373         vm_offset_t     va;
374
375         /** The pseudo-physical address where ring memory is mapped.*/
376         uint64_t        gnt_addr;
377
378         /**
379          * Grant table handles, one per-ring page, returned by the
380          * hyperpervisor upon mapping of the ring and required to
381          * unmap it when a connection is torn down.
382          */
383         grant_handle_t  handle[XBB_MAX_RING_PAGES];
384
385         /**
386          * The device bus address returned by the hypervisor when
387          * mapping the ring and required to unmap it when a connection
388          * is torn down.
389          */
390         uint64_t        bus_addr[XBB_MAX_RING_PAGES];
391
392         /** The number of ring pages mapped for the current connection. */
393         u_int           ring_pages;
394
395         /**
396          * The grant references, one per-ring page, supplied by the
397          * front-end, allowing us to reference the ring pages in the
398          * front-end's domain and to map these pages into our own domain.
399          */
400         grant_ref_t     ring_ref[XBB_MAX_RING_PAGES];
401
402         /** The interrupt driven even channel used to signal ring events. */
403         evtchn_port_t   evtchn;
404 };
405
406 /**
407  * Per-instance connection state flags.
408  */
409 typedef enum
410 {
411         /**
412          * The front-end requested a read-only mount of the
413          * back-end device/file.
414          */
415         XBBF_READ_ONLY         = 0x01,
416
417         /** Communication with the front-end has been established. */
418         XBBF_RING_CONNECTED    = 0x02,
419
420         /**
421          * Front-end requests exist in the ring and are waiting for
422          * xbb_xen_req objects to free up.
423          */
424         XBBF_RESOURCE_SHORTAGE = 0x04,
425
426         /** Connection teardown in progress. */
427         XBBF_SHUTDOWN          = 0x08,
428
429         /** A thread is already performing shutdown processing. */
430         XBBF_IN_SHUTDOWN       = 0x10
431 } xbb_flag_t;
432
433 /** Backend device type.  */
434 typedef enum {
435         /** Backend type unknown. */
436         XBB_TYPE_NONE           = 0x00,
437
438         /**
439          * Backend type disk (access via cdev switch
440          * strategy routine).
441          */
442         XBB_TYPE_DISK           = 0x01,
443
444         /** Backend type file (access vnode operations.). */
445         XBB_TYPE_FILE           = 0x02
446 } xbb_type;
447
448 /**
449  * \brief Structure used to memoize information about a per-request
450  *        scatter-gather list.
451  *
452  * The chief benefit of using this data structure is it avoids having
453  * to reparse the possibly discontiguous S/G list in the original
454  * request.  Due to the way that the mapping of the memory backing an
455  * I/O transaction is handled by Xen, a second pass is unavoidable.
456  * At least this way the second walk is a simple array traversal.
457  *
458  * \note A single Scatter/Gather element in the block interface covers
459  *       at most 1 machine page.  In this context a sector (blkif
460  *       nomenclature, not what I'd choose) is a 512b aligned unit
461  *       of mapping within the machine page referenced by an S/G
462  *       element.
463  */
464 struct xbb_sg {
465         /** The number of 512b data chunks mapped in this S/G element. */
466         int16_t nsect;
467
468         /**
469          * The index (0 based) of the first 512b data chunk mapped
470          * in this S/G element.
471          */
472         uint8_t first_sect;
473
474         /**
475          * The index (0 based) of the last 512b data chunk mapped
476          * in this S/G element.
477          */
478         uint8_t last_sect;
479 };
480
481 /**
482  * Character device backend specific configuration data.
483  */
484 struct xbb_dev_data {
485         /** Cdev used for device backend access.  */
486         struct cdev   *cdev;
487
488         /** Cdev switch used for device backend access.  */
489         struct cdevsw *csw;
490
491         /** Used to hold a reference on opened cdev backend devices. */
492         int            dev_ref;
493 };
494
495 /**
496  * File backend specific configuration data.
497  */
498 struct xbb_file_data {
499         /** Credentials to use for vnode backed (file based) I/O. */
500         struct ucred   *cred;
501
502         /**
503          * \brief Array of io vectors used to process file based I/O.
504          *
505          * Only a single file based request is outstanding per-xbb instance,
506          * so we only need one of these.
507          */
508         struct iovec    xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
509 #ifdef XBB_USE_BOUNCE_BUFFERS
510
511         /**
512          * \brief Array of io vectors used to handle bouncing of file reads.
513          *
514          * Vnode operations are free to modify uio data during their
515          * exectuion.  In the case of a read with bounce buffering active,
516          * we need some of the data from the original uio in order to
517          * bounce-out the read data.  This array serves as the temporary
518          * storage for this saved data.
519          */
520         struct iovec    saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
521
522         /**
523          * \brief Array of memoized bounce buffer kva offsets used
524          *        in the file based backend.
525          *
526          * Due to the way that the mapping of the memory backing an
527          * I/O transaction is handled by Xen, a second pass through
528          * the request sg elements is unavoidable. We memoize the computed
529          * bounce address here to reduce the cost of the second walk.
530          */
531         void            *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
532 #endif /* XBB_USE_BOUNCE_BUFFERS */
533 };
534
535 /**
536  * Collection of backend type specific data.
537  */
538 union xbb_backend_data {
539         struct xbb_dev_data  dev;
540         struct xbb_file_data file;
541 };
542
543 /**
544  * Function signature of backend specific I/O handlers.
545  */
546 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
547                               struct xbb_xen_reqlist *reqlist, int operation,
548                               int flags);
549
550 /**
551  * Per-instance configuration data.
552  */
553 struct xbb_softc {
554
555         /**
556          * Task-queue used to process I/O requests.
557          */
558         struct taskqueue         *io_taskqueue;
559
560         /**
561          * Single "run the request queue" task enqueued
562          * on io_taskqueue.
563          */
564         struct task               io_task;
565
566         /** Device type for this instance. */
567         xbb_type                  device_type;
568
569         /** NewBus device corresponding to this instance. */
570         device_t                  dev;
571
572         /** Backend specific dispatch routine for this instance. */
573         xbb_dispatch_t            dispatch_io;
574
575         /** The number of requests outstanding on the backend device/file. */
576         int                       active_request_count;
577
578         /** Free pool of request tracking structures. */
579         struct xbb_xen_req_list   request_free_stailq;
580
581         /** Array, sized at connection time, of request tracking structures. */
582         struct xbb_xen_req       *requests;
583
584         /** Free pool of request list structures. */
585         struct xbb_xen_reqlist_list reqlist_free_stailq;
586
587         /** List of pending request lists awaiting execution. */
588         struct xbb_xen_reqlist_list reqlist_pending_stailq;
589
590         /** Array, sized at connection time, of request list structures. */
591         struct xbb_xen_reqlist   *request_lists;
592
593         /**
594          * Global pool of kva used for mapping remote domain ring
595          * and I/O transaction data.
596          */
597         vm_offset_t               kva;
598
599         /** Psuedo-physical address corresponding to kva. */
600         uint64_t                  gnt_base_addr;
601
602         /** The size of the global kva pool. */
603         int                       kva_size;
604
605         /** The size of the KVA area used for request lists. */
606         int                       reqlist_kva_size;
607
608         /** The number of pages of KVA used for request lists */
609         int                       reqlist_kva_pages;
610
611         /** Bitmap of free KVA pages */
612         bitstr_t                 *kva_free;
613
614         /**
615          * \brief Cached value of the front-end's domain id.
616          * 
617          * This value is used at once for each mapped page in
618          * a transaction.  We cache it to avoid incuring the
619          * cost of an ivar access every time this is needed.
620          */
621         domid_t                   otherend_id;
622
623         /**
624          * \brief The blkif protocol abi in effect.
625          *
626          * There are situations where the back and front ends can
627          * have a different, native abi (e.g. intel x86_64 and
628          * 32bit x86 domains on the same machine).  The back-end
629          * always accomodates the front-end's native abi.  That
630          * value is pulled from the XenStore and recorded here.
631          */
632         int                       abi;
633
634         /**
635          * \brief The maximum number of requests and request lists allowed
636          *        to be in flight at a time.
637          *
638          * This value is negotiated via the XenStore.
639          */
640         u_int                     max_requests;
641
642         /**
643          * \brief The maximum number of segments (1 page per segment)
644          *        that can be mapped by a request.
645          *
646          * This value is negotiated via the XenStore.
647          */
648         u_int                     max_request_segments;
649
650         /**
651          * \brief Maximum number of segments per request list.
652          *
653          * This value is derived from and will generally be larger than
654          * max_request_segments.
655          */
656         u_int                     max_reqlist_segments;
657
658         /**
659          * The maximum size of any request to this back-end
660          * device.
661          *
662          * This value is negotiated via the XenStore.
663          */
664         u_int                     max_request_size;
665
666         /**
667          * The maximum size of any request list.  This is derived directly
668          * from max_reqlist_segments.
669          */
670         u_int                     max_reqlist_size;
671
672         /** Various configuration and state bit flags. */
673         xbb_flag_t                flags;
674
675         /** Ring mapping and interrupt configuration data. */
676         struct xbb_ring_config    ring_config;
677
678         /** Runtime, cross-abi safe, structures for ring access. */
679         blkif_back_rings_t        rings;
680
681         /** IRQ mapping for the communication ring event channel. */
682         int                       irq;
683
684         /**
685          * \brief Backend access mode flags (e.g. write, or read-only).
686          *
687          * This value is passed to us by the front-end via the XenStore.
688          */
689         char                     *dev_mode;
690
691         /**
692          * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
693          *
694          * This value is passed to us by the front-end via the XenStore.
695          * Currently unused.
696          */
697         char                     *dev_type;
698
699         /**
700          * \brief Backend device/file identifier.
701          *
702          * This value is passed to us by the front-end via the XenStore.
703          * We expect this to be a POSIX path indicating the file or
704          * device to open.
705          */
706         char                     *dev_name;
707
708         /**
709          * Vnode corresponding to the backend device node or file
710          * we are acessing.
711          */
712         struct vnode             *vn;
713
714         union xbb_backend_data    backend;
715
716         /** The native sector size of the backend. */
717         u_int                     sector_size;
718
719         /** log2 of sector_size.  */
720         u_int                     sector_size_shift;
721
722         /** Size in bytes of the backend device or file.  */
723         off_t                     media_size;
724
725         /**
726          * \brief media_size expressed in terms of the backend native
727          *        sector size.
728          *
729          * (e.g. xbb->media_size >> xbb->sector_size_shift).
730          */
731         uint64_t                  media_num_sectors;
732
733         /**
734          * \brief Array of memoized scatter gather data computed during the
735          *        conversion of blkif ring requests to internal xbb_xen_req
736          *        structures.
737          *
738          * Ring processing is serialized so we only need one of these.
739          */
740         struct xbb_sg             xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
741
742         /**
743          * Temporary grant table map used in xbb_dispatch_io().  When
744          * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
745          * stack could cause a stack overflow.
746          */
747         struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
748
749         /** Mutex protecting per-instance data. */
750         struct mtx                lock;
751
752 #ifdef XENHVM
753         /**
754          * Resource representing allocated physical address space
755          * associated with our per-instance kva region.
756          */
757         struct resource          *pseudo_phys_res;
758
759         /** Resource id for allocated physical address space. */
760         int                       pseudo_phys_res_id;
761 #endif
762
763         /**
764          * I/O statistics from BlockBack dispatch down.  These are
765          * coalesced requests, and we start them right before execution.
766          */
767         struct devstat           *xbb_stats;
768
769         /**
770          * I/O statistics coming into BlockBack.  These are the requests as
771          * we get them from BlockFront.  They are started as soon as we
772          * receive a request, and completed when the I/O is complete.
773          */
774         struct devstat           *xbb_stats_in;
775
776         /** Disable sending flush to the backend */
777         int                       disable_flush;
778
779         /** Send a real flush for every N flush requests */
780         int                       flush_interval;
781
782         /** Count of flush requests in the interval */
783         int                       flush_count;
784
785         /** Don't coalesce requests if this is set */
786         int                       no_coalesce_reqs;
787
788         /** Number of requests we have received */
789         uint64_t                  reqs_received;
790
791         /** Number of requests we have completed*/
792         uint64_t                  reqs_completed;
793
794         /** How many forced dispatches (i.e. without coalescing) have happend */
795         uint64_t                  forced_dispatch;
796
797         /** How many normal dispatches have happend */
798         uint64_t                  normal_dispatch;
799
800         /** How many total dispatches have happend */
801         uint64_t                  total_dispatch;
802
803         /** How many times we have run out of KVA */
804         uint64_t                  kva_shortages;
805
806         /** How many times we have run out of request structures */
807         uint64_t                  request_shortages;
808 };
809
810 /*---------------------------- Request Processing ----------------------------*/
811 /**
812  * Allocate an internal transaction tracking structure from the free pool.
813  *
814  * \param xbb  Per-instance xbb configuration structure.
815  *
816  * \return  On success, a pointer to the allocated xbb_xen_req structure.
817  *          Otherwise NULL.
818  */
819 static inline struct xbb_xen_req *
820 xbb_get_req(struct xbb_softc *xbb)
821 {
822         struct xbb_xen_req *req;
823
824         req = NULL;
825
826         mtx_assert(&xbb->lock, MA_OWNED);
827
828         if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
829                 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
830                 xbb->active_request_count++;
831         }
832
833         return (req);
834 }
835
836 /**
837  * Return an allocated transaction tracking structure to the free pool.
838  *
839  * \param xbb  Per-instance xbb configuration structure.
840  * \param req  The request structure to free.
841  */
842 static inline void
843 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
844 {
845         mtx_assert(&xbb->lock, MA_OWNED);
846
847         STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
848         xbb->active_request_count--;
849
850         KASSERT(xbb->active_request_count >= 0,
851                 ("xbb_release_req: negative active count"));
852 }
853
854 /**
855  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
856  *
857  * \param xbb       Per-instance xbb configuration structure.
858  * \param req_list  The list of requests to free.
859  * \param nreqs     The number of items in the list.
860  */
861 static inline void
862 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
863                  int nreqs)
864 {
865         mtx_assert(&xbb->lock, MA_OWNED);
866
867         STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
868         xbb->active_request_count -= nreqs;
869
870         KASSERT(xbb->active_request_count >= 0,
871                 ("xbb_release_reqs: negative active count"));
872 }
873
874 /**
875  * Given a page index and 512b sector offset within that page,
876  * calculate an offset into a request's kva region.
877  *
878  * \param reqlist The request structure whose kva region will be accessed.
879  * \param pagenr  The page index used to compute the kva offset.
880  * \param sector  The 512b sector index used to compute the page relative
881  *                kva offset.
882  *
883  * \return  The computed global KVA offset.
884  */
885 static inline uint8_t *
886 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
887 {
888         return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
889 }
890
891 #ifdef XBB_USE_BOUNCE_BUFFERS
892 /**
893  * Given a page index and 512b sector offset within that page,
894  * calculate an offset into a request's local bounce memory region.
895  *
896  * \param reqlist The request structure whose bounce region will be accessed.
897  * \param pagenr  The page index used to compute the bounce offset.
898  * \param sector  The 512b sector index used to compute the page relative
899  *                bounce offset.
900  *
901  * \return  The computed global bounce buffer address.
902  */
903 static inline uint8_t *
904 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
905 {
906         return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
907 }
908 #endif
909
910 /**
911  * Given a page number and 512b sector offset within that page,
912  * calculate an offset into the request's memory region that the
913  * underlying backend device/file should use for I/O.
914  *
915  * \param reqlist The request structure whose I/O region will be accessed.
916  * \param pagenr  The page index used to compute the I/O offset.
917  * \param sector  The 512b sector index used to compute the page relative
918  *                I/O offset.
919  *
920  * \return  The computed global I/O address.
921  *
922  * Depending on configuration, this will either be a local bounce buffer
923  * or a pointer to the memory mapped in from the front-end domain for
924  * this request.
925  */
926 static inline uint8_t *
927 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
928 {
929 #ifdef XBB_USE_BOUNCE_BUFFERS
930         return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
931 #else
932         return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
933 #endif
934 }
935
936 /**
937  * Given a page index and 512b sector offset within that page, calculate
938  * an offset into the local psuedo-physical address space used to map a
939  * front-end's request data into a request.
940  *
941  * \param reqlist The request list structure whose pseudo-physical region
942  *                will be accessed.
943  * \param pagenr  The page index used to compute the pseudo-physical offset.
944  * \param sector  The 512b sector index used to compute the page relative
945  *                pseudo-physical offset.
946  *
947  * \return  The computed global pseudo-phsyical address.
948  *
949  * Depending on configuration, this will either be a local bounce buffer
950  * or a pointer to the memory mapped in from the front-end domain for
951  * this request.
952  */
953 static inline uintptr_t
954 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
955 {
956         struct xbb_softc *xbb;
957
958         xbb = reqlist->xbb;
959
960         return ((uintptr_t)(xbb->gnt_base_addr +
961                 (uintptr_t)(reqlist->kva - xbb->kva) +
962                 (PAGE_SIZE * pagenr) + (sector << 9)));
963 }
964
965 /**
966  * Get Kernel Virtual Address space for mapping requests.
967  *
968  * \param xbb         Per-instance xbb configuration structure.
969  * \param nr_pages    Number of pages needed.
970  * \param check_only  If set, check for free KVA but don't allocate it.
971  * \param have_lock   If set, xbb lock is already held.
972  *
973  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
974  *
975  * Note:  This should be unnecessary once we have either chaining or
976  * scatter/gather support for struct bio.  At that point we'll be able to
977  * put multiple addresses and lengths in one bio/bio chain and won't need
978  * to map everything into one virtual segment.
979  */
980 static uint8_t *
981 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
982 {
983         intptr_t first_clear, num_clear;
984         uint8_t *free_kva;
985         int i;
986
987         KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
988
989         first_clear = 0;
990         free_kva = NULL;
991
992         mtx_lock(&xbb->lock);
993
994         /*
995          * Look for the first available page.  If there are none, we're done.
996          */
997         bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
998
999         if (first_clear == -1)
1000                 goto bailout;
1001
1002         /*
1003          * Starting at the first available page, look for consecutive free
1004          * pages that will satisfy the user's request.
1005          */
1006         for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
1007                 /*
1008                  * If this is true, the page is used, so we have to reset
1009                  * the number of clear pages and the first clear page
1010                  * (since it pointed to a region with an insufficient number
1011                  * of clear pages).
1012                  */
1013                 if (bit_test(xbb->kva_free, i)) {
1014                         num_clear = 0;
1015                         first_clear = -1;
1016                         continue;
1017                 }
1018
1019                 if (first_clear == -1)
1020                         first_clear = i;
1021
1022                 /*
1023                  * If this is true, we've found a large enough free region
1024                  * to satisfy the request.
1025                  */
1026                 if (++num_clear == nr_pages) {
1027
1028                         bit_nset(xbb->kva_free, first_clear,
1029                                  first_clear + nr_pages - 1);
1030
1031                         free_kva = xbb->kva +
1032                                 (uint8_t *)(first_clear * PAGE_SIZE);
1033
1034                         KASSERT(free_kva >= (uint8_t *)xbb->kva &&
1035                                 free_kva + (nr_pages * PAGE_SIZE) <=
1036                                 (uint8_t *)xbb->ring_config.va,
1037                                 ("Free KVA %p len %d out of range, "
1038                                  "kva = %#jx, ring VA = %#jx\n", free_kva,
1039                                  nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
1040                                  (uintmax_t)xbb->ring_config.va));
1041                         break;
1042                 }
1043         }
1044
1045 bailout:
1046
1047         if (free_kva == NULL) {
1048                 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1049                 xbb->kva_shortages++;
1050         }
1051
1052         mtx_unlock(&xbb->lock);
1053
1054         return (free_kva);
1055 }
1056
1057 /**
1058  * Free allocated KVA.
1059  *
1060  * \param xbb       Per-instance xbb configuration structure.
1061  * \param kva_ptr   Pointer to allocated KVA region.  
1062  * \param nr_pages  Number of pages in the KVA region.
1063  */
1064 static void
1065 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
1066 {
1067         intptr_t start_page;
1068
1069         mtx_assert(&xbb->lock, MA_OWNED);
1070
1071         start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
1072         bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
1073
1074 }
1075
1076 /**
1077  * Unmap the front-end pages associated with this I/O request.
1078  *
1079  * \param req  The request structure to unmap.
1080  */
1081 static void
1082 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1083 {
1084         struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1085         u_int                         i;
1086         u_int                         invcount;
1087         int                           error;
1088
1089         invcount = 0;
1090         for (i = 0; i < reqlist->nr_segments; i++) {
1091
1092                 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1093                         continue;
1094
1095                 unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
1096                 unmap[invcount].dev_bus_addr = 0;
1097                 unmap[invcount].handle       = reqlist->gnt_handles[i];
1098                 reqlist->gnt_handles[i]      = GRANT_REF_INVALID;
1099                 invcount++;
1100         }
1101
1102         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1103                                           unmap, invcount);
1104         KASSERT(error == 0, ("Grant table operation failed"));
1105 }
1106
1107 /**
1108  * Allocate an internal transaction tracking structure from the free pool.
1109  *
1110  * \param xbb  Per-instance xbb configuration structure.
1111  *
1112  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
1113  *          Otherwise NULL.
1114  */
1115 static inline struct xbb_xen_reqlist *
1116 xbb_get_reqlist(struct xbb_softc *xbb)
1117 {
1118         struct xbb_xen_reqlist *reqlist;
1119
1120         reqlist = NULL;
1121
1122         mtx_assert(&xbb->lock, MA_OWNED);
1123
1124         if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1125
1126                 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1127                 reqlist->flags = XBB_REQLIST_NONE;
1128                 reqlist->kva = NULL;
1129                 reqlist->status = BLKIF_RSP_OKAY;
1130                 reqlist->residual_512b_sectors = 0;
1131                 reqlist->num_children = 0;
1132                 reqlist->nr_segments = 0;
1133                 STAILQ_INIT(&reqlist->contig_req_list);
1134         }
1135
1136         return (reqlist);
1137 }
1138
1139 /**
1140  * Return an allocated transaction tracking structure to the free pool.
1141  *
1142  * \param xbb        Per-instance xbb configuration structure.
1143  * \param req        The request list structure to free.
1144  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
1145  *                   during a resource shortage condition.
1146  */
1147 static inline void
1148 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1149                     int wakeup)
1150 {
1151
1152         mtx_lock(&xbb->lock);
1153
1154         if (wakeup) {
1155                 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1156                 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1157         }
1158
1159         if (reqlist->kva != NULL)
1160                 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1161
1162         xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1163
1164         STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1165
1166         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1167                 /*
1168                  * Shutdown is in progress.  See if we can
1169                  * progress further now that one more request
1170                  * has completed and been returned to the
1171                  * free pool.
1172                  */
1173                 xbb_shutdown(xbb);
1174         }
1175
1176         mtx_unlock(&xbb->lock);
1177
1178         if (wakeup != 0)
1179                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1180 }
1181
1182 /**
1183  * Request resources and do basic request setup.
1184  *
1185  * \param xbb          Per-instance xbb configuration structure.
1186  * \param reqlist      Pointer to reqlist pointer.
1187  * \param ring_req     Pointer to a block ring request.
1188  * \param ring_index   The ring index of this request.
1189  *
1190  * \return  0 for success, non-zero for failure.
1191  */
1192 static int
1193 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1194                   blkif_request_t *ring_req, RING_IDX ring_idx)
1195 {
1196         struct xbb_xen_reqlist *nreqlist;
1197         struct xbb_xen_req     *nreq;
1198
1199         nreqlist = NULL;
1200         nreq     = NULL;
1201
1202         mtx_lock(&xbb->lock);
1203
1204         /*
1205          * We don't allow new resources to be allocated if we're in the
1206          * process of shutting down.
1207          */
1208         if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1209                 mtx_unlock(&xbb->lock);
1210                 return (1);
1211         }
1212
1213         /*
1214          * Allocate a reqlist if the caller doesn't have one already.
1215          */
1216         if (*reqlist == NULL) {
1217                 nreqlist = xbb_get_reqlist(xbb);
1218                 if (nreqlist == NULL)
1219                         goto bailout_error;
1220         }
1221
1222         /* We always allocate a request. */
1223         nreq = xbb_get_req(xbb);
1224         if (nreq == NULL)
1225                 goto bailout_error;
1226
1227         mtx_unlock(&xbb->lock);
1228
1229         if (*reqlist == NULL) {
1230                 *reqlist = nreqlist;
1231                 nreqlist->operation = ring_req->operation;
1232                 nreqlist->starting_sector_number = ring_req->sector_number;
1233                 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1234                                    links);
1235         }
1236
1237         nreq->reqlist = *reqlist;
1238         nreq->req_ring_idx = ring_idx;
1239
1240         if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1241                 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1242                 nreq->ring_req = &nreq->ring_req_storage;
1243         } else {
1244                 nreq->ring_req = ring_req;
1245         }
1246
1247         binuptime(&nreq->ds_t0);
1248         devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1249         STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1250         (*reqlist)->num_children++;
1251         (*reqlist)->nr_segments += ring_req->nr_segments;
1252
1253         return (0);
1254
1255 bailout_error:
1256
1257         /*
1258          * We're out of resources, so set the shortage flag.  The next time
1259          * a request is released, we'll try waking up the work thread to
1260          * see if we can allocate more resources.
1261          */
1262         xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1263         xbb->request_shortages++;
1264
1265         if (nreq != NULL)
1266                 xbb_release_req(xbb, nreq);
1267
1268         mtx_unlock(&xbb->lock);
1269
1270         if (nreqlist != NULL)
1271                 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1272
1273         return (1);
1274 }
1275
1276 /**
1277  * Create and transmit a response to a blkif request.
1278  * 
1279  * \param xbb     Per-instance xbb configuration structure.
1280  * \param req     The request structure to which to respond.
1281  * \param status  The status code to report.  See BLKIF_RSP_*
1282  *                in sys/xen/interface/io/blkif.h.
1283  */
1284 static void
1285 xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1286 {
1287         blkif_response_t *resp;
1288         int               more_to_do;
1289         int               notify;
1290
1291         more_to_do = 0;
1292
1293         /*
1294          * Place on the response ring for the relevant domain.
1295          * For now, only the spacing between entries is different
1296          * in the different ABIs, not the response entry layout.
1297          */
1298         mtx_lock(&xbb->lock);
1299         switch (xbb->abi) {
1300         case BLKIF_PROTOCOL_NATIVE:
1301                 resp = RING_GET_RESPONSE(&xbb->rings.native,
1302                                          xbb->rings.native.rsp_prod_pvt);
1303                 break;
1304         case BLKIF_PROTOCOL_X86_32:
1305                 resp = (blkif_response_t *)
1306                     RING_GET_RESPONSE(&xbb->rings.x86_32,
1307                                       xbb->rings.x86_32.rsp_prod_pvt);
1308                 break;
1309         case BLKIF_PROTOCOL_X86_64:
1310                 resp = (blkif_response_t *)
1311                     RING_GET_RESPONSE(&xbb->rings.x86_64,
1312                                       xbb->rings.x86_64.rsp_prod_pvt);
1313                 break;
1314         default:
1315                 panic("Unexpected blkif protocol ABI.");
1316         }
1317
1318         resp->id        = req->id;
1319         resp->operation = req->operation;
1320         resp->status    = status;
1321
1322         xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
1323         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
1324
1325         if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1326
1327                 /*
1328                  * Tail check for pending requests. Allows frontend to avoid
1329                  * notifications if requests are already in flight (lower
1330                  * overheads and promotes batching).
1331                  */
1332                 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1333         } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1334
1335                 more_to_do = 1;
1336         }
1337
1338         xbb->reqs_completed++;
1339
1340         mtx_unlock(&xbb->lock);
1341
1342         if (more_to_do)
1343                 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
1344
1345         if (notify)
1346                 notify_remote_via_irq(xbb->irq);
1347 }
1348
1349 /**
1350  * Complete a request list.
1351  *
1352  * \param xbb        Per-instance xbb configuration structure.
1353  * \param reqlist    Allocated internal request list structure.
1354  */
1355 static void
1356 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1357 {
1358         struct xbb_xen_req *nreq;
1359         off_t               sectors_sent;
1360
1361         sectors_sent = 0;
1362
1363         if (reqlist->flags & XBB_REQLIST_MAPPED)
1364                 xbb_unmap_reqlist(reqlist);
1365
1366         /*
1367          * All I/O is done, send the response.  A lock should not be
1368          * necessary here because the request list is complete, and
1369          * therefore this is the only context accessing this request
1370          * right now.  The functions we call do their own locking if
1371          * necessary.
1372          */
1373         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1374                 off_t cur_sectors_sent;
1375
1376                 xbb_send_response(xbb, nreq, reqlist->status);
1377
1378                 /* We don't report bytes sent if there is an error. */
1379                 if (reqlist->status == BLKIF_RSP_OKAY)
1380                         cur_sectors_sent = nreq->nr_512b_sectors;
1381                 else
1382                         cur_sectors_sent = 0;
1383
1384                 sectors_sent += cur_sectors_sent;
1385
1386                 devstat_end_transaction(xbb->xbb_stats_in,
1387                                         /*bytes*/cur_sectors_sent << 9,
1388                                         reqlist->ds_tag_type,
1389                                         reqlist->ds_trans_type,
1390                                         /*now*/NULL,
1391                                         /*then*/&nreq->ds_t0);
1392         }
1393
1394         /*
1395          * Take out any sectors not sent.  If we wind up negative (which
1396          * might happen if an error is reported as well as a residual), just
1397          * report 0 sectors sent.
1398          */
1399         sectors_sent -= reqlist->residual_512b_sectors;
1400         if (sectors_sent < 0)
1401                 sectors_sent = 0;
1402
1403         devstat_end_transaction(xbb->xbb_stats,
1404                                 /*bytes*/ sectors_sent << 9,
1405                                 reqlist->ds_tag_type,
1406                                 reqlist->ds_trans_type,
1407                                 /*now*/NULL,
1408                                 /*then*/&reqlist->ds_t0);
1409
1410         xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1411 }
1412
1413 /**
1414  * Completion handler for buffer I/O requests issued by the device
1415  * backend driver.
1416  *
1417  * \param bio  The buffer I/O request on which to perform completion
1418  *             processing.
1419  */
1420 static void
1421 xbb_bio_done(struct bio *bio)
1422 {
1423         struct xbb_softc       *xbb;
1424         struct xbb_xen_reqlist *reqlist;
1425
1426         reqlist = bio->bio_caller1;
1427         xbb     = reqlist->xbb;
1428
1429         reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1430
1431         /*
1432          * This is a bit imprecise.  With aggregated I/O a single
1433          * request list can contain multiple front-end requests and
1434          * a multiple bios may point to a single request.  By carefully
1435          * walking the request list, we could map residuals and errors
1436          * back to the original front-end request, but the interface
1437          * isn't sufficiently rich for us to properly report the error.
1438          * So, we just treat the entire request list as having failed if an
1439          * error occurs on any part.  And, if an error occurs, we treat
1440          * the amount of data transferred as 0.
1441          *
1442          * For residuals, we report it on the overall aggregated device,
1443          * but not on the individual requests, since we don't currently
1444          * do the work to determine which front-end request to which the
1445          * residual applies.
1446          */
1447         if (bio->bio_error) {
1448                 DPRINTF("BIO returned error %d for operation on device %s\n",
1449                         bio->bio_error, xbb->dev_name);
1450                 reqlist->status = BLKIF_RSP_ERROR;
1451
1452                 if (bio->bio_error == ENXIO
1453                  && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1454
1455                         /*
1456                          * Backend device has disappeared.  Signal the
1457                          * front-end that we (the device proxy) want to
1458                          * go away.
1459                          */
1460                         xenbus_set_state(xbb->dev, XenbusStateClosing);
1461                 }
1462         }
1463
1464 #ifdef XBB_USE_BOUNCE_BUFFERS
1465         if (bio->bio_cmd == BIO_READ) {
1466                 vm_offset_t kva_offset;
1467
1468                 kva_offset = (vm_offset_t)bio->bio_data
1469                            - (vm_offset_t)reqlist->bounce;
1470                 memcpy((uint8_t *)reqlist->kva + kva_offset,
1471                        bio->bio_data, bio->bio_bcount);
1472         }
1473 #endif /* XBB_USE_BOUNCE_BUFFERS */
1474
1475         /*
1476          * Decrement the pending count for the request list.  When we're
1477          * done with the requests, send status back for all of them.
1478          */
1479         if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1480                 xbb_complete_reqlist(xbb, reqlist);
1481
1482         g_destroy_bio(bio);
1483 }
1484
1485 /**
1486  * Parse a blkif request into an internal request structure and send
1487  * it to the backend for processing.
1488  *
1489  * \param xbb       Per-instance xbb configuration structure.
1490  * \param reqlist   Allocated internal request list structure.
1491  *
1492  * \return          On success, 0.  For resource shortages, non-zero.
1493  *  
1494  * This routine performs the backend common aspects of request parsing
1495  * including compiling an internal request structure, parsing the S/G
1496  * list and any secondary ring requests in which they may reside, and
1497  * the mapping of front-end I/O pages into our domain.
1498  */
1499 static int
1500 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1501 {
1502         struct xbb_sg                *xbb_sg;
1503         struct gnttab_map_grant_ref  *map;
1504         struct blkif_request_segment *sg;
1505         struct blkif_request_segment *last_block_sg;
1506         struct xbb_xen_req           *nreq;
1507         u_int                         nseg;
1508         u_int                         seg_idx;
1509         u_int                         block_segs;
1510         int                           nr_sects;
1511         int                           total_sects;
1512         int                           operation;
1513         uint8_t                       bio_flags;
1514         int                           error;
1515
1516         reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1517         bio_flags            = 0;
1518         total_sects          = 0;
1519         nr_sects             = 0;
1520
1521         /*
1522          * First determine whether we have enough free KVA to satisfy this
1523          * request list.  If not, tell xbb_run_queue() so it can go to
1524          * sleep until we have more KVA.
1525          */
1526         reqlist->kva = NULL;
1527         if (reqlist->nr_segments != 0) {
1528                 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1529                 if (reqlist->kva == NULL) {
1530                         /*
1531                          * If we're out of KVA, return ENOMEM.
1532                          */
1533                         return (ENOMEM);
1534                 }
1535         }
1536
1537         binuptime(&reqlist->ds_t0);
1538         devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1539
1540         switch (reqlist->operation) {
1541         case BLKIF_OP_WRITE_BARRIER:
1542                 bio_flags       |= BIO_ORDERED;
1543                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1544                 /* FALLTHROUGH */
1545         case BLKIF_OP_WRITE:
1546                 operation = BIO_WRITE;
1547                 reqlist->ds_trans_type = DEVSTAT_WRITE;
1548                 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1549                         DPRINTF("Attempt to write to read only device %s\n",
1550                                 xbb->dev_name);
1551                         reqlist->status = BLKIF_RSP_ERROR;
1552                         goto send_response;
1553                 }
1554                 break;
1555         case BLKIF_OP_READ:
1556                 operation = BIO_READ;
1557                 reqlist->ds_trans_type = DEVSTAT_READ;
1558                 break;
1559         case BLKIF_OP_FLUSH_DISKCACHE:
1560                 /*
1561                  * If this is true, the user has requested that we disable
1562                  * flush support.  So we just complete the requests
1563                  * successfully.
1564                  */
1565                 if (xbb->disable_flush != 0) {
1566                         goto send_response;
1567                 }
1568
1569                 /*
1570                  * The user has requested that we only send a real flush
1571                  * for every N flush requests.  So keep count, and either
1572                  * complete the request immediately or queue it for the
1573                  * backend.
1574                  */
1575                 if (xbb->flush_interval != 0) {
1576                         if (++(xbb->flush_count) < xbb->flush_interval) {
1577                                 goto send_response;
1578                         } else
1579                                 xbb->flush_count = 0;
1580                 }
1581
1582                 operation = BIO_FLUSH;
1583                 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1584                 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1585                 goto do_dispatch;
1586                 /*NOTREACHED*/
1587         default:
1588                 DPRINTF("error: unknown block io operation [%d]\n",
1589                         reqlist->operation);
1590                 reqlist->status = BLKIF_RSP_ERROR;
1591                 goto send_response;
1592         }
1593
1594         reqlist->xbb  = xbb;
1595         xbb_sg        = xbb->xbb_sgs;
1596         map           = xbb->maps;
1597         seg_idx       = 0;
1598
1599         STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1600                 blkif_request_t         *ring_req;
1601                 RING_IDX                 req_ring_idx;
1602                 u_int                    req_seg_idx;
1603
1604                 ring_req              = nreq->ring_req;
1605                 req_ring_idx          = nreq->req_ring_idx;
1606                 nr_sects              = 0;
1607                 nseg                  = ring_req->nr_segments;
1608                 nreq->id              = ring_req->id;
1609                 nreq->nr_pages        = nseg;
1610                 nreq->nr_512b_sectors = 0;
1611                 req_seg_idx           = 0;
1612                 sg                    = NULL;
1613
1614                 /* Check that number of segments is sane. */
1615                 if (unlikely(nseg == 0)
1616                  || unlikely(nseg > xbb->max_request_segments)) {
1617                         DPRINTF("Bad number of segments in request (%d)\n",
1618                                 nseg);
1619                         reqlist->status = BLKIF_RSP_ERROR;
1620                         goto send_response;
1621                 }
1622
1623                 block_segs    = MIN(nreq->nr_pages,
1624                                     BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1625                 sg            = ring_req->seg;
1626                 last_block_sg = sg + block_segs;
1627                 while (1) {
1628
1629                         while (sg < last_block_sg) {
1630                                 KASSERT(seg_idx <
1631                                         XBB_MAX_SEGMENTS_PER_REQLIST,
1632                                         ("seg_idx %d is too large, max "
1633                                         "segs %d\n", seg_idx,
1634                                         XBB_MAX_SEGMENTS_PER_REQLIST));
1635                         
1636                                 xbb_sg->first_sect = sg->first_sect;
1637                                 xbb_sg->last_sect  = sg->last_sect;
1638                                 xbb_sg->nsect =
1639                                     (int8_t)(sg->last_sect -
1640                                     sg->first_sect + 1);
1641
1642                                 if ((sg->last_sect >= (PAGE_SIZE >> 9))
1643                                  || (xbb_sg->nsect <= 0)) {
1644                                         reqlist->status = BLKIF_RSP_ERROR;
1645                                         goto send_response;
1646                                 }
1647
1648                                 nr_sects += xbb_sg->nsect;
1649                                 map->host_addr = xbb_get_gntaddr(reqlist,
1650                                                         seg_idx, /*sector*/0);
1651                                 KASSERT(map->host_addr + PAGE_SIZE <=
1652                                         xbb->ring_config.gnt_addr,
1653                                         ("Host address %#jx len %d overlaps "
1654                                          "ring address %#jx\n",
1655                                         (uintmax_t)map->host_addr, PAGE_SIZE,
1656                                         (uintmax_t)xbb->ring_config.gnt_addr));
1657                                         
1658                                 map->flags     = GNTMAP_host_map;
1659                                 map->ref       = sg->gref;
1660                                 map->dom       = xbb->otherend_id;
1661                                 if (operation == BIO_WRITE)
1662                                         map->flags |= GNTMAP_readonly;
1663                                 sg++;
1664                                 map++;
1665                                 xbb_sg++;
1666                                 seg_idx++;
1667                                 req_seg_idx++;
1668                         }
1669
1670                         block_segs = MIN(nseg - req_seg_idx,
1671                                          BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1672                         if (block_segs == 0)
1673                                 break;
1674
1675                         /*
1676                          * Fetch the next request block full of SG elements.
1677                          * For now, only the spacing between entries is
1678                          * different in the different ABIs, not the sg entry
1679                          * layout.
1680                          */
1681                         req_ring_idx++;
1682                         switch (xbb->abi) {
1683                         case BLKIF_PROTOCOL_NATIVE:
1684                                 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
1685                                                             req_ring_idx);
1686                                 break;
1687                         case BLKIF_PROTOCOL_X86_32:
1688                         {
1689                                 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
1690                                                             req_ring_idx);
1691                                 break;
1692                         }
1693                         case BLKIF_PROTOCOL_X86_64:
1694                         {
1695                                 sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
1696                                                             req_ring_idx);
1697                                 break;
1698                         }
1699                         default:
1700                                 panic("Unexpected blkif protocol ABI.");
1701                                 /* NOTREACHED */
1702                         } 
1703                         last_block_sg = sg + block_segs;
1704                 }
1705
1706                 /* Convert to the disk's sector size */
1707                 nreq->nr_512b_sectors = nr_sects;
1708                 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1709                 total_sects += nr_sects;
1710
1711                 if ((nreq->nr_512b_sectors &
1712                     ((xbb->sector_size >> 9) - 1)) != 0) {
1713                         device_printf(xbb->dev, "%s: I/O size (%d) is not "
1714                                       "a multiple of the backing store sector "
1715                                       "size (%d)\n", __func__,
1716                                       nreq->nr_512b_sectors << 9,
1717                                       xbb->sector_size);
1718                         reqlist->status = BLKIF_RSP_ERROR;
1719                         goto send_response;
1720                 }
1721         }
1722
1723         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1724                                           xbb->maps, reqlist->nr_segments);
1725         if (error != 0)
1726                 panic("Grant table operation failed (%d)", error);
1727
1728         reqlist->flags |= XBB_REQLIST_MAPPED;
1729
1730         for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1731              seg_idx++, map++){
1732
1733                 if (unlikely(map->status != 0)) {
1734                         DPRINTF("invalid buffer -- could not remap "
1735                                 "it (%d)\n", map->status);
1736                         DPRINTF("Mapping(%d): Host Addr 0x%lx, flags "
1737                                 "0x%x ref 0x%x, dom %d\n", seg_idx,
1738                                 map->host_addr, map->flags, map->ref,
1739                                 map->dom);
1740                         reqlist->status = BLKIF_RSP_ERROR;
1741                         goto send_response;
1742                 }
1743
1744                 reqlist->gnt_handles[seg_idx] = map->handle;
1745         }
1746         if (reqlist->starting_sector_number + total_sects >
1747             xbb->media_num_sectors) {
1748
1749                 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1750                         "extends past end of device %s\n",
1751                         operation == BIO_READ ? "read" : "write",
1752                         reqlist->starting_sector_number,
1753                         reqlist->starting_sector_number + total_sects,
1754                         xbb->dev_name); 
1755                 reqlist->status = BLKIF_RSP_ERROR;
1756                 goto send_response;
1757         }
1758
1759 do_dispatch:
1760
1761         error = xbb->dispatch_io(xbb,
1762                                  reqlist,
1763                                  operation,
1764                                  bio_flags);
1765
1766         if (error != 0) {
1767                 reqlist->status = BLKIF_RSP_ERROR;
1768                 goto send_response;
1769         }
1770
1771         return (0);
1772
1773 send_response:
1774
1775         xbb_complete_reqlist(xbb, reqlist);
1776
1777         return (0);
1778 }
1779
1780 static __inline int
1781 xbb_count_sects(blkif_request_t *ring_req)
1782 {
1783         int i;
1784         int cur_size = 0;
1785
1786         for (i = 0; i < ring_req->nr_segments; i++) {
1787                 int nsect;
1788
1789                 nsect = (int8_t)(ring_req->seg[i].last_sect -
1790                         ring_req->seg[i].first_sect + 1);
1791                 if (nsect <= 0)
1792                         break;
1793
1794                 cur_size += nsect;
1795         }
1796
1797         return (cur_size);
1798 }
1799
1800 /**
1801  * Process incoming requests from the shared communication ring in response
1802  * to a signal on the ring's event channel.
1803  *
1804  * \param context  Callback argument registerd during task initialization -
1805  *                 the xbb_softc for this instance.
1806  * \param pending  The number of taskqueue_enqueue events that have
1807  *                 occurred since this handler was last run.
1808  */
1809 static void
1810 xbb_run_queue(void *context, int pending)
1811 {
1812         struct xbb_softc       *xbb;
1813         blkif_back_rings_t     *rings;
1814         RING_IDX                rp;
1815         uint64_t                cur_sector;
1816         int                     cur_operation;
1817         struct xbb_xen_reqlist *reqlist;
1818
1819
1820         xbb           = (struct xbb_softc *)context;
1821         rings         = &xbb->rings;
1822
1823         /*
1824          * Work gather and dispatch loop.  Note that we have a bias here
1825          * towards gathering I/O sent by blockfront.  We first gather up
1826          * everything in the ring, as long as we have resources.  Then we
1827          * dispatch one request, and then attempt to gather up any
1828          * additional requests that have come in while we were dispatching
1829          * the request.
1830          *
1831          * This allows us to get a clearer picture (via devstat) of how
1832          * many requests blockfront is queueing to us at any given time.
1833          */
1834         for (;;) {
1835                 int retval;
1836
1837                 /*
1838                  * Initialize reqlist to the last element in the pending
1839                  * queue, if there is one.  This allows us to add more
1840                  * requests to that request list, if we have room.
1841                  */
1842                 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1843                                       xbb_xen_reqlist, links);
1844                 if (reqlist != NULL) {
1845                         cur_sector = reqlist->next_contig_sector;
1846                         cur_operation = reqlist->operation;
1847                 } else {
1848                         cur_operation = 0;
1849                         cur_sector    = 0;
1850                 }
1851
1852                 /*
1853                  * Cache req_prod to avoid accessing a cache line shared
1854                  * with the frontend.
1855                  */
1856                 rp = rings->common.sring->req_prod;
1857
1858                 /* Ensure we see queued requests up to 'rp'. */
1859                 rmb();
1860
1861                 /**
1862                  * Run so long as there is work to consume and the generation
1863                  * of a response will not overflow the ring.
1864                  *
1865                  * @note There's a 1 to 1 relationship between requests and
1866                  *       responses, so an overflow should never occur.  This
1867                  *       test is to protect our domain from digesting bogus
1868                  *       data.  Shouldn't we log this?
1869                  */
1870                 while (rings->common.req_cons != rp
1871                     && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1872                                                   rings->common.req_cons) == 0){
1873                         blkif_request_t         ring_req_storage;
1874                         blkif_request_t        *ring_req;
1875                         int                     cur_size;
1876
1877                         switch (xbb->abi) {
1878                         case BLKIF_PROTOCOL_NATIVE:
1879                                 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1880                                     rings->common.req_cons);
1881                                 break;
1882                         case BLKIF_PROTOCOL_X86_32:
1883                         {
1884                                 struct blkif_x86_32_request *ring_req32;
1885
1886                                 ring_req32 = RING_GET_REQUEST(
1887                                     &xbb->rings.x86_32, rings->common.req_cons);
1888                                 blkif_get_x86_32_req(&ring_req_storage,
1889                                                      ring_req32);
1890                                 ring_req = &ring_req_storage;
1891                                 break;
1892                         }
1893                         case BLKIF_PROTOCOL_X86_64:
1894                         {
1895                                 struct blkif_x86_64_request *ring_req64;
1896
1897                                 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1898                                     rings->common.req_cons);
1899                                 blkif_get_x86_64_req(&ring_req_storage,
1900                                                      ring_req64);
1901                                 ring_req = &ring_req_storage;
1902                                 break;
1903                         }
1904                         default:
1905                                 panic("Unexpected blkif protocol ABI.");
1906                                 /* NOTREACHED */
1907                         } 
1908
1909                         /*
1910                          * Check for situations that would require closing
1911                          * off this I/O for further coalescing:
1912                          *  - Coalescing is turned off.
1913                          *  - Current I/O is out of sequence with the previous
1914                          *    I/O.
1915                          *  - Coalesced I/O would be too large.
1916                          */
1917                         if ((reqlist != NULL)
1918                          && ((xbb->no_coalesce_reqs != 0)
1919                           || ((xbb->no_coalesce_reqs == 0)
1920                            && ((ring_req->sector_number != cur_sector)
1921                             || (ring_req->operation != cur_operation)
1922                             || ((ring_req->nr_segments + reqlist->nr_segments) >
1923                                  xbb->max_reqlist_segments))))) {
1924                                 reqlist = NULL;
1925                         }
1926
1927                         /*
1928                          * Grab and check for all resources in one shot.
1929                          * If we can't get all of the resources we need,
1930                          * the shortage is noted and the thread will get
1931                          * woken up when more resources are available.
1932                          */
1933                         retval = xbb_get_resources(xbb, &reqlist, ring_req,
1934                                                    xbb->rings.common.req_cons);
1935
1936                         if (retval != 0) {
1937                                 /*
1938                                  * Resource shortage has been recorded.
1939                                  * We'll be scheduled to run once a request
1940                                  * object frees up due to a completion.
1941                                  */
1942                                 break;
1943                         }
1944
1945                         /*
1946                          * Signify that we can overwrite this request with
1947                          * a response by incrementing our consumer index.
1948                          * The response won't be generated until after
1949                          * we've already consumed all necessary data out
1950                          * of the version of the request in the ring buffer
1951                          * (for native mode).  We must update the consumer
1952                          * index  before issueing back-end I/O so there is
1953                          * no possibility that it will complete and a
1954                          * response be generated before we make room in 
1955                          * the queue for that response.
1956                          */
1957                         xbb->rings.common.req_cons +=
1958                             BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
1959                         xbb->reqs_received++;
1960
1961                         cur_size = xbb_count_sects(ring_req);
1962                         cur_sector = ring_req->sector_number + cur_size;
1963                         reqlist->next_contig_sector = cur_sector;
1964                         cur_operation = ring_req->operation;
1965                 }
1966
1967                 /* Check for I/O to dispatch */
1968                 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1969                 if (reqlist == NULL) {
1970                         /*
1971                          * We're out of work to do, put the task queue to
1972                          * sleep.
1973                          */
1974                         break;
1975                 }
1976
1977                 /*
1978                  * Grab the first request off the queue and attempt
1979                  * to dispatch it.
1980                  */
1981                 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1982
1983                 retval = xbb_dispatch_io(xbb, reqlist);
1984                 if (retval != 0) {
1985                         /*
1986                          * xbb_dispatch_io() returns non-zero only when
1987                          * there is a resource shortage.  If that's the
1988                          * case, re-queue this request on the head of the
1989                          * queue, and go to sleep until we have more
1990                          * resources.
1991                          */
1992                         STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1993                                            reqlist, links);
1994                         break;
1995                 } else {
1996                         /*
1997                          * If we still have anything on the queue after
1998                          * removing the head entry, that is because we
1999                          * met one of the criteria to create a new
2000                          * request list (outlined above), and we'll call
2001                          * that a forced dispatch for statistical purposes.
2002                          *
2003                          * Otherwise, if there is only one element on the
2004                          * queue, we coalesced everything available on
2005                          * the ring and we'll call that a normal dispatch.
2006                          */
2007                         reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
2008
2009                         if (reqlist != NULL)
2010                                 xbb->forced_dispatch++;
2011                         else
2012                                 xbb->normal_dispatch++;
2013
2014                         xbb->total_dispatch++;
2015                 }
2016         }
2017 }
2018
2019 /**
2020  * Interrupt handler bound to the shared ring's event channel.
2021  *
2022  * \param arg  Callback argument registerd during event channel
2023  *             binding - the xbb_softc for this instance.
2024  */
2025 static void
2026 xbb_intr(void *arg)
2027 {
2028         struct xbb_softc *xbb;
2029
2030         /* Defer to kernel thread. */
2031         xbb = (struct xbb_softc *)arg;
2032         taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
2033 }
2034
2035 /*----------------------------- Backend Handlers -----------------------------*/
2036 /**
2037  * Backend handler for character device access.
2038  *
2039  * \param xbb        Per-instance xbb configuration structure.
2040  * \param reqlist    Allocated internal request list structure.
2041  * \param operation  BIO_* I/O operation code.
2042  * \param bio_flags  Additional bio_flag data to pass to any generated
2043  *                   bios (e.g. BIO_ORDERED)..
2044  *
2045  * \return  0 for success, errno codes for failure.
2046  */
2047 static int
2048 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2049                  int operation, int bio_flags)
2050 {
2051         struct xbb_dev_data *dev_data;
2052         struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
2053         struct xbb_xen_req  *nreq;
2054         off_t                bio_offset;
2055         struct bio          *bio;
2056         struct xbb_sg       *xbb_sg;
2057         u_int                nbio;
2058         u_int                bio_idx;
2059         u_int                nseg;
2060         u_int                seg_idx;
2061         int                  error;
2062
2063         dev_data   = &xbb->backend.dev;
2064         bio_offset = (off_t)reqlist->starting_sector_number
2065                    << xbb->sector_size_shift;
2066         error      = 0;
2067         nbio       = 0;
2068         bio_idx    = 0;
2069
2070         if (operation == BIO_FLUSH) {
2071                 nreq = STAILQ_FIRST(&reqlist->contig_req_list);
2072                 bio = g_new_bio();
2073                 if (unlikely(bio == NULL)) {
2074                         DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
2075                         error = ENOMEM;
2076                         return (error);
2077                 }
2078
2079                 bio->bio_cmd     = BIO_FLUSH;
2080                 bio->bio_flags  |= BIO_ORDERED;
2081                 bio->bio_dev     = dev_data->cdev;
2082                 bio->bio_offset  = 0;
2083                 bio->bio_data    = 0;
2084                 bio->bio_done    = xbb_bio_done;
2085                 bio->bio_caller1 = nreq;
2086                 bio->bio_pblkno  = 0;
2087
2088                 nreq->pendcnt    = 1;
2089
2090                 (*dev_data->csw->d_strategy)(bio);
2091
2092                 return (0);
2093         }
2094
2095         xbb_sg = xbb->xbb_sgs;
2096         bio    = NULL;
2097         nseg = reqlist->nr_segments;
2098
2099         for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2100
2101                 /*
2102                  * KVA will not be contiguous, so any additional
2103                  * I/O will need to be represented in a new bio.
2104                  */
2105                 if ((bio != NULL)
2106                  && (xbb_sg->first_sect != 0)) {
2107                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2108                                 printf("%s: Discontiguous I/O request "
2109                                        "from domain %d ends on "
2110                                        "non-sector boundary\n",
2111                                        __func__, xbb->otherend_id);
2112                                 error = EINVAL;
2113                                 goto fail_free_bios;
2114                         }
2115                         bio = NULL;
2116                 }
2117
2118                 if (bio == NULL) {
2119                         /*
2120                          * Make sure that the start of this bio is
2121                          * aligned to a device sector.
2122                          */
2123                         if ((bio_offset & (xbb->sector_size - 1)) != 0){
2124                                 printf("%s: Misaligned I/O request "
2125                                        "from domain %d\n", __func__,
2126                                        xbb->otherend_id);
2127                                 error = EINVAL;
2128                                 goto fail_free_bios;
2129                         }
2130
2131                         bio = bios[nbio++] = g_new_bio();
2132                         if (unlikely(bio == NULL)) {
2133                                 error = ENOMEM;
2134                                 goto fail_free_bios;
2135                         }
2136                         bio->bio_cmd     = operation;
2137                         bio->bio_flags  |= bio_flags;
2138                         bio->bio_dev     = dev_data->cdev;
2139                         bio->bio_offset  = bio_offset;
2140                         bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
2141                                                 xbb_sg->first_sect);
2142                         bio->bio_done    = xbb_bio_done;
2143                         bio->bio_caller1 = reqlist;
2144                         bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
2145                 }
2146
2147                 bio->bio_length += xbb_sg->nsect << 9;
2148                 bio->bio_bcount  = bio->bio_length;
2149                 bio_offset      += xbb_sg->nsect << 9;
2150
2151                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2152
2153                         if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2154                                 printf("%s: Discontiguous I/O request "
2155                                        "from domain %d ends on "
2156                                        "non-sector boundary\n",
2157                                        __func__, xbb->otherend_id);
2158                                 error = EINVAL;
2159                                 goto fail_free_bios;
2160                         }
2161                         /*
2162                          * KVA will not be contiguous, so any additional
2163                          * I/O will need to be represented in a new bio.
2164                          */
2165                         bio = NULL;
2166                 }
2167         }
2168
2169         reqlist->pendcnt = nbio;
2170
2171         for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2172         {
2173 #ifdef XBB_USE_BOUNCE_BUFFERS
2174                 vm_offset_t kva_offset;
2175
2176                 kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
2177                            - (vm_offset_t)reqlist->bounce;
2178                 if (operation == BIO_WRITE) {
2179                         memcpy(bios[bio_idx]->bio_data,
2180                                (uint8_t *)reqlist->kva + kva_offset,
2181                                bios[bio_idx]->bio_bcount);
2182                 }
2183 #endif
2184                 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2185         }
2186
2187         return (error);
2188
2189 fail_free_bios:
2190         for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2191                 g_destroy_bio(bios[bio_idx]);
2192         
2193         return (error);
2194 }
2195
2196 /**
2197  * Backend handler for file access.
2198  *
2199  * \param xbb        Per-instance xbb configuration structure.
2200  * \param reqlist    Allocated internal request list.
2201  * \param operation  BIO_* I/O operation code.
2202  * \param flags      Additional bio_flag data to pass to any generated bios
2203  *                   (e.g. BIO_ORDERED)..
2204  *
2205  * \return  0 for success, errno codes for failure.
2206  */
2207 static int
2208 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2209                   int operation, int flags)
2210 {
2211         struct xbb_file_data *file_data;
2212         u_int                 seg_idx;
2213         u_int                 nseg;
2214         off_t                 sectors_sent;
2215         struct uio            xuio;
2216         struct xbb_sg        *xbb_sg;
2217         struct iovec         *xiovec;
2218 #ifdef XBB_USE_BOUNCE_BUFFERS
2219         void                **p_vaddr;
2220         int                   saved_uio_iovcnt;
2221 #endif /* XBB_USE_BOUNCE_BUFFERS */
2222         int                   vfs_is_locked;
2223         int                   error;
2224
2225         file_data = &xbb->backend.file;
2226         sectors_sent = 0;
2227         error = 0;
2228         bzero(&xuio, sizeof(xuio));
2229
2230         switch (operation) {
2231         case BIO_READ:
2232                 xuio.uio_rw = UIO_READ;
2233                 break;
2234         case BIO_WRITE:
2235                 xuio.uio_rw = UIO_WRITE;
2236                 break;
2237         case BIO_FLUSH: {
2238                 struct mount *mountpoint;
2239
2240                 vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2241
2242                 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2243
2244                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2245                 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2246                 VOP_UNLOCK(xbb->vn, 0);
2247
2248                 vn_finished_write(mountpoint);
2249
2250                 VFS_UNLOCK_GIANT(vfs_is_locked);
2251
2252                 goto bailout_send_response;
2253                 /* NOTREACHED */
2254         }
2255         default:
2256                 panic("invalid operation %d", operation);
2257                 /* NOTREACHED */
2258         }
2259         xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2260                         << xbb->sector_size_shift;
2261         xuio.uio_segflg = UIO_SYSSPACE;
2262         xuio.uio_iov = file_data->xiovecs;
2263         xuio.uio_iovcnt = 0;
2264         xbb_sg = xbb->xbb_sgs;
2265         nseg = reqlist->nr_segments;
2266
2267         for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2268
2269                 /*
2270                  * If the first sector is not 0, the KVA will
2271                  * not be contiguous and we'll need to go on
2272                  * to another segment.
2273                  */
2274                 if (xbb_sg->first_sect != 0)
2275                         xiovec = NULL;
2276
2277                 if (xiovec == NULL) {
2278                         xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2279                         xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2280                             seg_idx, xbb_sg->first_sect);
2281 #ifdef XBB_USE_BOUNCE_BUFFERS
2282                         /*
2283                          * Store the address of the incoming
2284                          * buffer at this particular offset
2285                          * as well, so we can do the copy
2286                          * later without having to do more
2287                          * work to recalculate this address.
2288                          */
2289                         p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
2290                         *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
2291                             xbb_sg->first_sect);
2292 #endif /* XBB_USE_BOUNCE_BUFFERS */
2293                         xiovec->iov_len = 0;
2294                         xuio.uio_iovcnt++;
2295                 }
2296
2297                 xiovec->iov_len += xbb_sg->nsect << 9;
2298
2299                 xuio.uio_resid += xbb_sg->nsect << 9;
2300
2301                 /*
2302                  * If the last sector is not the full page
2303                  * size count, the next segment will not be
2304                  * contiguous in KVA and we need a new iovec.
2305                  */
2306                 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2307                         xiovec = NULL;
2308         }
2309
2310         xuio.uio_td = curthread;
2311
2312 #ifdef XBB_USE_BOUNCE_BUFFERS
2313         saved_uio_iovcnt = xuio.uio_iovcnt;
2314
2315         if (operation == BIO_WRITE) {
2316                 /* Copy the write data to the local buffer. */
2317                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2318                      xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
2319                      seg_idx++, xiovec++, p_vaddr++) {
2320
2321                         memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
2322                 }
2323         } else {
2324                 /*
2325                  * We only need to save off the iovecs in the case of a
2326                  * read, because the copy for the read happens after the
2327                  * VOP_READ().  (The uio will get modified in that call
2328                  * sequence.)
2329                  */
2330                 memcpy(file_data->saved_xiovecs, xuio.uio_iov,
2331                        xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
2332         }
2333 #endif /* XBB_USE_BOUNCE_BUFFERS */
2334
2335         vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2336         switch (operation) {
2337         case BIO_READ:
2338
2339                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2340
2341                 /*
2342                  * UFS pays attention to IO_DIRECT for reads.  If the
2343                  * DIRECTIO option is configured into the kernel, it calls
2344                  * ffs_rawread().  But that only works for single-segment
2345                  * uios with user space addresses.  In our case, with a
2346                  * kernel uio, it still reads into the buffer cache, but it
2347                  * will just try to release the buffer from the cache later
2348                  * on in ffs_read().
2349                  *
2350                  * ZFS does not pay attention to IO_DIRECT for reads.
2351                  *
2352                  * UFS does not pay attention to IO_SYNC for reads.
2353                  *
2354                  * ZFS pays attention to IO_SYNC (which translates into the
2355                  * Solaris define FRSYNC for zfs_read()) for reads.  It
2356                  * attempts to sync the file before reading.
2357                  *
2358                  * So, to attempt to provide some barrier semantics in the
2359                  * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
2360                  */
2361                 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
2362                                  (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2363
2364                 VOP_UNLOCK(xbb->vn, 0);
2365                 break;
2366         case BIO_WRITE: {
2367                 struct mount *mountpoint;
2368
2369                 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2370
2371                 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2372
2373                 /*
2374                  * UFS pays attention to IO_DIRECT for writes.  The write
2375                  * is done asynchronously.  (Normally the write would just
2376                  * get put into cache.
2377                  *
2378                  * UFS pays attention to IO_SYNC for writes.  It will
2379                  * attempt to write the buffer out synchronously if that
2380                  * flag is set.
2381                  *
2382                  * ZFS does not pay attention to IO_DIRECT for writes.
2383                  *
2384                  * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2385                  * for writes.  It will flush the transaction from the
2386                  * cache before returning.
2387                  *
2388                  * So if we've got the BIO_ORDERED flag set, we want
2389                  * IO_SYNC in either the UFS or ZFS case.
2390                  */
2391                 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2392                                   IO_SYNC : 0, file_data->cred);
2393                 VOP_UNLOCK(xbb->vn, 0);
2394
2395                 vn_finished_write(mountpoint);
2396
2397                 break;
2398         }
2399         default:
2400                 panic("invalid operation %d", operation);
2401                 /* NOTREACHED */
2402         }
2403         VFS_UNLOCK_GIANT(vfs_is_locked);
2404
2405 #ifdef XBB_USE_BOUNCE_BUFFERS
2406         /* We only need to copy here for read operations */
2407         if (operation == BIO_READ) {
2408
2409                 for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
2410                      xiovec = file_data->saved_xiovecs;
2411                      seg_idx < saved_uio_iovcnt; seg_idx++,
2412                      xiovec++, p_vaddr++) {
2413
2414                         /*
2415                          * Note that we have to use the copy of the 
2416                          * io vector we made above.  uiomove() modifies
2417                          * the uio and its referenced vector as uiomove
2418                          * performs the copy, so we can't rely on any
2419                          * state from the original uio.
2420                          */
2421                         memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
2422                 }
2423         }
2424 #endif /* XBB_USE_BOUNCE_BUFFERS */
2425
2426 bailout_send_response:
2427
2428         if (error != 0)
2429                 reqlist->status = BLKIF_RSP_ERROR;
2430
2431         xbb_complete_reqlist(xbb, reqlist);
2432
2433         return (0);
2434 }
2435
2436 /*--------------------------- Backend Configuration --------------------------*/
2437 /**
2438  * Close and cleanup any backend device/file specific state for this
2439  * block back instance. 
2440  *
2441  * \param xbb  Per-instance xbb configuration structure.
2442  */
2443 static void
2444 xbb_close_backend(struct xbb_softc *xbb)
2445 {
2446         DROP_GIANT();
2447         DPRINTF("closing dev=%s\n", xbb->dev_name);
2448         if (xbb->vn) {
2449                 int flags = FREAD;
2450                 int vfs_is_locked = 0;
2451
2452                 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2453                         flags |= FWRITE;
2454
2455                 switch (xbb->device_type) {
2456                 case XBB_TYPE_DISK:
2457                         if (xbb->backend.dev.csw) {
2458                                 dev_relthread(xbb->backend.dev.cdev,
2459                                               xbb->backend.dev.dev_ref);
2460                                 xbb->backend.dev.csw  = NULL;
2461                                 xbb->backend.dev.cdev = NULL;
2462                         }
2463                         break;
2464                 case XBB_TYPE_FILE:
2465                         vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
2466                         break;
2467                 case XBB_TYPE_NONE:
2468                 default:
2469                         panic("Unexpected backend type.");
2470                         break;
2471                 }
2472
2473                 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2474                 xbb->vn = NULL;
2475
2476                 switch (xbb->device_type) {
2477                 case XBB_TYPE_DISK:
2478                         break;
2479                 case XBB_TYPE_FILE:
2480                         VFS_UNLOCK_GIANT(vfs_is_locked);
2481                         if (xbb->backend.file.cred != NULL) {
2482                                 crfree(xbb->backend.file.cred);
2483                                 xbb->backend.file.cred = NULL;
2484                         }
2485                         break;
2486                 case XBB_TYPE_NONE:
2487                 default:
2488                         panic("Unexpected backend type.");
2489                         break;
2490                 }
2491         }
2492         PICKUP_GIANT();
2493 }
2494
2495 /**
2496  * Open a character device to be used for backend I/O.
2497  *
2498  * \param xbb  Per-instance xbb configuration structure.
2499  *
2500  * \return  0 for success, errno codes for failure.
2501  */
2502 static int
2503 xbb_open_dev(struct xbb_softc *xbb)
2504 {
2505         struct vattr   vattr;
2506         struct cdev   *dev;
2507         struct cdevsw *devsw;
2508         int            error;
2509
2510         xbb->device_type = XBB_TYPE_DISK;
2511         xbb->dispatch_io = xbb_dispatch_dev;
2512         xbb->backend.dev.cdev = xbb->vn->v_rdev;
2513         xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2514                                              &xbb->backend.dev.dev_ref);
2515         if (xbb->backend.dev.csw == NULL)
2516                 panic("Unable to retrieve device switch");
2517
2518         error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2519         if (error) {
2520                 xenbus_dev_fatal(xbb->dev, error, "error getting "
2521                                  "vnode attributes for device %s",
2522                                  xbb->dev_name);
2523                 return (error);
2524         }
2525
2526
2527         dev = xbb->vn->v_rdev;
2528         devsw = dev->si_devsw;
2529         if (!devsw->d_ioctl) {
2530                 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2531                                  "device %s!", xbb->dev_name);
2532                 return (ENODEV);
2533         }
2534
2535         error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2536                                (caddr_t)&xbb->sector_size, FREAD,
2537                                curthread);
2538         if (error) {
2539                 xenbus_dev_fatal(xbb->dev, error,
2540                                  "error calling ioctl DIOCGSECTORSIZE "
2541                                  "for device %s", xbb->dev_name);
2542                 return (error);
2543         }
2544
2545         error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2546                                (caddr_t)&xbb->media_size, FREAD,
2547                                curthread);
2548         if (error) {
2549                 xenbus_dev_fatal(xbb->dev, error,
2550                                  "error calling ioctl DIOCGMEDIASIZE "
2551                                  "for device %s", xbb->dev_name);
2552                 return (error);
2553         }
2554
2555         return (0);
2556 }
2557
2558 /**
2559  * Open a file to be used for backend I/O.
2560  *
2561  * \param xbb  Per-instance xbb configuration structure.
2562  *
2563  * \return  0 for success, errno codes for failure.
2564  */
2565 static int
2566 xbb_open_file(struct xbb_softc *xbb)
2567 {
2568         struct xbb_file_data *file_data;
2569         struct vattr          vattr;
2570         int                   error;
2571
2572         file_data = &xbb->backend.file;
2573         xbb->device_type = XBB_TYPE_FILE;
2574         xbb->dispatch_io = xbb_dispatch_file;
2575         error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2576         if (error != 0) {
2577                 xenbus_dev_fatal(xbb->dev, error,
2578                                  "error calling VOP_GETATTR()"
2579                                  "for file %s", xbb->dev_name);
2580                 return (error);
2581         }
2582
2583         /*
2584          * Verify that we have the ability to upgrade to exclusive
2585          * access on this file so we can trap errors at open instead
2586          * of reporting them during first access.
2587          */
2588         if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2589                 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2590                 if (xbb->vn->v_iflag & VI_DOOMED) {
2591                         error = EBADF;
2592                         xenbus_dev_fatal(xbb->dev, error,
2593                                          "error locking file %s",
2594                                          xbb->dev_name);
2595
2596                         return (error);
2597                 }
2598         }
2599
2600         file_data->cred = crhold(curthread->td_ucred);
2601         xbb->media_size = vattr.va_size;
2602
2603         /*
2604          * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2605          * With ZFS, it is 131072 bytes.  Block sizes that large don't work
2606          * with disklabel and UFS on FreeBSD at least.  Large block sizes
2607          * may not work with other OSes as well.  So just export a sector
2608          * size of 512 bytes, which should work with any OS or
2609          * application.  Since our backing is a file, any block size will
2610          * work fine for the backing store.
2611          */
2612 #if 0
2613         xbb->sector_size = vattr.va_blocksize;
2614 #endif
2615         xbb->sector_size = 512;
2616
2617         /*
2618          * Sanity check.  The media size has to be at least one
2619          * sector long.
2620          */
2621         if (xbb->media_size < xbb->sector_size) {
2622                 error = EINVAL;
2623                 xenbus_dev_fatal(xbb->dev, error,
2624                                  "file %s size %ju < block size %u",
2625                                  xbb->dev_name,
2626                                  (uintmax_t)xbb->media_size,
2627                                  xbb->sector_size);
2628         }
2629         return (error);
2630 }
2631
2632 /**
2633  * Open the backend provider for this connection.
2634  *
2635  * \param xbb  Per-instance xbb configuration structure.
2636  *
2637  * \return  0 for success, errno codes for failure.
2638  */
2639 static int
2640 xbb_open_backend(struct xbb_softc *xbb)
2641 {
2642         struct nameidata nd;
2643         int              flags;
2644         int              error;
2645         int              vfs_is_locked;
2646
2647         flags = FREAD;
2648         error = 0;
2649
2650         DPRINTF("opening dev=%s\n", xbb->dev_name);
2651
2652         if (rootvnode == NULL) {
2653                 xenbus_dev_fatal(xbb->dev, ENOENT,
2654                                  "Root file system not mounted");
2655                 return (ENOENT);
2656         }
2657
2658         if ((xbb->flags & XBBF_READ_ONLY) == 0)
2659                 flags |= FWRITE;
2660
2661         if (!curthread->td_proc->p_fd->fd_cdir) {
2662                 curthread->td_proc->p_fd->fd_cdir = rootvnode;
2663                 VREF(rootvnode);
2664         }
2665         if (!curthread->td_proc->p_fd->fd_rdir) {
2666                 curthread->td_proc->p_fd->fd_rdir = rootvnode;
2667                 VREF(rootvnode);
2668         }
2669         if (!curthread->td_proc->p_fd->fd_jdir) {
2670                 curthread->td_proc->p_fd->fd_jdir = rootvnode;
2671                 VREF(rootvnode);
2672         }
2673
2674  again:
2675         NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
2676         error = vn_open(&nd, &flags, 0, NULL);
2677         if (error) {
2678                 /*
2679                  * This is the only reasonable guess we can make as far as
2680                  * path if the user doesn't give us a fully qualified path.
2681                  * If they want to specify a file, they need to specify the
2682                  * full path.
2683                  */
2684                 if (xbb->dev_name[0] != '/') {
2685                         char *dev_path = "/dev/";
2686                         char *dev_name;
2687
2688                         /* Try adding device path at beginning of name */
2689                         dev_name = malloc(strlen(xbb->dev_name)
2690                                         + strlen(dev_path) + 1,
2691                                           M_XENBLOCKBACK, M_NOWAIT);
2692                         if (dev_name) {
2693                                 sprintf(dev_name, "%s%s", dev_path,
2694                                         xbb->dev_name);
2695                                 free(xbb->dev_name, M_XENBLOCKBACK);
2696                                 xbb->dev_name = dev_name;
2697                                 goto again;
2698                         }
2699                 }
2700                 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2701                                  xbb->dev_name);
2702                 return (error);
2703         }
2704
2705         vfs_is_locked = NDHASGIANT(&nd);
2706
2707         NDFREE(&nd, NDF_ONLY_PNBUF);
2708                 
2709         xbb->vn = nd.ni_vp;
2710
2711         /* We only support disks and files. */
2712         if (vn_isdisk(xbb->vn, &error)) {
2713                 error = xbb_open_dev(xbb);
2714         } else if (xbb->vn->v_type == VREG) {
2715                 error = xbb_open_file(xbb);
2716         } else {
2717                 error = EINVAL;
2718                 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2719                                  "or file", xbb->dev_name);
2720         }
2721         VOP_UNLOCK(xbb->vn, 0);
2722         VFS_UNLOCK_GIANT(vfs_is_locked);
2723
2724         if (error != 0) {
2725                 xbb_close_backend(xbb);
2726                 return (error);
2727         }
2728
2729         xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2730         xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2731
2732         DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2733                 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2734                 xbb->dev_name, xbb->sector_size, xbb->media_size);
2735
2736         return (0);
2737 }
2738
2739 /*------------------------ Inter-Domain Communication ------------------------*/
2740 /**
2741  * Free dynamically allocated KVA or pseudo-physical address allocations.
2742  *
2743  * \param xbb  Per-instance xbb configuration structure.
2744  */
2745 static void
2746 xbb_free_communication_mem(struct xbb_softc *xbb)
2747 {
2748         if (xbb->kva != 0) {
2749 #ifndef XENHVM
2750                 kmem_free(kernel_map, xbb->kva, xbb->kva_size);
2751 #else
2752                 if (xbb->pseudo_phys_res != NULL) {
2753                         bus_release_resource(xbb->dev, SYS_RES_MEMORY,
2754                                              xbb->pseudo_phys_res_id,
2755                                              xbb->pseudo_phys_res);
2756                         xbb->pseudo_phys_res = NULL;
2757                 }
2758 #endif
2759         }
2760         xbb->kva = 0;
2761         xbb->gnt_base_addr = 0;
2762         if (xbb->kva_free != NULL) {
2763                 free(xbb->kva_free, M_XENBLOCKBACK);
2764                 xbb->kva_free = NULL;
2765         }
2766 }
2767
2768 /**
2769  * Cleanup all inter-domain communication mechanisms.
2770  *
2771  * \param xbb  Per-instance xbb configuration structure.
2772  */
2773 static int
2774 xbb_disconnect(struct xbb_softc *xbb)
2775 {
2776         struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
2777         struct gnttab_unmap_grant_ref *op;
2778         u_int                          ring_idx;
2779         int                            error;
2780
2781         DPRINTF("\n");
2782
2783         if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
2784                 return (0);
2785
2786         if (xbb->irq != 0) {
2787                 unbind_from_irqhandler(xbb->irq);
2788                 xbb->irq = 0;
2789         }
2790
2791         mtx_unlock(&xbb->lock);
2792         taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
2793         mtx_lock(&xbb->lock);
2794
2795         /*
2796          * No new interrupts can generate work, but we must wait
2797          * for all currently active requests to drain.
2798          */
2799         if (xbb->active_request_count != 0)
2800                 return (EAGAIN);
2801         
2802         for (ring_idx = 0, op = ops;
2803              ring_idx < xbb->ring_config.ring_pages;
2804              ring_idx++, op++) {
2805
2806                 op->host_addr    = xbb->ring_config.gnt_addr
2807                                  + (ring_idx * PAGE_SIZE);
2808                 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2809                 op->handle       = xbb->ring_config.handle[ring_idx];
2810         }
2811
2812         error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2813                                           xbb->ring_config.ring_pages);
2814         if (error != 0)
2815                 panic("Grant table op failed (%d)", error);
2816
2817         xbb_free_communication_mem(xbb);
2818
2819         if (xbb->requests != NULL) {
2820                 free(xbb->requests, M_XENBLOCKBACK);
2821                 xbb->requests = NULL;
2822         }
2823
2824         if (xbb->request_lists != NULL) {
2825                 struct xbb_xen_reqlist *reqlist;
2826                 int i;
2827
2828                 /* There is one request list for ever allocated request. */
2829                 for (i = 0, reqlist = xbb->request_lists;
2830                      i < xbb->max_requests; i++, reqlist++){
2831 #ifdef XBB_USE_BOUNCE_BUFFERS
2832                         if (reqlist->bounce != NULL) {
2833                                 free(reqlist->bounce, M_XENBLOCKBACK);
2834                                 reqlist->bounce = NULL;
2835                         }
2836 #endif
2837                         if (reqlist->gnt_handles != NULL) {
2838                                 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2839                                 reqlist->gnt_handles = NULL;
2840                         }
2841                 }
2842                 free(xbb->request_lists, M_XENBLOCKBACK);
2843                 xbb->request_lists = NULL;
2844         }
2845
2846         xbb->flags &= ~XBBF_RING_CONNECTED;
2847         return (0);
2848 }
2849
2850 /**
2851  * Map shared memory ring into domain local address space, initialize
2852  * ring control structures, and bind an interrupt to the event channel
2853  * used to notify us of ring changes.
2854  *
2855  * \param xbb  Per-instance xbb configuration structure.
2856  */
2857 static int
2858 xbb_connect_ring(struct xbb_softc *xbb)
2859 {
2860         struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
2861         struct gnttab_map_grant_ref *gnt;
2862         u_int                        ring_idx;
2863         int                          error;
2864
2865         if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2866                 return (0);
2867
2868         /*
2869          * Kva for our ring is at the tail of the region of kva allocated
2870          * by xbb_alloc_communication_mem().
2871          */
2872         xbb->ring_config.va = xbb->kva
2873                             + (xbb->kva_size
2874                              - (xbb->ring_config.ring_pages * PAGE_SIZE));
2875         xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2876                                   + (xbb->kva_size
2877                                    - (xbb->ring_config.ring_pages * PAGE_SIZE));
2878
2879         for (ring_idx = 0, gnt = gnts;
2880              ring_idx < xbb->ring_config.ring_pages;
2881              ring_idx++, gnt++) {
2882
2883                 gnt->host_addr = xbb->ring_config.gnt_addr
2884                                + (ring_idx * PAGE_SIZE);
2885                 gnt->flags     = GNTMAP_host_map;
2886                 gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
2887                 gnt->dom       = xbb->otherend_id;
2888         }
2889
2890         error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2891                                           xbb->ring_config.ring_pages);
2892         if (error)
2893                 panic("blkback: Ring page grant table op failed (%d)", error);
2894
2895         for (ring_idx = 0, gnt = gnts;
2896              ring_idx < xbb->ring_config.ring_pages;
2897              ring_idx++, gnt++) {
2898                 if (gnt->status != 0) {
2899                         xbb->ring_config.va = 0;
2900                         xenbus_dev_fatal(xbb->dev, EACCES,
2901                                          "Ring shared page mapping failed. "
2902                                          "Status %d.", gnt->status);
2903                         return (EACCES);
2904                 }
2905                 xbb->ring_config.handle[ring_idx]   = gnt->handle;
2906                 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2907         }
2908
2909         /* Initialize the ring based on ABI. */
2910         switch (xbb->abi) {
2911         case BLKIF_PROTOCOL_NATIVE:
2912         {
2913                 blkif_sring_t *sring;
2914                 sring = (blkif_sring_t *)xbb->ring_config.va;
2915                 BACK_RING_INIT(&xbb->rings.native, sring,
2916                                xbb->ring_config.ring_pages * PAGE_SIZE);
2917                 break;
2918         }
2919         case BLKIF_PROTOCOL_X86_32:
2920         {
2921                 blkif_x86_32_sring_t *sring_x86_32;
2922                 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2923                 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2924                                xbb->ring_config.ring_pages * PAGE_SIZE);
2925                 break;
2926         }
2927         case BLKIF_PROTOCOL_X86_64:
2928         {
2929                 blkif_x86_64_sring_t *sring_x86_64;
2930                 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2931                 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2932                                xbb->ring_config.ring_pages * PAGE_SIZE);
2933                 break;
2934         }
2935         default:
2936                 panic("Unexpected blkif protocol ABI.");
2937         }
2938
2939         xbb->flags |= XBBF_RING_CONNECTED;
2940
2941         error =
2942             bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id,
2943                                                   xbb->ring_config.evtchn,
2944                                                   device_get_nameunit(xbb->dev),
2945                                                   xbb_intr, /*arg*/xbb,
2946                                                   INTR_TYPE_BIO | INTR_MPSAFE,
2947                                                   &xbb->irq);
2948         if (error) {
2949                 (void)xbb_disconnect(xbb);
2950                 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2951                 return (error);
2952         }
2953
2954         DPRINTF("rings connected!\n");
2955
2956         return 0;
2957 }
2958
2959 /* Needed to make bit_alloc() macro work */
2960 #define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,      \
2961                                    M_NOWAIT|M_ZERO);
2962
2963 /**
2964  * Size KVA and pseudo-physical address allocations based on negotiated
2965  * values for the size and number of I/O requests, and the size of our
2966  * communication ring.
2967  *
2968  * \param xbb  Per-instance xbb configuration structure.
2969  *
2970  * These address spaces are used to dynamically map pages in the
2971  * front-end's domain into our own.
2972  */
2973 static int
2974 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2975 {
2976         xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2977         xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2978         xbb->kva_size = xbb->reqlist_kva_size +
2979                         (xbb->ring_config.ring_pages * PAGE_SIZE);
2980
2981         xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
2982         if (xbb->kva_free == NULL)
2983                 return (ENOMEM);
2984
2985         DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
2986                 device_get_nameunit(xbb->dev), xbb->kva_size,
2987                 xbb->reqlist_kva_size);
2988 #ifndef XENHVM
2989         xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size);
2990         if (xbb->kva == 0)
2991                 return (ENOMEM);
2992         xbb->gnt_base_addr = xbb->kva;
2993 #else /* XENHVM */
2994         /*
2995          * Reserve a range of pseudo physical memory that we can map
2996          * into kva.  These pages will only be backed by machine
2997          * pages ("real memory") during the lifetime of front-end requests
2998          * via grant table operations.
2999          */
3000         xbb->pseudo_phys_res_id = 0;
3001         xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
3002                                                   &xbb->pseudo_phys_res_id,
3003                                                   0, ~0, xbb->kva_size,
3004                                                   RF_ACTIVE);
3005         if (xbb->pseudo_phys_res == NULL) {
3006                 xbb->kva = 0;
3007                 return (ENOMEM);
3008         }
3009         xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
3010         xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
3011 #endif /* XENHVM */
3012
3013         DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
3014                 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
3015                 (uintmax_t)xbb->gnt_base_addr); 
3016         return (0);
3017 }
3018
3019 /**
3020  * Collect front-end information from the XenStore.
3021  *
3022  * \param xbb  Per-instance xbb configuration structure.
3023  */
3024 static int
3025 xbb_collect_frontend_info(struct xbb_softc *xbb)
3026 {
3027         char        protocol_abi[64];
3028         const char *otherend_path;
3029         int         error;
3030         u_int       ring_idx;
3031
3032         otherend_path = xenbus_get_otherend_path(xbb->dev);
3033
3034         /*
3035          * Protocol defaults valid even if all negotiation fails.
3036          */
3037         xbb->ring_config.ring_pages = 1;
3038         xbb->max_requests           = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
3039         xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
3040         xbb->max_request_size       = xbb->max_request_segments * PAGE_SIZE;
3041
3042         /*
3043          * Mandatory data (used in all versions of the protocol) first.
3044          */
3045         error = xs_gather(XST_NIL, otherend_path,
3046                           "ring-ref", "%" PRIu32,
3047                           &xbb->ring_config.ring_ref[0],
3048                           "event-channel", "%" PRIu32,
3049                           &xbb->ring_config.evtchn,
3050                           NULL);
3051         if (error != 0) {
3052                 xenbus_dev_fatal(xbb->dev, error,
3053                                  "Unable to retrieve ring information from "
3054                                  "frontend %s.  Unable to connect.",
3055                                  xenbus_get_otherend_path(xbb->dev));
3056                 return (error);
3057         }
3058
3059         /*
3060          * These fields are initialized to legacy protocol defaults
3061          * so we only need to fail if reading the updated value succeeds
3062          * and the new value is outside of its allowed range.
3063          *
3064          * \note xs_gather() returns on the first encountered error, so
3065          *       we must use independant calls in order to guarantee
3066          *       we don't miss information in a sparsly populated front-end
3067          *       tree.
3068          */
3069         (void)xs_scanf(XST_NIL, otherend_path,
3070                        "ring-pages", NULL, "%u",
3071                        &xbb->ring_config.ring_pages);
3072
3073         (void)xs_scanf(XST_NIL, otherend_path,
3074                        "max-requests", NULL, "%u",
3075                        &xbb->max_requests);
3076
3077         (void)xs_scanf(XST_NIL, otherend_path,
3078                        "max-request-segments", NULL, "%u",
3079                        &xbb->max_request_segments);
3080
3081         (void)xs_scanf(XST_NIL, otherend_path,
3082                        "max-request-size", NULL, "%u",
3083                        &xbb->max_request_size);
3084
3085         if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
3086                 xenbus_dev_fatal(xbb->dev, EINVAL,
3087                                  "Front-end specificed ring-pages of %u "
3088                                  "exceeds backend limit of %zu.  "
3089                                  "Unable to connect.",
3090                                  xbb->ring_config.ring_pages,
3091                                  XBB_MAX_RING_PAGES);
3092                 return (EINVAL);
3093         } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
3094                 xenbus_dev_fatal(xbb->dev, EINVAL,
3095                                  "Front-end specificed max_requests of %u "
3096                                  "exceeds backend limit of %u.  "
3097                                  "Unable to connect.",
3098                                  xbb->max_requests,
3099                                  XBB_MAX_REQUESTS);
3100                 return (EINVAL);
3101         } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
3102                 xenbus_dev_fatal(xbb->dev, EINVAL,
3103                                  "Front-end specificed max_requests_segments "
3104                                  "of %u exceeds backend limit of %u.  "
3105                                  "Unable to connect.",
3106                                  xbb->max_request_segments,
3107                                  XBB_MAX_SEGMENTS_PER_REQUEST);
3108                 return (EINVAL);
3109         } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
3110                 xenbus_dev_fatal(xbb->dev, EINVAL,
3111                                  "Front-end specificed max_request_size "
3112                                  "of %u exceeds backend limit of %u.  "
3113                                  "Unable to connect.",
3114                                  xbb->max_request_size,
3115                                  XBB_MAX_REQUEST_SIZE);
3116                 return (EINVAL);
3117         }
3118
3119         /* If using a multi-page ring, pull in the remaining references. */
3120         for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
3121                 char ring_ref_name[]= "ring_refXX";
3122
3123                 snprintf(ring_ref_name, sizeof(ring_ref_name),
3124                          "ring-ref%u", ring_idx);
3125                 error = xs_scanf(XST_NIL, otherend_path,
3126                                  ring_ref_name, NULL, "%" PRIu32,
3127                                  &xbb->ring_config.ring_ref[ring_idx]);
3128                 if (error != 0) {
3129                         xenbus_dev_fatal(xbb->dev, error,
3130                                          "Failed to retriev grant reference "
3131                                          "for page %u of shared ring.  Unable "
3132                                          "to connect.", ring_idx);
3133                         return (error);
3134                 }
3135         }
3136
3137         error = xs_gather(XST_NIL, otherend_path,
3138                           "protocol", "%63s", protocol_abi,
3139                           NULL); 
3140         if (error != 0
3141          || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
3142                 /*
3143                  * Assume native if the frontend has not
3144                  * published ABI data or it has published and
3145                  * matches our own ABI.
3146                  */
3147                 xbb->abi = BLKIF_PROTOCOL_NATIVE;
3148         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
3149
3150                 xbb->abi = BLKIF_PROTOCOL_X86_32;
3151         } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
3152
3153                 xbb->abi = BLKIF_PROTOCOL_X86_64;
3154         } else {
3155
3156                 xenbus_dev_fatal(xbb->dev, EINVAL,
3157                                  "Unknown protocol ABI (%s) published by "
3158                                  "frontend.  Unable to connect.", protocol_abi);
3159                 return (EINVAL);
3160         }
3161         return (0);
3162 }
3163
3164 /**
3165  * Allocate per-request data structures given request size and number
3166  * information negotiated with the front-end.
3167  *
3168  * \param xbb  Per-instance xbb configuration structure.
3169  */
3170 static int
3171 xbb_alloc_requests(struct xbb_softc *xbb)
3172 {
3173         struct xbb_xen_req *req;
3174         struct xbb_xen_req *last_req;
3175
3176         /*
3177          * Allocate request book keeping datastructures.
3178          */
3179         xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3180                                M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3181         if (xbb->requests == NULL) {
3182                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3183                                   "Unable to allocate request structures");
3184                 return (ENOMEM);
3185         }
3186
3187         req      = xbb->requests;
3188         last_req = &xbb->requests[xbb->max_requests - 1];
3189         STAILQ_INIT(&xbb->request_free_stailq);
3190         while (req <= last_req) {
3191                 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3192                 req++;
3193         }
3194         return (0);
3195 }
3196
3197 static int
3198 xbb_alloc_request_lists(struct xbb_softc *xbb)
3199 {
3200         int i;
3201         struct xbb_xen_reqlist *reqlist;
3202
3203         /*
3204          * If no requests can be merged, we need 1 request list per
3205          * in flight request.
3206          */
3207         xbb->request_lists = malloc(xbb->max_requests *
3208                 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3209         if (xbb->request_lists == NULL) {
3210                 xenbus_dev_fatal(xbb->dev, ENOMEM, 
3211                                   "Unable to allocate request list structures");
3212                 return (ENOMEM);
3213         }
3214
3215         STAILQ_INIT(&xbb->reqlist_free_stailq);
3216         STAILQ_INIT(&xbb->reqlist_pending_stailq);
3217         for (i = 0; i < xbb->max_requests; i++) {
3218                 int seg;
3219
3220                 reqlist      = &xbb->request_lists[i];
3221
3222                 reqlist->xbb = xbb;
3223
3224 #ifdef XBB_USE_BOUNCE_BUFFERS
3225                 reqlist->bounce = malloc(xbb->max_reqlist_size,
3226                                          M_XENBLOCKBACK, M_NOWAIT);
3227                 if (reqlist->bounce == NULL) {
3228                         xenbus_dev_fatal(xbb->dev, ENOMEM, 
3229                                          "Unable to allocate request "
3230                                          "bounce buffers");
3231                         return (ENOMEM);
3232                 }
3233 #endif /* XBB_USE_BOUNCE_BUFFERS */
3234
3235                 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3236                                               sizeof(*reqlist->gnt_handles),
3237                                               M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3238                 if (reqlist->gnt_handles == NULL) {
3239                         xenbus_dev_fatal(xbb->dev, ENOMEM,
3240                                           "Unable to allocate request "
3241                                           "grant references");
3242                         return (ENOMEM);
3243                 }
3244
3245                 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3246                         reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3247
3248                 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3249         }
3250         return (0);
3251 }
3252
3253 /**
3254  * Supply information about the physical device to the frontend
3255  * via XenBus.
3256  *
3257  * \param xbb  Per-instance xbb configuration structure.
3258  */
3259 static int
3260 xbb_publish_backend_info(struct xbb_softc *xbb)
3261 {
3262         struct xs_transaction xst;
3263         const char           *our_path;
3264         const char           *leaf;
3265         int                   error;
3266
3267         our_path = xenbus_get_node(xbb->dev);
3268         while (1) {
3269                 error = xs_transaction_start(&xst);
3270                 if (error != 0) {
3271                         xenbus_dev_fatal(xbb->dev, error,
3272                                          "Error publishing backend info "
3273                                          "(start transaction)");
3274                         return (error);
3275                 }
3276
3277                 leaf = "sectors";
3278                 error = xs_printf(xst, our_path, leaf,
3279                                   "%"PRIu64, xbb->media_num_sectors);
3280                 if (error != 0)
3281                         break;
3282
3283                 /* XXX Support all VBD attributes here. */
3284                 leaf = "info";
3285                 error = xs_printf(xst, our_path, leaf, "%u",
3286                                   xbb->flags & XBBF_READ_ONLY
3287                                 ? VDISK_READONLY : 0);
3288                 if (error != 0)
3289                         break;
3290
3291                 leaf = "sector-size";
3292                 error = xs_printf(xst, our_path, leaf, "%u",
3293                                   xbb->sector_size);
3294                 if (error != 0)
3295                         break;
3296
3297                 error = xs_transaction_end(xst, 0);
3298                 if (error == 0) {
3299                         return (0);
3300                 } else if (error != EAGAIN) {
3301                         xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3302                         return (error);
3303                 }
3304         }
3305
3306         xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3307                         our_path, leaf);
3308         xs_transaction_end(xst, 1);
3309         return (error);
3310 }
3311
3312 /**
3313  * Connect to our blkfront peer now that it has completed publishing
3314  * its configuration into the XenStore.
3315  *
3316  * \param xbb  Per-instance xbb configuration structure.
3317  */
3318 static void
3319 xbb_connect(struct xbb_softc *xbb)
3320 {
3321         int                   error;
3322
3323         if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
3324                 return;
3325
3326         if (xbb_collect_frontend_info(xbb) != 0)
3327                 return;
3328
3329         xbb->flags &= ~XBBF_SHUTDOWN;
3330
3331         /*
3332          * We limit the maximum number of reqlist segments to the maximum
3333          * number of segments in the ring, or our absolute maximum,
3334          * whichever is smaller.
3335          */
3336         xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3337                 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3338
3339         /*
3340          * The maximum size is simply a function of the number of segments
3341          * we can handle.
3342          */
3343         xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3344
3345         /* Allocate resources whose size depends on front-end configuration. */
3346         error = xbb_alloc_communication_mem(xbb);
3347         if (error != 0) {
3348                 xenbus_dev_fatal(xbb->dev, error,
3349                                  "Unable to allocate communication memory");
3350                 return;
3351         }
3352
3353         error = xbb_alloc_requests(xbb);
3354         if (error != 0) {
3355                 /* Specific errors are reported by xbb_alloc_requests(). */
3356                 return;
3357         }
3358
3359         error = xbb_alloc_request_lists(xbb);
3360         if (error != 0) {
3361                 /* Specific errors are reported by xbb_alloc_request_lists(). */
3362                 return;
3363         }
3364
3365         /*
3366          * Connect communication channel.
3367          */
3368         error = xbb_connect_ring(xbb);
3369         if (error != 0) {
3370                 /* Specific errors are reported by xbb_connect_ring(). */
3371                 return;
3372         }
3373         
3374         if (xbb_publish_backend_info(xbb) != 0) {
3375                 /*
3376                  * If we can't publish our data, we cannot participate
3377                  * in this connection, and waiting for a front-end state
3378                  * change will not help the situation.
3379                  */
3380                 (void)xbb_disconnect(xbb);
3381                 return;
3382         }
3383
3384         /* Ready for I/O. */
3385         xenbus_set_state(xbb->dev, XenbusStateConnected);
3386 }
3387
3388 /*-------------------------- Device Teardown Support -------------------------*/
3389 /**
3390  * Perform device shutdown functions.
3391  *
3392  * \param xbb  Per-instance xbb configuration structure.
3393  *
3394  * Mark this instance as shutting down, wait for any active I/O on the
3395  * backend device/file to drain, disconnect from the front-end, and notify
3396  * any waiters (e.g. a thread invoking our detach method) that detach can
3397  * now proceed.
3398  */
3399 static int
3400 xbb_shutdown(struct xbb_softc *xbb)
3401 {
3402         int error;
3403
3404         DPRINTF("\n");
3405
3406         /*
3407          * Due to the need to drop our mutex during some
3408          * xenbus operations, it is possible for two threads
3409          * to attempt to close out shutdown processing at
3410          * the same time.  Tell the caller that hits this
3411          * race to try back later. 
3412          */
3413         if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3414                 return (EAGAIN);
3415
3416         DPRINTF("\n");
3417
3418         /* Indicate shutdown is in progress. */
3419         xbb->flags |= XBBF_SHUTDOWN;
3420
3421         /* Disconnect from the front-end. */
3422         error = xbb_disconnect(xbb);
3423         if (error != 0) {
3424                 /*
3425                  * Requests still outstanding.  We'll be called again
3426                  * once they complete.
3427                  */
3428                 KASSERT(error == EAGAIN,
3429                         ("%s: Unexpected xbb_disconnect() failure %d",
3430                          __func__, error));
3431
3432                 return (error);
3433         }
3434
3435         DPRINTF("\n");
3436
3437         xbb->flags |= XBBF_IN_SHUTDOWN;
3438         mtx_unlock(&xbb->lock);
3439
3440         if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3441                 xenbus_set_state(xbb->dev, XenbusStateClosing);
3442
3443         mtx_lock(&xbb->lock);
3444         xbb->flags &= ~XBBF_IN_SHUTDOWN;
3445
3446         /* Indicate to xbb_detach() that is it safe to proceed. */
3447         wakeup(xbb);
3448
3449         return (0);
3450 }
3451
3452 /**
3453  * Report an attach time error to the console and Xen, and cleanup
3454  * this instance by forcing immediate detach processing.
3455  *
3456  * \param xbb  Per-instance xbb configuration structure.
3457  * \param err  Errno describing the error.
3458  * \param fmt  Printf style format and arguments
3459  */
3460 static void
3461 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3462 {
3463         va_list ap;
3464         va_list ap_hotplug;
3465
3466         va_start(ap, fmt);
3467         va_copy(ap_hotplug, ap);
3468         xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3469                   "hotplug-error", fmt, ap_hotplug);
3470         va_end(ap_hotplug);
3471         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3472                   "hotplug-status", "error");
3473
3474         xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3475         va_end(ap);
3476
3477         xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3478                   "online", "0");
3479         xbb_detach(xbb->dev);
3480 }
3481
3482 /*---------------------------- NewBus Entrypoints ----------------------------*/
3483 /**
3484  * Inspect a XenBus device and claim it if is of the appropriate type.
3485  * 
3486  * \param dev  NewBus device object representing a candidate XenBus device.
3487  *
3488  * \return  0 for success, errno codes for failure.
3489  */
3490 static int
3491 xbb_probe(device_t dev)
3492 {
3493  
3494         if (!strcmp(xenbus_get_type(dev), "vbd")) {
3495                 device_set_desc(dev, "Backend Virtual Block Device");
3496                 device_quiet(dev);
3497                 return (0);
3498         }
3499
3500         return (ENXIO);
3501 }
3502
3503 /**
3504  * Setup sysctl variables to control various Block Back parameters.
3505  *
3506  * \param xbb  Xen Block Back softc.
3507  *
3508  */
3509 static void
3510 xbb_setup_sysctl(struct xbb_softc *xbb)
3511 {
3512         struct sysctl_ctx_list *sysctl_ctx = NULL;
3513         struct sysctl_oid      *sysctl_tree = NULL;
3514         
3515         sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3516         if (sysctl_ctx == NULL)
3517                 return;
3518
3519         sysctl_tree = device_get_sysctl_tree(xbb->dev);
3520         if (sysctl_tree == NULL)
3521                 return;
3522
3523         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3524                        "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3525                        "fake the flush command");
3526
3527         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3528                        "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3529                        "send a real flush for N flush requests");
3530
3531         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3532                        "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3533                        "Don't coalesce contiguous requests");
3534
3535         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3536                          "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3537                          "how many I/O requests we have received");
3538
3539         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3540                          "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3541                          "how many I/O requests have been completed");
3542
3543         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3544                          "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3545                          "how many I/O dispatches were forced");
3546
3547         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3548                          "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3549                          "how many I/O dispatches were normal");
3550
3551         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3552                          "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3553                          "total number of I/O dispatches");
3554
3555         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3556                          "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3557                          "how many times we have run out of KVA");
3558
3559         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3560                          "request_shortages", CTLFLAG_RW,
3561                          &xbb->request_shortages,
3562                          "how many times we have run out of requests");
3563
3564         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3565                         "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3566                         "maximum outstanding requests (negotiated)");
3567
3568         SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3569                         "max_request_segments", CTLFLAG_RD,
3570                         &xbb->max_request_segments, 0,
3571                         "maximum number of pages per requests (negotiated)");
3572 }
3573
3574 /**
3575  * Attach to a XenBus device that has been claimed by our probe routine.
3576  *
3577  * \param dev  NewBus device object representing this Xen Block Back instance.
3578  *
3579  * \return  0 for success, errno codes for failure.
3580  */
3581 static int
3582 xbb_attach(device_t dev)
3583 {
3584         struct xbb_softc        *xbb;
3585         int                      error;
3586
3587         DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3588
3589         /*
3590          * Basic initialization.
3591          * After this block it is safe to call xbb_detach()
3592          * to clean up any allocated data for this instance.
3593          */
3594         xbb = device_get_softc(dev);
3595         xbb->dev = dev;
3596         xbb->otherend_id = xenbus_get_otherend_id(dev);
3597         TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3598         mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3599
3600         /*
3601          * Publish protocol capabilities for consumption by the
3602          * front-end.
3603          */
3604         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3605                           "feature-barrier", "1");
3606         if (error) {
3607                 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3608                                   xenbus_get_node(xbb->dev));
3609                 return (error);
3610         }
3611
3612         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3613                           "feature-flush-cache", "1");
3614         if (error) {
3615                 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3616                                   xenbus_get_node(xbb->dev));
3617                 return (error);
3618         }
3619
3620         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3621                           "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
3622         if (error) {
3623                 xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
3624                                   xenbus_get_node(xbb->dev));
3625                 return (error);
3626         }
3627
3628         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3629                           "max-requests", "%u", XBB_MAX_REQUESTS);
3630         if (error) {
3631                 xbb_attach_failed(xbb, error, "writing %s/max-requests",
3632                                   xenbus_get_node(xbb->dev));
3633                 return (error);
3634         }
3635
3636         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3637                           "max-request-segments", "%u",
3638                           XBB_MAX_SEGMENTS_PER_REQUEST);
3639         if (error) {
3640                 xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
3641                                   xenbus_get_node(xbb->dev));
3642                 return (error);
3643         }
3644
3645         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3646                           "max-request-size", "%u",
3647                           XBB_MAX_REQUEST_SIZE);
3648         if (error) {
3649                 xbb_attach_failed(xbb, error, "writing %s/max-request-size",
3650                                   xenbus_get_node(xbb->dev));
3651                 return (error);
3652         }
3653
3654         /* Collect physical device information. */
3655         error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
3656                           "device-type", NULL, &xbb->dev_type,
3657                           NULL);
3658         if (error != 0)
3659                 xbb->dev_type = NULL;
3660
3661         error = xs_gather(XST_NIL, xenbus_get_node(dev),
3662                           "mode", NULL, &xbb->dev_mode,
3663                           "params", NULL, &xbb->dev_name,
3664                           NULL);
3665         if (error != 0) {
3666                 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3667                                   xenbus_get_node(dev));
3668                 return (ENXIO);
3669         }
3670
3671         /* Parse fopen style mode flags. */
3672         if (strchr(xbb->dev_mode, 'w') == NULL)
3673                 xbb->flags |= XBBF_READ_ONLY;
3674
3675         /*
3676          * Verify the physical device is present and can support
3677          * the desired I/O mode.
3678          */
3679         DROP_GIANT();
3680         error = xbb_open_backend(xbb);
3681         PICKUP_GIANT();
3682         if (error != 0) {
3683                 xbb_attach_failed(xbb, error, "Unable to open %s",
3684                                   xbb->dev_name);
3685                 return (ENXIO);
3686         }
3687
3688         /* Use devstat(9) for recording statistics. */
3689         xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3690                                            xbb->sector_size,
3691                                            DEVSTAT_ALL_SUPPORTED,
3692                                            DEVSTAT_TYPE_DIRECT
3693                                          | DEVSTAT_TYPE_IF_OTHER,
3694                                            DEVSTAT_PRIORITY_OTHER);
3695
3696         xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3697                                               xbb->sector_size,
3698                                               DEVSTAT_ALL_SUPPORTED,
3699                                               DEVSTAT_TYPE_DIRECT
3700                                             | DEVSTAT_TYPE_IF_OTHER,
3701                                               DEVSTAT_PRIORITY_OTHER);
3702         /*
3703          * Setup sysctl variables.
3704          */
3705         xbb_setup_sysctl(xbb);
3706
3707         /*
3708          * Create a taskqueue for doing work that must occur from a
3709          * thread context.
3710          */
3711         xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT,
3712                                              taskqueue_thread_enqueue,
3713                                              /*context*/&xbb->io_taskqueue);
3714         if (xbb->io_taskqueue == NULL) {
3715                 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3716                 return (ENOMEM);
3717         }
3718
3719         taskqueue_start_threads(&xbb->io_taskqueue,
3720                                 /*num threads*/1,
3721                                 /*priority*/PWAIT,
3722                                 /*thread name*/
3723                                 "%s taskq", device_get_nameunit(dev));
3724
3725         /* Update hot-plug status to satisfy xend. */
3726         error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3727                           "hotplug-status", "connected");
3728         if (error) {
3729                 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3730                                   xenbus_get_node(xbb->dev));
3731                 return (error);
3732         }
3733
3734         /* Tell the front end that we are ready to connect. */
3735         xenbus_set_state(dev, XenbusStateInitWait);
3736
3737         return (0);
3738 }
3739
3740 /**
3741  * Detach from a block back device instance.
3742  *
3743  * \param dev  NewBus device object representing this Xen Block Back instance.
3744  *
3745  * \return  0 for success, errno codes for failure.
3746  * 
3747  * \note A block back device may be detached at any time in its life-cycle,
3748  *       including part way through the attach process.  For this reason,
3749  *       initialization order and the intialization state checks in this
3750  *       routine must be carefully coupled so that attach time failures
3751  *       are gracefully handled.
3752  */
3753 static int
3754 xbb_detach(device_t dev)
3755 {
3756         struct xbb_softc *xbb;
3757
3758         DPRINTF("\n");
3759
3760         xbb = device_get_softc(dev);
3761         mtx_lock(&xbb->lock);
3762         while (xbb_shutdown(xbb) == EAGAIN) {
3763                 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3764                        "xbb_shutdown", 0);
3765         }
3766         mtx_unlock(&xbb->lock);
3767
3768         DPRINTF("\n");
3769
3770         if (xbb->io_taskqueue != NULL)
3771                 taskqueue_free(xbb->io_taskqueue);
3772
3773         if (xbb->xbb_stats != NULL)
3774                 devstat_remove_entry(xbb->xbb_stats);
3775
3776         if (xbb->xbb_stats_in != NULL)
3777                 devstat_remove_entry(xbb->xbb_stats_in);
3778
3779         xbb_close_backend(xbb);
3780
3781         if (xbb->dev_mode != NULL) {
3782                 free(xbb->dev_mode, M_XENBUS);
3783                 xbb->dev_mode = NULL;
3784         }
3785
3786         if (xbb->dev_type != NULL) {
3787                 free(xbb->dev_type, M_XENBUS);
3788                 xbb->dev_type = NULL;
3789         }
3790
3791         if (xbb->dev_name != NULL) {
3792                 free(xbb->dev_name, M_XENBUS);
3793                 xbb->dev_name = NULL;
3794         }
3795
3796         mtx_destroy(&xbb->lock);
3797         return (0);
3798 }
3799
3800 /**
3801  * Prepare this block back device for suspension of this VM.
3802  * 
3803  * \param dev  NewBus device object representing this Xen Block Back instance.
3804  *
3805  * \return  0 for success, errno codes for failure.
3806  */
3807 static int
3808 xbb_suspend(device_t dev)
3809 {
3810 #ifdef NOT_YET
3811         struct xbb_softc *sc = device_get_softc(dev);
3812
3813         /* Prevent new requests being issued until we fix things up. */
3814         mtx_lock(&sc->xb_io_lock);
3815         sc->connected = BLKIF_STATE_SUSPENDED;
3816         mtx_unlock(&sc->xb_io_lock);
3817 #endif
3818
3819         return (0);
3820 }
3821
3822 /**
3823  * Perform any processing required to recover from a suspended state.
3824  * 
3825  * \param dev  NewBus device object representing this Xen Block Back instance.
3826  *
3827  * \return  0 for success, errno codes for failure.
3828  */
3829 static int
3830 xbb_resume(device_t dev)
3831 {
3832         return (0);
3833 }
3834
3835 /**
3836  * Handle state changes expressed via the XenStore by our front-end peer.
3837  *
3838  * \param dev             NewBus device object representing this Xen
3839  *                        Block Back instance.
3840  * \param frontend_state  The new state of the front-end.
3841  *
3842  * \return  0 for success, errno codes for failure.
3843  */
3844 static void
3845 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3846 {
3847         struct xbb_softc *xbb = device_get_softc(dev);
3848
3849         DPRINTF("frontend_state=%s, xbb_state=%s\n",
3850                 xenbus_strstate(frontend_state),
3851                 xenbus_strstate(xenbus_get_state(xbb->dev)));
3852
3853         switch (frontend_state) {
3854         case XenbusStateInitialising:
3855                 break;
3856         case XenbusStateInitialised:
3857         case XenbusStateConnected:
3858                 xbb_connect(xbb);
3859                 break;
3860         case XenbusStateClosing:
3861         case XenbusStateClosed:
3862                 mtx_lock(&xbb->lock);
3863                 xbb_shutdown(xbb);
3864                 mtx_unlock(&xbb->lock);
3865                 if (frontend_state == XenbusStateClosed)
3866                         xenbus_set_state(xbb->dev, XenbusStateClosed);
3867                 break;
3868         default:
3869                 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3870                                  frontend_state);
3871                 break;
3872         }
3873 }
3874
3875 /*---------------------------- NewBus Registration ---------------------------*/
3876 static device_method_t xbb_methods[] = {
3877         /* Device interface */
3878         DEVMETHOD(device_probe,         xbb_probe),
3879         DEVMETHOD(device_attach,        xbb_attach),
3880         DEVMETHOD(device_detach,        xbb_detach),
3881         DEVMETHOD(device_shutdown,      bus_generic_shutdown),
3882         DEVMETHOD(device_suspend,       xbb_suspend),
3883         DEVMETHOD(device_resume,        xbb_resume),
3884
3885         /* Xenbus interface */
3886         DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3887
3888         { 0, 0 }
3889 };
3890
3891 static driver_t xbb_driver = {
3892         "xbbd",
3893         xbb_methods,
3894         sizeof(struct xbb_softc),
3895 };
3896 devclass_t xbb_devclass;
3897
3898 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);