]> CyberLeo.Net >> Repos - FreeBSD/releng/9.0.git/blob - sys/dev/xen/blkfront/blkfront.c
Copy stable/9 to releng/9.0 as part of the FreeBSD 9.0-RELEASE release
[FreeBSD/releng/9.0.git] / sys / dev / xen / blkfront / blkfront.c
1 /*
2  * XenBSD block device driver
3  *
4  * Copyright (c) 2009 Scott Long, Yahoo!
5  * Copyright (c) 2009 Frank Suchomel, Citrix
6  * Copyright (c) 2009 Doug F. Rabson, Citrix
7  * Copyright (c) 2005 Kip Macy
8  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
9  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
10  *
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a copy
13  * of this software and associated documentation files (the "Software"), to
14  * deal in the Software without restriction, including without limitation the
15  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
16  * sell copies of the Software, and to permit persons to whom the Software is
17  * furnished to do so, subject to the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included in
20  * all copies or substantial portions of the Software.
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/kernel.h>
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39
40 #include <sys/bio.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <sys/module.h>
44
45 #include <machine/bus.h>
46 #include <sys/rman.h>
47 #include <machine/resource.h>
48 #include <machine/intr_machdep.h>
49 #include <machine/vmparam.h>
50 #include <sys/bus_dma.h>
51
52 #include <machine/_inttypes.h>
53 #include <machine/xen/xen-os.h>
54 #include <machine/xen/xenvar.h>
55 #include <machine/xen/xenfunc.h>
56
57 #include <xen/hypervisor.h>
58 #include <xen/xen_intr.h>
59 #include <xen/evtchn.h>
60 #include <xen/gnttab.h>
61 #include <xen/interface/grant_table.h>
62 #include <xen/interface/io/protocols.h>
63 #include <xen/xenbus/xenbusvar.h>
64
65 #include <geom/geom_disk.h>
66
67 #include <dev/xen/blkfront/block.h>
68
69 #include "xenbus_if.h"
70
71 /* prototypes */
72 static void xb_free_command(struct xb_command *cm);
73 static void xb_startio(struct xb_softc *sc);
74 static void blkfront_connect(struct xb_softc *);
75 static void blkfront_closing(device_t);
76 static int blkfront_detach(device_t);
77 static int setup_blkring(struct xb_softc *);
78 static void blkif_int(void *);
79 static void blkfront_initialize(struct xb_softc *);
80 static int blkif_completion(struct xb_command *);
81 static void blkif_free(struct xb_softc *);
82 static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int);
83
84 MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
85
86 #define GRANT_INVALID_REF 0
87
88 /* Control whether runtime update of vbds is enabled. */
89 #define ENABLE_VBD_UPDATE 0
90
91 #if ENABLE_VBD_UPDATE
92 static void vbd_update(void);
93 #endif
94
95 #define BLKIF_STATE_DISCONNECTED 0
96 #define BLKIF_STATE_CONNECTED    1
97 #define BLKIF_STATE_SUSPENDED    2
98
99 #ifdef notyet
100 static char *blkif_state_name[] = {
101         [BLKIF_STATE_DISCONNECTED] = "disconnected",
102         [BLKIF_STATE_CONNECTED]    = "connected",
103         [BLKIF_STATE_SUSPENDED]    = "closed",
104 };
105
106 static char * blkif_status_name[] = {
107         [BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
108         [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
109         [BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
110         [BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
111 };
112 #endif
113
114 #if 0
115 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
116 #else
117 #define DPRINTK(fmt, args...) 
118 #endif
119
120 static int blkif_open(struct disk *dp);
121 static int blkif_close(struct disk *dp);
122 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
123 static int blkif_queue_request(struct xb_softc *sc, struct xb_command *cm);
124 static void xb_strategy(struct bio *bp);
125
126 // In order to quiesce the device during kernel dumps, outstanding requests to
127 // DOM0 for disk reads/writes need to be accounted for.
128 static  int     xb_dump(void *, void *, vm_offset_t, off_t, size_t);
129
130 /* XXX move to xb_vbd.c when VBD update support is added */
131 #define MAX_VBDS 64
132
133 #define XBD_SECTOR_SIZE         512     /* XXX: assume for now */
134 #define XBD_SECTOR_SHFT         9
135
136 /*
137  * Translate Linux major/minor to an appropriate name and unit
138  * number. For HVM guests, this allows us to use the same drive names
139  * with blkfront as the emulated drives, easing transition slightly.
140  */
141 static void
142 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
143 {
144         static struct vdev_info {
145                 int major;
146                 int shift;
147                 int base;
148                 const char *name;
149         } info[] = {
150                 {3,     6,      0,      "ad"},  /* ide0 */
151                 {22,    6,      2,      "ad"},  /* ide1 */
152                 {33,    6,      4,      "ad"},  /* ide2 */
153                 {34,    6,      6,      "ad"},  /* ide3 */
154                 {56,    6,      8,      "ad"},  /* ide4 */
155                 {57,    6,      10,     "ad"},  /* ide5 */
156                 {88,    6,      12,     "ad"},  /* ide6 */
157                 {89,    6,      14,     "ad"},  /* ide7 */
158                 {90,    6,      16,     "ad"},  /* ide8 */
159                 {91,    6,      18,     "ad"},  /* ide9 */
160
161                 {8,     4,      0,      "da"},  /* scsi disk0 */
162                 {65,    4,      16,     "da"},  /* scsi disk1 */
163                 {66,    4,      32,     "da"},  /* scsi disk2 */
164                 {67,    4,      48,     "da"},  /* scsi disk3 */
165                 {68,    4,      64,     "da"},  /* scsi disk4 */
166                 {69,    4,      80,     "da"},  /* scsi disk5 */
167                 {70,    4,      96,     "da"},  /* scsi disk6 */
168                 {71,    4,      112,    "da"},  /* scsi disk7 */
169                 {128,   4,      128,    "da"},  /* scsi disk8 */
170                 {129,   4,      144,    "da"},  /* scsi disk9 */
171                 {130,   4,      160,    "da"},  /* scsi disk10 */
172                 {131,   4,      176,    "da"},  /* scsi disk11 */
173                 {132,   4,      192,    "da"},  /* scsi disk12 */
174                 {133,   4,      208,    "da"},  /* scsi disk13 */
175                 {134,   4,      224,    "da"},  /* scsi disk14 */
176                 {135,   4,      240,    "da"},  /* scsi disk15 */
177
178                 {202,   4,      0,      "xbd"}, /* xbd */
179
180                 {0,     0,      0,      NULL},
181         };
182         int major = vdevice >> 8;
183         int minor = vdevice & 0xff;
184         int i;
185
186         if (vdevice & (1 << 28)) {
187                 *unit = (vdevice & ((1 << 28) - 1)) >> 8;
188                 *name = "xbd";
189         }
190
191         for (i = 0; info[i].major; i++) {
192                 if (info[i].major == major) {
193                         *unit = info[i].base + (minor >> info[i].shift);
194                         *name = info[i].name;
195                         return;
196                 }
197         }
198
199         *unit = minor >> 4;
200         *name = "xbd";
201 }
202
203 int
204 xlvbd_add(struct xb_softc *sc, blkif_sector_t sectors,
205     int vdevice, uint16_t vdisk_info, unsigned long sector_size)
206 {
207         int     unit, error = 0;
208         const char *name;
209
210         blkfront_vdevice_to_unit(vdevice, &unit, &name);
211
212         sc->xb_unit = unit;
213
214         if (strcmp(name, "xbd"))
215                 device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit);
216
217         sc->xb_disk = disk_alloc();
218         sc->xb_disk->d_unit = sc->xb_unit;
219         sc->xb_disk->d_open = blkif_open;
220         sc->xb_disk->d_close = blkif_close;
221         sc->xb_disk->d_ioctl = blkif_ioctl;
222         sc->xb_disk->d_strategy = xb_strategy;
223         sc->xb_disk->d_dump = xb_dump;
224         sc->xb_disk->d_name = name;
225         sc->xb_disk->d_drv1 = sc;
226         sc->xb_disk->d_sectorsize = sector_size;
227
228         sc->xb_disk->d_mediasize = sectors * sector_size;
229         sc->xb_disk->d_maxsize = sc->max_request_size;
230         sc->xb_disk->d_flags = 0;
231         disk_create(sc->xb_disk, DISK_VERSION_00);
232
233         return error;
234 }
235
236 /************************ end VBD support *****************/
237
238 /*
239  * Read/write routine for a buffer.  Finds the proper unit, place it on
240  * the sortq and kick the controller.
241  */
242 static void
243 xb_strategy(struct bio *bp)
244 {
245         struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
246
247         /* bogus disk? */
248         if (sc == NULL) {
249                 bp->bio_error = EINVAL;
250                 bp->bio_flags |= BIO_ERROR;
251                 bp->bio_resid = bp->bio_bcount;
252                 biodone(bp);
253                 return;
254         }
255
256         /*
257          * Place it in the queue of disk activities for this disk
258          */
259         mtx_lock(&sc->xb_io_lock);
260
261         xb_enqueue_bio(sc, bp);
262         xb_startio(sc);
263
264         mtx_unlock(&sc->xb_io_lock);
265         return;
266 }
267
268 static void
269 xb_bio_complete(struct xb_softc *sc, struct xb_command *cm)
270 {
271         struct bio *bp;
272
273         bp = cm->bp;
274
275         if ( unlikely(cm->status != BLKIF_RSP_OKAY) ) {
276                 disk_err(bp, "disk error" , -1, 0);
277                 printf(" status: %x\n", cm->status);
278                 bp->bio_flags |= BIO_ERROR;
279         }
280
281         if (bp->bio_flags & BIO_ERROR)
282                 bp->bio_error = EIO;
283         else
284                 bp->bio_resid = 0;
285
286         xb_free_command(cm);
287         biodone(bp);
288 }
289
290 // Quiesce the disk writes for a dump file before allowing the next buffer.
291 static void
292 xb_quiesce(struct xb_softc *sc)
293 {
294         int             mtd;
295
296         // While there are outstanding requests
297         while (!TAILQ_EMPTY(&sc->cm_busy)) {
298                 RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, mtd);
299                 if (mtd) {
300                         /* Recieved request completions, update queue. */
301                         blkif_int(sc);
302                 }
303                 if (!TAILQ_EMPTY(&sc->cm_busy)) {
304                         /*
305                          * Still pending requests, wait for the disk i/o
306                          * to complete.
307                          */
308                         HYPERVISOR_yield();
309                 }
310         }
311 }
312
313 /* Kernel dump function for a paravirtualized disk device */
314 static void
315 xb_dump_complete(struct xb_command *cm)
316 {
317
318         xb_enqueue_complete(cm);
319 }
320
321 static int
322 xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
323         size_t length)
324 {
325         struct  disk    *dp = arg;
326         struct xb_softc *sc = (struct xb_softc *) dp->d_drv1;
327         struct xb_command *cm;
328         size_t          chunk;
329         int             sbp;
330         int             rc = 0;
331
332         if (length <= 0)
333                 return (rc);
334
335         xb_quiesce(sc); /* All quiet on the western front. */
336
337         /*
338          * If this lock is held, then this module is failing, and a
339          * successful kernel dump is highly unlikely anyway.
340          */
341         mtx_lock(&sc->xb_io_lock);
342
343         /* Split the 64KB block as needed */
344         for (sbp=0; length > 0; sbp++) {
345                 cm = xb_dequeue_free(sc);
346                 if (cm == NULL) {
347                         mtx_unlock(&sc->xb_io_lock);
348                         device_printf(sc->xb_dev, "dump: no more commands?\n");
349                         return (EBUSY);
350                 }
351
352                 if (gnttab_alloc_grant_references(sc->max_request_segments,
353                                                   &cm->gref_head) != 0) {
354                         xb_free_command(cm);
355                         mtx_unlock(&sc->xb_io_lock);
356                         device_printf(sc->xb_dev, "no more grant allocs?\n");
357                         return (EBUSY);
358                 }
359
360                 chunk = length > sc->max_request_size
361                       ? sc->max_request_size : length;
362                 cm->data = virtual;
363                 cm->datalen = chunk;
364                 cm->operation = BLKIF_OP_WRITE;
365                 cm->sector_number = offset / dp->d_sectorsize;
366                 cm->cm_complete = xb_dump_complete;
367
368                 xb_enqueue_ready(cm);
369
370                 length -= chunk;
371                 offset += chunk;
372                 virtual = (char *) virtual + chunk;
373         }
374
375         /* Tell DOM0 to do the I/O */
376         xb_startio(sc);
377         mtx_unlock(&sc->xb_io_lock);
378
379         /* Poll for the completion. */
380         xb_quiesce(sc); /* All quite on the eastern front */
381
382         /* If there were any errors, bail out... */
383         while ((cm = xb_dequeue_complete(sc)) != NULL) {
384                 if (cm->status != BLKIF_RSP_OKAY) {
385                         device_printf(sc->xb_dev,
386                             "Dump I/O failed at sector %jd\n",
387                             cm->sector_number);
388                         rc = EIO;
389                 }
390                 xb_free_command(cm);
391         }
392
393         return (rc);
394 }
395
396
397 static int
398 blkfront_probe(device_t dev)
399 {
400
401         if (!strcmp(xenbus_get_type(dev), "vbd")) {
402                 device_set_desc(dev, "Virtual Block Device");
403                 device_quiet(dev);
404                 return (0);
405         }
406
407         return (ENXIO);
408 }
409
410 /*
411  * Setup supplies the backend dir, virtual device.  We place an event
412  * channel and shared frame entries.  We watch backend to wait if it's
413  * ok.
414  */
415 static int
416 blkfront_attach(device_t dev)
417 {
418         struct xb_softc *sc;
419         const char *name;
420         int error;
421         int vdevice;
422         int i;
423         int unit;
424
425         /* FIXME: Use dynamic device id if this is not set. */
426         error = xs_scanf(XST_NIL, xenbus_get_node(dev),
427             "virtual-device", NULL, "%i", &vdevice);
428         if (error) {
429                 xenbus_dev_fatal(dev, error, "reading virtual-device");
430                 device_printf(dev, "Couldn't determine virtual device.\n");
431                 return (error);
432         }
433
434         blkfront_vdevice_to_unit(vdevice, &unit, &name);
435         if (!strcmp(name, "xbd"))
436                 device_set_unit(dev, unit);
437
438         sc = device_get_softc(dev);
439         mtx_init(&sc->xb_io_lock, "blkfront i/o lock", NULL, MTX_DEF);
440         xb_initq_free(sc);
441         xb_initq_busy(sc);
442         xb_initq_ready(sc);
443         xb_initq_complete(sc);
444         xb_initq_bio(sc);
445         for (i = 0; i < XBF_MAX_RING_PAGES; i++)
446                 sc->ring_ref[i] = GRANT_INVALID_REF;
447
448         sc->xb_dev = dev;
449         sc->vdevice = vdevice;
450         sc->connected = BLKIF_STATE_DISCONNECTED;
451
452         /* Wait for backend device to publish its protocol capabilities. */
453         xenbus_set_state(dev, XenbusStateInitialising);
454
455         return (0);
456 }
457
458 static int
459 blkfront_suspend(device_t dev)
460 {
461         struct xb_softc *sc = device_get_softc(dev);
462         int retval;
463         int saved_state;
464
465         /* Prevent new requests being issued until we fix things up. */
466         mtx_lock(&sc->xb_io_lock);
467         saved_state = sc->connected;
468         sc->connected = BLKIF_STATE_SUSPENDED;
469
470         /* Wait for outstanding I/O to drain. */
471         retval = 0;
472         while (TAILQ_EMPTY(&sc->cm_busy) == 0) {
473                 if (msleep(&sc->cm_busy, &sc->xb_io_lock,
474                            PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) {
475                         retval = EBUSY;
476                         break;
477                 }
478         }
479         mtx_unlock(&sc->xb_io_lock);
480
481         if (retval != 0)
482                 sc->connected = saved_state;
483
484         return (retval);
485 }
486
487 static int
488 blkfront_resume(device_t dev)
489 {
490         struct xb_softc *sc = device_get_softc(dev);
491
492         DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
493
494         blkif_free(sc);
495         blkfront_initialize(sc);
496         return (0);
497 }
498
499 static void
500 blkfront_initialize(struct xb_softc *sc)
501 {
502         const char *otherend_path;
503         const char *node_path;
504         int error;
505         int i;
506
507         if (xenbus_get_state(sc->xb_dev) != XenbusStateInitialising) {
508                 /* Initialization has already been performed. */
509                 return;
510         }
511
512         /*
513          * Protocol defaults valid even if negotiation for a
514          * setting fails.
515          */
516         sc->ring_pages = 1;
517         sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
518         sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
519         sc->max_request_size = (sc->max_request_segments - 1) * PAGE_SIZE;
520         sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
521
522         /*
523          * Protocol negotiation.
524          *
525          * \note xs_gather() returns on the first encountered error, so
526          *       we must use independant calls in order to guarantee
527          *       we don't miss information in a sparsly populated back-end
528          *       tree.
529          */
530         otherend_path = xenbus_get_otherend_path(sc->xb_dev);
531         node_path = xenbus_get_node(sc->xb_dev);
532         (void)xs_scanf(XST_NIL, otherend_path,
533                        "max-ring-pages", NULL, "%" PRIu32,
534                        &sc->ring_pages);
535
536         (void)xs_scanf(XST_NIL, otherend_path,
537                        "max-requests", NULL, "%" PRIu32,
538                        &sc->max_requests);
539
540         (void)xs_scanf(XST_NIL, otherend_path,
541                        "max-request-segments", NULL, "%" PRIu32,
542                        &sc->max_request_segments);
543
544         (void)xs_scanf(XST_NIL, otherend_path,
545                        "max-request-size", NULL, "%" PRIu32,
546                        &sc->max_request_size);
547
548         if (sc->ring_pages > XBF_MAX_RING_PAGES) {
549                 device_printf(sc->xb_dev, "Back-end specified ring-pages of "
550                               "%u limited to front-end limit of %zu.\n",
551                               sc->ring_pages, XBF_MAX_RING_PAGES);
552                 sc->ring_pages = XBF_MAX_RING_PAGES;
553         }
554
555         if (sc->max_requests > XBF_MAX_REQUESTS) {
556                 device_printf(sc->xb_dev, "Back-end specified max_requests of "
557                               "%u limited to front-end limit of %u.\n",
558                               sc->max_requests, XBF_MAX_REQUESTS);
559                 sc->max_requests = XBF_MAX_REQUESTS;
560         }
561
562         if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) {
563                 device_printf(sc->xb_dev, "Back-end specificed "
564                               "max_requests_segments of %u limited to "
565                               "front-end limit of %u.\n",
566                               sc->max_request_segments,
567                               XBF_MAX_SEGMENTS_PER_REQUEST);
568                 sc->max_request_segments = XBF_MAX_SEGMENTS_PER_REQUEST;
569         }
570
571         if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) {
572                 device_printf(sc->xb_dev, "Back-end specificed "
573                               "max_request_size of %u limited to front-end "
574                               "limit of %u.\n", sc->max_request_size,
575                               XBF_MAX_REQUEST_SIZE);
576                 sc->max_request_size = XBF_MAX_REQUEST_SIZE;
577         }
578         sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
579
580         /* Allocate datastructures based on negotiated values. */
581         error = bus_dma_tag_create(NULL,                /* parent */
582                                    512, PAGE_SIZE,      /* algnmnt, boundary */
583                                    BUS_SPACE_MAXADDR,   /* lowaddr */
584                                    BUS_SPACE_MAXADDR,   /* highaddr */
585                                    NULL, NULL,          /* filter, filterarg */
586                                    sc->max_request_size,
587                                    sc->max_request_segments,
588                                    PAGE_SIZE,           /* maxsegsize */
589                                    BUS_DMA_ALLOCNOW,    /* flags */
590                                    busdma_lock_mutex,   /* lockfunc */
591                                    &sc->xb_io_lock,     /* lockarg */
592                                    &sc->xb_io_dmat);
593         if (error != 0) {
594                 xenbus_dev_fatal(sc->xb_dev, error,
595                                  "Cannot allocate parent DMA tag\n");
596                 return;
597         }
598
599         /* Per-transaction data allocation. */
600         sc->shadow = malloc(sizeof(*sc->shadow) * sc->max_requests,
601                             M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
602         if (sc->shadow == NULL) {
603                 bus_dma_tag_destroy(sc->xb_io_dmat);
604                 xenbus_dev_fatal(sc->xb_dev, error,
605                                  "Cannot allocate request structures\n");
606                 return;
607         }
608
609         for (i = 0; i < sc->max_requests; i++) {
610                 struct xb_command *cm;
611
612                 cm = &sc->shadow[i];
613                 cm->sg_refs = malloc(sizeof(grant_ref_t)
614                                    * sc->max_request_segments,
615                                      M_XENBLOCKFRONT, M_NOWAIT);
616                 if (cm->sg_refs == NULL)
617                         break;
618                 cm->id = i;
619                 cm->cm_sc = sc;
620                 if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
621                         break;
622                 xb_free_command(cm);
623         }
624
625         if (setup_blkring(sc) != 0)
626                 return;
627
628         error = xs_printf(XST_NIL, node_path,
629                          "ring-pages","%u", sc->ring_pages);
630         if (error) {
631                 xenbus_dev_fatal(sc->xb_dev, error,
632                                  "writing %s/ring-pages",
633                                  node_path);
634                 return;
635         }
636
637         error = xs_printf(XST_NIL, node_path,
638                          "max-requests","%u", sc->max_requests);
639         if (error) {
640                 xenbus_dev_fatal(sc->xb_dev, error,
641                                  "writing %s/max-requests",
642                                  node_path);
643                 return;
644         }
645
646         error = xs_printf(XST_NIL, node_path,
647                          "max-request-segments","%u", sc->max_request_segments);
648         if (error) {
649                 xenbus_dev_fatal(sc->xb_dev, error,
650                                  "writing %s/max-request-segments",
651                                  node_path);
652                 return;
653         }
654
655         error = xs_printf(XST_NIL, node_path,
656                          "max-request-size","%u", sc->max_request_size);
657         if (error) {
658                 xenbus_dev_fatal(sc->xb_dev, error,
659                                  "writing %s/max-request-size",
660                                  node_path);
661                 return;
662         }
663
664         error = xs_printf(XST_NIL, node_path, "event-channel",
665                           "%u", irq_to_evtchn_port(sc->irq));
666         if (error) {
667                 xenbus_dev_fatal(sc->xb_dev, error,
668                                  "writing %s/event-channel",
669                                  node_path);
670                 return;
671         }
672
673         error = xs_printf(XST_NIL, node_path,
674                           "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
675         if (error) {
676                 xenbus_dev_fatal(sc->xb_dev, error,
677                                  "writing %s/protocol",
678                                  node_path);
679                 return;
680         }
681
682         xenbus_set_state(sc->xb_dev, XenbusStateInitialised);
683 }
684
685 static int 
686 setup_blkring(struct xb_softc *sc)
687 {
688         blkif_sring_t *sring;
689         uintptr_t sring_page_addr;
690         int error;
691         int i;
692
693         sring = malloc(sc->ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
694                        M_NOWAIT|M_ZERO);
695         if (sring == NULL) {
696                 xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring");
697                 return (ENOMEM);
698         }
699         SHARED_RING_INIT(sring);
700         FRONT_RING_INIT(&sc->ring, sring, sc->ring_pages * PAGE_SIZE);
701
702         for (i = 0, sring_page_addr = (uintptr_t)sring;
703              i < sc->ring_pages;
704              i++, sring_page_addr += PAGE_SIZE) {
705
706                 error = xenbus_grant_ring(sc->xb_dev,
707                     (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->ring_ref[i]);
708                 if (error) {
709                         xenbus_dev_fatal(sc->xb_dev, error,
710                                          "granting ring_ref(%d)", i);
711                         return (error);
712                 }
713         }
714         error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
715                           "ring-ref","%u", sc->ring_ref[0]);
716         if (error) {
717                 xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
718                                  xenbus_get_node(sc->xb_dev));
719                 return (error);
720         }
721         for (i = 1; i < sc->ring_pages; i++) {
722                 char ring_ref_name[]= "ring_refXX";
723
724                 snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
725                 error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
726                                  ring_ref_name, "%u", sc->ring_ref[i]);
727                 if (error) {
728                         xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
729                                          xenbus_get_node(sc->xb_dev),
730                                          ring_ref_name);
731                         return (error);
732                 }
733         }
734
735         error = bind_listening_port_to_irqhandler(
736             xenbus_get_otherend_id(sc->xb_dev),
737             "xbd", (driver_intr_t *)blkif_int, sc,
738             INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq);
739         if (error) {
740                 xenbus_dev_fatal(sc->xb_dev, error,
741                     "bind_evtchn_to_irqhandler failed");
742                 return (error);
743         }
744
745         return (0);
746 }
747
748 /**
749  * Callback received when the backend's state changes.
750  */
751 static void
752 blkfront_backend_changed(device_t dev, XenbusState backend_state)
753 {
754         struct xb_softc *sc = device_get_softc(dev);
755
756         DPRINTK("backend_state=%d\n", backend_state);
757
758         switch (backend_state) {
759         case XenbusStateUnknown:
760         case XenbusStateInitialising:
761         case XenbusStateReconfigured:
762         case XenbusStateReconfiguring:
763         case XenbusStateClosed:
764                 break;
765
766         case XenbusStateInitWait:
767         case XenbusStateInitialised:
768                 blkfront_initialize(sc);
769                 break;
770
771         case XenbusStateConnected:
772                 blkfront_initialize(sc);
773                 blkfront_connect(sc);
774                 break;
775
776         case XenbusStateClosing:
777                 if (sc->users > 0)
778                         xenbus_dev_error(dev, -EBUSY,
779                                          "Device in use; refusing to close");
780                 else
781                         blkfront_closing(dev);
782                 break;  
783         }
784 }
785
786 /* 
787 ** Invoked when the backend is finally 'ready' (and has published
788 ** the details about the physical device - #sectors, size, etc). 
789 */
790 static void 
791 blkfront_connect(struct xb_softc *sc)
792 {
793         device_t dev = sc->xb_dev;
794         unsigned long sectors, sector_size;
795         unsigned int binfo;
796         int err, feature_barrier;
797
798         if( (sc->connected == BLKIF_STATE_CONNECTED) || 
799             (sc->connected == BLKIF_STATE_SUSPENDED) )
800                 return;
801
802         DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
803
804         err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
805                         "sectors", "%lu", &sectors,
806                         "info", "%u", &binfo,
807                         "sector-size", "%lu", &sector_size,
808                         NULL);
809         if (err) {
810                 xenbus_dev_fatal(dev, err,
811                     "reading backend fields at %s",
812                     xenbus_get_otherend_path(dev));
813                 return;
814         }
815         err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
816                         "feature-barrier", "%lu", &feature_barrier,
817                         NULL);
818         if (!err || feature_barrier)
819                 sc->xb_flags |= XB_BARRIER;
820
821         if (sc->xb_disk == NULL) {
822                 device_printf(dev, "%juMB <%s> at %s",
823                     (uintmax_t) sectors / (1048576 / sector_size),
824                     device_get_desc(dev),
825                     xenbus_get_node(dev));
826                 bus_print_child_footer(device_get_parent(dev), dev);
827
828                 xlvbd_add(sc, sectors, sc->vdevice, binfo, sector_size);
829         }
830
831         (void)xenbus_set_state(dev, XenbusStateConnected); 
832
833         /* Kick pending requests. */
834         mtx_lock(&sc->xb_io_lock);
835         sc->connected = BLKIF_STATE_CONNECTED;
836         xb_startio(sc);
837         sc->xb_flags |= XB_READY;
838         mtx_unlock(&sc->xb_io_lock);
839 }
840
841 /**
842  * Handle the change of state of the backend to Closing.  We must delete our
843  * device-layer structures now, to ensure that writes are flushed through to
844  * the backend.  Once this is done, we can switch to Closed in
845  * acknowledgement.
846  */
847 static void
848 blkfront_closing(device_t dev)
849 {
850         struct xb_softc *sc = device_get_softc(dev);
851
852         xenbus_set_state(dev, XenbusStateClosing);
853
854         DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
855
856         if (sc->xb_disk != NULL) {
857                 disk_destroy(sc->xb_disk);
858                 sc->xb_disk = NULL;
859         }
860
861         xenbus_set_state(dev, XenbusStateClosed); 
862 }
863
864
865 static int
866 blkfront_detach(device_t dev)
867 {
868         struct xb_softc *sc = device_get_softc(dev);
869
870         DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
871
872         blkif_free(sc);
873         mtx_destroy(&sc->xb_io_lock);
874
875         return 0;
876 }
877
878
879 static inline void 
880 flush_requests(struct xb_softc *sc)
881 {
882         int notify;
883
884         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->ring, notify);
885
886         if (notify)
887                 notify_remote_via_irq(sc->irq);
888 }
889
890 static void
891 blkif_restart_queue_callback(void *arg)
892 {
893         struct xb_softc *sc = arg;
894
895         mtx_lock(&sc->xb_io_lock);
896
897         xb_startio(sc);
898
899         mtx_unlock(&sc->xb_io_lock);
900 }
901
902 static int
903 blkif_open(struct disk *dp)
904 {
905         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
906
907         if (sc == NULL) {
908                 printf("xb%d: not found", sc->xb_unit);
909                 return (ENXIO);
910         }
911
912         sc->xb_flags |= XB_OPEN;
913         sc->users++;
914         return (0);
915 }
916
917 static int
918 blkif_close(struct disk *dp)
919 {
920         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
921
922         if (sc == NULL)
923                 return (ENXIO);
924         sc->xb_flags &= ~XB_OPEN;
925         if (--(sc->users) == 0) {
926                 /* Check whether we have been instructed to close.  We will
927                    have ignored this request initially, as the device was
928                    still mounted. */
929                 device_t dev = sc->xb_dev;
930                 XenbusState state =
931                         xenbus_read_driver_state(xenbus_get_otherend_path(dev));
932
933                 if (state == XenbusStateClosing)
934                         blkfront_closing(dev);
935         }
936         return (0);
937 }
938
939 static int
940 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
941 {
942         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
943
944         if (sc == NULL)
945                 return (ENXIO);
946
947         return (ENOTTY);
948 }
949
950 static void
951 xb_free_command(struct xb_command *cm)
952 {
953
954         KASSERT((cm->cm_flags & XB_ON_XBQ_MASK) == 0,
955             ("Freeing command that is still on a queue\n"));
956
957         cm->cm_flags = 0;
958         cm->bp = NULL;
959         cm->cm_complete = NULL;
960         xb_enqueue_free(cm);
961 }
962
963 /*
964  * blkif_queue_request
965  *
966  * request block io
967  * 
968  * id: for guest use only.
969  * operation: BLKIF_OP_{READ,WRITE,PROBE}
970  * buffer: buffer to read/write into. this should be a
971  *   virtual address in the guest os.
972  */
973 static struct xb_command *
974 xb_bio_command(struct xb_softc *sc)
975 {
976         struct xb_command *cm;
977         struct bio *bp;
978
979         if (unlikely(sc->connected != BLKIF_STATE_CONNECTED))
980                 return (NULL);
981
982         bp = xb_dequeue_bio(sc);
983         if (bp == NULL)
984                 return (NULL);
985
986         if ((cm = xb_dequeue_free(sc)) == NULL) {
987                 xb_requeue_bio(sc, bp);
988                 return (NULL);
989         }
990
991         if (gnttab_alloc_grant_references(sc->max_request_segments,
992             &cm->gref_head) != 0) {
993                 gnttab_request_free_callback(&sc->callback,
994                         blkif_restart_queue_callback, sc,
995                         sc->max_request_segments);
996                 xb_requeue_bio(sc, bp);
997                 xb_enqueue_free(cm);
998                 sc->xb_flags |= XB_FROZEN;
999                 return (NULL);
1000         }
1001
1002         cm->bp = bp;
1003         cm->data = bp->bio_data;
1004         cm->datalen = bp->bio_bcount;
1005         cm->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
1006             BLKIF_OP_WRITE;
1007         cm->sector_number = (blkif_sector_t)bp->bio_pblkno;
1008
1009         return (cm);
1010 }
1011
1012 static int
1013 blkif_queue_request(struct xb_softc *sc, struct xb_command *cm)
1014 {
1015         int     error;
1016
1017         error = bus_dmamap_load(sc->xb_io_dmat, cm->map, cm->data, cm->datalen,
1018             blkif_queue_cb, cm, 0);
1019         if (error == EINPROGRESS) {
1020                 printf("EINPROGRESS\n");
1021                 sc->xb_flags |= XB_FROZEN;
1022                 cm->cm_flags |= XB_CMD_FROZEN;
1023                 return (0);
1024         }
1025
1026         return (error);
1027 }
1028
1029 static void
1030 blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1031 {
1032         struct xb_softc *sc;
1033         struct xb_command *cm;
1034         blkif_request_t *ring_req;
1035         struct blkif_request_segment *sg;
1036         struct blkif_request_segment *last_block_sg;
1037         grant_ref_t *sg_ref;
1038         vm_paddr_t buffer_ma;
1039         uint64_t fsect, lsect;
1040         int ref;
1041         int op;
1042         int block_segs;
1043
1044         cm = arg;
1045         sc = cm->cm_sc;
1046
1047 //printf("%s: Start\n", __func__);
1048         if (error) {
1049                 printf("error %d in blkif_queue_cb\n", error);
1050                 cm->bp->bio_error = EIO;
1051                 biodone(cm->bp);
1052                 xb_free_command(cm);
1053                 return;
1054         }
1055
1056         /* Fill out a communications ring structure. */
1057         ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
1058         sc->ring.req_prod_pvt++;
1059         ring_req->id = cm->id;
1060         ring_req->operation = cm->operation;
1061         ring_req->sector_number = cm->sector_number;
1062         ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
1063         ring_req->nr_segments = nsegs;
1064         cm->nseg = nsegs;
1065
1066         block_segs    = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1067         sg            = ring_req->seg;
1068         last_block_sg = sg + block_segs;
1069         sg_ref        = cm->sg_refs;
1070
1071         while (1) {
1072
1073                 while (sg < last_block_sg) {
1074                         buffer_ma = segs->ds_addr;
1075                         fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
1076                         lsect = fsect + (segs->ds_len  >> XBD_SECTOR_SHFT) - 1;
1077
1078                         KASSERT(lsect <= 7, ("XEN disk driver data cannot "
1079                                 "cross a page boundary"));
1080
1081                         /* install a grant reference. */
1082                         ref = gnttab_claim_grant_reference(&cm->gref_head);
1083
1084                         /*
1085                          * GNTTAB_LIST_END == 0xffffffff, but it is private
1086                          * to gnttab.c.
1087                          */
1088                         KASSERT(ref != ~0, ("grant_reference failed"));
1089
1090                         gnttab_grant_foreign_access_ref(
1091                                 ref,
1092                                 xenbus_get_otherend_id(sc->xb_dev),
1093                                 buffer_ma >> PAGE_SHIFT,
1094                                 ring_req->operation == BLKIF_OP_WRITE);
1095
1096                         *sg_ref = ref;
1097                         *sg = (struct blkif_request_segment) {
1098                                 .gref       = ref,
1099                                 .first_sect = fsect, 
1100                                 .last_sect  = lsect };
1101                         sg++;
1102                         sg_ref++;
1103                         segs++;
1104                         nsegs--;
1105                 }
1106                 block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1107                 if (block_segs == 0)
1108                         break;
1109
1110                 sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
1111                 sc->ring.req_prod_pvt++;
1112                 last_block_sg = sg + block_segs;
1113         }
1114
1115         if (cm->operation == BLKIF_OP_READ)
1116                 op = BUS_DMASYNC_PREREAD;
1117         else if (cm->operation == BLKIF_OP_WRITE)
1118                 op = BUS_DMASYNC_PREWRITE;
1119         else
1120                 op = 0;
1121         bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
1122
1123         gnttab_free_grant_references(cm->gref_head);
1124
1125         xb_enqueue_busy(cm);
1126
1127         /*
1128          * This flag means that we're probably executing in the busdma swi
1129          * instead of in the startio context, so an explicit flush is needed.
1130          */
1131         if (cm->cm_flags & XB_CMD_FROZEN)
1132                 flush_requests(sc);
1133
1134 //printf("%s: Done\n", __func__);
1135         return;
1136 }
1137
1138 /*
1139  * Dequeue buffers and place them in the shared communication ring.
1140  * Return when no more requests can be accepted or all buffers have 
1141  * been queued.
1142  *
1143  * Signal XEN once the ring has been filled out.
1144  */
1145 static void
1146 xb_startio(struct xb_softc *sc)
1147 {
1148         struct xb_command *cm;
1149         int error, queued = 0;
1150
1151         mtx_assert(&sc->xb_io_lock, MA_OWNED);
1152
1153         if (sc->connected != BLKIF_STATE_CONNECTED)
1154                 return;
1155
1156         while (RING_FREE_REQUESTS(&sc->ring) >= sc->max_request_blocks) {
1157                 if (sc->xb_flags & XB_FROZEN)
1158                         break;
1159
1160                 cm = xb_dequeue_ready(sc);
1161
1162                 if (cm == NULL)
1163                     cm = xb_bio_command(sc);
1164
1165                 if (cm == NULL)
1166                         break;
1167
1168                 if ((error = blkif_queue_request(sc, cm)) != 0) {
1169                         printf("blkif_queue_request returned %d\n", error);
1170                         break;
1171                 }
1172                 queued++;
1173         }
1174
1175         if (queued != 0) 
1176                 flush_requests(sc);
1177 }
1178
1179 static void
1180 blkif_int(void *xsc)
1181 {
1182         struct xb_softc *sc = xsc;
1183         struct xb_command *cm;
1184         blkif_response_t *bret;
1185         RING_IDX i, rp;
1186         int op;
1187
1188         mtx_lock(&sc->xb_io_lock);
1189
1190         if (unlikely(sc->connected == BLKIF_STATE_DISCONNECTED)) {
1191                 mtx_unlock(&sc->xb_io_lock);
1192                 return;
1193         }
1194
1195  again:
1196         rp = sc->ring.sring->rsp_prod;
1197         rmb(); /* Ensure we see queued responses up to 'rp'. */
1198
1199         for (i = sc->ring.rsp_cons; i != rp;) {
1200                 bret = RING_GET_RESPONSE(&sc->ring, i);
1201                 cm   = &sc->shadow[bret->id];
1202
1203                 xb_remove_busy(cm);
1204                 i += blkif_completion(cm);
1205
1206                 if (cm->operation == BLKIF_OP_READ)
1207                         op = BUS_DMASYNC_POSTREAD;
1208                 else if (cm->operation == BLKIF_OP_WRITE)
1209                         op = BUS_DMASYNC_POSTWRITE;
1210                 else
1211                         op = 0;
1212                 bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
1213                 bus_dmamap_unload(sc->xb_io_dmat, cm->map);
1214
1215                 /*
1216                  * If commands are completing then resources are probably
1217                  * being freed as well.  It's a cheap assumption even when
1218                  * wrong.
1219                  */
1220                 sc->xb_flags &= ~XB_FROZEN;
1221
1222                 /*
1223                  * Directly call the i/o complete routine to save an
1224                  * an indirection in the common case.
1225                  */
1226                 cm->status = bret->status;
1227                 if (cm->bp)
1228                         xb_bio_complete(sc, cm);
1229                 else if (cm->cm_complete)
1230                         (cm->cm_complete)(cm);
1231                 else
1232                         xb_free_command(cm);
1233         }
1234
1235         sc->ring.rsp_cons = i;
1236
1237         if (i != sc->ring.req_prod_pvt) {
1238                 int more_to_do;
1239                 RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, more_to_do);
1240                 if (more_to_do)
1241                         goto again;
1242         } else {
1243                 sc->ring.sring->rsp_event = i + 1;
1244         }
1245
1246         xb_startio(sc);
1247
1248         if (unlikely(sc->connected == BLKIF_STATE_SUSPENDED))
1249                 wakeup(&sc->cm_busy);
1250
1251         mtx_unlock(&sc->xb_io_lock);
1252 }
1253
1254 static void 
1255 blkif_free(struct xb_softc *sc)
1256 {
1257         uint8_t *sring_page_ptr;
1258         int i;
1259         
1260         /* Prevent new requests being issued until we fix things up. */
1261         mtx_lock(&sc->xb_io_lock);
1262         sc->connected = BLKIF_STATE_DISCONNECTED; 
1263         mtx_unlock(&sc->xb_io_lock);
1264
1265         /* Free resources associated with old device channel. */
1266         if (sc->ring.sring != NULL) {
1267                 sring_page_ptr = (uint8_t *)sc->ring.sring;
1268                 for (i = 0; i < sc->ring_pages; i++) {
1269                         if (sc->ring_ref[i] != GRANT_INVALID_REF) {
1270                                 gnttab_end_foreign_access_ref(sc->ring_ref[i]);
1271                                 sc->ring_ref[i] = GRANT_INVALID_REF;
1272                         }
1273                         sring_page_ptr += PAGE_SIZE;
1274                 }
1275                 free(sc->ring.sring, M_XENBLOCKFRONT);
1276                 sc->ring.sring = NULL;
1277         }
1278
1279         if (sc->shadow) {
1280
1281                 for (i = 0; i < sc->max_requests; i++) {
1282                         struct xb_command *cm;
1283
1284                         cm = &sc->shadow[i];
1285                         if (cm->sg_refs != NULL) {
1286                                 free(cm->sg_refs, M_XENBLOCKFRONT);
1287                                 cm->sg_refs = NULL;
1288                         }
1289
1290                         bus_dmamap_destroy(sc->xb_io_dmat, cm->map);
1291                 }
1292                 free(sc->shadow, M_XENBLOCKFRONT);
1293                 sc->shadow = NULL;
1294
1295                 bus_dma_tag_destroy(sc->xb_io_dmat);
1296                 
1297                 xb_initq_free(sc);
1298                 xb_initq_ready(sc);
1299                 xb_initq_complete(sc);
1300         }
1301                 
1302         if (sc->irq) {
1303                 unbind_from_irqhandler(sc->irq);
1304                 sc->irq = 0;
1305         }
1306 }
1307
1308 static int
1309 blkif_completion(struct xb_command *s)
1310 {
1311 //printf("%s: Req %p(%d)\n", __func__, s, s->nseg);
1312         gnttab_end_foreign_access_references(s->nseg, s->sg_refs);
1313         return (BLKIF_SEGS_TO_BLOCKS(s->nseg));
1314 }
1315
1316 /* ** Driver registration ** */
1317 static device_method_t blkfront_methods[] = { 
1318         /* Device interface */ 
1319         DEVMETHOD(device_probe,         blkfront_probe), 
1320         DEVMETHOD(device_attach,        blkfront_attach), 
1321         DEVMETHOD(device_detach,        blkfront_detach), 
1322         DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
1323         DEVMETHOD(device_suspend,       blkfront_suspend), 
1324         DEVMETHOD(device_resume,        blkfront_resume), 
1325  
1326         /* Xenbus interface */
1327         DEVMETHOD(xenbus_otherend_changed, blkfront_backend_changed),
1328
1329         { 0, 0 } 
1330 }; 
1331
1332 static driver_t blkfront_driver = { 
1333         "xbd", 
1334         blkfront_methods, 
1335         sizeof(struct xb_softc),                      
1336 }; 
1337 devclass_t blkfront_devclass; 
1338  
1339 DRIVER_MODULE(xbd, xenbusb_front, blkfront_driver, blkfront_devclass, 0, 0);