]> CyberLeo.Net >> Repos - FreeBSD/releng/8.2.git/blob - sys/dev/xen/blkfront/blkfront.c
Copy stable/8 to releng/8.2 in preparation for FreeBSD-8.2 release.
[FreeBSD/releng/8.2.git] / sys / dev / xen / blkfront / blkfront.c
1 /*
2  * XenBSD block device driver
3  *
4  * Copyright (c) 2009 Scott Long, Yahoo!
5  * Copyright (c) 2009 Frank Suchomel, Citrix
6  * Copyright (c) 2009 Doug F. Rabson, Citrix
7  * Copyright (c) 2005 Kip Macy
8  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
9  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
10  *
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a copy
13  * of this software and associated documentation files (the "Software"), to
14  * deal in the Software without restriction, including without limitation the
15  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
16  * sell copies of the Software, and to permit persons to whom the Software is
17  * furnished to do so, subject to the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included in
20  * all copies or substantial portions of the Software.
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/kernel.h>
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39
40 #include <sys/bio.h>
41 #include <sys/bus.h>
42 #include <sys/conf.h>
43 #include <sys/module.h>
44
45 #include <machine/bus.h>
46 #include <sys/rman.h>
47 #include <machine/resource.h>
48 #include <machine/intr_machdep.h>
49 #include <machine/vmparam.h>
50 #include <sys/bus_dma.h>
51
52 #include <machine/_inttypes.h>
53 #include <machine/xen/xen-os.h>
54 #include <machine/xen/xenfunc.h>
55
56 #include <xen/hypervisor.h>
57 #include <xen/xen_intr.h>
58 #include <xen/evtchn.h>
59 #include <xen/gnttab.h>
60 #include <xen/interface/grant_table.h>
61 #include <xen/interface/io/protocols.h>
62 #include <xen/xenbus/xenbusvar.h>
63
64 #include <geom/geom_disk.h>
65
66 #include <dev/xen/blkfront/block.h>
67
68 #include "xenbus_if.h"
69
70 /* prototypes */
71 static void xb_free_command(struct xb_command *cm);
72 static void xb_startio(struct xb_softc *sc);
73 static void blkfront_connect(struct xb_softc *);
74 static void blkfront_closing(device_t);
75 static int blkfront_detach(device_t);
76 static int setup_blkring(struct xb_softc *);
77 static void blkif_int(void *);
78 static void blkfront_initialize(struct xb_softc *);
79 #if 0
80 static void blkif_recover(struct xb_softc *);
81 #endif
82 static int blkif_completion(struct xb_command *);
83 static void blkif_free(struct xb_softc *, int);
84 static void blkif_queue_cb(void *, bus_dma_segment_t *, int, int);
85
86 MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
87
88 #define GRANT_INVALID_REF 0
89
90 /* Control whether runtime update of vbds is enabled. */
91 #define ENABLE_VBD_UPDATE 0
92
93 #if ENABLE_VBD_UPDATE
94 static void vbd_update(void);
95 #endif
96
97 #define BLKIF_STATE_DISCONNECTED 0
98 #define BLKIF_STATE_CONNECTED    1
99 #define BLKIF_STATE_SUSPENDED    2
100
101 #ifdef notyet
102 static char *blkif_state_name[] = {
103         [BLKIF_STATE_DISCONNECTED] = "disconnected",
104         [BLKIF_STATE_CONNECTED]    = "connected",
105         [BLKIF_STATE_SUSPENDED]    = "closed",
106 };
107
108 static char * blkif_status_name[] = {
109         [BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
110         [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
111         [BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
112         [BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
113 };
114 #endif
115
116 #if 0
117 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
118 #else
119 #define DPRINTK(fmt, args...) 
120 #endif
121
122 static int blkif_open(struct disk *dp);
123 static int blkif_close(struct disk *dp);
124 static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td);
125 static int blkif_queue_request(struct xb_softc *sc, struct xb_command *cm);
126 static void xb_strategy(struct bio *bp);
127
128 // In order to quiesce the device during kernel dumps, outstanding requests to
129 // DOM0 for disk reads/writes need to be accounted for.
130 static  int     xb_dump(void *, void *, vm_offset_t, off_t, size_t);
131
132 /* XXX move to xb_vbd.c when VBD update support is added */
133 #define MAX_VBDS 64
134
135 #define XBD_SECTOR_SIZE         512     /* XXX: assume for now */
136 #define XBD_SECTOR_SHFT         9
137
138 /*
139  * Translate Linux major/minor to an appropriate name and unit
140  * number. For HVM guests, this allows us to use the same drive names
141  * with blkfront as the emulated drives, easing transition slightly.
142  */
143 static void
144 blkfront_vdevice_to_unit(int vdevice, int *unit, const char **name)
145 {
146         static struct vdev_info {
147                 int major;
148                 int shift;
149                 int base;
150                 const char *name;
151         } info[] = {
152                 {3,     6,      0,      "ad"},  /* ide0 */
153                 {22,    6,      2,      "ad"},  /* ide1 */
154                 {33,    6,      4,      "ad"},  /* ide2 */
155                 {34,    6,      6,      "ad"},  /* ide3 */
156                 {56,    6,      8,      "ad"},  /* ide4 */
157                 {57,    6,      10,     "ad"},  /* ide5 */
158                 {88,    6,      12,     "ad"},  /* ide6 */
159                 {89,    6,      14,     "ad"},  /* ide7 */
160                 {90,    6,      16,     "ad"},  /* ide8 */
161                 {91,    6,      18,     "ad"},  /* ide9 */
162
163                 {8,     4,      0,      "da"},  /* scsi disk0 */
164                 {65,    4,      16,     "da"},  /* scsi disk1 */
165                 {66,    4,      32,     "da"},  /* scsi disk2 */
166                 {67,    4,      48,     "da"},  /* scsi disk3 */
167                 {68,    4,      64,     "da"},  /* scsi disk4 */
168                 {69,    4,      80,     "da"},  /* scsi disk5 */
169                 {70,    4,      96,     "da"},  /* scsi disk6 */
170                 {71,    4,      112,    "da"},  /* scsi disk7 */
171                 {128,   4,      128,    "da"},  /* scsi disk8 */
172                 {129,   4,      144,    "da"},  /* scsi disk9 */
173                 {130,   4,      160,    "da"},  /* scsi disk10 */
174                 {131,   4,      176,    "da"},  /* scsi disk11 */
175                 {132,   4,      192,    "da"},  /* scsi disk12 */
176                 {133,   4,      208,    "da"},  /* scsi disk13 */
177                 {134,   4,      224,    "da"},  /* scsi disk14 */
178                 {135,   4,      240,    "da"},  /* scsi disk15 */
179
180                 {202,   4,      0,      "xbd"}, /* xbd */
181
182                 {0,     0,      0,      NULL},
183         };
184         int major = vdevice >> 8;
185         int minor = vdevice & 0xff;
186         int i;
187
188         if (vdevice & (1 << 28)) {
189                 *unit = (vdevice & ((1 << 28) - 1)) >> 8;
190                 *name = "xbd";
191         }
192
193         for (i = 0; info[i].major; i++) {
194                 if (info[i].major == major) {
195                         *unit = info[i].base + (minor >> info[i].shift);
196                         *name = info[i].name;
197                         return;
198                 }
199         }
200
201         *unit = minor >> 4;
202         *name = "xbd";
203 }
204
205 int
206 xlvbd_add(struct xb_softc *sc, blkif_sector_t sectors,
207     int vdevice, uint16_t vdisk_info, unsigned long sector_size)
208 {
209         int     unit, error = 0;
210         const char *name;
211
212         blkfront_vdevice_to_unit(vdevice, &unit, &name);
213
214         sc->xb_unit = unit;
215
216         if (strcmp(name, "xbd"))
217                 device_printf(sc->xb_dev, "attaching as %s%d\n", name, unit);
218
219         sc->xb_disk = disk_alloc();
220         sc->xb_disk->d_unit = sc->xb_unit;
221         sc->xb_disk->d_open = blkif_open;
222         sc->xb_disk->d_close = blkif_close;
223         sc->xb_disk->d_ioctl = blkif_ioctl;
224         sc->xb_disk->d_strategy = xb_strategy;
225         sc->xb_disk->d_dump = xb_dump;
226         sc->xb_disk->d_name = name;
227         sc->xb_disk->d_drv1 = sc;
228         sc->xb_disk->d_sectorsize = sector_size;
229
230         sc->xb_disk->d_mediasize = sectors * sector_size;
231         sc->xb_disk->d_maxsize = sc->max_request_size;
232         sc->xb_disk->d_flags = 0;
233         disk_create(sc->xb_disk, DISK_VERSION_00);
234
235         return error;
236 }
237
238 /************************ end VBD support *****************/
239
240 /*
241  * Read/write routine for a buffer.  Finds the proper unit, place it on
242  * the sortq and kick the controller.
243  */
244 static void
245 xb_strategy(struct bio *bp)
246 {
247         struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1;
248
249         /* bogus disk? */
250         if (sc == NULL) {
251                 bp->bio_error = EINVAL;
252                 bp->bio_flags |= BIO_ERROR;
253                 bp->bio_resid = bp->bio_bcount;
254                 biodone(bp);
255                 return;
256         }
257
258         /*
259          * Place it in the queue of disk activities for this disk
260          */
261         mtx_lock(&sc->xb_io_lock);
262
263         xb_enqueue_bio(sc, bp);
264         xb_startio(sc);
265
266         mtx_unlock(&sc->xb_io_lock);
267         return;
268 }
269
270 static void
271 xb_bio_complete(struct xb_softc *sc, struct xb_command *cm)
272 {
273         struct bio *bp;
274
275         bp = cm->bp;
276
277         if ( unlikely(cm->status != BLKIF_RSP_OKAY) ) {
278                 disk_err(bp, "disk error" , -1, 0);
279                 printf(" status: %x\n", cm->status);
280                 bp->bio_flags |= BIO_ERROR;
281         }
282
283         if (bp->bio_flags & BIO_ERROR)
284                 bp->bio_error = EIO;
285         else
286                 bp->bio_resid = 0;
287
288         xb_free_command(cm);
289         biodone(bp);
290 }
291
292 // Quiesce the disk writes for a dump file before allowing the next buffer.
293 static void
294 xb_quiesce(struct xb_softc *sc)
295 {
296         int             mtd;
297
298         // While there are outstanding requests
299         while (!TAILQ_EMPTY(&sc->cm_busy)) {
300                 RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, mtd);
301                 if (mtd) {
302                         /* Recieved request completions, update queue. */
303                         blkif_int(sc);
304                 }
305                 if (!TAILQ_EMPTY(&sc->cm_busy)) {
306                         /*
307                          * Still pending requests, wait for the disk i/o
308                          * to complete.
309                          */
310                         HYPERVISOR_yield();
311                 }
312         }
313 }
314
315 /* Kernel dump function for a paravirtualized disk device */
316 static void
317 xb_dump_complete(struct xb_command *cm)
318 {
319
320         xb_enqueue_complete(cm);
321 }
322
323 static int
324 xb_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
325         size_t length)
326 {
327         struct  disk    *dp = arg;
328         struct xb_softc *sc = (struct xb_softc *) dp->d_drv1;
329         struct xb_command *cm;
330         size_t          chunk;
331         int             sbp;
332         int             rc = 0;
333
334         if (length <= 0)
335                 return (rc);
336
337         xb_quiesce(sc); /* All quiet on the western front. */
338
339         /*
340          * If this lock is held, then this module is failing, and a
341          * successful kernel dump is highly unlikely anyway.
342          */
343         mtx_lock(&sc->xb_io_lock);
344
345         /* Split the 64KB block as needed */
346         for (sbp=0; length > 0; sbp++) {
347                 cm = xb_dequeue_free(sc);
348                 if (cm == NULL) {
349                         mtx_unlock(&sc->xb_io_lock);
350                         device_printf(sc->xb_dev, "dump: no more commands?\n");
351                         return (EBUSY);
352                 }
353
354                 if (gnttab_alloc_grant_references(sc->max_request_segments,
355                                                   &cm->gref_head) != 0) {
356                         xb_free_command(cm);
357                         mtx_unlock(&sc->xb_io_lock);
358                         device_printf(sc->xb_dev, "no more grant allocs?\n");
359                         return (EBUSY);
360                 }
361
362                 chunk = length > sc->max_request_size
363                       ? sc->max_request_size : length;
364                 cm->data = virtual;
365                 cm->datalen = chunk;
366                 cm->operation = BLKIF_OP_WRITE;
367                 cm->sector_number = offset / dp->d_sectorsize;
368                 cm->cm_complete = xb_dump_complete;
369
370                 xb_enqueue_ready(cm);
371
372                 length -= chunk;
373                 offset += chunk;
374                 virtual = (char *) virtual + chunk;
375         }
376
377         /* Tell DOM0 to do the I/O */
378         xb_startio(sc);
379         mtx_unlock(&sc->xb_io_lock);
380
381         /* Poll for the completion. */
382         xb_quiesce(sc); /* All quite on the eastern front */
383
384         /* If there were any errors, bail out... */
385         while ((cm = xb_dequeue_complete(sc)) != NULL) {
386                 if (cm->status != BLKIF_RSP_OKAY) {
387                         device_printf(sc->xb_dev,
388                             "Dump I/O failed at sector %jd\n",
389                             cm->sector_number);
390                         rc = EIO;
391                 }
392                 xb_free_command(cm);
393         }
394
395         return (rc);
396 }
397
398
399 static int
400 blkfront_probe(device_t dev)
401 {
402
403         if (!strcmp(xenbus_get_type(dev), "vbd")) {
404                 device_set_desc(dev, "Virtual Block Device");
405                 device_quiet(dev);
406                 return (0);
407         }
408
409         return (ENXIO);
410 }
411
412 /*
413  * Setup supplies the backend dir, virtual device.  We place an event
414  * channel and shared frame entries.  We watch backend to wait if it's
415  * ok.
416  */
417 static int
418 blkfront_attach(device_t dev)
419 {
420         struct xb_softc *sc;
421         const char *name;
422         int error;
423         int vdevice;
424         int i;
425         int unit;
426
427         /* FIXME: Use dynamic device id if this is not set. */
428         error = xs_scanf(XST_NIL, xenbus_get_node(dev),
429             "virtual-device", NULL, "%i", &vdevice);
430         if (error) {
431                 xenbus_dev_fatal(dev, error, "reading virtual-device");
432                 device_printf(dev, "Couldn't determine virtual device.\n");
433                 return (error);
434         }
435
436         blkfront_vdevice_to_unit(vdevice, &unit, &name);
437         if (!strcmp(name, "xbd"))
438                 device_set_unit(dev, unit);
439
440         sc = device_get_softc(dev);
441         mtx_init(&sc->xb_io_lock, "blkfront i/o lock", NULL, MTX_DEF);
442         xb_initq_free(sc);
443         xb_initq_busy(sc);
444         xb_initq_ready(sc);
445         xb_initq_complete(sc);
446         xb_initq_bio(sc);
447         for (i = 0; i < XBF_MAX_RING_PAGES; i++)
448                 sc->ring_ref[i] = GRANT_INVALID_REF;
449
450         sc->xb_dev = dev;
451         sc->vdevice = vdevice;
452         sc->connected = BLKIF_STATE_DISCONNECTED;
453
454         /* Front end dir is a number, which is used as the id. */
455         sc->handle = strtoul(strrchr(xenbus_get_node(dev),'/')+1, NULL, 0);
456
457         /* Wait for backend device to publish its protocol capabilities. */
458         xenbus_set_state(dev, XenbusStateInitialising);
459
460         return (0);
461 }
462
463 static int
464 blkfront_suspend(device_t dev)
465 {
466         struct xb_softc *sc = device_get_softc(dev);
467
468         /* Prevent new requests being issued until we fix things up. */
469         mtx_lock(&sc->xb_io_lock);
470         sc->connected = BLKIF_STATE_SUSPENDED;
471         mtx_unlock(&sc->xb_io_lock);
472
473         return (0);
474 }
475
476 static int
477 blkfront_resume(device_t dev)
478 {
479 #if 0
480         struct xb_softc *sc = device_get_softc(dev);
481
482         DPRINTK("blkfront_resume: %s\n", xenbus_get_node(dev));
483
484 /* XXX This can't work!!! */
485         blkif_free(sc, 1);
486         blkfront_initialize(sc);
487         if (sc->connected == BLKIF_STATE_SUSPENDED)
488                 blkif_recover(sc);
489 #endif
490         return (0);
491 }
492
493 static void
494 blkfront_initialize(struct xb_softc *sc)
495 {
496         const char *otherend_path;
497         const char *node_path;
498         int error;
499         int i;
500
501         if (xenbus_get_state(sc->xb_dev) != XenbusStateInitialising)
502                 return;
503
504         /*
505          * Protocol defaults valid even if negotiation for a
506          * setting fails.
507          */
508         sc->ring_pages = 1;
509         sc->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
510         sc->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
511         sc->max_request_size = (sc->max_request_segments - 1) * PAGE_SIZE;
512         sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
513
514         /*
515          * Protocol negotiation.
516          *
517          * \note xs_gather() returns on the first encountered error, so
518          *       we must use independant calls in order to guarantee
519          *       we don't miss information in a sparsly populated back-end
520          *       tree.
521          */
522         otherend_path = xenbus_get_otherend_path(sc->xb_dev);
523         node_path = xenbus_get_node(sc->xb_dev);
524         (void)xs_scanf(XST_NIL, otherend_path,
525                        "max-ring-pages", NULL, "%" PRIu32,
526                        &sc->ring_pages);
527
528         (void)xs_scanf(XST_NIL, otherend_path,
529                        "max-requests", NULL, "%" PRIu32,
530                        &sc->max_requests);
531
532         (void)xs_scanf(XST_NIL, otherend_path,
533                        "max-request-segments", NULL, "%" PRIu32,
534                        &sc->max_request_segments);
535
536         (void)xs_scanf(XST_NIL, otherend_path,
537                        "max-request-size", NULL, "%" PRIu32,
538                        &sc->max_request_size);
539
540         if (sc->ring_pages > XBF_MAX_RING_PAGES) {
541                 device_printf(sc->xb_dev, "Back-end specified ring-pages of "
542                               "%u limited to front-end limit of %zu.\n",
543                               sc->ring_pages, XBF_MAX_RING_PAGES);
544                 sc->ring_pages = XBF_MAX_RING_PAGES;
545         }
546
547         if (sc->max_requests > XBF_MAX_REQUESTS) {
548                 device_printf(sc->xb_dev, "Back-end specified max_requests of "
549                               "%u limited to front-end limit of %u.\n",
550                               sc->max_requests, XBF_MAX_REQUESTS);
551                 sc->max_requests = XBF_MAX_REQUESTS;
552         }
553
554         if (sc->max_request_segments > XBF_MAX_SEGMENTS_PER_REQUEST) {
555                 device_printf(sc->xb_dev, "Back-end specificed "
556                               "max_requests_segments of %u limited to "
557                               "front-end limit of %u.\n",
558                               sc->max_request_segments,
559                               XBF_MAX_SEGMENTS_PER_REQUEST);
560                 sc->max_request_segments = XBF_MAX_SEGMENTS_PER_REQUEST;
561         }
562
563         if (sc->max_request_size > XBF_MAX_REQUEST_SIZE) {
564                 device_printf(sc->xb_dev, "Back-end specificed "
565                               "max_request_size of %u limited to front-end "
566                               "limit of %u.\n", sc->max_request_size,
567                               XBF_MAX_REQUEST_SIZE);
568                 sc->max_request_size = XBF_MAX_REQUEST_SIZE;
569         }
570         sc->max_request_blocks = BLKIF_SEGS_TO_BLOCKS(sc->max_request_segments);
571
572         /* Allocate datastructures based on negotiated values. */
573         error = bus_dma_tag_create(NULL,                /* parent */
574                                    512, PAGE_SIZE,      /* algnmnt, boundary */
575                                    BUS_SPACE_MAXADDR,   /* lowaddr */
576                                    BUS_SPACE_MAXADDR,   /* highaddr */
577                                    NULL, NULL,          /* filter, filterarg */
578                                    sc->max_request_size,
579                                    sc->max_request_segments,
580                                    PAGE_SIZE,           /* maxsegsize */
581                                    BUS_DMA_ALLOCNOW,    /* flags */
582                                    busdma_lock_mutex,   /* lockfunc */
583                                    &sc->xb_io_lock,     /* lockarg */
584                                    &sc->xb_io_dmat);
585         if (error != 0) {
586                 xenbus_dev_fatal(sc->xb_dev, error,
587                                  "Cannot allocate parent DMA tag\n");
588                 return;
589         }
590
591         /* Per-transaction data allocation. */
592         sc->shadow = malloc(sizeof(*sc->shadow) * sc->max_requests,
593                             M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
594         if (sc->shadow == NULL) {
595                 xenbus_dev_fatal(sc->xb_dev, error,
596                                  "Cannot allocate request structures\n");
597         }
598
599         for (i = 0; i < sc->max_requests; i++) {
600                 struct xb_command *cm;
601
602                 cm = &sc->shadow[i];
603                 cm->sg_refs = malloc(sizeof(grant_ref_t)
604                                    * sc->max_request_segments,
605                                      M_XENBLOCKFRONT, M_NOWAIT);
606                 if (cm->sg_refs == NULL)
607                         break;
608                 cm->id = i;
609                 cm->cm_sc = sc;
610                 if (bus_dmamap_create(sc->xb_io_dmat, 0, &cm->map) != 0)
611                         break;
612                 xb_free_command(cm);
613         }
614
615         if (setup_blkring(sc) != 0)
616                 return;
617
618         error = xs_printf(XST_NIL, node_path,
619                          "ring-pages","%u", sc->ring_pages);
620         if (error) {
621                 xenbus_dev_fatal(sc->xb_dev, error,
622                                  "writing %s/ring-pages",
623                                  node_path);
624                 return;
625         }
626
627         error = xs_printf(XST_NIL, node_path,
628                          "max-requests","%u", sc->max_requests);
629         if (error) {
630                 xenbus_dev_fatal(sc->xb_dev, error,
631                                  "writing %s/max-requests",
632                                  node_path);
633                 return;
634         }
635
636         error = xs_printf(XST_NIL, node_path,
637                          "max-request-segments","%u", sc->max_request_segments);
638         if (error) {
639                 xenbus_dev_fatal(sc->xb_dev, error,
640                                  "writing %s/max-request-segments",
641                                  node_path);
642                 return;
643         }
644
645         error = xs_printf(XST_NIL, node_path,
646                          "max-request-size","%u", sc->max_request_size);
647         if (error) {
648                 xenbus_dev_fatal(sc->xb_dev, error,
649                                  "writing %s/max-request-size",
650                                  node_path);
651                 return;
652         }
653
654         error = xs_printf(XST_NIL, node_path, "event-channel",
655                           "%u", irq_to_evtchn_port(sc->irq));
656         if (error) {
657                 xenbus_dev_fatal(sc->xb_dev, error,
658                                  "writing %s/event-channel",
659                                  node_path);
660                 return;
661         }
662
663         error = xs_printf(XST_NIL, node_path,
664                           "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
665         if (error) {
666                 xenbus_dev_fatal(sc->xb_dev, error,
667                                  "writing %s/protocol",
668                                  node_path);
669                 return;
670         }
671
672         xenbus_set_state(sc->xb_dev, XenbusStateInitialised);
673 }
674
675 static int 
676 setup_blkring(struct xb_softc *sc)
677 {
678         blkif_sring_t *sring;
679         uintptr_t sring_page_addr;
680         int error;
681         int i;
682
683         sring = malloc(sc->ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
684                        M_NOWAIT|M_ZERO);
685         if (sring == NULL) {
686                 xenbus_dev_fatal(sc->xb_dev, ENOMEM, "allocating shared ring");
687                 return (ENOMEM);
688         }
689         SHARED_RING_INIT(sring);
690         FRONT_RING_INIT(&sc->ring, sring, sc->ring_pages * PAGE_SIZE);
691
692         for (i = 0, sring_page_addr = (uintptr_t)sring;
693              i < sc->ring_pages;
694              i++, sring_page_addr += PAGE_SIZE) {
695
696                 error = xenbus_grant_ring(sc->xb_dev,
697                     (vtomach(sring_page_addr) >> PAGE_SHIFT), &sc->ring_ref[i]);
698                 if (error) {
699                         xenbus_dev_fatal(sc->xb_dev, error,
700                                          "granting ring_ref(%d)", i);
701                         return (error);
702                 }
703         }
704         error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
705                           "ring-ref","%u", sc->ring_ref[0]);
706         if (error) {
707                 xenbus_dev_fatal(sc->xb_dev, error, "writing %s/ring-ref",
708                                  xenbus_get_node(sc->xb_dev));
709                 return (error);
710         }
711         for (i = 1; i < sc->ring_pages; i++) {
712                 char ring_ref_name[]= "ring_refXX";
713
714                 snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
715                 error = xs_printf(XST_NIL, xenbus_get_node(sc->xb_dev),
716                                  ring_ref_name, "%u", sc->ring_ref[i]);
717                 if (error) {
718                         xenbus_dev_fatal(sc->xb_dev, error, "writing %s/%s",
719                                          xenbus_get_node(sc->xb_dev),
720                                          ring_ref_name);
721                         return (error);
722                 }
723         }
724
725         error = bind_listening_port_to_irqhandler(
726             xenbus_get_otherend_id(sc->xb_dev),
727             "xbd", (driver_intr_t *)blkif_int, sc,
728             INTR_TYPE_BIO | INTR_MPSAFE, &sc->irq);
729         if (error) {
730                 xenbus_dev_fatal(sc->xb_dev, error,
731                     "bind_evtchn_to_irqhandler failed");
732                 return (error);
733         }
734
735         return (0);
736 }
737
738 /**
739  * Callback received when the backend's state changes.
740  */
741 static int
742 blkfront_backend_changed(device_t dev, XenbusState backend_state)
743 {
744         struct xb_softc *sc = device_get_softc(dev);
745
746         DPRINTK("backend_state=%d\n", backend_state);
747
748         switch (backend_state) {
749         case XenbusStateUnknown:
750         case XenbusStateInitialising:
751         case XenbusStateReconfigured:
752         case XenbusStateReconfiguring:
753         case XenbusStateClosed:
754                 break;
755
756         case XenbusStateInitWait:
757                 blkfront_initialize(sc);
758                 break;
759
760         case XenbusStateInitialised:
761         case XenbusStateConnected:
762                 blkfront_initialize(sc);
763                 blkfront_connect(sc);
764                 break;
765
766         case XenbusStateClosing:
767                 if (sc->users > 0)
768                         xenbus_dev_error(dev, -EBUSY,
769                                          "Device in use; refusing to close");
770                 else
771                         blkfront_closing(dev);
772                 break;  
773         }
774
775         return (0);
776 }
777
778 /* 
779 ** Invoked when the backend is finally 'ready' (and has told produced 
780 ** the details about the physical device - #sectors, size, etc). 
781 */
782 static void 
783 blkfront_connect(struct xb_softc *sc)
784 {
785         device_t dev = sc->xb_dev;
786         unsigned long sectors, sector_size;
787         unsigned int binfo;
788         int err, feature_barrier;
789
790         if( (sc->connected == BLKIF_STATE_CONNECTED) || 
791             (sc->connected == BLKIF_STATE_SUSPENDED) )
792                 return;
793
794         DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
795
796         err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
797                         "sectors", "%lu", &sectors,
798                         "info", "%u", &binfo,
799                         "sector-size", "%lu", &sector_size,
800                         NULL);
801         if (err) {
802                 xenbus_dev_fatal(dev, err,
803                     "reading backend fields at %s",
804                     xenbus_get_otherend_path(dev));
805                 return;
806         }
807         err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
808                         "feature-barrier", "%lu", &feature_barrier,
809                         NULL);
810         if (!err || feature_barrier)
811                 sc->xb_flags |= XB_BARRIER;
812
813         device_printf(dev, "%juMB <%s> at %s",
814             (uintmax_t) sectors / (1048576 / sector_size),
815             device_get_desc(dev),
816             xenbus_get_node(dev));
817         bus_print_child_footer(device_get_parent(dev), dev);
818
819         xlvbd_add(sc, sectors, sc->vdevice, binfo, sector_size);
820
821         (void)xenbus_set_state(dev, XenbusStateConnected); 
822
823         /* Kick pending requests. */
824         mtx_lock(&sc->xb_io_lock);
825         sc->connected = BLKIF_STATE_CONNECTED;
826         xb_startio(sc);
827         sc->xb_flags |= XB_READY;
828         mtx_unlock(&sc->xb_io_lock);
829         
830 }
831
832 /**
833  * Handle the change of state of the backend to Closing.  We must delete our
834  * device-layer structures now, to ensure that writes are flushed through to
835  * the backend.  Once this is done, we can switch to Closed in
836  * acknowledgement.
837  */
838 static void
839 blkfront_closing(device_t dev)
840 {
841         struct xb_softc *sc = device_get_softc(dev);
842
843         xenbus_set_state(dev, XenbusStateClosing);
844
845         DPRINTK("blkfront_closing: %s removed\n", xenbus_get_node(dev));
846
847         if (sc->xb_disk != NULL) {
848                 disk_destroy(sc->xb_disk);
849                 sc->xb_disk = NULL;
850         }
851
852         xenbus_set_state(dev, XenbusStateClosed); 
853 }
854
855
856 static int
857 blkfront_detach(device_t dev)
858 {
859         struct xb_softc *sc = device_get_softc(dev);
860
861         DPRINTK("blkfront_remove: %s removed\n", xenbus_get_node(dev));
862
863         blkif_free(sc, 0);
864         mtx_destroy(&sc->xb_io_lock);
865
866         return 0;
867 }
868
869
870 static inline void 
871 flush_requests(struct xb_softc *sc)
872 {
873         int notify;
874
875         RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->ring, notify);
876
877         if (notify)
878                 notify_remote_via_irq(sc->irq);
879 }
880
881 static void
882 blkif_restart_queue_callback(void *arg)
883 {
884         struct xb_softc *sc = arg;
885
886         mtx_lock(&sc->xb_io_lock);
887
888         xb_startio(sc);
889
890         mtx_unlock(&sc->xb_io_lock);
891 }
892
893 static int
894 blkif_open(struct disk *dp)
895 {
896         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
897
898         if (sc == NULL) {
899                 printf("xb%d: not found", sc->xb_unit);
900                 return (ENXIO);
901         }
902
903         sc->xb_flags |= XB_OPEN;
904         sc->users++;
905         return (0);
906 }
907
908 static int
909 blkif_close(struct disk *dp)
910 {
911         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
912
913         if (sc == NULL)
914                 return (ENXIO);
915         sc->xb_flags &= ~XB_OPEN;
916         if (--(sc->users) == 0) {
917                 /* Check whether we have been instructed to close.  We will
918                    have ignored this request initially, as the device was
919                    still mounted. */
920                 device_t dev = sc->xb_dev;
921                 XenbusState state =
922                         xenbus_read_driver_state(xenbus_get_otherend_path(dev));
923
924                 if (state == XenbusStateClosing)
925                         blkfront_closing(dev);
926         }
927         return (0);
928 }
929
930 static int
931 blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
932 {
933         struct xb_softc *sc = (struct xb_softc *)dp->d_drv1;
934
935         if (sc == NULL)
936                 return (ENXIO);
937
938         return (ENOTTY);
939 }
940
941 static void
942 xb_free_command(struct xb_command *cm)
943 {
944
945         KASSERT((cm->cm_flags & XB_ON_XBQ_MASK) == 0,
946             ("Freeing command that is still on a queue\n"));
947
948         cm->cm_flags = 0;
949         cm->bp = NULL;
950         cm->cm_complete = NULL;
951         xb_enqueue_free(cm);
952 }
953
954 /*
955  * blkif_queue_request
956  *
957  * request block io
958  * 
959  * id: for guest use only.
960  * operation: BLKIF_OP_{READ,WRITE,PROBE}
961  * buffer: buffer to read/write into. this should be a
962  *   virtual address in the guest os.
963  */
964 static struct xb_command *
965 xb_bio_command(struct xb_softc *sc)
966 {
967         struct xb_command *cm;
968         struct bio *bp;
969
970         if (unlikely(sc->connected != BLKIF_STATE_CONNECTED))
971                 return (NULL);
972
973         bp = xb_dequeue_bio(sc);
974         if (bp == NULL)
975                 return (NULL);
976
977         if ((cm = xb_dequeue_free(sc)) == NULL) {
978                 xb_requeue_bio(sc, bp);
979                 return (NULL);
980         }
981
982         if (gnttab_alloc_grant_references(sc->max_request_segments,
983             &cm->gref_head) != 0) {
984                 gnttab_request_free_callback(&sc->callback,
985                         blkif_restart_queue_callback, sc,
986                         sc->max_request_segments);
987                 xb_requeue_bio(sc, bp);
988                 xb_enqueue_free(cm);
989                 sc->xb_flags |= XB_FROZEN;
990                 return (NULL);
991         }
992
993         cm->bp = bp;
994         cm->data = bp->bio_data;
995         cm->datalen = bp->bio_bcount;
996         cm->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ :
997             BLKIF_OP_WRITE;
998         cm->sector_number = (blkif_sector_t)bp->bio_pblkno;
999
1000         return (cm);
1001 }
1002
1003 static int
1004 blkif_queue_request(struct xb_softc *sc, struct xb_command *cm)
1005 {
1006         int     error;
1007
1008         error = bus_dmamap_load(sc->xb_io_dmat, cm->map, cm->data, cm->datalen,
1009             blkif_queue_cb, cm, 0);
1010         if (error == EINPROGRESS) {
1011                 printf("EINPROGRESS\n");
1012                 sc->xb_flags |= XB_FROZEN;
1013                 cm->cm_flags |= XB_CMD_FROZEN;
1014                 return (0);
1015         }
1016
1017         return (error);
1018 }
1019
1020 static void
1021 blkif_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1022 {
1023         struct xb_softc *sc;
1024         struct xb_command *cm;
1025         blkif_request_t *ring_req;
1026         struct blkif_request_segment *sg;
1027         struct blkif_request_segment *last_block_sg;
1028         grant_ref_t *sg_ref;
1029         vm_paddr_t buffer_ma;
1030         uint64_t fsect, lsect;
1031         int ref;
1032         int op;
1033         int block_segs;
1034
1035         cm = arg;
1036         sc = cm->cm_sc;
1037
1038 //printf("%s: Start\n", __func__);
1039         if (error) {
1040                 printf("error %d in blkif_queue_cb\n", error);
1041                 cm->bp->bio_error = EIO;
1042                 biodone(cm->bp);
1043                 xb_free_command(cm);
1044                 return;
1045         }
1046
1047         /* Fill out a communications ring structure. */
1048         ring_req = RING_GET_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
1049         sc->ring.req_prod_pvt++;
1050         ring_req->id = cm->id;
1051         ring_req->operation = cm->operation;
1052         ring_req->sector_number = cm->sector_number;
1053         ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk;
1054         ring_req->nr_segments = nsegs;
1055         cm->nseg = nsegs;
1056
1057         block_segs    = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
1058         sg            = ring_req->seg;
1059         last_block_sg = sg + block_segs;
1060         sg_ref        = cm->sg_refs;
1061
1062         while (1) {
1063
1064                 while (sg < last_block_sg) {
1065                         buffer_ma = segs->ds_addr;
1066                         fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
1067                         lsect = fsect + (segs->ds_len  >> XBD_SECTOR_SHFT) - 1;
1068
1069                         KASSERT(lsect <= 7, ("XEN disk driver data cannot "
1070                                 "cross a page boundary"));
1071
1072                         /* install a grant reference. */
1073                         ref = gnttab_claim_grant_reference(&cm->gref_head);
1074
1075                         /*
1076                          * GNTTAB_LIST_END == 0xffffffff, but it is private
1077                          * to gnttab.c.
1078                          */
1079                         KASSERT(ref != ~0, ("grant_reference failed"));
1080
1081                         gnttab_grant_foreign_access_ref(
1082                                 ref,
1083                                 xenbus_get_otherend_id(sc->xb_dev),
1084                                 buffer_ma >> PAGE_SHIFT,
1085                                 ring_req->operation == BLKIF_OP_WRITE);
1086
1087                         *sg_ref = ref;
1088                         *sg = (struct blkif_request_segment) {
1089                                 .gref       = ref,
1090                                 .first_sect = fsect, 
1091                                 .last_sect  = lsect };
1092                         sg++;
1093                         sg_ref++;
1094                         segs++;
1095                         nsegs--;
1096                 }
1097                 block_segs = MIN(nsegs, BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
1098                 if (block_segs == 0)
1099                         break;
1100
1101                 sg = BLKRING_GET_SG_REQUEST(&sc->ring, sc->ring.req_prod_pvt);
1102                 sc->ring.req_prod_pvt++;
1103                 last_block_sg = sg + block_segs;
1104         }
1105
1106         if (cm->operation == BLKIF_OP_READ)
1107                 op = BUS_DMASYNC_PREREAD;
1108         else if (cm->operation == BLKIF_OP_WRITE)
1109                 op = BUS_DMASYNC_PREWRITE;
1110         else
1111                 op = 0;
1112         bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
1113
1114         gnttab_free_grant_references(cm->gref_head);
1115
1116         xb_enqueue_busy(cm);
1117
1118         /*
1119          * This flag means that we're probably executing in the busdma swi
1120          * instead of in the startio context, so an explicit flush is needed.
1121          */
1122         if (cm->cm_flags & XB_CMD_FROZEN)
1123                 flush_requests(sc);
1124
1125 //printf("%s: Done\n", __func__);
1126         return;
1127 }
1128
1129 /*
1130  * Dequeue buffers and place them in the shared communication ring.
1131  * Return when no more requests can be accepted or all buffers have 
1132  * been queued.
1133  *
1134  * Signal XEN once the ring has been filled out.
1135  */
1136 static void
1137 xb_startio(struct xb_softc *sc)
1138 {
1139         struct xb_command *cm;
1140         int error, queued = 0;
1141
1142         mtx_assert(&sc->xb_io_lock, MA_OWNED);
1143
1144         while (RING_FREE_REQUESTS(&sc->ring) >= sc->max_request_blocks) {
1145                 if (sc->xb_flags & XB_FROZEN)
1146                         break;
1147
1148                 cm = xb_dequeue_ready(sc);
1149
1150                 if (cm == NULL)
1151                     cm = xb_bio_command(sc);
1152
1153                 if (cm == NULL)
1154                         break;
1155
1156                 if ((error = blkif_queue_request(sc, cm)) != 0) {
1157                         printf("blkif_queue_request returned %d\n", error);
1158                         break;
1159                 }
1160                 queued++;
1161         }
1162
1163         if (queued != 0) 
1164                 flush_requests(sc);
1165 }
1166
1167 static void
1168 blkif_int(void *xsc)
1169 {
1170         struct xb_softc *sc = xsc;
1171         struct xb_command *cm;
1172         blkif_response_t *bret;
1173         RING_IDX i, rp;
1174         int op;
1175
1176         mtx_lock(&sc->xb_io_lock);
1177
1178         if (unlikely(sc->connected != BLKIF_STATE_CONNECTED)) {
1179                 mtx_unlock(&sc->xb_io_lock);
1180                 return;
1181         }
1182
1183  again:
1184         rp = sc->ring.sring->rsp_prod;
1185         rmb(); /* Ensure we see queued responses up to 'rp'. */
1186
1187         for (i = sc->ring.rsp_cons; i != rp;) {
1188                 bret = RING_GET_RESPONSE(&sc->ring, i);
1189                 cm   = &sc->shadow[bret->id];
1190
1191                 xb_remove_busy(cm);
1192                 i += blkif_completion(cm);
1193
1194                 if (cm->operation == BLKIF_OP_READ)
1195                         op = BUS_DMASYNC_POSTREAD;
1196                 else if (cm->operation == BLKIF_OP_WRITE)
1197                         op = BUS_DMASYNC_POSTWRITE;
1198                 else
1199                         op = 0;
1200                 bus_dmamap_sync(sc->xb_io_dmat, cm->map, op);
1201                 bus_dmamap_unload(sc->xb_io_dmat, cm->map);
1202
1203                 /*
1204                  * If commands are completing then resources are probably
1205                  * being freed as well.  It's a cheap assumption even when
1206                  * wrong.
1207                  */
1208                 sc->xb_flags &= ~XB_FROZEN;
1209
1210                 /*
1211                  * Directly call the i/o complete routine to save an
1212                  * an indirection in the common case.
1213                  */
1214                 cm->status = bret->status;
1215                 if (cm->bp)
1216                         xb_bio_complete(sc, cm);
1217                 else if (cm->cm_complete)
1218                         (cm->cm_complete)(cm);
1219                 else
1220                         xb_free_command(cm);
1221         }
1222
1223         sc->ring.rsp_cons = i;
1224
1225         if (i != sc->ring.req_prod_pvt) {
1226                 int more_to_do;
1227                 RING_FINAL_CHECK_FOR_RESPONSES(&sc->ring, more_to_do);
1228                 if (more_to_do)
1229                         goto again;
1230         } else {
1231                 sc->ring.sring->rsp_event = i + 1;
1232         }
1233
1234         xb_startio(sc);
1235
1236         mtx_unlock(&sc->xb_io_lock);
1237 }
1238
1239 static void 
1240 blkif_free(struct xb_softc *sc, int suspend)
1241 {
1242         uint8_t *sring_page_ptr;
1243         int i;
1244         
1245         /* Prevent new requests being issued until we fix things up. */
1246         mtx_lock(&sc->xb_io_lock);
1247         sc->connected = suspend ? 
1248                 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; 
1249         mtx_unlock(&sc->xb_io_lock);
1250
1251         /* Free resources associated with old device channel. */
1252         if (sc->ring.sring != NULL) {
1253                 sring_page_ptr = (uint8_t *)sc->ring.sring;
1254                 for (i = 0; i < sc->ring_pages; i++) {
1255                         if (sc->ring_ref[i] != GRANT_INVALID_REF) {
1256                                 gnttab_end_foreign_access_ref(sc->ring_ref[i]);
1257                                 sc->ring_ref[i] = GRANT_INVALID_REF;
1258                         }
1259                         sring_page_ptr += PAGE_SIZE;
1260                 }
1261                 free(sc->ring.sring, M_XENBLOCKFRONT);
1262                 sc->ring.sring = NULL;
1263         }
1264
1265         if (sc->shadow) {
1266
1267                 for (i = 0; i < sc->max_requests; i++) {
1268                         struct xb_command *cm;
1269
1270                         cm = &sc->shadow[i];
1271                         if (cm->sg_refs != NULL) {
1272                                 free(cm->sg_refs, M_XENBLOCKFRONT);
1273                                 cm->sg_refs = NULL;
1274                         }
1275
1276                         bus_dmamap_destroy(sc->xb_io_dmat, cm->map);
1277                 }
1278                 free(sc->shadow, M_XENBLOCKFRONT);
1279                 sc->shadow = NULL;
1280         }
1281                 
1282         if (sc->irq) {
1283                 unbind_from_irqhandler(sc->irq);
1284                 sc->irq = 0;
1285         }
1286 }
1287
1288 static int
1289 blkif_completion(struct xb_command *s)
1290 {
1291 //printf("%s: Req %p(%d)\n", __func__, s, s->nseg);
1292         gnttab_end_foreign_access_references(s->nseg, s->sg_refs);
1293         return (BLKIF_SEGS_TO_BLOCKS(s->nseg));
1294 }
1295
1296 #if 0
1297 static void 
1298 blkif_recover(struct xb_softc *sc)
1299 {
1300         /*
1301          * XXX The whole concept of not quiescing and completing all i/o
1302          * during suspend, and then hoping to recover and replay the
1303          * resulting abandoned I/O during resume, is laughable.  At best,
1304          * it invalidates the i/o ordering rules required by just about
1305          * every filesystem, and at worst it'll corrupt data.  The code
1306          * has been removed until further notice.
1307          */
1308 }
1309 #endif
1310
1311 /* ** Driver registration ** */
1312 static device_method_t blkfront_methods[] = { 
1313         /* Device interface */ 
1314         DEVMETHOD(device_probe,         blkfront_probe), 
1315         DEVMETHOD(device_attach,        blkfront_attach), 
1316         DEVMETHOD(device_detach,        blkfront_detach), 
1317         DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
1318         DEVMETHOD(device_suspend,       blkfront_suspend), 
1319         DEVMETHOD(device_resume,        blkfront_resume), 
1320  
1321         /* Xenbus interface */
1322         DEVMETHOD(xenbus_otherend_changed, blkfront_backend_changed),
1323
1324         { 0, 0 } 
1325 }; 
1326
1327 static driver_t blkfront_driver = { 
1328         "xbd", 
1329         blkfront_methods, 
1330         sizeof(struct xb_softc),                      
1331 }; 
1332 devclass_t blkfront_devclass; 
1333  
1334 DRIVER_MODULE(xbd, xenbusb_front, blkfront_driver, blkfront_devclass, 0, 0);