bitkeeper revision 1.879.1.1 (408f7ae5PHe1i2motf-Iulpr3dEVhQ)
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Wed, 28 Apr 2004 09:35:33 +0000 (09:35 +0000)
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Wed, 28 Apr 2004 09:35:33 +0000 (09:35 +0000)
Further modifications towards new block-device drivers for new I/O
model.

24 files changed:
.rootkeys
tools/xend/lib/domain_controller.h
tools/xend/lib/main.py
tools/xend/lib/utils.c
tools/xend/setup.py
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c [deleted file]
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h [deleted file]
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h [new file with mode: 0644]
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c [new file with mode: 0644]
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c
xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c
xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c
xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h
xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h
xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h
xenolinux-2.4.26-sparse/mm/vmalloc.c

index 74cc58825a1f30d923c820543e7013e0ac3ec722..f391d811f692683f104df0d253347ea574ffe946 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 4087cf0dlv1Dw4MAbeRStPPG8IvPPg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
 40880cc6hHg6s2cPHbqPNQxENefjoQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
 4075806dI5kfeMD5RV-DA0PYoThx_w xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile
-4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c
-4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h
+4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h
+4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
 4075806dibjCcfuXv6CINMhxWTw3jQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
 3e5a4e65iHEuC5sjFhj42XALYbLVRw xenolinux-2.4.26-sparse/arch/xen/drivers/block/Makefile
 3e5a4e65pP5spJErBW69pJxSSdK9RA xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c
index 14f970dd046dcf8dc5e73f374e0dc7e292e82c8a..eec8402e5fa5c504d60139d51879f9db23a10ef1 100644 (file)
@@ -56,14 +56,90 @@ typedef struct {
 #define CMSG_BLKIF_BE           1  /* Block-device backend  */
 #define CMSG_BLKIF_FE           2  /* Block-device frontend */
 
+
+/******************************************************************************
+ * CONSOLE DEFINITIONS
+ */
+
 /*
  * Subtypes for console messages.
  */
 #define CMSG_CONSOLE_DATA       0
 
+
+/******************************************************************************
+ * BLOCK-INTERFACE FRONTEND DEFINITIONS
+ */
+
+/* Messages from domain controller to guest. */
+#define CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED   0
+
+/* Messages from guest to domain controller. */
+#define CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED     32
+#define CMSG_BLKIF_FE_INTERFACE_UP              33
+#define CMSG_BLKIF_FE_INTERFACE_DOWN            34
+
+/* These are used by both front-end and back-end drivers. */
+#define blkif_vdev_t   u16
+#define blkif_pdev_t   u16
+#define blkif_sector_t u64
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
+ *  Notify a guest about a status change on one of its block interfaces.
+ *  If the interface is DESTROYED or DOWN then the interface is disconnected:
+ *   1. The shared-memory frame is available for reuse.
+ *   2. Any unacknowledged messgaes pending on the interface were dropped.
+ */
+#define BLKIF_INTERFACE_STATUS_DESTROYED 0 /* Interface doesn't exist.      */
+#define BLKIF_INTERFACE_STATUS_DOWN      1 /* Interface exists but is down. */
+#define BLKIF_INTERFACE_STATUS_UP        2 /* Interface exists and is up.   */
+typedef struct {
+    unsigned int handle;
+    unsigned int status;
+    unsigned int evtchn; /* status == BLKIF_INTERFACE_STATUS_UP */
+} blkif_fe_interface_status_changed_t;
+
 /*
- * Subtypes for block-device messages.
+ * CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED:
+ *  Notify the domain controller that the front-end driver is DOWN or UP.
+ *  When the driver goes DOWN then the controller will send no more
+ *  status-change notifications. When the driver comes UP then the controller
+ *  will send a notification for each interface that currently exists.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically take the interfaces DOWN.
  */
+#define BLKIF_DRIVER_STATUS_DOWN         0
+#define BLKIF_DRIVER_STATUS_UP           1
+typedef struct {
+    unsigned int status; /* BLKIF_DRIVER_STATUS_??? */
+} blkif_fe_driver_status_changed_t;
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_UP:
+ *  If successful, the domain controller will acknowledge with a STATUS_UP
+ *  message.
+ */
+typedef struct {
+    unsigned int  handle;
+    unsigned long shmem_frame;
+} blkif_fe_interface_up_t;
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_DOWN:
+ *  If successful, the domain controller will acknowledge with a STATUS_DOWN
+ *  message.
+ */
+typedef struct {
+    unsigned int handle;
+} blkif_fe_interface_down_t;
+
+
+/******************************************************************************
+ * BLOCK-INTERFACE BACKEND DEFINITIONS
+ */
+
+/* Messages from domain controller. */
 #define CMSG_BLKIF_BE_CREATE      0  /* Create a new block-device interface. */
 #define CMSG_BLKIF_BE_DESTROY     1  /* Destroy a block-device interface.    */
 #define CMSG_BLKIF_BE_VBD_CREATE  2  /* Create a new VBD for an interface.   */
@@ -71,14 +147,13 @@ typedef struct {
 #define CMSG_BLKIF_BE_VBD_GROW    4  /* Append an extent to a given VBD.     */
 #define CMSG_BLKIF_BE_VBD_SHRINK  5  /* Remove last extent from a given VBD. */
 
+/* Messages to domain controller. */
+#define CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED 32
+
 /*
- * Message request/response defintions for block-device messages.
+ * Message request/response definitions for block-device messages.
  */
 
-#define blkif_vdev_t   u16
-#define blkif_pdev_t   u16
-#define blkif_sector_t u64
-
 typedef struct {
     blkif_pdev_t   device;
     blkif_sector_t sector_start;
@@ -86,21 +161,36 @@ typedef struct {
 } blkif_extent_t;
 
 /* Non-specific 'okay' return. */
-#define BLKIF_STATUS_OKAY                0
+#define BLKIF_BE_STATUS_OKAY                0
 /* Non-specific 'error' return. */
-#define BLKIF_STATUS_ERROR               1
+#define BLKIF_BE_STATUS_ERROR               1
 /* The following are specific error returns. */
-#define BLKIF_STATUS_INTERFACE_EXISTS    2
-#define BLKIF_STATUS_INTERFACE_NOT_FOUND 3
+#define BLKIF_BE_STATUS_INTERFACE_EXISTS    2
+#define BLKIF_BE_STATUS_INTERFACE_NOT_FOUND 3
+#define BLKIF_BE_STATUS_VBD_EXISTS          4
+#define BLKIF_BE_STATUS_VBD_NOT_FOUND       5
+#define BLKIF_BE_STATUS_OUT_OF_MEMORY       6
+#define BLKIF_BE_STATUS_EXTENT_NOT_FOUND    7
+#define BLKIF_BE_STATUS_MAPPING_ERROR       8
 
 /* This macro can be used to create an array of descriptive error strings. */
-#define BLKIF_STATUS_ERRORS {    \
-    "Okay",                      \
-    "Non-specific error",        \
-    "Interface already exists",  \
-    "Interface not found" }
+#define BLKIF_BE_STATUS_ERRORS {   \
+    "Okay",                        \
+    "Non-specific error",          \
+    "Interface already exists",    \
+    "Interface not found",         \
+    "VBD already exists",          \
+    "VBD not found",               \
+    "Out of memory",               \
+    "Extent not found for VBD",    \
+    "Could not map domain memory" }
 
-/* CMSG_BLKIF_CREATE */
+/*
+ * CMSG_BLKIF_BE_CREATE:
+ *  When the driver sends a successful response then the interface is fully
+ *  set up. The controller will send an UP notification to the front-end
+ *  driver.
+ */
 typedef struct { 
     /* IN */
     domid_t        domid;             /* Domain attached to new interface.   */
@@ -109,18 +199,23 @@ typedef struct {
     unsigned long  shmem_frame;       /* Page cont. shared comms window.     */
     /* OUT */
     unsigned int   status;
-} blkif_create_t; 
+} blkif_be_create_t; 
 
-/* CMSG_BLKIF_DESTROY */
+/*
+ * CMSG_BLKIF_BE_DESTROY:
+ *  When the driver sends a successful response then the interface is fully
+ *  torn down. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
 typedef struct { 
     /* IN */
     domid_t        domid;             /* Identify interface to be destroyed. */
     unsigned int   blkif_handle;      /* ...ditto...                         */
     /* OUT */
     unsigned int   status;
-} blkif_destroy_t; 
+} blkif_be_destroy_t; 
 
-/* CMSG_BLKIF_VBD_CREATE */
+/* CMSG_BLKIF_BE_VBD_CREATE */
 typedef struct { 
     /* IN */
     domid_t        domid;             /* Identify blkdev interface.          */
@@ -129,9 +224,9 @@ typedef struct {
     int            readonly;          /* Non-zero -> VBD isn't writeable.    */
     /* OUT */
     unsigned int   status;
-} blkif_vbd_create_t; 
+} blkif_be_vbd_create_t; 
 
-/* CMSG_BLKIF_VBD_DESTROY */
+/* CMSG_BLKIF_BE_VBD_DESTROY */
 typedef struct {
     /* IN */
     domid_t        domid;             /* Identify blkdev interface.          */
@@ -139,9 +234,9 @@ typedef struct {
     blkif_vdev_t   vdevice;           /* Interface-specific id of the VBD.   */
     /* OUT */
     unsigned int   status;
-} blkif_vbd_destroy_t; 
+} blkif_be_vbd_destroy_t; 
 
-/* CMSG_BLKIF_VBD_GROW */
+/* CMSG_BLKIF_BE_VBD_GROW */
 typedef struct { 
     /* IN */
     domid_t        domid;             /* Identify blkdev interface.          */
@@ -150,9 +245,9 @@ typedef struct {
     blkif_extent_t extent;            /* Physical extent to append to VBD.   */
     /* OUT */
     unsigned int   status;
-} blkif_vbd_grow_t; 
+} blkif_be_vbd_grow_t; 
 
-/* CMSG_BLKIF_VBD_SHRINK */
+/* CMSG_BLKIF_BE_VBD_SHRINK */
 typedef struct { 
     /* IN */
     domid_t        domid;             /* Identify blkdev interface.          */
@@ -160,6 +255,16 @@ typedef struct {
     blkif_vdev_t   vdevice;           /* Interface-specific id of the VBD.   */
     /* OUT */
     unsigned int   status;
-} blkif_vbd_shrink_t; 
+} blkif_be_vbd_shrink_t; 
+
+/*
+ * CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED:
+ *  Notify the domain controller that the back-end driver is DOWN or UP.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically send DOWN notifications.
+ */
+typedef struct {
+    unsigned int status; /* BLKIF_DRIVER_STATUS_??? */
+} blkif_be_driver_status_changed;
 
 #endif /* __DOMAIN_CONTROLLER_H__ */
index 4b243b330726000de3009786b8934481756daba5..4dd26ca8c578e28ad712486beb9ad3ad3da62b5f 100755 (executable)
@@ -44,6 +44,14 @@ def daemon_loop():
     # notifications.
     notifier = xend.utils.notifier()
 
+    # The DOM0 control interface is not set up via the management interface.
+    # Note that console messages don't come our way (actually, only driver
+    # back-ends should use the DOM0 control interface). We therefore don't
+    # need to set up console structures.
+    xend.utils.port(0)
+    xend.main.notifier.bind(port.local_port)
+    xend.main.control_list[port.local_port] = (port, 0, 0, 0)
+
     ##
     ## MAIN LOOP
     ## 
index 4883ec1a462d7685c7cee943c9e2704e4d41fa16..c28d682ec9a9a39f4fee4768a491f5f6c704310c 100644 (file)
@@ -22,6 +22,8 @@
 #include <signal.h>
 #include <xc.h>
 
+#include <asm-xen/proc_cmd.h>
+
 #include <hypervisor-if.h>
 #include "domain_controller.h"
 
@@ -684,8 +686,23 @@ static PyObject *xu_port_new(PyObject *self, PyObject *args)
         goto fail2;
     }
 
-    if ( xc_evtchn_bind_interdomain(xup->xc_handle, 
-                                    DOMID_SELF, dom, &port1, &port2) != 0 )
+    if ( dom == 0ULL )
+    {
+        /*
+         * The control-interface event channel for DOM0 is already set up.
+         * We use an ioctl to discover the port at our end of the channel.
+         */
+        port1 = ioctl(xup->xc_handle, IOCTL_PRIVCMD_INITDOMAIN_EVTCHN, NULL);
+        port2 = -1; /* We don't need the remote end of the DOM0 link. */
+        if ( port1 < 0 )
+        {
+            PyErr_SetString(port_error, "Could not open channel to DOM0");
+            goto fail3;
+        }
+    }
+    else if ( xc_evtchn_bind_interdomain(xup->xc_handle, 
+                                         DOMID_SELF, dom, 
+                                         &port1, &port2) != 0 )
     {
         PyErr_SetString(port_error, "Could not open channel to domain");
         goto fail3;
@@ -744,7 +761,8 @@ static void xu_port_dealloc(PyObject *self)
 {
     xu_port_object *xup = (xu_port_object *)self;
     unmap_control_interface(xup->mem_fd, xup->interface);
-    (void)xc_evtchn_close(xup->xc_handle, DOMID_SELF, xup->local_port);
+    if ( xup->remote_dom != 0ULL )
+        (void)xc_evtchn_close(xup->xc_handle, DOMID_SELF, xup->local_port);
     (void)xc_interface_close(xup->xc_handle);
     (void)close(xup->mem_fd);
     PyObject_Del(self);
index 1f39cb457257d3d7bd23480b9836ea6882a7a092..5567d7093c48952615d953d72f7a9297e06d26f6 100644 (file)
@@ -4,7 +4,8 @@ from distutils.core import setup, Extension
 utils = Extension("utils",
                   extra_compile_args   = ["-fno-strict-aliasing"],
                   include_dirs         = ["../xc/lib",
-                                          "../../xen/include/hypervisor-ifs"],
+                                          "../../xen/include/hypervisor-ifs",
+                                          "../../xenolinux-sparse/include"],
                   library_dirs         = ["../xc/lib"],
                   libraries            = ["xc"],
                   sources              = ["lib/utils.c"])
index 489517293711e7e237d771ca8ac29c55164540b5..646f4855f35e18fc70a6c36df4b4e477074c16f8 100644 (file)
@@ -34,7 +34,7 @@ typedef struct blkif_st {
     unsigned int     evtchn;
     int              irq;
     /* Comms information. */
-    blk_ring_t      *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
     BLK_RING_IDX     blk_req_cons;  /* Request consumer. */
     BLK_RING_IDX     blk_resp_prod; /* Private version of response producer. */
     /* VBDs attached to this interface. */
@@ -44,13 +44,19 @@ typedef struct blkif_st {
     struct blkif_st *hash_next;
     struct list_head blkdev_list;
     spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
 } blkif_t;
 
-void blkif_create(blkif_create_t *create);
-void blkif_destroy(blkif_destroy_t *destroy);
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void __blkif_destroy(blkif_t *blkif);
 blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
-void blkif_get(blkif_t *blkif);
-void blkif_put(blkif_t *blkif);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __blkif_destroy(_b);                  \
+    } while (0)
 
 /* An entry in a list of xen_extents. */
 typedef struct _blkif_extent_le { 
@@ -60,25 +66,25 @@ typedef struct _blkif_extent_le {
 
 typedef struct _vbd { 
     blkif_vdev_t       vdevice;   /* what the domain refers to this vbd as */
-    unsigned char      mode;      /* VBD_MODE_{R,W} */
+    unsigned char      readonly;  /* Non-zero -> read-only */
     unsigned char      type;      /* XD_TYPE_xxx */
     blkif_extent_le_t *extents;   /* list of xen_extents making up this vbd */
     rb_node_t          rb;        /* for linking into R-B tree lookup struct */
 } vbd_t; 
 
-long vbd_create(blkif_vbd_create_t *create_params); 
-long vbd_grow(blkif_vbd_grow_t *grow_params); 
-long vbd_shrink(blkif_vbd_shrink_t *shrink_params);
-long vbd_destroy(blkif_vbd_destroy_t *delete_params); 
-
-void destroy_all_vbds(struct task_struct *p);
+void vbd_create(blkif_be_vbd_create_t *create); 
+void vbd_grow(blkif_be_vbd_grow_t *grow); 
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink);
+void vbd_destroy(blkif_be_vbd_destroy_t *delete); 
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
+void destroy_all_vbds(blkif_t *blkif);
 
 typedef struct {
     blkif_t       *blkif;
     unsigned long  id;
     atomic_t       pendcnt;
     unsigned short operation;
-    unsigned short status;
+    int            status;
 } pending_req_t;
 
 /* Describes a [partial] disk extent (part of a block io request) */
@@ -91,7 +97,10 @@ typedef struct {
 
 int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 
 
-int blkif_be_controller_init(void);
+void blkif_interface_init(void);
+void blkif_ctrlif_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
 
 void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
 
index c7ef10c3ba59c62ef069bce2a57af7603175ab86..e1ed295ed39ff55806817f81e27d032a170c9e22 100644 (file)
@@ -13,34 +13,34 @@ static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
     switch ( msg->subtype )
     {
     case CMSG_BLKIF_BE_CREATE:
-        if ( msg->length != sizeof(blkif_create_t) )
+        if ( msg->length != sizeof(blkif_be_create_t) )
             goto parse_error;
-        blkif_create((blkif_create_t *)&msg->msg[0]);
+        blkif_create((blkif_be_create_t *)&msg->msg[0]);
         break;        
     case CMSG_BLKIF_BE_DESTROY:
-        if ( msg->length != sizeof(blkif_destroy_t) )
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
             goto parse_error;
-        blkif_destroy((blkif_destroy_t *)&msg->msg[0]);
+        blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]);
         break;        
     case CMSG_BLKIF_BE_VBD_CREATE:
-        if ( msg->length != sizeof(blkif_vbd_create_t) )
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
             goto parse_error;
-        vbd_create((blkif_vbd_create_t *)&msg->msg[0]);
+        vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]);
         break;
     case CMSG_BLKIF_BE_VBD_DESTROY:
-        if ( msg->length != sizeof(blkif_vbd_destroy_t) )
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
             goto parse_error;
-        vbd_destroy((blkif_vbd_destroy_t *)&msg->msg[0]);
+        vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]);
         break;
     case CMSG_BLKIF_BE_VBD_GROW:
-        if ( msg->length != sizeof(blkif_vbd_grow_t) )
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
             goto parse_error;
-        vbd_grow((blkif_vbd_grow_t *)&msg->msg[0]);
+        vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]);
         break;
     case CMSG_BLKIF_BE_VBD_SHRINK:
-        if ( msg->length != sizeof(blkif_vbd_shrink_t) )
+        if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
             goto parse_error;
-        vbd_shrink((blkif_vbd_shrink_t *)&msg->msg[0]);
+        vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]);
         break;
     default:
         goto parse_error;
@@ -54,8 +54,7 @@ static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
     ctrl_if_send_response(msg);
 }
 
-int blkif_ctrlif_init(void)
+void blkif_ctrlif_init(void)
 {
     (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
-    return 0;
 }
index 579795deb92d924349a5229246bbb2f65c0c8541..87925681da33e745768d02d6c11e94075559fcd4 100644 (file)
 #define BLKIF_HASH(_d,_h) \
     (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(BLKIF_HASHSZ-1))
 
-static blkif_t *blkif_hash[BLKIF_HASHSZ];
+static kmem_cache_t *blkif_cachep;
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+static spinlock_t    blkif_hash_lock;
 
 blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
 {
-    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( (blkif != NULL) && 
-            (blkif->domid != domid) && 
-            (blkif->handle != handle) )
+    blkif_t      *blkif;
+    unsigned long flags;
+    
+    spin_lock_irqsave(&blkif_hash_lock, flags);
+    blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( blkif != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            blkif_get(blkif);
+            break;
+        }
         blkif = blkif->hash_next;
+    }
+    spin_unlock_irqrestore(&blkif_hash_lock, flags);
+
     return blkif;
 }
 
-void blkif_create(blkif_create_t *create)
+void __blkif_destroy(blkif_t *blkif)
+{
+    free_irq(blkif->irq, NULL);
+    unbind_evtchn_from_irq(blkif->evtchn);
+    vfree(blkif->blk_ring_base);
+    destroy_all_vbds(blkif);
+    kmem_cache_free(blkif_cachep, blkif);    
+}
+
+void blkif_create(blkif_be_create_t *create)
 {
     domid_t       domid  = create->domid;
     unsigned int  handle = create->blkif_handle;
     unsigned int  evtchn = create->evtchn;
     unsigned long shmem_frame = create->shmem_frame;
+    unsigned long flags;
     blkif_t     **pblkif, *blkif;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
 
-    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
-    while ( *pblkif == NULL )
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
     {
-        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
-            goto found_match;
-        pblkif = &(*pblkif)->hash_next;
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+    {
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto fail1;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                    prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            create->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+        else
+            create->status = BLKIF_BE_STATUS_ERROR;
+        goto fail2;
     }
 
-    blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
     memset(blkif, 0, sizeof(*blkif));
-    blkif->domid       = domid;
-    blkif->handle      = handle;
-    blkif->evtchn      = evtchn;
-    blkif->irq         = bind_evtchn_to_irq(evtchn);
-    blkif->shmem_frame = shmem_frame;
-    blkif->shmem_vbase = ioremap(shmem_frame<<PAGE_SHIFT, PAGE_SIZE);
+    blkif->domid         = domid;
+    blkif->handle        = handle;
+    blkif->evtchn        = evtchn;
+    blkif->irq           = bind_evtchn_to_irq(evtchn);
+    blkif->shmem_frame   = shmem_frame;
+    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
     spin_lock_init(&blkif->vbd_lock);
     spin_lock_init(&blkif->blk_ring_lock);
 
-    request_irq(irq, blkif_be_int, 0, "blkif-backend", blkif);
+    spin_lock_irqsave(&blkif_hash_lock, flags);
 
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif == NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            spin_unlock_irqrestore(&blkif_hash_lock, flags);
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            goto fail3;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    atomic_set(&blkif->refcnt, 1);
     blkif->hash_next = *pblkif;
     *pblkif = blkif;
 
-    create->status = BLKIF_STATUS_OKAY;
-    return;
+    spin_unlock_irqrestore(&blkif_hash_lock, flags);
 
- found_match:
-    create->status = BLKIF_STATUS_INTERFACE_EXISTS;
-    return;
+    request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif);
 
- evtchn_in_use:
-    unbind_evtchn_from_irq(evtchn); /* drop refcnt */
-    create->status = BLKIF_STATUS_ERROR;
+    create->status = BLKIF_BE_STATUS_OKAY;
     return;
+
+ fail3: unbind_evtchn_from_irq(evtchn);
+ fail2: kmem_cache_free(blkif_cachep, blkif);
+ fail1: vfree(vma->addr);
 }
 
-void blkif_destroy(blkif_destroy_t *destroy)
+void blkif_destroy(blkif_be_destroy_t *destroy)
 {
     domid_t       domid  = destroy->domid;
     unsigned int  handle = destroy->blkif_handle;
+    unsigned long flags;
     blkif_t     **pblkif, *blkif;
 
+    spin_lock_irqsave(&blkif_hash_lock, flags);
+
     pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
     while ( (blkif = *pblkif) == NULL )
     {
         if ( (blkif->domid == domid) && (blkif->handle == handle) )
-            goto found_match;
+        {
+            *pblkif = blkif->hash_next;
+            spin_unlock_irqrestore(&blkif_hash_lock, flags);
+            blkif_deschedule(blkif);
+            blkif_put(blkif);
+            destroy->status = BLKIF_BE_STATUS_OKAY;
+            return;
+        }
         pblkif = &blkif->hash_next;
     }
 
-    destroy->status = BLKIF_STATUS_INTERFACE_NOT_FOUND;
-    return;
+    spin_unlock_irqrestore(&blkif_hash_lock, flags);
 
- found_match:
-    free_irq(blkif->irq, NULL);
-    unbind_evtchn_from_irq(blkif->evtchn);
-    *pblkif = blkif->hash_next;
-    kmem_cache_free(blkif_cachep, blkif);
-    destroy->status = BLKIF_STATUS_OKAY;
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
 }
 
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+    spin_lock_init(&blkif_hash_lock);
+}
index 1e6190c3e64f72c53964eb9e691b87778e9192b7..886279825087e66728eb52623cf4a7b6b90344bf 100644 (file)
 #define MAX_PENDING_REQS 64
 #define BATCH_PER_DOMAIN 16
 
+static struct vm_struct *mmap_vma;
+#define MMAP_PAGES_PER_SEGMENT \
+    ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1)
+#define MMAP_PAGES_PER_REQUEST \
+    (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT)
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)            \
+    ((unsigned long)mmap_vma->addr +     \
+     ((_req) * MMAP_PAGES_PER_REQUEST) + \
+     ((_seg) * MMAP_PAGES_PER_SEGMENT))
+
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
  * 'pending_req' allocated to it. Each buffer_head that completes decrements 
@@ -46,22 +58,11 @@ static PEND_RING_IDX pending_prod, pending_cons;
 
 static kmem_cache_t *buffer_head_cachep;
 
-static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
-
-static int lock_buffer(blkif_t *blkif,
-                       unsigned long buffer,
-                       unsigned short size,
-                       int writeable_buffer);
-static void unlock_buffer(unsigned long buffer,
-                          unsigned short size,
-                          int writeable_buffer);
-
-static void io_schedule(unsigned long unused);
 static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif,
-                                 blk_ring_req_entry_t *req);
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
 static void make_response(blkif_t *blkif, unsigned long id, 
-                          unsigned short op, unsigned long st);
+                          unsigned short op, int st);
 
 
 /******************************************************************
@@ -108,8 +109,6 @@ static void add_to_blkdev_list_tail(blkif_t *blkif)
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
-
 static void io_schedule(unsigned long unused)
 {
     blkif_t          *blkif;
@@ -132,6 +131,8 @@ static void io_schedule(unsigned long unused)
     run_task_queue(&tq_disk);
 }
 
+static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
+
 static void maybe_trigger_io_schedule(void)
 {
     /*
@@ -155,28 +156,25 @@ static void maybe_trigger_io_schedule(void)
 static void end_block_io_op(struct buffer_head *bh, int uptodate)
 {
     pending_req_t *pending_req = bh->b_private;
+    unsigned long  flags;
 
     /* An error fails the entire request. */
     if ( !uptodate )
     {
         DPRINTK("Buffer not up-to-date at end of operation\n");
-        pending_req->status = 2;
+        pending_req->status = BLKIF_RSP_ERROR;
     }
 
-    unlock_buffer(virt_to_phys(bh->b_data), 
-                  bh->b_size, 
-                  (pending_req->operation==READ));
-    
     if ( atomic_dec_and_test(&pending_req->pendcnt) )
     {
+        int pending_idx = pending_req - pending_reqs;
+        vmfree_area_pages(MMAP_VADDR(pending_idx, 0), MMAP_PAGES_PER_REQUEST);
         make_response(pending_req->blkif, pending_req->id,
                       pending_req->operation, pending_req->status);
         blkif_put(pending_req->blkif);
-        spin_lock(&pend_prod_lock);
-        pending_ring[MASK_PEND_IDX(pending_prod)] = 
-            pending_req - pending_reqs;
-        pending_prod++;
-        spin_unlock(&pend_prod_lock);
+        spin_lock_irqsave(&pend_prod_lock, flags);
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        spin_unlock_irqrestore(&pend_prod_lock, flags);
         maybe_trigger_io_schedule();
     }
 }
@@ -200,45 +198,10 @@ void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int lock_buffer(blkif_t *blkif,
-                       unsigned long buffer,
-                       unsigned short size,
-                       int writeable_buffer)
-{
-    unsigned long    pfn;
-
-    for ( pfn = buffer >> PAGE_SHIFT; 
-          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
-          pfn++ )
-    {
-    }
-
-    return 1;
-
- fail:
-    while ( pfn-- > (buffer >> PAGE_SHIFT) )
-    {        
-    }
-    return 0;
-}
-
-static void unlock_buffer(unsigned long buffer,
-                          unsigned short size,
-                          int writeable_buffer)
-{
-    unsigned long pfn;
-
-    for ( pfn = buffer >> PAGE_SHIFT; 
-          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
-          pfn++ )
-    {
-    }
-}
-
 static int do_block_io_op(blkif_t *blkif, int max_to_do)
 {
-    blk_ring_t *blk_ring = blkif->blk_ring_base;
-    blk_ring_req_entry_t *req;
+    blkif_ring_t *blk_ring = blkif->blk_ring_base;
+    blkif_request_t *req;
     BLK_RING_IDX i;
     int more_to_do = 0;
 
@@ -262,11 +225,15 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do)
             dispatch_rw_block_io(blkif, req);
             break;
 
+        case BLKIF_OP_PROBE:
+            dispatch_probe(blkif, req);
+            break;
+
         default:
             DPRINTK("error: unknown block io operation [%d]\n",
                     blk_ring->ring[i].req.operation);
             make_response(blkif, blk_ring->ring[i].req.id, 
-                          blk_ring->ring[i].req.operation, 1);
+                          blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
             break;
         }
     }
@@ -275,24 +242,62 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do)
     return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif,
-                                 blk_ring_req_entry_t *req)
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
+{
+    int      i, rc, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    pgprot_t prot;
+
+    /* Check that number of segments is sane. */
+    if ( unlikely(req->nr_segments == 0) || 
+         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    {
+        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+        goto bad_descriptor;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
+    for ( i = 0; i < req->nr_segments; i++ )
+    {
+        if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) )
+            goto bad_descriptor;
+        if ( direct_remap_area_pages(&init_mm, 
+                                     MMAP_VADDR(pending_idx, i),
+                                     req->buffer_and_sects[i] & PAGE_MASK, 
+                                     PAGE_SIZE, prot, blkif->domid) != 0 )
+            goto bad_descriptor;
+    }
+
+    rc = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
+                   (req->nr_segments * PAGE_SIZE) / sizeof(vdisk_t));
+
+    vmfree_area_pages(MMAP_VADDR(pending_idx, 0), 
+                      MMAP_PAGES_PER_REQUEST);
+    make_response(blkif, req->id, req->operation, rc);
+    return;
+
+ bad_descriptor:
+    vmfree_area_pages(MMAP_VADDR(pending_idx, 0), MMAP_PAGES_PER_REQUEST);
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
 {
     extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
     struct buffer_head *bh;
     int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
     unsigned short nr_sects;
     unsigned long buffer;
-    int i, tot_sects;
+    int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
     pending_req_t *pending_req;
+    pgprot_t       prot;
 
     /* We map virtual scatter/gather segments to physical segments. */
     int new_segs, nr_psegs = 0;
-    phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
+    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
 
     /* Check that number of segments is sane. */
     if ( unlikely(req->nr_segments == 0) || 
-         unlikely(req->nr_segments > MAX_BLK_SEGS) )
+         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
     {
         DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
         goto bad_descriptor;
@@ -310,8 +315,11 @@ static void dispatch_rw_block_io(blkif_t *blkif,
         nr_sects = req->buffer_and_sects[i] &  0x1FF;
 
         if ( unlikely(nr_sects == 0) )
+            continue;
+
+        if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) )
         {
-            DPRINTK("zero-sized data request\n");
+            DPRINTK("Too many sectors in segment\n");
             goto bad_descriptor;
         }
 
@@ -333,29 +341,40 @@ static void dispatch_rw_block_io(blkif_t *blkif,
         }
   
         nr_psegs += new_segs;
-        ASSERT(nr_psegs <= MAX_BLK_SEGS*2);
+        ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2);
     }
 
+    /* Nonsensical zero-sized request? */
+    if ( unlikely(nr_psegs == 0) )
+        goto bad_descriptor;
+
+    if ( operation == READ )
+        prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
+    else
+        prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
+
     for ( i = 0; i < nr_psegs; i++ )
     {
-        if ( unlikely(!lock_buffer(blkif, phys_seg[i].buffer, 
-                                   phys_seg[i].nr_sects << 9,
-                                   operation==READ)) )
+        unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 
+                            (phys_seg[i].nr_sects << 9) + 
+                            (PAGE_SIZE - 1)) & PAGE_MASK;
+        if ( direct_remap_area_pages(&init_mm, 
+                                     MMAP_VADDR(pending_idx, i),
+                                     phys_seg[i].buffer & PAGE_MASK, 
+                                     sz, prot, blkif->domid) != 0 )
         {
             DPRINTK("invalid buffer\n");
-            while ( i-- > 0 )
-                unlock_buffer(phys_seg[i].buffer, 
-                              phys_seg[i].nr_sects << 9,
-                              operation==READ);
+            vmfree_area_pages(MMAP_VADDR(pending_idx, 0), 
+                              MMAP_PAGES_PER_REQUEST);
             goto bad_descriptor;
         }
     }
 
-    pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]];
+    pending_req = &pending_reqs[pending_idx];
     pending_req->blkif     = blkif;
     pending_req->id        = req->id;
     pending_req->operation = operation;
-    pending_req->status    = 0;
+    pending_req->status    = BLKIF_RSP_ERROR;
     atomic_set(&pending_req->pendcnt, nr_psegs);
 
     blkif_get(blkif);
@@ -371,11 +390,8 @@ static void dispatch_rw_block_io(blkif_t *blkif,
         bh->b_size          = phys_seg[i].nr_sects << 9;
         bh->b_dev           = phys_seg[i].dev;
         bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
-
-        /* SMH: we store a 'pseudo-virtual' bogus address in b_data since
-           later code will undo this transformation (i.e. +-PAGE_OFFSET). */
-        bh->b_data          = phys_to_virt(phys_seg[i].buffer);
+        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) + 
+            (phys_seg[i].buffer & ~PAGE_MASK);
         /* SMH: bh_phys() uses the below field as a 'cheap' virt_to_phys */
         bh->b_page          = &mem_map[phys_seg[i].buffer>>PAGE_SHIFT]; 
         bh->b_end_io        = end_block_io_op;
@@ -391,10 +407,11 @@ static void dispatch_rw_block_io(blkif_t *blkif,
         submit_bh(operation, bh);
     }
 
+    pending_cons++;
     return;
 
  bad_descriptor:
-    make_response(blkif, req->id, req->operation, 1);
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
 } 
 
 
@@ -405,12 +422,13 @@ static void dispatch_rw_block_io(blkif_t *blkif,
 
 
 static void make_response(blkif_t *blkif, unsigned long id, 
-                          unsigned short op, unsigned long st)
+                          unsigned short op, int st)
 {
-    blk_ring_resp_entry_t *resp;
+    blkif_response_t *resp;
+    unsigned long     flags;
 
     /* Place on the response ring for the relevant domain. */ 
-    spin_lock(&blkif->blk_ring_lock);
+    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
     resp = &blkif->blk_ring_base->
         ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
     resp->id        = id;
@@ -418,48 +436,13 @@ static void make_response(blkif_t *blkif, unsigned long id,
     resp->status    = st;
     wmb();
     blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
-    spin_unlock(&blkif->blk_ring_lock);
+    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
 
     /* Kick the relevant domain. */
     notify_via_evtchn(blkif->evtchn);
 }
 
-static void blkif_debug_int(int irq, void *unused, struct pt_regs *regs)
-{
-#if 0
-    unsigned long flags;
-    struct task_struct *p;
-    blk_ring_t *blk_ring;
-    int i;
-
-    printk("Dumping block queue stats: nr_pending = %d"
-           " (prod=0x%08x,cons=0x%08x)\n",
-           NR_PENDING_REQS, pending_prod, pending_cons);
-
-    read_lock_irqsave(&tasklist_lock, flags);
-    for_each_domain ( p )
-    {
-        printk("Domain: %llu\n", blkif->domain);
-        blk_ring = blkif->blk_ring_base;
-        printk("  req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/"
-               "0x%08x on_list=%d\n",
-               blk_ring->req_prod, blkif->blk_req_cons,
-               blk_ring->resp_prod, blkif->blk_resp_prod,
-               __on_blkdev_list(p));
-    }
-    read_unlock_irqrestore(&tasklist_lock, flags);
-
-    for ( i = 0; i < MAX_PENDING_REQS; i++ )
-    {
-        printk("Pend%d: dom=%p, id=%08lx, cnt=%d, op=%d, status=%d\n",
-               i, pending_reqs[i].domain, pending_reqs[i].id,
-               atomic_read(&pending_reqs[i].pendcnt), 
-               pending_reqs[i].operation, pending_reqs[i].status);
-    }
-#endif
-}
-
-void unlink_blkdev_info(blkif_t *blkif)
+void blkif_deschedule(blkif_t *blkif)
 {
     unsigned long flags;
 
@@ -477,26 +460,29 @@ static int __init init_module(void)
 {
     int i;
 
+    blkif_interface_init();
+
+    if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n");
+        return -ENOMEM;
+    }
+
     pending_cons = 0;
     pending_prod = MAX_PENDING_REQS;
     memset(pending_reqs, 0, sizeof(pending_reqs));
     for ( i = 0; i < MAX_PENDING_REQS; i++ )
         pending_ring[i] = i;
     
-    for ( i = 0; i < NR_CPUS; i++ )
-        completed_bhs[i] = NULL;
-        
     spin_lock_init(&io_schedule_list_lock);
     INIT_LIST_HEAD(&io_schedule_list);
 
-    if ( request_irq(bind_virq_to_irq(VIRQ_DEBUG), blkif_debug_int, 
-                     SA_SHIRQ, "blkif-backend-dbg", &blkif_debug_int) != 0 )
-        printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
-
     buffer_head_cachep = kmem_cache_create(
         "buffer_head_cache", sizeof(struct buffer_head),
         0, SLAB_HWCACHE_ALIGN, NULL, NULL);
 
+    blkif_ctrlif_init();
+
     return 0;
 }
 
index bd6c40125cde83720ca36cac23fa886b9cb4a7ec..bc5390eeb9959bc0f42348d0b63b4b5d193895a2 100644 (file)
@@ -8,7 +8,7 @@
 
 #include "common.h"
 
-void vbd_create(blkif_vbd_create_t *create) 
+void vbd_create(blkif_be_vbd_create_t *create) 
 {
     vbd_t       *vbd; 
     rb_node_t  **rb_p, *rb_parent = NULL;
@@ -18,9 +18,9 @@ void vbd_create(blkif_vbd_create_t *create)
     blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
     if ( unlikely(blkif == NULL) )
     {
-        DPRINTK("vbd_create attempted for non-existent blkif (%llu,&u)\n", 
+        DPRINTK("vbd_create attempted for non-existent blkif (%llu,%u)\n", 
                 create->domid, create->blkif_handle); 
-        create->status = BLKIF_STATUS_INTERFACE_NOT_FOUND;
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
         return;
     }
 
@@ -42,7 +42,7 @@ void vbd_create(blkif_vbd_create_t *create)
         else
         {
             DPRINTK("vbd_create attempted for already existing vbd\n");
-            create->status = BLKIF_STATUS_VBD_EXISTS;
+            create->status = BLKIF_BE_STATUS_VBD_EXISTS;
             goto out;
         }
     }
@@ -50,19 +50,19 @@ void vbd_create(blkif_vbd_create_t *create)
     if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
     {
         DPRINTK("vbd_create: out of memory\n");
-        create->status = BLKIF_STATUS_OUT_OF_MEMORY;
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
         goto out;
     }
 
-    vbd->vdevice = vdevice; 
-    vbd->mode    = create->mode; 
-    vbd->type    = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
-    vbd->extents = NULL; 
+    vbd->vdevice  = vdevice; 
+    vbd->readonly = create->readonly;
+    vbd->type     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+    vbd->extents  = NULL; 
 
     rb_link_node(&vbd->rb, rb_parent, rb_p);
     rb_insert_color(&vbd->rb, &blkif->vbd_rb);
 
-    create->status = BLKIF_STATUS_OKAY;
+    create->status = BLKIF_BE_STATUS_OKAY;
 
  out:
     spin_unlock(&blkif->vbd_lock);
@@ -71,20 +71,20 @@ void vbd_create(blkif_vbd_create_t *create)
 
 
 /* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
-void vbd_grow(blkif_vbd_grow_t *grow) 
+void vbd_grow(blkif_be_vbd_grow_t *grow) 
 {
-    blkif_t          *blkif;
-    xen_extent_le_t **px, *x; 
-    vbd_t            *vbd = NULL;
-    rb_node_t        *rb;
-    blkif_vdev_t      vdevice = grow->vdevice;
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = grow->vdevice;
 
     blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
     if ( unlikely(blkif == NULL) )
     {
-        DPRINTK("vbd_grow attempted for non-existent blkif (%llu,&u)\n", 
+        DPRINTK("vbd_grow attempted for non-existent blkif (%llu,%u)\n", 
                 grow->domid, grow->blkif_handle); 
-        grow->status = BLKIF_STATUS_INTERFACE_NOT_FOUND;
+        grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
         return;
     }
 
@@ -105,28 +105,29 @@ void vbd_grow(blkif_vbd_grow_t *grow)
     if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
     {
         DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
-        grow->status = BLKIF_STATUS_VBD_NOT_FOUND;
+        grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
         goto out;
     } 
 
-    if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) == NULL) )
+    if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
+                               GFP_KERNEL)) == NULL) )
     {
         DPRINTK("vbd_grow: out of memory\n");
-        grow->status = BLKIF_STATUS_OUT_OF_MEMORY;
+        grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
         goto out;
     }
  
     x->extent.device        = grow->extent.device; 
     x->extent.sector_start  = grow->extent.sector_start; 
     x->extent.sector_length = grow->extent.sector_length; 
-    x->next                 = (xen_extent_le_t *)NULL; 
+    x->next                 = (blkif_extent_le_t *)NULL; 
 
     for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) 
         continue;
 
     *px = x;
 
-    grow->status = BLKIF_STATUS_OKAY;
+    grow->status = BLKIF_BE_STATUS_OKAY;
 
  out:
     spin_unlock(&blkif->vbd_lock);
@@ -134,20 +135,20 @@ void vbd_grow(blkif_vbd_grow_t *grow)
 }
 
 
-void vbd_shrink(blkif_vbd_shrink_t *shrink)
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink)
 {
-    blkif_t          *blkif;
-    xen_extent_le_t **px, *x; 
-    vbd_t            *vbd = NULL;
-    rb_node_t        *rb;
-    blkif_vdev_t      vdevice = shrink->vdevice;
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = shrink->vdevice;
 
     blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle);
     if ( unlikely(blkif == NULL) )
     {
-        DPRINTK("vbd_shrink attempted for non-existent blkif (%llu,&u)\n", 
+        DPRINTK("vbd_shrink attempted for non-existent blkif (%llu,%u)\n", 
                 shrink->domid, shrink->blkif_handle); 
-        shrink->status = BLKIF_STATUS_INTERFACE_NOT_FOUND;
+        shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
         return;
     }
 
@@ -167,13 +168,13 @@ void vbd_shrink(blkif_vbd_shrink_t *shrink)
 
     if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
     {
-        shrink->status = BLKIF_STATUS_VBD_NOT_FOUND;
+        shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
         goto out;
     }
 
     if ( unlikely(vbd->extents == NULL) )
     {
-        shrink->status = BLKIF_STATUS_EXTENT_NOT_FOUND;
+        shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
         goto out;
     }
 
@@ -185,7 +186,7 @@ void vbd_shrink(blkif_vbd_shrink_t *shrink)
     *px = x->next;
     kfree(x);
 
-    shrink->status = BLKIF_STATUS_OKAY;
+    shrink->status = BLKIF_BE_STATUS_OKAY;
 
  out:
     spin_unlock(&blkif->vbd_lock);
@@ -193,20 +194,20 @@ void vbd_shrink(blkif_vbd_shrink_t *shrink)
 }
 
 
-void vbd_destroy(blkif_vbd_destroy_t *destroy) 
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy) 
 {
-    blkif_t         *blkif;
-    vbd_t           *vbd;
-    rb_node_t       *rb;
-    xen_extent_le_t *x, *t;
-    blkif_vdev_t     vdevice = destroy->vdevice;
+    blkif_t           *blkif;
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_extent_le_t *x, *t;
+    blkif_vdev_t       vdevice = destroy->vdevice;
 
     blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
     if ( unlikely(blkif == NULL) )
     {
-        DPRINTK("vbd_destroy attempted for non-existent blkif (%llu,&u)\n", 
+        DPRINTK("vbd_destroy attempted for non-existent blkif (%llu,%u)\n", 
                 destroy->domid, destroy->blkif_handle); 
-        destroy->status = BLKIF_STATUS_INTERFACE_NOT_FOUND;
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
         return;
     }
 
@@ -224,7 +225,7 @@ void vbd_destroy(blkif_vbd_destroy_t *destroy)
             goto found;
     }
 
-    destroy->status = BLKIF_STATUS_VBD_NOT_FOUND;
+    destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
     goto out;
 
  found:
@@ -249,7 +250,7 @@ void destroy_all_vbds(blkif_t *blkif)
 {
     vbd_t *vbd;
     rb_node_t *rb;
-    xen_extent_le_t *x, *t;
+    blkif_extent_le_t *x, *t;
 
     spin_lock(&blkif->vbd_lock);
 
@@ -273,51 +274,30 @@ void destroy_all_vbds(blkif_t *blkif)
 }
 
 
-static int vbd_probe_single(xen_disk_info_t *xdi, 
-                            vbd_t *vbd, 
-                            struct task_struct *p)
+static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
 {
-    xen_extent_le_t *x; 
-    xen_disk_t cur_disk; 
+    blkif_extent_le_t *x; 
 
-    if ( xdi->count == xdi->max )
-    {
-        DPRINTK("vbd_probe_devices: out of space for probe.\n"); 
-        return -ENOMEM; 
-    }
-
-    cur_disk.device = vbd->vdevice; 
-    cur_disk.info   = vbd->type;
-    if ( !VBD_CAN_WRITE(vbd) )
-        cur_disk.info |= XD_FLAG_RO; 
-    cur_disk.capacity = 0ULL;
+    vbd_info->device = vbd->vdevice; 
+    vbd_info->info   = vbd->type;
+    if ( vbd->readonly )
+        vbd_info->info |= VDISK_FLAG_RO; 
+    vbd_info->capacity = 0ULL;
     for ( x = vbd->extents; x != NULL; x = x->next )
-        cur_disk.capacity += x->extent.nr_sectors; 
-    cur_disk.domain = p->domain; 
+        vbd_info->capacity += x->extent.sector_length; 
         
-    /* Now copy into relevant part of user-space buffer */
-    if( copy_to_user(&xdi->disks[xdi->count], 
-                     &cur_disk, 
-                     sizeof(xen_disk_t)) )
-    { 
-        DPRINTK("vbd_probe_devices: copy_to_user failed\n");
-        return -EFAULT;
-    } 
-        
-    xdi->count++; 
-
     return 0;
 }
 
 
-static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p)
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
 {
-    int rc = 0;
+    int rc = 0, nr_vbds = 0;
     rb_node_t *rb;
 
-    spin_lock(&p->vbd_lock);
+    spin_lock(&blkif->vbd_lock);
 
-    if ( (rb = p->vbd_rb.rb_node) == NULL )
+    if ( (rb = blkif->vbd_rb.rb_node) == NULL )
         goto out;
 
  new_subtree:
@@ -328,7 +308,10 @@ static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p)
     for ( ; ; )
     {
         /* STEP 2. Dealt with left subtree. Now process current node. */
-        if ( (rc = vbd_probe_single(xdi, rb_entry(rb, vbd_t, rb), p)) != 0 )
+        if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], 
+                                    rb_entry(rb, vbd_t, rb))) != 0 )
+            goto out;
+        if ( ++nr_vbds == max_vbds )
             goto out;
 
         /* STEP 3. Process right subtree, if any. */
@@ -355,146 +338,22 @@ static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p)
     }
 
  out:
-    spin_unlock(&p->vbd_lock);
-    return rc;  
-}
-
-
-/*
- * Return information about the VBDs available for a given domain, or for all 
- * domains; in the general case the 'domain' argument will be 0 which means 
- * "information about the caller"; otherwise the 'domain' argument will 
- * specify either a given domain, or all domains ("VBD_PROBE_ALL") -- both of 
- * these cases require the caller to be privileged.
- */
-long vbd_probe(vbd_probe_t *probe) 
-{
-    struct task_struct *p = NULL; 
-    unsigned long flags;
-    long ret = 0;  
-
-    if ( probe->domain != 0 )
-    { 
-        /* We can only probe for ourselves (unless we're privileged). */
-        if( (probe->domain != current->domain) && !IS_PRIV(current) )
-            return -EPERM; 
-
-        if ( (probe->domain != VBD_PROBE_ALL) &&
-             ((p = find_domain_by_id(probe->domain)) == NULL) )
-        {
-            DPRINTK("vbd_probe attempted for non-existent domain %llu\n", 
-                    probe->domain); 
-            return -EINVAL; 
-        }
-    }
-    else
-    { 
-        /* Default is to probe for ourselves. */
-        p = current; 
-        get_task_struct(p); /* to mirror final put_task_struct */
-    }
-
-    if ( probe->domain == VBD_PROBE_ALL )
-    { 
-        read_lock_irqsave(&tasklist_lock, flags);
-        for_each_domain ( p )
-        {
-            if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
-            { 
-                read_unlock_irqrestore(&tasklist_lock, flags);
-                goto out; 
-            }
-        }
-        read_unlock_irqrestore(&tasklist_lock, flags);
-    } 
-    else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
-        goto out; 
-
- out: 
-    if ( ret != 0 )
-        DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 
-    if ( p != NULL )
-        put_task_struct(p); 
-    return ret; 
-}
-
-
-long vbd_info(vbd_info_t *info) 
-{
-    struct task_struct *p; 
-    xen_extent_le_t *x; 
-    xen_extent_t *extents; 
-    vbd_t *vbd = NULL;
-    rb_node_t *rb;
-    long ret = 0;  
-   
-    if ( (info->domain != current->domain) && !IS_PRIV(current) )
-        return -EPERM; 
-
-    if ( (p = find_domain_by_id(info->domain)) == NULL )
-    {
-        DPRINTK("vbd_info attempted for non-existent domain %llu\n", 
-                info->domain); 
-        return -EINVAL; 
-    }
-
-    spin_lock(&p->vbd_lock);
-
-    rb = p->vbd_rb.rb_node;
-    while ( rb != NULL )
-    {
-        vbd = rb_entry(rb, vbd_t, rb);
-        if ( info->vdevice < vbd->vdevice )
-            rb = rb->rb_left;
-        else if ( info->vdevice > vbd->vdevice )
-            rb = rb->rb_right;
-        else
-            break;
-    }
-
-    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != info->vdevice) )
-    {
-        DPRINTK("vbd_info attempted on non-existent VBD.\n"); 
-        ret = -EINVAL; 
-        goto out; 
-    }
-
-    info->mode     = vbd->mode;
-    info->nextents = 0; 
-
-    extents = info->extents;
-    for ( x = vbd->extents; x != NULL; x = x->next )
-    {
-        if ( info->nextents == info->maxextents )
-            break;
-        if ( copy_to_user(extents, &x->extent, sizeof(xen_extent_t)) )
-        {
-            DPRINTK("vbd_info: copy_to_user failed\n");
-            ret = -EFAULT;
-            goto out; 
-        } 
-        extents++;
-        info->nextents++;
-    }
-
- out: 
-    spin_unlock(&p->vbd_lock);
-    put_task_struct(p); 
-    return ret; 
+    spin_unlock(&blkif->vbd_lock);
+    return (rc == 0) ? nr_vbds : rc;  
 }
 
 
-int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
 {
-    xen_extent_le_t *x; 
-    vbd_t *vbd;
-    rb_node_t *rb;
-    xen_sector_t sec_off;
-    unsigned long nr_secs;
+    blkif_extent_le_t *x; 
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_sector_t     sec_off;
+    unsigned long      nr_secs;
 
-    spin_lock(&p->vbd_lock);
+    spin_lock(&blkif->vbd_lock);
 
-    rb = p->vbd_rb.rb_node;
+    rb = blkif->vbd_rb.rb_node;
     while ( rb != NULL )
     {
         vbd = rb_entry(rb, vbd_t, rb);
@@ -507,42 +366,41 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
     }
 
     DPRINTK("vbd_translate; domain %llu attempted to access "
-            "non-existent VBD.\n", p->domain); 
+            "non-existent VBD.\n", blkif->domid);
 
-    spin_unlock(&p->vbd_lock);
+    spin_unlock(&blkif->vbd_lock);
     return -ENODEV; 
 
  found:
 
-    if ( ((operation == READ) && !VBD_CAN_READ(vbd)) ||
-         ((operation == WRITE) && !VBD_CAN_WRITE(vbd)) )
+    if ( (operation == WRITE) && vbd->readonly )
     {
-        spin_unlock(&p->vbd_lock);
+        spin_unlock(&blkif->vbd_lock);
         return -EACCES; 
     }
 
     /*
-     * Now iterate through the list of xen_extents, working out which should 
+     * Now iterate through the list of blkif_extents, working out which should 
      * be used to perform the translation.
      */
     sec_off = pseg->sector_number; 
     nr_secs = pseg->nr_sects;
     for ( x = vbd->extents; x != NULL; x = x->next )
     { 
-        if ( sec_off < x->extent.nr_sectors )
+        if ( sec_off < x->extent.sector_length )
         {
             pseg->dev = x->extent.device; 
-            pseg->sector_number = x->extent.start_sector + sec_off;
-            if ( unlikely((sec_off + nr_secs) > x->extent.nr_sectors) )
+            pseg->sector_number = x->extent.sector_start + sec_off;
+            if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) )
                 goto overrun;
             spin_unlock(&p->vbd_lock);
             return 1;
         } 
-        sec_off -= x->extent.nr_sectors
+        sec_off -= x->extent.sector_length
     }
 
     DPRINTK("vbd_translate: end of vbd.\n");
-    spin_unlock(&p->vbd_lock);
+    spin_unlock(&blkif->vbd_lock);
     return -EACCES; 
 
     /*
@@ -554,7 +412,7 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
  overrun:
 
     /* Adjust length of first chunk to run to end of first extent. */
-    pseg[0].nr_sects = x->extent.nr_sectors - sec_off;
+    pseg[0].nr_sects = x->extent.sector_length - sec_off;
 
     /* Set second chunk buffer and length to start where first chunk ended. */
     pseg[1].buffer   = pseg[0].buffer + (pseg[0].nr_sects << 9);
@@ -562,7 +420,7 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
 
     /* Now move to the next extent. Check it exists and is long enough! */
     if ( unlikely((x = x->next) == NULL) || 
-         unlikely(x->extent.nr_sectors < pseg[1].nr_sects) )
+         unlikely(x->extent.sector_length < pseg[1].nr_sects) )
     {
         DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
         spin_unlock(&p->vbd_lock);
@@ -571,8 +429,8 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
 
     /* Store the real device and start sector for the second chunk. */
     pseg[1].dev           = x->extent.device;
-    pseg[1].sector_number = x->extent.start_sector;
+    pseg[1].sector_number = x->extent.sector_start;
     
-    spin_unlock(&p->vbd_lock);
+    spin_unlock(&blkif->vbd_lock);
     return 2;
 }
index f6e8a4d5c8fa6b99faf71eded29da92851f62986..5db2b48a51a4630c793674d907d98636bedead34 100644 (file)
 #define BLKIF_OP_WRITE     1
 #define BLKIF_OP_PROBE     2
 
-/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */
+/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */
 #define BLKIF_RING_SIZE        64
 
 /*
  * Maximum scatter/gather segments per request.
- * This is carefully chosen so that sizeof(blk_ring_t) <= PAGE_SIZE.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
  * NB. This could be 12 if the ring indexes weren't stored in the same page.
  */
-#define BLKIF_REQUEST_MAX_SEGMENTS 11
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+
+#define BLKIF_MAX_SECTORS_PER_SEGMENT  16
 
 typedef struct {
     unsigned char  operation;        /* BLKIF_OP_???                         */
-    unsigned char  nr_segments;      /* number of segments (<= MAX_BLK_SEGS) */
+    unsigned char  nr_segments;      /* number of segments                   */
     blkif_vdev_t   device;           /* only for read/write requests         */
     unsigned long  id;               /* private guest value, echoed in resp  */
-    xen_sector_t   sector_number;    /* start sector idx on disk (r/w only)  */
-    /* Least 9 bits is 'nr_sects'. High 23 bits is the address.      */
-    unsigned long  buffer_and_sects[MAX_BLK_SEGS];
+    blkif_sector_t sector_number;    /* start sector idx on disk (r/w only)  */
+    /* Least 9 bits is 'nr_sects'. High 23 bits is the address.       */
+    /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */
+    unsigned long  buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 } blkif_request_t;
 
 typedef struct {
@@ -59,8 +62,8 @@ typedef unsigned int BLKIF_RING_IDX;
 #define MASK_BLKIF_IDX(_i) ((_i)&(BLKIF_RING_SIZE-1))
 
 typedef struct {
-    BLKIF_RING_IDX req_prod;  /* Request producer. Updated by guest OS. */
-    BLKIF_RING_IDX resp_prod; /* Response producer. Updated by Xen.     */
+    BLKIF_RING_IDX req_prod;  /* Request producer. Updated by front-end. */
+    BLKIF_RING_IDX resp_prod; /* Response producer. Updated by back-end. */
     union {
         blkif_request_t  req;
         blkif_response_t resp;
@@ -103,7 +106,7 @@ typedef struct {
 typedef struct {
     blkif_vdev_t   device;       /* Device number (opaque 16 bit value). */
     unsigned short info;         /* Device type and flags (VDISK_*).     */
-    xen_sector_t   capacity;     /* Size in terms of 512-byte sectors.   */
+    blkif_sector_t capacity;     /* Size in terms of 512-byte sectors.   */
 } vdisk_t;
 
 #endif /* __SHARED_BLKIF_H__ */
index 35986ca54a6a7b903d139eb6cedbc9494392cbc1..b0d27cf698233acfbf41a20f9d47f08d19753bad 100644 (file)
@@ -1,3 +1,3 @@
 O_TARGET := drv.o
-obj-y := block.o vbd.o
+obj-y := main.o vbd.o
 include $(TOPDIR)/Rules.make
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c
deleted file mode 100644 (file)
index d00dd98..0000000
+++ /dev/null
@@ -1,625 +0,0 @@
-/******************************************************************************
- * block.c
- * 
- * Xenolinux virtual block-device driver.
- * 
- * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
- * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
- */
-
-#include "block.h"
-#include <linux/blk.h>
-#include <linux/cdrom.h>
-#include <linux/tqueue.h>
-#include <linux/sched.h>
-#include <scsi/scsi.h>
-
-#include <linux/interrupt.h>
-
-typedef unsigned char byte; /* from linux/ide.h */
-
-#define STATE_ACTIVE    0
-#define STATE_SUSPENDED 1
-#define STATE_CLOSED    2
-static unsigned int state = STATE_SUSPENDED;
-
-/* Dynamically-mapped IRQs. */
-static int xlblk_response_irq, xlblk_update_irq;
-
-static blk_ring_t *blk_ring;
-static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
-static BLK_RING_IDX req_prod;  /* Private request producer.         */
-
-/* We plug the I/O ring if the driver is suspended or if the ring is full. */
-#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \
-                      (state != STATE_ACTIVE))
-
-
-/*
- * Request queues with outstanding work, but ring is currently full.
- * We need no special lock here, as we always access this with the
- * io_request_lock held. We only need a small maximum list.
- */
-#define MAX_PENDING 8
-static request_queue_t *pending_queues[MAX_PENDING];
-static int nr_pending;
-
-static kdev_t        sg_dev;
-static int           sg_operation = -1;
-static unsigned long sg_next_sect;
-#define DISABLE_SCATTERGATHER() (sg_operation = -1)
-
-static inline void signal_requests_to_xen(void)
-{
-    block_io_op_t op; 
-
-    DISABLE_SCATTERGATHER();
-    blk_ring->req_prod = req_prod;
-
-    op.cmd = BLOCK_IO_OP_SIGNAL; 
-    HYPERVISOR_block_io_op(&op);
-    return;
-}
-
-
-/*
- * xlblk_update_int/update-vbds_task - handle VBD update events from Xen
- * 
- * Schedule a task for keventd to run, which will update the VBDs and perform 
- * the corresponding updates to our view of VBD state, so the XenoLinux will 
- * respond to changes / additions / deletions to the set of VBDs automatically.
- */
-static struct tq_struct update_tq;
-static void update_vbds_task(void *unused)
-{ 
-    xlvbd_update_vbds();
-}
-static void xlblk_update_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-    update_tq.routine = update_vbds_task;
-    schedule_task(&update_tq);
-}
-
-
-int xen_block_open(struct inode *inode, struct file *filep)
-{
-    short xldev = inode->i_rdev; 
-    struct gendisk *gd = get_gendisk(xldev);
-    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
-    short minor = MINOR(xldev); 
-
-    if ( gd->part[minor].nr_sects == 0 )
-    { 
-        /*
-         * Device either doesn't exist, or has zero capacity; we use a few
-         * cheesy heuristics to return the relevant error code
-         */
-        if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
-             ((minor & (gd->max_p - 1)) != 0) )
-        { 
-            /*
-             * We have a real device, but no such partition, or we just have a
-             * partition number so guess this is the problem.
-             */
-            return -ENXIO;     /* no such device or address */
-        }
-        else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
-        {
-            /* This is a removable device => assume that media is missing. */ 
-            return -ENOMEDIUM; /* media not present (this is a guess) */
-        } 
-        else
-        { 
-            /* Just go for the general 'no such device' error. */
-            return -ENODEV;    /* no such device */
-        }
-    }
-    
-    /* Update of usage count is protected by per-device semaphore. */
-    disk->usage++;
-
-    return 0;
-}
-
-
-int xen_block_release(struct inode *inode, struct file *filep)
-{
-    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
-
-    /*
-     * When usage drops to zero it may allow more VBD updates to occur.
-     * Update of usage count is protected by a per-device semaphore.
-     */
-    if ( --disk->usage == 0 )
-    {
-        update_tq.routine = update_vbds_task;
-        schedule_task(&update_tq);
-    }
-
-    return 0;
-}
-
-
-int xen_block_ioctl(struct inode *inode, struct file *filep,
-                          unsigned command, unsigned long argument)
-{
-    kdev_t dev = inode->i_rdev;
-    struct hd_geometry *geo = (struct hd_geometry *)argument;
-    struct gendisk *gd;     
-    struct hd_struct *part; 
-    int i;
-
-    /* NB. No need to check permissions. That is done for us. */
-    
-    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
-                  command, (long) argument, dev); 
-  
-    gd = get_gendisk(dev);
-    part = &gd->part[MINOR(dev)]; 
-
-    switch ( command )
-    {
-    case BLKGETSIZE:
-        DPRINTK_IOCTL("   BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); 
-        return put_user(part->nr_sects, (unsigned long *) argument);
-
-    case BLKGETSIZE64:
-        DPRINTK_IOCTL("   BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
-                      (u64)part->nr_sects * 512);
-        return put_user((u64)part->nr_sects * 512, (u64 *) argument);
-
-    case BLKRRPART:                               /* re-read partition table */
-        DPRINTK_IOCTL("   BLKRRPART: %x\n", BLKRRPART);
-        return xen_block_revalidate(dev);
-
-    case BLKSSZGET:
-        return hardsect_size[MAJOR(dev)][MINOR(dev)]; 
-
-    case BLKBSZGET:                                        /* get block size */
-        DPRINTK_IOCTL("   BLKBSZGET: %x\n", BLKBSZGET);
-        break;
-
-    case BLKBSZSET:                                        /* set block size */
-        DPRINTK_IOCTL("   BLKBSZSET: %x\n", BLKBSZSET);
-        break;
-
-    case BLKRASET:                                         /* set read-ahead */
-        DPRINTK_IOCTL("   BLKRASET: %x\n", BLKRASET);
-        break;
-
-    case BLKRAGET:                                         /* get read-ahead */
-        DPRINTK_IOCTL("   BLKRAFET: %x\n", BLKRAGET);
-        break;
-
-    case HDIO_GETGEO:
-        /* note: these values are complete garbage */
-        DPRINTK_IOCTL("   HDIO_GETGEO: %x\n", HDIO_GETGEO);
-        if (!argument) return -EINVAL;
-        if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
-        if (put_user(0xff,  (byte *)&geo->heads)) return -EFAULT;
-        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
-        if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
-        return 0;
-
-    case HDIO_GETGEO_BIG: 
-        /* note: these values are complete garbage */
-        DPRINTK_IOCTL("   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
-        if (!argument) return -EINVAL;
-        if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
-        if (put_user(0xff,  (byte *)&geo->heads))   return -EFAULT;
-        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
-        if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
-        return 0;
-
-    case CDROMMULTISESSION:
-        DPRINTK("FIXME: support multisession CDs later\n");
-        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
-            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
-        return 0;
-
-    case SCSI_IOCTL_GET_BUS_NUMBER:
-        DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in Xen blkdev");
-        return -ENOSYS;
-
-    default:
-        printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", command);
-        return -ENOSYS;
-    }
-    
-    return 0;
-}
-
-/* check media change: should probably do something here in some cases :-) */
-int xen_block_check(kdev_t dev)
-{
-    DPRINTK("xen_block_check\n");
-    return 0;
-}
-
-int xen_block_revalidate(kdev_t dev)
-{
-    struct block_device *bd;
-    struct gendisk *gd;
-    xl_disk_t *disk;
-    unsigned long capacity;
-    int i, rc = 0;
-    
-    if ( (bd = bdget(dev)) == NULL )
-        return -EINVAL;
-
-    /*
-     * Update of partition info, and check of usage count, is protected
-     * by the per-block-device semaphore.
-     */
-    down(&bd->bd_sem);
-
-    if ( ((gd = get_gendisk(dev)) == NULL) ||
-         ((disk = xldev_to_xldisk(dev)) == NULL) ||
-         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
-    {
-        rc = -EINVAL;
-        goto out;
-    }
-
-    if ( disk->usage > 1 )
-    {
-        rc = -EBUSY;
-        goto out;
-    }
-
-    /* Only reread partition table if VBDs aren't mapped to partitions. */
-    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
-    {
-        for ( i = gd->max_p - 1; i >= 0; i-- )
-        {
-            invalidate_device(dev+i, 1);
-            gd->part[MINOR(dev+i)].start_sect = 0;
-            gd->part[MINOR(dev+i)].nr_sects   = 0;
-            gd->sizes[MINOR(dev+i)]           = 0;
-        }
-
-        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
-    }
-
- out:
-    up(&bd->bd_sem);
-    bdput(bd);
-    return rc;
-}
-
-
-/*
- * hypervisor_request
- *
- * request block io 
- * 
- * id: for guest use only.
- * operation: XEN_BLOCK_{READ,WRITE,PROBE,VBD*}
- * buffer: buffer to read/write into. this should be a
- *   virtual address in the guest os.
- */
-static int hypervisor_request(unsigned long   id,
-                              int             operation,
-                              char *          buffer,
-                              unsigned long   sector_number,
-                              unsigned short  nr_sectors,
-                              kdev_t          device)
-{
-    unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); 
-    struct gendisk *gd;
-    blk_ring_req_entry_t *req;
-    struct buffer_head *bh;
-
-    if ( unlikely(nr_sectors >= (1<<9)) )
-        BUG();
-    if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
-        BUG();
-
-    if ( unlikely(state == STATE_CLOSED) )
-        return 1;
-
-    switch ( operation )
-    {
-
-    case XEN_BLOCK_READ:
-    case XEN_BLOCK_WRITE:
-        gd = get_gendisk(device); 
-
-        /*
-         * Update the sector_number we'll pass down as appropriate; note that
-         * we could sanity check that resulting sector will be in this
-         * partition, but this will happen in xen anyhow.
-         */
-        sector_number += gd->part[MINOR(device)].start_sect;
-
-        /*
-         * If this unit doesn't consist of virtual (i.e., Xen-specified)
-         * partitions then we clear the partn bits from the device number.
-         */
-        if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & 
-               GENHD_FL_VIRT_PARTNS) )
-            device &= ~(gd->max_p - 1);
-
-        if ( (sg_operation == operation) &&
-             (sg_dev == device) &&
-             (sg_next_sect == sector_number) )
-        {
-            req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req;
-            bh = (struct buffer_head *)id;
-            bh->b_reqnext = (struct buffer_head *)req->id;
-            req->id = id;
-            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
-            if ( ++req->nr_segments < MAX_BLK_SEGS )
-                sg_next_sect += nr_sectors;
-            else
-                DISABLE_SCATTERGATHER();
-            return 0;
-        }
-        else if ( RING_PLUGGED )
-        {
-            return 1;
-        }
-        else
-        {
-            sg_operation = operation;
-            sg_dev       = device;
-            sg_next_sect = sector_number + nr_sectors;
-        }
-        break;
-
-    default:
-        panic("unknown op %d\n", operation);
-    }
-
-    /* Fill out a communications ring structure. */
-    req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
-    req->id            = id;
-    req->operation     = operation;
-    req->sector_number = (xen_sector_t)sector_number;
-    req->device        = device; 
-    req->nr_segments   = 1;
-    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
-    req_prod++;
-
-    return 0;
-}
-
-
-/*
- * do_xlblk_request
- *  read a block; request is in a request queue
- */
-void do_xlblk_request(request_queue_t *rq)
-{
-    struct request *req;
-    struct buffer_head *bh, *next_bh;
-    int rw, nsect, full, queued = 0;
-
-    DPRINTK("xlblk.c::do_xlblk_request\n"); 
-
-    while ( !rq->plugged && !list_empty(&rq->queue_head))
-    {
-        if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) 
-            goto out;
-  
-        DPRINTK("do_xlblk_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
-                req, req->cmd, req->sector,
-                req->current_nr_sectors, req->nr_sectors, req->bh);
-
-        rw = req->cmd;
-        if ( rw == READA )
-            rw = READ;
-        if ( unlikely((rw != READ) && (rw != WRITE)) )
-            panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
-
-        req->errors = 0;
-
-        bh = req->bh;
-        while ( bh != NULL )
-        {
-            next_bh = bh->b_reqnext;
-            bh->b_reqnext = NULL;
-
-            full = hypervisor_request(
-                (unsigned long)bh,
-                (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, 
-                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
-
-            if ( full )
-            { 
-                bh->b_reqnext = next_bh;
-                pending_queues[nr_pending++] = rq;
-                if ( unlikely(nr_pending >= MAX_PENDING) )
-                    BUG();
-                goto out; 
-            }
-
-            queued++;
-
-            /* Dequeue the buffer head from the request. */
-            nsect = bh->b_size >> 9;
-            bh = req->bh = next_bh;
-            
-            if ( bh != NULL )
-            {
-                /* There's another buffer head to do. Update the request. */
-                req->hard_sector += nsect;
-                req->hard_nr_sectors -= nsect;
-                req->sector = req->hard_sector;
-                req->nr_sectors = req->hard_nr_sectors;
-                req->current_nr_sectors = bh->b_size >> 9;
-                req->buffer = bh->b_data;
-            }
-            else
-            {
-                /* That was the last buffer head. Finalise the request. */
-                if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
-                    BUG();
-                blkdev_dequeue_request(req);
-                end_that_request_last(req);
-            }
-        }
-    }
-
- out:
-    if ( queued != 0 ) signal_requests_to_xen();
-}
-
-
-static void kick_pending_request_queues(void)
-{
-    /* We kick pending request queues if the ring is reasonably empty. */
-    if ( (nr_pending != 0) && 
-         ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) )
-    {
-        /* Attempt to drain the queue, but bail if the ring becomes full. */
-        while ( (nr_pending != 0) && !RING_PLUGGED )
-            do_xlblk_request(pending_queues[--nr_pending]);
-    }
-}
-
-
-static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-    BLK_RING_IDX i; 
-    unsigned long flags; 
-    struct buffer_head *bh, *next_bh;
-    
-    if ( unlikely(state == STATE_CLOSED) )
-        return;
-    
-    spin_lock_irqsave(&io_request_lock, flags);     
-
-    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
-    {
-        blk_ring_resp_entry_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
-        switch ( bret->operation )
-        {
-        case XEN_BLOCK_READ:
-        case XEN_BLOCK_WRITE:
-            if ( unlikely(bret->status != 0) )
-                DPRINTK("Bad return from blkdev data request: %lx\n",
-                        bret->status);
-            for ( bh = (struct buffer_head *)bret->id; 
-                  bh != NULL; 
-                  bh = next_bh )
-            {
-                next_bh = bh->b_reqnext;
-                bh->b_reqnext = NULL;
-                bh->b_end_io(bh, !bret->status);
-            }
-            break;
-     
-        default:
-            BUG();
-        }
-    }
-    
-    resp_cons = i;
-
-    kick_pending_request_queues();
-
-    spin_unlock_irqrestore(&io_request_lock, flags);
-}
-
-
-static void reset_xlblk_interface(void)
-{
-    block_io_op_t op; 
-
-    nr_pending = 0;
-
-    op.cmd = BLOCK_IO_OP_RESET;
-    if ( HYPERVISOR_block_io_op(&op) != 0 )
-        printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n");
-
-    op.cmd = BLOCK_IO_OP_RING_ADDRESS;
-    (void)HYPERVISOR_block_io_op(&op);
-
-    set_fixmap(FIX_BLKRING_BASE, op.u.ring_mfn << PAGE_SHIFT);
-    blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE);
-    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
-
-    wmb();
-    state = STATE_ACTIVE;
-}
-
-
-int __init xlblk_init(void)
-{
-    int error; 
-
-    reset_xlblk_interface();
-
-    xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
-    xlblk_update_irq   = bind_virq_to_irq(VIRQ_VBD_UPD);
-
-    error = request_irq(xlblk_response_irq, xlblk_response_int, 
-                        SA_SAMPLE_RANDOM, "blkdev", NULL);
-    if ( error )
-    {
-        printk(KERN_ALERT "Could not allocate receive interrupt\n");
-        goto fail;
-    }
-
-    error = request_irq(xlblk_update_irq, xlblk_update_int,
-                        0, "blkdev", NULL);
-
-    if ( error )
-    {
-        printk(KERN_ALERT "Could not allocate block update interrupt\n");
-        goto fail;
-    }
-
-    (void)xlvbd_init();
-
-    return 0;
-
- fail:
-    return error;
-}
-
-
-static void __exit xlblk_cleanup(void)
-{
-    xlvbd_cleanup();
-    free_irq(xlblk_response_irq, NULL);
-    free_irq(xlblk_update_irq, NULL);
-    unbind_virq_from_irq(VIRQ_BLKDEV);
-    unbind_virq_from_irq(VIRQ_VBD_UPD);
-}
-
-
-#ifdef MODULE
-module_init(xlblk_init);
-module_exit(xlblk_cleanup);
-#endif
-
-
-void blkdev_suspend(void)
-{
-    state = STATE_SUSPENDED;
-    wmb();
-
-    while ( resp_cons != blk_ring->req_prod )
-    {
-        barrier();
-        current->state = TASK_INTERRUPTIBLE;
-        schedule_timeout(1);
-    }
-
-    wmb();
-    state = STATE_CLOSED;
-    wmb();
-
-    clear_fixmap(FIX_BLKRING_BASE);
-}
-
-
-void blkdev_resume(void)
-{
-    reset_xlblk_interface();
-    spin_lock_irq(&io_request_lock);
-    kick_pending_request_queues();
-    spin_unlock_irq(&io_request_lock);
-}
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h
deleted file mode 100644 (file)
index e41e039..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/******************************************************************************
- * block.h
- * 
- * Shared definitions between all levels of XenoLinux Virtual block devices.
- */
-
-#ifndef __XEN_DRIVERS_BLOCK_H__
-#define __XEN_DRIVERS_BLOCK_H__
-
-#include <linux/config.h>
-#include <linux/module.h>
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-
-#include <linux/fs.h>
-#include <linux/hdreg.h>
-#include <linux/blkdev.h>
-#include <linux/major.h>
-
-#include <asm/hypervisor-ifs/hypervisor-if.h>
-#include <asm/hypervisor-ifs/vbd.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
-
-#if 0
-#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
-#if 0
-#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
-#else
-#define DPRINTK_IOCTL(_f, _a...) ((void)0)
-#endif
-
-/* Private gendisk->flags[] values. */
-#define GENHD_FL_XEN        2 /* Is unit a Xen block device?  */
-#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */
-
-/*
- * We have one of these per vbd, whether ide, scsi or 'other'.
- * They hang in an array off the gendisk structure. We may end up putting
- * all kinds of interesting stuff here :-)
- */
-typedef struct xl_disk {
-    int usage;
-} xl_disk_t;
-
-extern int xen_control_msg(int operration, char *buffer, int size);
-extern int xen_block_open(struct inode *inode, struct file *filep);
-extern int xen_block_release(struct inode *inode, struct file *filep);
-extern int xen_block_ioctl(struct inode *inode, struct file *filep,
-                                 unsigned command, unsigned long argument);
-extern int xen_block_check(kdev_t dev);
-extern int xen_block_revalidate(kdev_t dev);
-extern void do_xlblk_request (request_queue_t *rq); 
-
-extern void xlvbd_update_vbds(void);
-
-static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev)
-{
-    struct gendisk *gd = get_gendisk(xldev);
-    
-    if ( gd == NULL ) 
-        return NULL;
-    
-    return (xl_disk_t *)gd->real_devices + 
-        (MINOR(xldev) >> gd->minor_shift);
-}
-
-
-/* Virtual block-device subsystem. */
-extern int  xlvbd_init(void);
-extern void xlvbd_cleanup(void); 
-
-#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h
new file mode 100644 (file)
index 0000000..2d4415b
--- /dev/null
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/frontend/common.h
+ * 
+ * Shared definitions between all levels of XenoLinux Virtual block devices.
+ */
+
+#ifndef __XEN_DRIVERS_COMMON_H__
+#define __XEN_DRIVERS_COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+#include <asm/hypervisor-ifs/vbd.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#include "../blkif.h"
+
+#if 0
+#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+/* Private gendisk->flags[] values. */
+#define GENHD_FL_XEN        2 /* Is unit a Xen block device?  */
+#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.
+ * They hang in an array off the gendisk structure. We may end up putting
+ * all kinds of interesting stuff here :-)
+ */
+typedef struct xl_disk {
+    int usage;
+} xl_disk_t;
+
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+                                 unsigned command, unsigned long argument);
+extern int blkif_check(kdev_t dev);
+extern int blkif_revalidate(kdev_t dev);
+extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp);
+extern void do_blkif_request (request_queue_t *rq); 
+
+extern void xlvbd_update_vbds(void);
+
+static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev)
+{
+    struct gendisk *gd = get_gendisk(xldev);
+    
+    if ( gd == NULL ) 
+        return NULL;
+    
+    return (xl_disk_t *)gd->real_devices + 
+        (MINOR(xldev) >> gd->minor_shift);
+}
+
+
+/* Virtual block-device subsystem. */
+extern int  xlvbd_init(void);
+extern void xlvbd_cleanup(void); 
+
+#endif /* __XEN_DRIVERS_COMMON_H__ */
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
new file mode 100644 (file)
index 0000000..b0c524f
--- /dev/null
@@ -0,0 +1,702 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/frontend/main.c
+ * 
+ * Xenolinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ */
+
+#include "common.h"
+#include <linux/blk.h>
+#include <linux/cdrom.h>
+#include <linux/tqueue.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <scsi/scsi.h>
+#include <asm/ctrl_if.h>
+
+typedef unsigned char byte; /* from linux/ide.h */
+
+#define BLKIF_STATE_CLOSED    0
+#define BLKIF_STATE_DOWN      1
+#define BLKIF_STATE_UP        2
+static unsigned int blkif_state = BLKIF_STATE_CLOSED;
+static unsigned int blkif_evtchn, blkif_irq;
+
+static struct tq_struct blkif_statechange_tq;
+
+static int blkif_control_rsp_valid;
+static blkif_response_t blkif_control_rsp;
+
+static blkif_ring_t *blk_ring;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod;  /* Private request producer.         */
+
+/* We plug the I/O ring if the driver is suspended or if the ring is full. */
+#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \
+                      (blkif_state != BLKIF_STATE_UP))
+
+
+/*
+ * Request queues with outstanding work, but ring is currently full.
+ * We need no special lock here, as we always access this with the
+ * io_request_lock held. We only need a small maximum list.
+ */
+#define MAX_PENDING 8
+static request_queue_t *pending_queues[MAX_PENDING];
+static int nr_pending;
+
+static kdev_t        sg_dev;
+static int           sg_operation = -1;
+static unsigned long sg_next_sect;
+#define DISABLE_SCATTERGATHER() (sg_operation = -1)
+
+static inline void flush_requests(void)
+{
+    DISABLE_SCATTERGATHER();
+    blk_ring->req_prod = req_prod;
+    notify_via_evtchn(blkif_evtchn);
+}
+
+
+/*
+ * blkif_update_int/update-vbds_task - handle VBD update events.
+ *  Schedule a task for keventd to run, which will update the VBDs and perform 
+ *  the corresponding updates to our view of VBD state.
+ */
+static struct tq_struct update_tq;
+static void update_vbds_task(void *unused)
+{ 
+    xlvbd_update_vbds();
+}
+
+
+int blkif_open(struct inode *inode, struct file *filep)
+{
+    short xldev = inode->i_rdev; 
+    struct gendisk *gd = get_gendisk(xldev);
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+    short minor = MINOR(xldev); 
+
+    if ( gd->part[minor].nr_sects == 0 )
+    { 
+        /*
+         * Device either doesn't exist, or has zero capacity; we use a few
+         * cheesy heuristics to return the relevant error code
+         */
+        if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
+             ((minor & (gd->max_p - 1)) != 0) )
+        { 
+            /*
+             * We have a real device, but no such partition, or we just have a
+             * partition number so guess this is the problem.
+             */
+            return -ENXIO;     /* no such device or address */
+        }
+        else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
+        {
+            /* This is a removable device => assume that media is missing. */ 
+            return -ENOMEDIUM; /* media not present (this is a guess) */
+        } 
+        else
+        { 
+            /* Just go for the general 'no such device' error. */
+            return -ENODEV;    /* no such device */
+        }
+    }
+    
+    /* Update of usage count is protected by per-device semaphore. */
+    disk->usage++;
+
+    return 0;
+}
+
+
+int blkif_release(struct inode *inode, struct file *filep)
+{
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+
+    /*
+     * When usage drops to zero it may allow more VBD updates to occur.
+     * Update of usage count is protected by a per-device semaphore.
+     */
+    if ( --disk->usage == 0 )
+    {
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+    }
+
+    return 0;
+}
+
+
+int blkif_ioctl(struct inode *inode, struct file *filep,
+                          unsigned command, unsigned long argument)
+{
+    kdev_t dev = inode->i_rdev;
+    struct hd_geometry *geo = (struct hd_geometry *)argument;
+    struct gendisk *gd;     
+    struct hd_struct *part; 
+    int i;
+
+    /* NB. No need to check permissions. That is done for us. */
+    
+    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+                  command, (long) argument, dev); 
+  
+    gd = get_gendisk(dev);
+    part = &gd->part[MINOR(dev)]; 
+
+    switch ( command )
+    {
+    case BLKGETSIZE:
+        DPRINTK_IOCTL("   BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); 
+        return put_user(part->nr_sects, (unsigned long *) argument);
+
+    case BLKGETSIZE64:
+        DPRINTK_IOCTL("   BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
+                      (u64)part->nr_sects * 512);
+        return put_user((u64)part->nr_sects * 512, (u64 *) argument);
+
+    case BLKRRPART:                               /* re-read partition table */
+        DPRINTK_IOCTL("   BLKRRPART: %x\n", BLKRRPART);
+        return blkif_revalidate(dev);
+
+    case BLKSSZGET:
+        return hardsect_size[MAJOR(dev)][MINOR(dev)]; 
+
+    case BLKBSZGET:                                        /* get block size */
+        DPRINTK_IOCTL("   BLKBSZGET: %x\n", BLKBSZGET);
+        break;
+
+    case BLKBSZSET:                                        /* set block size */
+        DPRINTK_IOCTL("   BLKBSZSET: %x\n", BLKBSZSET);
+        break;
+
+    case BLKRASET:                                         /* set read-ahead */
+        DPRINTK_IOCTL("   BLKRASET: %x\n", BLKRASET);
+        break;
+
+    case BLKRAGET:                                         /* get read-ahead */
+        DPRINTK_IOCTL("   BLKRAFET: %x\n", BLKRAGET);
+        break;
+
+    case HDIO_GETGEO:
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO: %x\n", HDIO_GETGEO);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads)) return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case HDIO_GETGEO_BIG: 
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads))   return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case CDROMMULTISESSION:
+        DPRINTK("FIXME: support multisession CDs later\n");
+        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
+            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
+        return 0;
+
+    case SCSI_IOCTL_GET_BUS_NUMBER:
+        DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
+        return -ENOSYS;
+
+    default:
+        printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
+        return -ENOSYS;
+    }
+    
+    return 0;
+}
+
+/* check media change: should probably do something here in some cases :-) */
+int blkif_check(kdev_t dev)
+{
+    DPRINTK("blkif_check\n");
+    return 0;
+}
+
+int blkif_revalidate(kdev_t dev)
+{
+    struct block_device *bd;
+    struct gendisk *gd;
+    xl_disk_t *disk;
+    unsigned long capacity;
+    int i, rc = 0;
+    
+    if ( (bd = bdget(dev)) == NULL )
+        return -EINVAL;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(dev)) == NULL) ||
+         ((disk = xldev_to_xldisk(dev)) == NULL) ||
+         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
+    {
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( disk->usage > 1 )
+    {
+        rc = -EBUSY;
+        goto out;
+    }
+
+    /* Only reread partition table if VBDs aren't mapped to partitions. */
+    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
+    {
+        for ( i = gd->max_p - 1; i >= 0; i-- )
+        {
+            invalidate_device(dev+i, 1);
+            gd->part[MINOR(dev+i)].start_sect = 0;
+            gd->part[MINOR(dev+i)].nr_sects   = 0;
+            gd->sizes[MINOR(dev+i)]           = 0;
+        }
+
+        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+
+/*
+ * blkif_queue_request
+ *
+ * request block io 
+ * 
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(unsigned long   id,
+                               int             operation,
+                               char *          buffer,
+                               unsigned long   sector_number,
+                               unsigned short  nr_sectors,
+                               kdev_t          device)
+{
+    unsigned long       buffer_ma = phys_to_machine(virt_to_phys(buffer)); 
+    struct gendisk     *gd;
+    blkif_request_t    *req;
+    struct buffer_head *bh;
+
+    if ( unlikely(nr_sectors >= (1<<9)) )
+        BUG();
+    if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
+        BUG();
+
+    if ( unlikely(blkif_state != BLKIF_STATE_UP) )
+        return 1;
+
+    switch ( operation )
+    {
+
+    case BLKIF_OP_READ:
+    case BLKIF_OP_WRITE:
+        gd = get_gendisk(device); 
+
+        /*
+         * Update the sector_number we'll pass down as appropriate; note that
+         * we could sanity check that resulting sector will be in this
+         * partition, but this will happen in driver backend anyhow.
+         */
+        sector_number += gd->part[MINOR(device)].start_sect;
+
+        /*
+         * If this unit doesn't consist of virtual partitions then we clear 
+         * the partn bits from the device number.
+         */
+        if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & 
+               GENHD_FL_VIRT_PARTNS) )
+            device &= ~(gd->max_p - 1);
+
+        if ( (sg_operation == operation) &&
+             (sg_dev == device) &&
+             (sg_next_sect == sector_number) )
+        {
+            req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req;
+            bh = (struct buffer_head *)id;
+            bh->b_reqnext = (struct buffer_head *)req->id;
+            req->id = id;
+            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
+            if ( ++req->nr_segments < MAX_BLK_SEGS )
+                sg_next_sect += nr_sectors;
+            else
+                DISABLE_SCATTERGATHER();
+            return 0;
+        }
+        else if ( RING_PLUGGED )
+        {
+            return 1;
+        }
+        else
+        {
+            sg_operation = operation;
+            sg_dev       = device;
+            sg_next_sect = sector_number + nr_sectors;
+        }
+        break;
+
+    default:
+        panic("unknown op %d\n", operation);
+    }
+
+    /* Fill out a communications ring structure. */
+    req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+    req->id            = id;
+    req->operation     = operation;
+    req->sector_number = (blkif_sector_t)sector_number;
+    req->device        = device; 
+    req->nr_segments   = 1;
+    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
+    req_prod++;
+
+    return 0;
+}
+
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(request_queue_t *rq)
+{
+    struct request *req;
+    struct buffer_head *bh, *next_bh;
+    int rw, nsect, full, queued = 0;
+
+    DPRINTK("Entered do_blkif_request\n"); 
+
+    while ( !rq->plugged && !list_empty(&rq->queue_head))
+    {
+        if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) 
+            goto out;
+  
+        DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
+                req, req->cmd, req->sector,
+                req->current_nr_sectors, req->nr_sectors, req->bh);
+
+        rw = req->cmd;
+        if ( rw == READA )
+            rw = READ;
+        if ( unlikely((rw != READ) && (rw != WRITE)) )
+            panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
+
+        req->errors = 0;
+
+        bh = req->bh;
+        while ( bh != NULL )
+        {
+            next_bh = bh->b_reqnext;
+            bh->b_reqnext = NULL;
+
+            full = blkif_queue_request(
+                (unsigned long)bh,
+                (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, 
+                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
+
+            if ( full )
+            { 
+                bh->b_reqnext = next_bh;
+                pending_queues[nr_pending++] = rq;
+                if ( unlikely(nr_pending >= MAX_PENDING) )
+                    BUG();
+                goto out; 
+            }
+
+            queued++;
+
+            /* Dequeue the buffer head from the request. */
+            nsect = bh->b_size >> 9;
+            bh = req->bh = next_bh;
+            
+            if ( bh != NULL )
+            {
+                /* There's another buffer head to do. Update the request. */
+                req->hard_sector += nsect;
+                req->hard_nr_sectors -= nsect;
+                req->sector = req->hard_sector;
+                req->nr_sectors = req->hard_nr_sectors;
+                req->current_nr_sectors = bh->b_size >> 9;
+                req->buffer = bh->b_data;
+            }
+            else
+            {
+                /* That was the last buffer head. Finalise the request. */
+                if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
+                    BUG();
+                blkdev_dequeue_request(req);
+                end_that_request_last(req);
+            }
+        }
+    }
+
+ out:
+    if ( queued != 0 )
+        flush_requests();
+}
+
+
+static void kick_pending_request_queues(void)
+{
+    /* We kick pending request queues if the ring is reasonably empty. */
+    if ( (nr_pending != 0) && 
+         ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) )
+    {
+        /* Attempt to drain the queue, but bail if the ring becomes full. */
+        while ( (nr_pending != 0) && !RING_PLUGGED )
+            do_blkif_request(pending_queues[--nr_pending]);
+    }
+}
+
+
+static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    BLK_RING_IDX i; 
+    unsigned long flags; 
+    struct buffer_head *bh, *next_bh;
+    
+    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) )
+        return;
+    
+    spin_lock_irqsave(&io_request_lock, flags);     
+
+    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
+    {
+        blkif_response_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+        switch ( bret->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
+                DPRINTK("Bad return from blkdev data request: %lx\n",
+                        bret->status);
+            for ( bh = (struct buffer_head *)bret->id; 
+                  bh != NULL; 
+                  bh = next_bh )
+            {
+                next_bh = bh->b_reqnext;
+                bh->b_reqnext = NULL;
+                bh->b_end_io(bh, !bret->status);
+            }
+            break;
+        case BLKIF_OP_PROBE:
+            memcpy(&blkif_control_rsp, bret, sizeof(*bret));
+            blkif_control_rsp_valid = 1;
+            break;
+        default:
+            BUG();
+        }
+    }
+    
+    resp_cons = i;
+
+    kick_pending_request_queues();
+
+    spin_unlock_irqrestore(&io_request_lock, flags);
+}
+
+
+void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
+{
+    unsigned long flags;
+
+ retry:
+    while ( (req_prod - resp_cons) == BLK_RING_SIZE )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    spin_lock_irqsave(&io_request_lock, flags);
+    if ( (req_prod - resp_cons) == BLK_RING_SIZE )
+    {
+        spin_unlock_irqrestore(&io_request_lock, flags);
+        goto retry;
+    }
+
+    DISABLE_SCATTERGATHER();
+    memcpy(&blk_ring->ring[MASK_BLK_IDX(req_prod)].req, req, sizeof(*req));
+    req_prod++;
+    flush_requests();
+
+    spin_unlock_irqrestore(&io_request_lock, flags);
+
+    while ( !blkif_control_rsp_valid )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
+    blkif_control_rsp_valid = 0;
+}
+
+
+static void blkif_bringup_phase1(void *unused)
+{
+    ctrl_msg_t              cmsg;
+    blkif_fe_interface_up_t up;
+
+    /* Move from CLOSED to DOWN state. */
+    blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+    blkif_state  = BLKIF_STATE_DOWN;
+
+    /* Construct an interface-UP message for the domain controller. */
+    cmsg.type      = CMSG_BLKIF_FE;
+    cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_UP;
+    cmsg.length    = sizeof(blkif_fe_interface_up_t);
+    up.handle      = 0;
+    up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
+    memcpy(cmsg.msg, &up, sizeof(up));
+
+    /* Tell the controller to bring up the interface. */
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
+
+static void blkif_bringup_phase2(void *unused)
+{
+    /* Move from DOWN to UP state. */
+    blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
+    (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
+    blkif_state = BLKIF_STATE_UP;
+
+    /* Probe for discs that are attached to the interface. */
+    xlvbd_init();
+
+    /* Kick pending requests. */
+    spin_lock_irq(&io_request_lock);
+    kick_pending_request_queues();
+    spin_unlock_irq(&io_request_lock);
+}
+
+static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
+{
+    if ( status->handle != 0 )
+    {
+        printk(KERN_WARNING "Status change on unsupported blkif %d\n",
+               status->handle);
+        return;
+    }
+
+    switch ( status->status )
+    {
+    case BLKIF_INTERFACE_STATUS_DESTROYED:
+        printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n",
+               blkif_state);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_DOWN:
+        if ( blkif_state != BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-DOWN message in state %d\n",
+                   blkif_state);
+            break;
+        }
+        blkif_statechange_tq.routine = blkif_bringup_phase1;
+        schedule_task(&blkif_statechange_tq);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_UP:
+        if ( blkif_state == BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-UP message in state %d\n",
+                   blkif_state);
+            break;
+        }
+        blkif_evtchn = status->evtchn;
+        blkif_statechange_tq.routine = blkif_bringup_phase2;
+        schedule_task(&blkif_statechange_tq);
+        break;
+
+    default:
+        printk(KERN_WARNING "Status change to unknown value %d\n", 
+               status->status);
+        break;
+    }
+}
+
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
+        if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) )
+            goto parse_error;
+        blkif_status_change((blkif_fe_interface_status_changed_t *)
+                            &msg->msg[0]);
+        break;        
+#if 0
+    case CMSG_BLKIF_FE_VBD_STATUS_CHANGED:
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+        break;
+#endif
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+
+int __init xlblk_init(void)
+{
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx);
+    return 0;
+}
+
+
+static void __exit xlblk_cleanup(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
+
+
+#ifdef MODULE
+module_init(xlblk_init);
+module_exit(xlblk_cleanup);
+#endif
+
+
+void blkdev_suspend(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
+
+
+void blkdev_resume(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
index e08b976c5641a6938ea07cda4fa1d282166c3924..944bf7eace85b8f300a3d741789ee9b4c5c52f1a 100644 (file)
@@ -1,13 +1,13 @@
 /******************************************************************************
- * vbd.c
+ * arch/xen/drivers/blkif/frontend/vbd.c
  * 
- * Xenolinux virtual block-device driver (xvd).
+ * Xenolinux virtual block-device driver.
  * 
  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
  */
 
-#include "block.h"
+#include "common.h"
 #include <linux/blk.h>
 
 /*
@@ -43,54 +43,59 @@ static int xlvbd_blksize_size[256];
 static int xlvbd_hardsect_size[256];
 static int xlvbd_max_sectors[256];
 
-/* Information from Xen about our VBDs. */
+/* Information about our VBDs. */
 #define MAX_VBDS 64
 static int nr_vbds;
-static xen_disk_t *vbd_info;
+static vdisk_t *vbd_info;
 
 static struct block_device_operations xlvbd_block_fops = 
 {
-    open:               xen_block_open,
-    release:            xen_block_release,
-    ioctl:              xen_block_ioctl,
-    check_media_change: xen_block_check,
-    revalidate:         xen_block_revalidate,
+    open:               blkif_open,
+    release:            blkif_release,
+    ioctl:              blkif_ioctl,
+    check_media_change: blkif_check,
+    revalidate:         blkif_revalidate,
 };
 
-static int xlvbd_get_vbd_info(xen_disk_t *disk_info)
+static int xlvbd_get_vbd_info(vdisk_t *disk_info)
 {
-    int error;
-    block_io_op_t op; 
-
-    /* Probe for disk information. */
-    memset(&op, 0, sizeof(op)); 
-    op.cmd = BLOCK_IO_OP_VBD_PROBE; 
-    op.u.probe_params.domain    = 0; 
-    op.u.probe_params.xdi.max   = MAX_VBDS;
-    op.u.probe_params.xdi.disks = disk_info;
-    op.u.probe_params.xdi.count = 0;
-
-    if ( (error = HYPERVISOR_block_io_op(&op)) != 0 )
+    vdisk_t         *buf = (vdisk_t *)__get_free_page(GFP_KERNEL);
+    blkif_request_t  req;
+    blkif_response_t rsp;
+    int              nr;
+
+    memset(&req, 0, sizeof(req));
+    req.operation   = BLKIF_OP_PROBE;
+    req.nr_segments = 1;
+    req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512);
+
+    blkif_control_send(&req, &rsp);
+
+    if ( rsp.status <= 0 )
     {
-        printk(KERN_ALERT "Could not probe disks (%d)\n", error);
+        printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status);
         return -1;
     }
 
-    return op.u.probe_params.xdi.count;
+    if ( (nr = rsp.status) > MAX_VBDS )
+         nr = MAX_VBDS;
+    memcpy(disk_info, buf, nr * sizeof(vdisk_t));
+
+    return nr;
 }
 
 /*
  * xlvbd_init_device - initialise a VBD device
- * @disk:              a xen_disk_t describing the VBD
+ * @disk:              a vdisk_t describing the VBD
  *
- * Takes a xen_disk_t * that describes a VBD the domain has access to.
+ * Takes a vdisk_t * that describes a VBD the domain has access to.
  * Performs appropriate initialisation and registration of the device.
  *
  * Care needs to be taken when making re-entrant calls to ensure that
  * corruption does not occur.  Also, devices that are in use should not have
  * their details updated.  This is the caller's responsibility.
  */
-static int xlvbd_init_device(xen_disk_t *xd)
+static int xlvbd_init_device(vdisk_t *xd)
 {
     int device = xd->device;
     int major  = MAJOR(device); 
@@ -181,11 +186,11 @@ static int xlvbd_init_device(xen_disk_t *xd)
             read_ahead[major]    = 8;
         }
 
-        blk_init_queue(BLK_DEFAULT_QUEUE(major), do_xlblk_request);
+        blk_init_queue(BLK_DEFAULT_QUEUE(major), do_blkif_request);
 
         /*
          * Turn off barking 'headactive' mode. We dequeue buffer heads as
-         * soon as we pass them down to Xen.
+         * soon as we pass them to the back-end driver.
          */
         blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0);
 
@@ -431,12 +436,12 @@ static int xlvbd_remove_device(int device)
 void xlvbd_update_vbds(void)
 {
     int i, j, k, old_nr, new_nr;
-    xen_disk_t *old_info, *new_info, *merged_info;
+    vdisk_t *old_info, *new_info, *merged_info;
 
     old_info = vbd_info;
     old_nr   = nr_vbds;
 
-    new_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL);
+    new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
     if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
     {
         kfree(new_info);
@@ -448,7 +453,7 @@ void xlvbd_update_vbds(void)
      * old list and new list do not overlap at all, and we cannot yet destroy
      * VBDs in the old list because the usage counts are busy.
      */
-    merged_info = kmalloc((old_nr + new_nr) * sizeof(xen_disk_t), GFP_KERNEL);
+    merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
 
     /* @i tracks old list; @j tracks new list; @k tracks merged list. */
     i = j = k = 0;
@@ -458,13 +463,13 @@ void xlvbd_update_vbds(void)
         if ( old_info[i].device < new_info[j].device )
         {
             if ( xlvbd_remove_device(old_info[i].device) != 0 )
-                memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
             i++;
         }
         else if ( old_info[i].device > new_info[j].device )
         {
             if ( xlvbd_init_device(&new_info[j]) == 0 )
-                memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
             j++;
         }
         else
@@ -472,9 +477,9 @@ void xlvbd_update_vbds(void)
             if ( ((old_info[i].capacity == new_info[j].capacity) &&
                   (old_info[i].info == new_info[j].info)) ||
                  (xlvbd_remove_device(old_info[i].device) != 0) )
-                memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
             else if ( xlvbd_init_device(&new_info[j]) == 0 )
-                memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
             i++; j++;
         }
     }
@@ -482,13 +487,13 @@ void xlvbd_update_vbds(void)
     for ( ; i < old_nr; i++ )
     {
         if ( xlvbd_remove_device(old_info[i].device) != 0 )
-            memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+            memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
     }
 
     for ( ; j < new_nr; j++ )
     {
         if ( xlvbd_init_device(&new_info[j]) == 0 )
-            memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+            memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
     }
 
     vbd_info = merged_info;
@@ -500,12 +505,12 @@ void xlvbd_update_vbds(void)
 
 
 /*
- * Set up all the linux device goop for the virtual block devices (vbd's) that 
- * xen tells us about. Note that although from xen's pov VBDs are addressed 
- * simply an opaque 16-bit device number, the domain creation tools 
+ * Set up all the linux device goop for the virtual block devices (vbd's) that
+ * we know about. Note that although from the backend driver's p.o.v. VBDs are
+ * addressed simply an opaque 16-bit device number, the domain creation tools 
  * conventionally allocate these numbers to correspond to those used by 'real' 
  * linux -- this is just for convenience as it means e.g. that the same 
- * /etc/fstab can be used when booting with or without xen.
+ * /etc/fstab can be used when booting with or without Xen.
  */
 int __init xlvbd_init(void)
 {
@@ -537,7 +542,7 @@ int __init xlvbd_init(void)
         xlvbd_max_sectors[i]   = 128;
     }
 
-    vbd_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL);
+    vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
     nr_vbds  = xlvbd_get_vbd_info(vbd_info);
 
     if ( nr_vbds < 0 )
index b59f3e8a841f3045362557eb5cfbf328af0697ff..e6fc3aed0587a043b9ed67318f63fca9c304deee 100644 (file)
@@ -36,7 +36,7 @@ static struct proc_dir_entry *privcmd_intf;
 static int privcmd_ioctl(struct inode *inode, struct file *file,
                          unsigned int cmd, unsigned long data)
 {
-    int ret = 0;
+    int ret = -ENOSYS;
 
     switch ( cmd )
     {
@@ -62,10 +62,14 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
     }
     break;
 
-    default:
-        ret = -EINVAL;
-       break;
-       }
+    case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN:
+    {
+        extern int initdom_ctrlif_domcontroller_port;
+        ret = initdom_ctrlif_domcontroller_port;
+    }
+    break;
+    }
+
     return ret;
 }
 
@@ -85,7 +89,7 @@ static int __init init_module(void)
     {
         privcmd_intf->owner      = THIS_MODULE;
         privcmd_intf->nlink      = 1;
-       privcmd_intf->proc_fops  = &privcmd_file_ops;
+        privcmd_intf->proc_fops  = &privcmd_file_ops;
     }
 
     return 0;
index 7d59ad2e1638da738b6c00fa71bf9dbf0afbc1a3..715f707eb091c3510a6d69ed8e96d6ec07c0633e 100644 (file)
 #include <asm/ctrl_if.h>
 #include <asm/evtchn.h>
 
+/*
+ * Only used by initial domain which must create its own control-interface
+ * event channel. This value is picked up by the user-space domain controller
+ * via an ioctl.
+ */
+int initdom_ctrlif_domcontroller_port = -1;
+
 static int        ctrl_if_evtchn;
 static int        ctrl_if_irq;
 static spinlock_t ctrl_if_lock;
@@ -276,9 +283,6 @@ void ctrl_if_unregister_receiver(u8 type, ctrl_msg_handler_t hnd)
 
 void ctrl_if_suspend(void)
 {
-    if ( start_info.flags & SIF_INITDOMAIN )
-        return;
-
     free_irq(ctrl_if_irq, NULL);
     unbind_evtchn_from_irq(ctrl_if_evtchn);
 }
@@ -286,7 +290,21 @@ void ctrl_if_suspend(void)
 void ctrl_if_resume(void)
 {
     if ( start_info.flags & SIF_INITDOMAIN )
-        return;
+    {
+        /*
+         * The initial domain must create its own domain-controller link.
+         * The controller is probably not running at this point, but will
+         * pick up its end of the event channel from 
+         */
+        evtchn_op_t op;
+        op.cmd = EVTCHNOP_bind_interdomain;
+        op.u.bind_interdomain.dom1 = DOMID_SELF;
+        op.u.bind_interdomain.dom2 = DOMID_SELF;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            BUG();
+        start_info.domain_controller_evtchn = op.u.bind_interdomain.port1;
+        initdom_ctrlif_domcontroller_port   = op.u.bind_interdomain.port2;
+    }
 
     ctrl_if_tx_resp_cons = 0;
     ctrl_if_rx_req_cons  = 0;
index 665357d4bcc6dfb4cab757bad06ed48664f4d460..4eeac0c4dda64be7b0bfb8b36506c7d41c9f5a08 100644 (file)
@@ -5,7 +5,7 @@
  *
  * (C) Copyright 1995 1996 Linus Torvalds
  *
- * Modifications for Xenolinux (c) 2003 Keir Fraser
+ * Modifications for Xenolinux (c) 2003-2004 Keir Fraser
  */
 
 #include <linux/slab.h>
 #define direct_mk_pte_phys(physpage, pgprot) \
   __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
 
-static inline void direct_remap_area_pte(pte_t *pte, 
-                                         unsigned long address, 
-                                         unsigned long size,
-                                         unsigned long machine_addr, 
-                                         pgprot_t prot,
-                                         domid_t  domid)
+static inline int direct_remap_area_pte(pte_t *pte, 
+                                        unsigned long address, 
+                                        unsigned long size,
+                                        unsigned long machine_addr, 
+                                        pgprot_t prot,
+                                        domid_t  domid)
 {
     unsigned long end;
+#define MAX_DIRECTMAP_MMU_QUEUE 64
+    mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v;
 
-    mmu_update_t *u, *v;
-    u = v = vmalloc(3*PAGE_SIZE); /* plenty */
+    address &= ~PMD_MASK;
+    end = address + size;
+    if (end > PMD_SIZE)
+        end = PMD_SIZE;
+    if (address >= end)
+        BUG();
 
+ reset_buffer:
     /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */
+    v = &u[0];
     if ( domid != 0 )
     {
         v[0].val  = (unsigned long)(domid<<16) & ~0xFFFFUL;
@@ -53,12 +61,6 @@ static inline void direct_remap_area_pte(pte_t *pte,
         v += 2;
     }
 
-    address &= ~PMD_MASK;
-    end = address + size;
-    if (end > PMD_SIZE)
-        end = PMD_SIZE;
-    if (address >= end)
-        BUG();
     do {
         if (!pte_none(*pte)) {
             printk("direct_remap_area_pte: page already exists\n");
@@ -66,16 +68,21 @@ static inline void direct_remap_area_pte(pte_t *pte,
         }
         v->ptr = virt_to_machine(pte);
         v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
-        v++;
+        if ( ++v == MAX_DIRECTMAP_MMU_QUEUE )
+        {
+            if ( HYPERVISOR_mmu_update(u, MAX_DIRECTMAP_MMU_QUEUE) < 0 )
+                return -EFAULT;
+            goto reset_buffer;
+        }
         address += PAGE_SIZE;
         machine_addr += PAGE_SIZE;
         pte++;
     } while (address && (address < end));
 
     if ( ((v-u) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) )
-        printk(KERN_WARNING "Failed to ioremap %08lx->%08lx (%08lx)\n",
-               end-size, end, machine_addr-size);
-    vfree(u);
+        return -EFAULT;
+
+    return 0;
 }
 
 static inline int direct_remap_area_pmd(struct mm_struct *mm,
@@ -86,6 +93,7 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm,
                                         pgprot_t prot,
                                         domid_t  domid)
 {
+    int error = 0;
     unsigned long end;
 
     address &= ~PGDIR_MASK;
@@ -99,12 +107,14 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm,
         pte_t * pte = pte_alloc(mm, pmd, address);
         if (!pte)
             return -ENOMEM;
-        direct_remap_area_pte(pte, address, end - address, 
-                              address + machine_addr, prot, domid);
+        error = direct_remap_area_pte(pte, address, end - address, 
+                                      address + machine_addr, prot, domid);
+        if ( error )
+            break;
         address = (address + PMD_SIZE) & PMD_MASK;
         pmd++;
     } while (address && (address < end));
-    return 0;
+    return error;
 }
  
 int direct_remap_area_pages(struct mm_struct *mm,
index f1d2b77c2e21773e0328187ebcc8fd8b96f40abf..a02e2471ea7d53d3933e3f3ebaf7afc77cd7f4fe 100644 (file)
@@ -52,7 +52,7 @@ int ctrl_if_send_message_noblock(
  *     function returns.
  *  2. If @hnd is NULL then no callback is executed.
  */
-int ctrl_if_send_message(
+int ctrl_if_send_message_block(
     ctrl_msg_t *msg, 
     ctrl_msg_handler_t hnd, 
     unsigned long id, 
index c780f644c0d313764ff797e0c8ef1f43a4c577ff..162ba1fbed03cd752413b36b4a7690999cd52989 100644 (file)
@@ -47,6 +47,11 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
        return (pmd_t *) dir;
 }
 
+#define pte_same(a, b)         ((a).pte_low == (b).pte_low)
+#define pte_page(x)            (mem_map+((unsigned long)((pte_val(x) >> PAGE_SHIFT))))
+#define pte_none(x)            (!(x).pte_low)
+#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
+
 /*
  * A note on implementation of this atomic 'get-and-clear' operation.
  * This is actually very simple because XenoLinux can only run on a single
@@ -59,13 +64,9 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 static inline pte_t ptep_get_and_clear(pte_t *xp)
 {
     pte_t pte = *xp;
-    queue_l1_entry_update(xp, 0);
+    if ( !pte_none(pte) )
+        queue_l1_entry_update(xp, 0);
     return pte;
 }
 
-#define pte_same(a, b)         ((a).pte_low == (b).pte_low)
-#define pte_page(x)            (mem_map+((unsigned long)((pte_val(x) >> PAGE_SHIFT))))
-#define pte_none(x)            (!(x).pte_low)
-#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
-
 #endif /* _I386_PGTABLE_2LEVEL_H */
index 4ce2930daa037e35da9ae1ac5056752d1cd1eabf..d359b6eaa7ee8cb3ee5526c7063318cc6c5b298d 100644 (file)
@@ -13,16 +13,21 @@ typedef struct privcmd_hypercall
     unsigned long arg[5];
 } privcmd_hypercall_t;
 
-typedef struct privcmd_blkmsg
-{
-    unsigned long op;
-    void         *buf;
-    int           buf_size;
-} privcmd_blkmsg_t;
-
-#define IOCTL_PRIVCMD_HYPERCALL        \
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL         \
     _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
-#define IOCTL_PRIVCMD_BLKMSG           \
-    _IOC(_IOC_NONE, 'P', 1, sizeof(privcmd_blkmsg_t))
+
+/*
+ * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN
+ * @arg: n/a
+ * Return: Port associated with domain-controller end of control event channel
+ *         for the initial domain.
+ */
+#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \
+    _IOC(_IOC_NONE, 'P', 1, 0)
 
 #endif /* __PROC_CMD_H__ */
index 4d583b54a7c8ed233fb249e0072e99f62595b334..b030270b42ee50f2942932ddd1b73c0cb503a2f1 100644 (file)
@@ -45,6 +45,10 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo
                        continue;
                if (pte_present(page)) {
                        struct page *ptpage = pte_page(page);
+#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
+                       if (pte_io(page))
+                               continue;
+#endif
                        if (VALID_PAGE(ptpage) && (!PageReserved(ptpage)))
                                __free_page(ptpage);
                        continue;
@@ -250,11 +254,6 @@ void __vfree(void * addr, int free_area_pages)
        for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
                if (tmp->addr == addr) {
                        *p = tmp->next;
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
-                       if (tmp->flags & VM_IOREMAP)
-                               zap_page_range(&init_mm, VMALLOC_VMADDR(tmp->addr), tmp->size);
-                       else
-#endif
                        if (free_area_pages)
                            vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
                        write_unlock(&vmlist_lock);