3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c
4083dc16z0jvZEH4PiVDbDRreaNp6w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/Makefile
4083dc16KQus88a4U3uCV6qVCA6_8Q xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile
+4087cf0dPeHOvzmZAazvwLslKEF93A xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h
+4087cf0da2cROOiybf9A-j4R_yHnjg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c
+4087cf0dvXL1PKX23t_LvO1wVPb7OA xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c
+4087cf0dkVF3I19gpT1cNubeJgQr7g xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c
+4087cf0dlv1Dw4MAbeRStPPG8IvPPg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c
4075806dI5kfeMD5RV-DA0PYoThx_w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/Makefile
4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.c
4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.h
4075806dibjCcfuXv6CINMhxWTw3jQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/vbd.c
4083dc16-Kd5y9psK_yk161sme5j5Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/Makefile
4083dc16UmHXxS9g_UFVnkUpN-oP2Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/Makefile
+4087cf0d5dudKw_DecIJgOhLlBF_0Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c
405853f2wg7JXZJNltspMwOZJklxgw xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/Makefile
405853f6nbeazrNyEWNHBuoSg2PiPA xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/vnetif.c
3e5a4e65lWzkiPXsZdzPt2RNnJGG1g xenolinux-2.4.26-sparse/arch/xen/kernel/Makefile
memset(builddomain, 0, sizeof(*builddomain));
- if ( (pm_handle = init_pfn_mapper()) < 0 )
+ if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 )
goto error_out;
if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
}
shared_info_frame = op.u.getdomaininfo.shared_info_frame;
- if ( (pm_handle = init_pfn_mapper()) < 0 )
+ if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 )
goto out;
/* Copy saved contents of shared-info page. No checking needed. */
goto out;
}
- if ( (pm_handle = init_pfn_mapper()) < 0 )
+ if ( (pm_handle = init_pfn_mapper((domid_t)domid)) < 0 )
goto out;
/* Is the suspend-record MFN actually valid for this domain? */
memset(builddomain, 0, sizeof(*builddomain));
- if ( (pm_handle = init_pfn_mapper()) < 0 )
+ if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 )
goto error_out;
if ( (page_array = malloc(tot_pages * sizeof(unsigned long))) == NULL )
#include "xc_private.h"
-int init_pfn_mapper(void)
+int init_pfn_mapper(domid_t domid)
{
- return open("/dev/mem", O_RDWR);
+ int fd = open("/dev/mem", O_RDWR);
+ if ( fd >= 0 )
+ {
+ (void)ioctl(fd, _IO('M', 1), (unsigned long)(domid>> 0)); /* low */
+ (void)ioctl(fd, _IO('M', 2), (unsigned long)(domid>>32)); /* high */
+ }
+ return fd;
}
int close_pfn_mapper(int pm_handle)
if ( mmu->idx == FIRST_MMU_UPDATE )
return 0;
- /* The first two requests set the correct subject domain. */
+ /* The first two requests set the correct subject domain (PTS and GPS). */
mmu->updates[0].val = (unsigned long)(mmu->subject<<16) & ~0xFFFFUL;
mmu->updates[0].ptr = (unsigned long)(mmu->subject<< 0) & ~0xFFFFUL;
mmu->updates[1].val = (unsigned long)(mmu->subject>>16) & ~0xFFFFUL;
mmu->updates[0].ptr |= MMU_EXTENDED_COMMAND;
mmu->updates[0].val |= MMUEXT_SET_SUBJECTDOM_L;
mmu->updates[1].ptr |= MMU_EXTENDED_COMMAND;
- mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+ mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H | SET_PAGETABLE_SUBJECTDOM;
hypercall.op = __HYPERVISOR_mmu_update;
hypercall.arg[0] = (unsigned long)mmu->updates;
/*
* PFN mapping.
*/
-int init_pfn_mapper(void);
+int init_pfn_mapper(domid_t domid);
int close_pfn_mapper(int pm_handle);
void *map_pfn_writeable(int pm_handle, unsigned long pfn);
void *map_pfn_readonly(int pm_handle, unsigned long pfn);
CONTROL_RING_IDX rx_req_prod, rx_resp_prod;
} control_if_t;
-#define CMSG_CONSOLE 0
-#define CMSG_CONSOLE_DATA 0
+/*
+ * Top-level command types.
+ */
+#define CMSG_CONSOLE 0 /* Console */
+#define CMSG_BLKIF_BE 1 /* Block-device backend */
+#define CMSG_BLKIF_FE 2 /* Block-device frontend */
+
+/*
+ * Subtypes for console messages.
+ */
+#define CMSG_CONSOLE_DATA 0
+
+/*
+ * Subtypes for block-device messages.
+ */
+#define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */
+#define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */
+#define CMSG_BLKIF_BE_VBD_CREATE 2 /* Create a new VBD for an interface. */
+#define CMSG_BLKIF_BE_VBD_DESTROY 3 /* Delete a VBD from an interface. */
+#define CMSG_BLKIF_BE_VBD_GROW 4 /* Append an extent to a given VBD. */
+#define CMSG_BLKIF_BE_VBD_SHRINK 5 /* Remove last extent from a given VBD. */
+
+/*
+ * Message request/response defintions for block-device messages.
+ */
+
+typedef u16 blkif_vdev_t;
+typedef u16 blkif_pdev_t;
+typedef u64 blkif_sector_t;
+typedef struct {
+ blkif_pdev_t device;
+ blkif_sector_t sector_start;
+ blkif_sector_t sector_length;
+} blkif_extent_t;
+
+/* Non-specific 'okay' return. */
+#define BLKIF_STATUS_OKAY 0
+/* Non-specific 'error' return. */
+#define BLKIF_STATUS_ERROR 1
+/* The following are specific error returns. */
+#define BLKIF_STATUS_INTERFACE_EXISTS 2
+#define BLKIF_STATUS_INTERFACE_NOT_FOUND 3
+
+/* This macro can be used to create an array of descriptive error strings. */
+#define BLKIF_STATUS_ERRORS { \
+ "Okay", \
+ "Non-specific error", \
+ "Interface already exists", \
+ "Interface not found" }
+
+/* CMSG_BLKIF_CREATE */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Domain attached to new interface. */
+ unsigned int blkif_handle; /* Domain-specific interface handle. */
+ unsigned int evtchn_port; /* Event channel for notifications. */
+ unsigned long shmem_frame; /* Page cont. shared comms window. */
+ /* OUT */
+ unsigned int status;
+} blkif_create_t;
+
+/* CMSG_BLKIF_DESTROY */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Identify interface to be destroyed. */
+ unsigned int blkif_handle; /* ...ditto... */
+ /* OUT */
+ unsigned int status;
+} blkif_destroy_t;
+
+/* CMSG_BLKIF_VBD_CREATE */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Identify blkdev interface. */
+ unsigned int blkif_handle; /* ...ditto... */
+ blkif_vdev_t vdevice; /* Interface-specific id for this VBD. */
+ int readonly; /* Non-zero -> VBD isn't writeable. */
+ /* OUT */
+ unsigned int status;
+} blkif_vbd_create_t;
+
+/* CMSG_BLKIF_VBD_DESTROY */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Identify blkdev interface. */
+ unsigned int blkif_handle; /* ...ditto... */
+ blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */
+ /* OUT */
+ unsigned int status;
+} blkif_vbd_destroy_t;
+
+/* CMSG_BLKIF_VBD_GROW */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Identify blkdev interface. */
+ unsigned int blkif_handle; /* ...ditto... */
+ blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */
+ blkif_extent_t extent; /* Physical extent to append to VBD. */
+ /* OUT */
+ unsigned int status;
+} blkif_vbd_grow_t;
+/* CMSG_BLKIF_VBD_SHRINK */
+typedef struct {
+ /* IN */
+ domid_t domid; /* Identify blkdev interface. */
+ unsigned int blkif_handle; /* ...ditto... */
+ blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */
+ /* OUT */
+ unsigned int status;
+} blkif_vbd_shrink_t;
#endif /* __DOMAIN_CONTROLLER_H__ */
goto fail1;
}
+ /* Set the General-Purpose Subject whose page frame will be mapped. */
+ (void)ioctl(xup->mem_fd, _IO('M', 1), (unsigned long)(dom>> 0)); /* low */
+ (void)ioctl(xup->mem_fd, _IO('M', 2), (unsigned long)(dom>>32)); /* high */
+
if ( (xup->xc_handle = xc_interface_open()) == -1 )
{
PyErr_SetString(port_error, "Could not open Xen control interface");
static int alloc_l2_table(struct pfn_info *page);
static int alloc_l1_table(struct pfn_info *page);
-static int get_page_from_pagenr(unsigned long page_nr, int check_level);
+static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p);
static int get_page_and_type_from_pagenr(unsigned long page_nr,
u32 type,
- int check_level);
-#define CHECK_STRICT 0 /* Subject domain must own the page */
-#define CHECK_ANYDOM 1 /* Any domain may own the page (if subject is priv.) */
+ struct task_struct *p);
static void free_l2_table(struct pfn_info *page);
static void free_l1_table(struct pfn_info *page);
unsigned long deferred_ops;
unsigned long cr0;
domid_t subject_id;
- struct task_struct *subject_p;
+ /* General-Purpose Subject, Page-Table Subject */
+ struct task_struct *gps, *pts;
} percpu_info[NR_CPUS] __cacheline_aligned;
+/* Determine the current General-Purpose Subject or Page-Table Subject. */
+#define PTS (percpu_info[smp_processor_id()].pts ? : current)
+#define GPS (percpu_info[smp_processor_id()].gps ? : current)
+
/*
* init_frametable:
}
-static int get_page_from_pagenr(unsigned long page_nr, int check_level)
+static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p)
{
- struct task_struct *p = current;
struct pfn_info *page = &frame_table[page_nr];
- u32 y, x, nx;
if ( unlikely(!pfn_is_ram(page_nr)) )
{
return 0;
}
- /* Find the correct subject domain. */
- if ( unlikely(percpu_info[p->processor].subject_p != NULL) )
- p = percpu_info[p->processor].subject_p;
-
- /* Demote ANYDOM to STRICT if subject domain is not privileged. */
- if ( check_level == CHECK_ANYDOM && !IS_PRIV(p) )
- check_level = CHECK_STRICT;
-
- switch ( check_level )
+ if ( unlikely(!get_page(page, p)) )
{
- case CHECK_STRICT:
- if ( unlikely(!get_page(page, p)) )
- {
- MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr);
- return 0;
- }
- break;
- case CHECK_ANYDOM:
- y = page->count_and_flags;
- do {
- x = y;
- nx = x + 1;
- if ( unlikely((x & PGC_count_mask) == 0) ||
- unlikely((nx & PGC_count_mask) == 0) )
- {
- MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr);
- return 0;
- }
- }
- while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
- break;
+ MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr);
+ return 0;
}
return 1;
static int get_page_and_type_from_pagenr(unsigned long page_nr,
u32 type,
- int check_level)
+ struct task_struct *p)
{
struct pfn_info *page = &frame_table[page_nr];
- if ( unlikely(!get_page_from_pagenr(page_nr, check_level)) )
+ if ( unlikely(!get_page_from_pagenr(page_nr, p)) )
return 0;
if ( unlikely(!get_page_type(page, type)) )
if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
{
/* Make sure the mapped frame belongs to the correct domain. */
- if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e),
- CHECK_STRICT)) )
+ if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), PTS)) )
return 0;
/*
if ( l1v & _PAGE_RW )
{
if ( unlikely(!get_page_and_type_from_pagenr(
- pfn, PGT_writeable_page, CHECK_ANYDOM)) )
+ pfn, PGT_writeable_page, GPS)) )
return 0;
set_bit(_PGC_tlb_flush_on_type_change,
&frame_table[pfn].count_and_flags);
return 1;
}
- return get_page_from_pagenr(pfn, CHECK_ANYDOM);
+ return get_page_from_pagenr(pfn, GPS);
}
}
if ( unlikely(!get_page_and_type_from_pagenr(
- l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, CHECK_STRICT)) )
+ l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, PTS)) )
return get_linear_pagetable(l2e, pfn);
return 1;
page-frame_table) & PSH_shadowed) )
{
/*
- * Using 'current->mm' is safe and correct because page-table pages
- * are not shared across domains. Updates to such pages' types are
- * thus only done within the context of the owning domain. The one
- * exception is when destroying a domain; however, this is not a
- * problem as the currently-executing domain will not have this
- * MFN shadowed, and at domain end-of-day we explicitly unshadow
+ * Using 'current->mm' is safe and correct because page-table pages
+ * are not shared across domains. Updates to such pages' types are
+ * thus only done within the context of the owning domain. The one
+ * exception is when destroying a domain; however, this is not a
+ * problem as the currently-executing domain will not have this MFN
+ * shadowed, and at domain end-of-day we explicitly unshadow
* everything so that nothing will get left lying around.
*/
unshadow_table( page-frame_table, type );
case MMUEXT_PIN_L1_TABLE:
case MMUEXT_PIN_L2_TABLE:
okay = get_page_and_type_from_pagenr(
- pfn, (cmd == MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table :
- PGT_l1_page_table,
- CHECK_STRICT);
+ pfn,
+ (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table,
+ PTS);
if ( unlikely(!okay) )
{
MEM_LOG("Error while pinning pfn %08lx", pfn);
break;
case MMUEXT_UNPIN_TABLE:
- if ( unlikely(!(okay = get_page_from_pagenr(pfn, CHECK_STRICT))) )
+ if ( unlikely(!(okay = get_page_from_pagenr(pfn, PTS))) )
{
MEM_LOG("Page %08lx bad domain (dom=%p)",
ptr, page->u.domain);
break;
case MMUEXT_NEW_BASEPTR:
- okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table,
- CHECK_STRICT);
+ okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, current);
if ( likely(okay) )
{
invalidate_shadow_ldt();
break;
case MMUEXT_INVLPG:
- __flush_tlb_one(val & ~MMUEXT_CMD_MASK);
+ __flush_tlb_one(ptr);
break;
case MMUEXT_SET_LDT:
}
else
{
- if ( percpu_info[cpu].subject_p != NULL )
- put_task_struct(percpu_info[cpu].subject_p);
- percpu_info[cpu].subject_p = find_domain_by_id(
+ if ( percpu_info[cpu].gps != NULL )
+ put_task_struct(percpu_info[cpu].gps);
+ percpu_info[cpu].gps = find_domain_by_id(
percpu_info[cpu].subject_id);
- if ( percpu_info[cpu].subject_p == NULL )
+ percpu_info[cpu].pts = (val & SET_PAGETABLE_SUBJECTDOM) ?
+ percpu_info[cpu].gps : NULL;
+ if ( percpu_info[cpu].gps == NULL )
{
MEM_LOG("Unknown domain '%llu'", percpu_info[cpu].subject_id);
okay = 0;
* MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
*/
case MMU_NORMAL_PT_UPDATE:
- if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) )
+ if ( unlikely(!get_page_from_pagenr(pfn, PTS)) )
{
MEM_LOG("Could not get page for normal update");
break;
break;
case MMU_MACHPHYS_UPDATE:
- if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) )
+ if ( unlikely(!get_page_from_pagenr(pfn, GPS)) )
{
MEM_LOG("Could not get page for mach->phys update");
break;
if ( deferred_ops & DOP_RELOAD_LDT )
(void)map_ldt_shadow_page(0);
- if ( unlikely(percpu_info[cpu].subject_p != NULL) )
+ if ( unlikely(percpu_info[cpu].gps != NULL) )
{
- put_task_struct(percpu_info[cpu].subject_p);
- percpu_info[cpu].subject_p = NULL;
+ put_task_struct(percpu_info[cpu].gps);
+ percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
}
return rc;
#define NR_VIRQS 12
/*
- * MMU_XXX: specified in least 2 bits of 'ptr' field. These bits are masked
- * off to get the real 'ptr' value.
- * All requests specify relevent address in 'ptr'. This is either a
- * machine/physical address (MA), or linear/virtual address (VA).
- * Normal requests specify update value in 'value'.
- * Extended requests specify command in least 8 bits of 'value'. These bits
- * are masked off to get the real 'val' value. Except for MMUEXT_SET_LDT
- * which shifts the least bits out.
+ * MMU-UPDATE REQUESTS
+ *
+ * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
+ * ptr[1:0] specifies the appropriate MMU_* command.
+ *
+ * GPS (General-Purpose Subject)
+ * -----------------------------
+ * This domain that must own all non-page-table pages that are involved in
+ * MMU updates. By default it is the domain that executes mmu_update(). If the
+ * caller has sufficient privilege then it can be changed by executing
+ * MMUEXT_SET_SUBJECTDOM_{L,H}.
+ *
+ * PTS (Page-Table Subject)
+ * ------------------------
+ * This domain must own all the page-table pages that are subject to MMU
+ * updates. By default it is the domain that executes mmu_update(). If the
+ * caller has sufficient privilege then it can be changed by executing
+ * MMUEXT_SET_SUBJECTDOM_H with val[14] (SET_PAGETABLE_SUBJECTDOM) set.
+ *
+ * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
+ * Updates an entry in a page table.
+ * ptr[:2] -- machine address of the page-table entry to modify [1]
+ * val -- value to write [2]
+ *
+ * ptr[1:0] == MMU_MACHPHYS_UPDATE:
+ * Updates an entry in the machine->pseudo-physical mapping table.
+ * ptr[:2] -- machine address within the frame whose mapping to modify [3]
+ * val -- value to write into the mapping entry
+ *
+ * ptr[1:0] == MMU_EXTENDED_COMMAND:
+ * val[7:0] -- MMUEXT_* command
+ *
+ * val[7:0] == MMUEXT_(UN)PIN_*_TABLE:
+ * ptr[:2] -- machine address of frame to be (un)pinned as a p.t. page [1]
+ *
+ * val[7:0] == MMUEXT_NEW_BASEPTR:
+ * ptr[:2] -- machine address of new page-table base to install in MMU [1]
+ *
+ * val[7:0] == MMUEXT_TLB_FLUSH:
+ * no additional arguments
+ *
+ * val[7:0] == MMUEXT_INVLPG:
+ * ptr[:2] -- linear address to be flushed from the TLB
+ *
+ * val[7:0] == MMUEXT_SET_LDT:
+ * ptr[:2] -- linear address of LDT base (NB. must be page-aligned)
+ * val[:8] -- number of entries in LDT
+ *
+ * val[7:0] == MMUEXT_SET_SUBJECTDOM_L:
+ * (ptr[31:15],val[31:15]) -- dom[31:0]
+ *
+ * val[7:0] == MMUEXT_SET_SUBJECTDOM_H:
+ * val[14] -- if TRUE then sets the PTS in addition to the GPS.
+ * (ptr[31:15],val[31:15]) -- dom[63:32]
+ * NB. This command must be immediately preceded by SET_SUBJECTDOM_L.
+ *
+ * Notes on constraints on the above arguments:
+ * [1] The page frame containing the machine address must belong to the PTS.
+ * [2] If the PTE is valid (i.e., bit 0 is set) then the specified page frame
+ * must belong to:
+ * (a) the PTS (if the PTE is part of a non-L1 table); or
+ * (b) the GPS (if the PTE is part of an L1 table).
+ * [3] The page frame containing the machine address must belong to the GPS.
*/
-/* A normal page-table update request. */
#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
-/* Update an entry in the machine->physical mapping table. */
#define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */
-/* An extended command. */
#define MMU_EXTENDED_COMMAND 3 /* least 8 bits of val demux further */
-/* Extended commands: */
#define MMUEXT_PIN_L1_TABLE 0 /* ptr = MA of frame to pin */
#define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */
#define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */
#define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */
#define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */
#define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */
-#define MMUEXT_INVLPG 7 /* ptr = NULL ; val = VA to invalidate */
+#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */
#define MMUEXT_SET_LDT 8 /* ptr = VA of table; val = # entries */
/* NB. MMUEXT_SET_SUBJECTDOM must consist of *_L followed immediately by *_H */
#define MMUEXT_SET_SUBJECTDOM_L 9 /* (ptr[31:15],val[31:15]) = dom[31:0] */
#define MMUEXT_SET_SUBJECTDOM_H 10 /* (ptr[31:15],val[31:15]) = dom[63:32] */
+#define SET_PAGETABLE_SUBJECTDOM (1<<14) /* OR into 'val' arg of SUBJECTDOM_H*/
#define MMUEXT_CMD_MASK 255
#define MMUEXT_CMD_SHIFT 8
O_TARGET := drv.o
-obj-y := main.o
+obj-y := main.o control.o interface.o vbd.o
include $(TOPDIR)/Rules.make
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vblkif/backend/common.h
+ */
+
+#ifndef __VBLKIF__BACKEND__COMMON_H__
+#define __VBLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <asm/ctrl_if.h>
+#include <asm/io.h>
+
+#ifndef NDEBUG
+#define ASSERT(_p) \
+ if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned long shmem_frame;
+ unsigned int evtchn;
+ int irq;
+ /* Comms information. */
+ blk_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+ BLK_RING_IDX blk_req_cons; /* Request consumer. */
+ BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */
+ /* VBDs attached to this interface. */
+ rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */
+ spinlock_t vbd_lock; /* Protects VBD mapping. */
+ /* Private fields. */
+ struct list_head blkdev_list;
+ spinlock_t blk_ring_lock;
+} blkif_t;
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+void blkif_get(blkif_t *blkif);
+void blkif_put(blkif_t *blkif);
+
+/* An entry in a list of xen_extents. */
+typedef struct _blkif_extent_le {
+ blkif_extent_t extent; /* an individual extent */
+ struct _blkif_extent_le *next; /* and a pointer to the next */
+} blkif_extent_le_t;
+
+typedef struct _vbd {
+ blkif_vdev_t vdevice; /* what the domain refers to this vbd as */
+ unsigned char mode; /* VBD_MODE_{R,W} */
+ unsigned char type; /* XD_TYPE_xxx */
+ blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */
+ rb_node_t rb; /* for linking into R-B tree lookup struct */
+} vbd_t;
+
+long vbd_create(blkif_vbd_create_t *create_params);
+long vbd_grow(blkif_vbd_grow_t *grow_params);
+long vbd_shrink(blkif_vbd_shrink_t *shrink_params);
+long vbd_destroy(blkif_vbd_destroy_t *delete_params);
+
+void destroy_all_vbds(struct task_struct *p);
+
+typedef struct {
+ blkif_t *blkif;
+ unsigned long id;
+ atomic_t pendcnt;
+ unsigned short operation;
+ unsigned short status;
+} pending_req_t;
+
+/* Describes a [partial] disk extent (part of a block io request) */
+typedef struct {
+ unsigned short dev;
+ unsigned short nr_sects;
+ unsigned long buffer;
+ xen_sector_t sector_number;
+} phys_seg_t;
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation);
+
+int vblkif_be_controller_init(void);
+
+void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __VBLKIF__BACKEND__COMMON_H__ */
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vblkif/backend/control.c
+ *
+ * Routines for interfacing with the control plane.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ switch ( msg->subtype )
+ {
+ case CMSG_BLKIF_BE_CREATE:
+ if ( msg->length != sizeof(blkif_create_t) )
+ goto parse_error;
+ blkif_create((blkif_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_DESTROY:
+ if ( msg->length != sizeof(blkif_destroy_t) )
+ goto parse_error;
+ blkif_destroy((blkif_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_CREATE:
+ if ( msg->length != sizeof(blkif_vbd_create_t) )
+ goto parse_error;
+ vbd_create((blkif_vbd_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_DESTROY:
+ if ( msg->length != sizeof(blkif_vbd_destroy_t) )
+ goto parse_error;
+ vbd_destroy((blkif_vbd_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_GROW:
+ if ( msg->length != sizeof(blkif_vbd_grow_t) )
+ goto parse_error;
+ vbd_grow((blkif_vbd_grow_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_SHRINK:
+ if ( msg->length != sizeof(blkif_vbd_shrink_t) )
+ goto parse_error;
+ vbd_shrink((blkif_vbd_shrink_t *)&msg->msg[0]);
+ break;
+ default:
+ goto parse_error;
+ }
+
+ ctrl_if_send_response(msg);
+ return;
+
+ parse_error:
+ msg->length = 0;
+ ctrl_if_send_response(msg);
+}
+
+int blkif_ctrlif_init(void)
+{
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
+}
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vblkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) \
+ (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static blkif_t *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ blkif_t *blkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif != NULL) &&
+ (blkif->domid != domid) &&
+ (blkif->handle != handle) )
+ blkif = blkif->hash_next;
+ return blkif;
+}
+
+static void blkif_create(blkif_create_t *create)
+{
+ domid_t domid = create->domid;
+ unsigned int handle = create->blkif_handle;
+ unsigned int evtchn = create->evtchn;
+ unsigned long shmem_frame = create->shmem_frame;
+ blkif_t **pblkif, *blkif;
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( *pblkif == NULL )
+ {
+ if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+ goto found_match;
+ pblkif = &(*pblkif)->hash_next;
+ }
+
+ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->handle = handle;
+ blkif->evtchn = evtchn;
+ blkif->irq = bind_evtchn_to_irq(evtchn);
+ blkif->shmem_frame = shmem_frame;
+ blkif->shmem_vbase = ioremap(shmem_frame<<PAGE_SHIFT, PAGE_SIZE);
+ spin_lock_init(&blkif->vbd_lock);
+ spin_lock_init(&blkif->blk_ring_lock);
+
+ request_irq(irq, vblkif_be_int, 0, "vblkif-backend", blkif);
+
+ blkif->hash_next = *pblkif;
+ *pblkif = blkif;
+
+ create->status = BLKIF_STATUS_OKAY;
+ return;
+
+ found_match:
+ create->status = BLKIF_STATUS_INTERFACE_EXISTS;
+ return;
+
+ evtchn_in_use:
+ unbind_evtchn_from_irq(evtchn); /* drop refcnt */
+ create->status = BLKIF_STATUS_ERROR;
+ return;
+}
+
+static void blkif_destroy(blkif_destroy_t *destroy)
+{
+ domid_t domid = destroy->domid;
+ unsigned int handle = destroy->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif = *pblkif) == NULL )
+ {
+ if ( (blkif->domid == domid) && (blkif->handle == handle) )
+ goto found_match;
+ pblkif = &blkif->hash_next;
+ }
+
+ destroy->status = BLKIF_STATUS_NO_INTERFACE;
+ return;
+
+ found_match:
+ free_irq(blkif->irq, NULL);
+ unbind_evtchn_from_irq(blkif->evtchn);
+ *pblkif = blkif->hash_next;
+ kmem_cache_free(blkif_cachep, blkif);
+ destroy->status = BLKIF_STATUS_OKAY;
+}
+
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vblkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/vblkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ *
+ * We can't allocate pending_req's in order, since they may complete out of
+ * order. We therefore maintain an allocation ring. This ring also indicates
+ * when enough work has been passed down -- at that point the allocation ring
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+static kmem_cache_t *buffer_head_cachep;
+
+static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
+
+static int lock_buffer(blkif_t *blkif,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+static void unlock_buffer(unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+
+static void io_schedule(unsigned long unused);
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blk_ring_req_entry_t *req);
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, unsigned long st);
+
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head io_schedule_list;
+static spinlock_t io_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+ return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( !__on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&io_schedule_list_lock, flags);
+ if ( __on_blkdev_list(blkif) )
+ {
+ list_del(&blkif->blkdev_list);
+ blkif->blkdev_list.next = NULL;
+ blkif_put(blkif);
+ }
+ spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( __on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&io_schedule_list_lock, flags);
+ if ( !__on_blkdev_list(blkif) )
+ {
+ list_add_tail(&blkif->blkdev_list, &io_schedule_list);
+ blkif_get(blkif);
+ }
+ spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
+
+static void io_schedule(unsigned long unused)
+{
+ blkif_t *blkif;
+ struct list_head *ent;
+
+ /* Queue up a batch of requests. */
+ while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+ !list_empty(&io_schedule_list) )
+ {
+ ent = io_schedule_list.next;
+ blkif = list_entry(ent, blkif_t, blkdev_list);
+ blkif_get(blkif);
+ remove_from_blkdev_list(blkif);
+ if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+ add_to_blkdev_list_tail(blkif);
+ blkif_put(blkif);
+ }
+
+ /* Push the batch through to disc. */
+ run_task_queue(&tq_disk);
+}
+
+static void maybe_trigger_io_schedule(void)
+{
+ /*
+ * Needed so that two processes, who together make the following predicate
+ * true, don't both read stale values and evaluate the predicate
+ * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+ */
+ smp_mb();
+
+ if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&io_schedule_list) )
+ tasklet_schedule(&io_schedule_tasklet);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void end_block_io_op(struct buffer_head *bh, int uptodate)
+{
+ pending_req_t *pending_req = bh->b_private;
+
+ /* An error fails the entire request. */
+ if ( !uptodate )
+ {
+ DPRINTK("Buffer not up-to-date at end of operation\n");
+ pending_req->status = 2;
+ }
+
+ unlock_buffer(virt_to_phys(bh->b_data),
+ bh->b_size,
+ (pending_req->operation==READ));
+
+ if ( atomic_dec_and_test(&pending_req->pendcnt) )
+ {
+ make_response(pending_req->blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ blkif_put(pending_req->blkif);
+ spin_lock(&pend_prod_lock);
+ pending_ring[MASK_PEND_IDX(pending_prod)] =
+ pending_req - pending_reqs;
+ pending_prod++;
+ spin_unlock(&pend_prod_lock);
+ maybe_trigger_io_schedule();
+ }
+}
+
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ blkif_t *blkif = dev_id;
+ add_to_blkdev_list_tail(blkif);
+ maybe_trigger_io_schedule();
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int lock_buffer(blkif_t *blkif,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
+{
+ unsigned long pfn;
+
+ for ( pfn = buffer >> PAGE_SHIFT;
+ pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ pfn++ )
+ {
+ }
+
+ return 1;
+
+ fail:
+ while ( pfn-- > (buffer >> PAGE_SHIFT) )
+ {
+ }
+ return 0;
+}
+
+static void unlock_buffer(unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
+{
+ unsigned long pfn;
+
+ for ( pfn = buffer >> PAGE_SHIFT;
+ pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ pfn++ )
+ {
+ }
+}
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+ blk_ring_t *blk_ring = blkif->blk_ring_base;
+ blk_ring_req_entry_t *req;
+ BLK_RING_IDX i;
+ int more_to_do = 0;
+
+ /* Take items off the comms ring, taking care not to overflow. */
+ for ( i = blkif->blk_req_cons;
+ (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) !=
+ BLK_RING_SIZE);
+ i++ )
+ {
+ if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+ {
+ more_to_do = 1;
+ break;
+ }
+
+ req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
+ switch ( req->operation )
+ {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ dispatch_rw_block_io(blkif, req);
+ break;
+
+ default:
+ DPRINTK("error: unknown block io operation [%d]\n",
+ blk_ring->ring[i].req.operation);
+ make_response(blkif, blk_ring->ring[i].req.id,
+ blk_ring->ring[i].req.operation, 1);
+ break;
+ }
+ }
+
+ blkif->blk_req_cons = i;
+ return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blk_ring_req_entry_t *req)
+{
+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+ struct buffer_head *bh;
+ int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
+ unsigned short nr_sects;
+ unsigned long buffer;
+ int i, tot_sects;
+ pending_req_t *pending_req;
+
+ /* We map virtual scatter/gather segments to physical segments. */
+ int new_segs, nr_psegs = 0;
+ phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
+
+ /* Check that number of segments is sane. */
+ if ( unlikely(req->nr_segments == 0) ||
+ unlikely(req->nr_segments > MAX_BLK_SEGS) )
+ {
+ DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+ goto bad_descriptor;
+ }
+
+ /*
+ * Check each address/size pair is sane, and convert into a
+ * physical device and block offset. Note that if the offset and size
+ * crosses a virtual extent boundary, we may end up with more
+ * physical scatter/gather segments than virtual segments.
+ */
+ for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
+ {
+ buffer = req->buffer_and_sects[i] & ~0x1FF;
+ nr_sects = req->buffer_and_sects[i] & 0x1FF;
+
+ if ( unlikely(nr_sects == 0) )
+ {
+ DPRINTK("zero-sized data request\n");
+ goto bad_descriptor;
+ }
+
+ phys_seg[nr_psegs].dev = req->device;
+ phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+ phys_seg[nr_psegs].buffer = buffer;
+ phys_seg[nr_psegs].nr_sects = nr_sects;
+
+ /* Translate the request into the relevant 'physical device' */
+ new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
+ if ( new_segs < 0 )
+ {
+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ req->sector_number + tot_sects,
+ req->sector_number + tot_sects + nr_sects,
+ req->device);
+ goto bad_descriptor;
+ }
+
+ nr_psegs += new_segs;
+ ASSERT(nr_psegs <= MAX_BLK_SEGS*2);
+ }
+
+ for ( i = 0; i < nr_psegs; i++ )
+ {
+ if ( unlikely(!lock_buffer(blkif, phys_seg[i].buffer,
+ phys_seg[i].nr_sects << 9,
+ operation==READ)) )
+ {
+ DPRINTK("invalid buffer\n");
+ while ( i-- > 0 )
+ unlock_buffer(phys_seg[i].buffer,
+ phys_seg[i].nr_sects << 9,
+ operation==READ);
+ goto bad_descriptor;
+ }
+ }
+
+ pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]];
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ pending_req->status = 0;
+ atomic_set(&pending_req->pendcnt, nr_psegs);
+
+ blkif_get(blkif);
+
+ /* Now we pass each segment down to the real blkdev layer. */
+ for ( i = 0; i < nr_psegs; i++ )
+ {
+ bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
+ if ( unlikely(bh == NULL) )
+ panic("bh is null\n");
+ memset(bh, 0, sizeof (struct buffer_head));
+
+ bh->b_size = phys_seg[i].nr_sects << 9;
+ bh->b_dev = phys_seg[i].dev;
+ bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
+
+ /* SMH: we store a 'pseudo-virtual' bogus address in b_data since
+ later code will undo this transformation (i.e. +-PAGE_OFFSET). */
+ bh->b_data = phys_to_virt(phys_seg[i].buffer);
+
+ /* SMH: bh_phys() uses the below field as a 'cheap' virt_to_phys */
+ bh->b_page = &mem_map[phys_seg[i].buffer>>PAGE_SHIFT];
+ bh->b_end_io = end_block_io_op;
+ bh->b_private = pending_req;
+
+ bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock);
+ if ( operation == WRITE )
+ bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
+
+ atomic_set(&bh->b_count, 1);
+
+ /* Dispatch a single request. We'll flush it to disc later. */
+ submit_bh(operation, bh);
+ }
+
+ return;
+
+ bad_descriptor:
+ make_response(blkif, req->id, req->operation, 1);
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, unsigned long st)
+{
+ blk_ring_resp_entry_t *resp;
+
+ /* Place on the response ring for the relevant domain. */
+ spin_lock(&blkif->blk_ring_lock);
+ resp = &blkif->blk_ring_base->
+ ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
+ resp->id = id;
+ resp->operation = op;
+ resp->status = st;
+ wmb();
+ blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
+ spin_unlock(&blkif->blk_ring_lock);
+
+ /* Kick the relevant domain. */
+ notify_via_evtchn(blkif->evtchn);
+}
+
+static void blkif_debug_int(int irq, void *unused, struct pt_regs *regs)
+{
+#if 0
+ unsigned long flags;
+ struct task_struct *p;
+ blk_ring_t *blk_ring;
+ int i;
+
+ printk("Dumping block queue stats: nr_pending = %d"
+ " (prod=0x%08x,cons=0x%08x)\n",
+ NR_PENDING_REQS, pending_prod, pending_cons);
+
+ read_lock_irqsave(&tasklist_lock, flags);
+ for_each_domain ( p )
+ {
+ printk("Domain: %llu\n", blkif->domain);
+ blk_ring = blkif->blk_ring_base;
+ printk(" req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/"
+ "0x%08x on_list=%d\n",
+ blk_ring->req_prod, blkif->blk_req_cons,
+ blk_ring->resp_prod, blkif->blk_resp_prod,
+ __on_blkdev_list(p));
+ }
+ read_unlock_irqrestore(&tasklist_lock, flags);
+
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ {
+ printk("Pend%d: dom=%p, id=%08lx, cnt=%d, op=%d, status=%d\n",
+ i, pending_reqs[i].domain, pending_reqs[i].id,
+ atomic_read(&pending_reqs[i].pendcnt),
+ pending_reqs[i].operation, pending_reqs[i].status);
+ }
+#endif
+}
+
+void unlink_blkdev_info(blkif_t *blkif)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&io_schedule_list_lock, flags);
+ if ( __on_blkdev_list(blkif) )
+ {
+ list_del(&blkif->blkdev_list);
+ blkif->blkdev_list.next = (void *)0xdeadbeef;
+ blkif_put(blkif);
+ }
+ spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+static int __init init_module(void)
+{
+ int i;
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ completed_bhs[i] = NULL;
+
+ spin_lock_init(&io_schedule_list_lock);
+ INIT_LIST_HEAD(&io_schedule_list);
+
+ if ( request_irq(bind_virq_to_irq(VIRQ_DEBUG), blkif_debug_int,
+ SA_SHIRQ, "vblkif-backend-dbg", &blkif_debug_int) != 0 )
+ printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
+
+ buffer_head_cachep = kmem_cache_create(
+ "buffer_head_cache", sizeof(struct buffer_head),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+ return 0;
+}
+
+static void cleanup_module(void)
+{
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vblkif/backend/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+long __vbd_create(struct task_struct *p,
+ unsigned short vdevice,
+ unsigned char mode,
+ unsigned char type)
+{
+ vbd_t *vbd;
+ rb_node_t **rb_p, *rb_parent = NULL;
+ long ret = 0;
+
+ spin_lock(&p->vbd_lock);
+
+ rb_p = &p->vbd_rb.rb_node;
+ while ( *rb_p != NULL )
+ {
+ rb_parent = *rb_p;
+ vbd = rb_entry(rb_parent, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ {
+ rb_p = &rb_parent->rb_left;
+ }
+ else if ( vdevice > vbd->vdevice )
+ {
+ rb_p = &rb_parent->rb_right;
+ }
+ else
+ {
+ DPRINTK("vbd_create attempted for already existing vbd\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+ {
+ DPRINTK("vbd_create: out of memory\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vbd->vdevice = vdevice;
+ vbd->mode = mode;
+ vbd->type = type;
+ vbd->extents = NULL;
+
+ rb_link_node(&vbd->rb, rb_parent, rb_p);
+ rb_insert_color(&vbd->rb, &p->vbd_rb);
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ return ret;
+}
+
+
+long vbd_create(vbd_create_t *create)
+{
+ struct task_struct *p;
+ long rc;
+
+ if ( unlikely(!IS_PRIV(current)) )
+ return -EPERM;
+
+ if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) )
+ {
+ DPRINTK("vbd_create attempted for non-existent domain %llu\n",
+ create->domain);
+ return -EINVAL;
+ }
+
+ rc = __vbd_create(p, create->vdevice, create->mode,
+ XD_TYPE_DISK | XD_FLAG_VIRT);
+
+ put_task_struct(p);
+
+ return rc;
+}
+
+
+long __vbd_grow(struct task_struct *p,
+ unsigned short vdevice,
+ xen_extent_t *extent)
+{
+ xen_extent_le_t **px, *x;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ long ret = 0;
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+ {
+ DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) == NULL) )
+ {
+ DPRINTK("vbd_grow: out of memory\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ x->extent.device = extent->device;
+ x->extent.start_sector = extent->start_sector;
+ x->extent.nr_sectors = extent->nr_sectors;
+ x->next = (xen_extent_le_t *)NULL;
+
+ for ( px = &vbd->extents; *px != NULL; px = &(*px)->next )
+ continue;
+
+ *px = x;
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ return ret;
+}
+
+
+/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
+long vbd_grow(vbd_grow_t *grow)
+{
+ struct task_struct *p;
+ long rc;
+
+ if ( unlikely(!IS_PRIV(current)) )
+ return -EPERM;
+
+ if ( unlikely((p = find_domain_by_id(grow->domain)) == NULL) )
+ {
+ DPRINTK("vbd_grow: attempted for non-existent domain %llu\n",
+ grow->domain);
+ return -EINVAL;
+ }
+
+ rc = __vbd_grow(p, grow->vdevice, &grow->extent);
+
+ put_task_struct(p);
+
+ return rc;
+}
+
+
+long vbd_shrink(vbd_shrink_t *shrink)
+{
+ struct task_struct *p;
+ xen_extent_le_t **px, *x;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ long ret = 0;
+
+ if ( !IS_PRIV(current) )
+ return -EPERM;
+
+ if ( (p = find_domain_by_id(shrink->domain)) == NULL )
+ {
+ DPRINTK("vbd_shrink attempted for non-existent domain %llu\n",
+ shrink->domain);
+ return -EINVAL;
+ }
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( shrink->vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( shrink->vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) ||
+ unlikely(vbd->vdevice != shrink->vdevice) ||
+ unlikely(vbd->extents == NULL) )
+ {
+ DPRINTK("vbd_shrink: attempt to remove non-existent extent.\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Find the last extent. We now know that there is at least one. */
+ for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next )
+ continue;
+
+ x = *px;
+ *px = x->next;
+ kfree(x);
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ put_task_struct(p);
+ return ret;
+}
+
+
+long vbd_setextents(vbd_setextents_t *setextents)
+{
+ struct task_struct *p;
+ xen_extent_t e;
+ xen_extent_le_t *new_extents, *x, *t;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ int i;
+ long ret = 0;
+
+ if ( !IS_PRIV(current) )
+ return -EPERM;
+
+ if ( (p = find_domain_by_id(setextents->domain)) == NULL )
+ {
+ DPRINTK("vbd_setextents attempted for non-existent domain %llu\n",
+ setextents->domain);
+ return -EINVAL;
+ }
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( setextents->vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( setextents->vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) ||
+ unlikely(vbd->vdevice != setextents->vdevice) )
+ {
+ DPRINTK("vbd_setextents: attempt to modify non-existent VBD.\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Construct the new extent list. */
+ new_extents = NULL;
+ for ( i = setextents->nr_extents - 1; i >= 0; i-- )
+ {
+ if ( unlikely(copy_from_user(&e,
+ &setextents->extents[i],
+ sizeof(e)) != 0) )
+ {
+ DPRINTK("vbd_setextents: copy_from_user failed\n");
+ ret = -EFAULT;
+ goto free_and_out;
+ }
+
+ if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL))
+ == NULL) )
+ {
+ DPRINTK("vbd_setextents: out of memory\n");
+ ret = -ENOMEM;
+ goto free_and_out;
+ }
+
+ x->extent = e;
+ x->next = new_extents;
+
+ new_extents = x;
+ }
+
+ /* Delete the old extent list _after_ successfully creating the new. */
+ for ( x = vbd->extents; x != NULL; x = t )
+ {
+ t = x->next;
+ kfree(x);
+ }
+
+ /* Make the new list visible. */
+ vbd->extents = new_extents;
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ put_task_struct(p);
+ return ret;
+
+ free_and_out:
+ /* Failed part-way through the new list. Delete all that we managed. */
+ for ( x = new_extents; x != NULL; x = t )
+ {
+ t = x->next;
+ kfree(x);
+ }
+ goto out;
+}
+
+
+long vbd_delete(vbd_delete_t *delete)
+{
+ struct task_struct *p;
+ vbd_t *vbd;
+ rb_node_t *rb;
+ xen_extent_le_t *x, *t;
+
+ if( !IS_PRIV(current) )
+ return -EPERM;
+
+ if ( (p = find_domain_by_id(delete->domain)) == NULL )
+ {
+ DPRINTK("vbd_delete attempted for non-existent domain %llu\n",
+ delete->domain);
+ return -EINVAL;
+ }
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( delete->vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( delete->vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ goto found;
+ }
+
+ DPRINTK("vbd_delete attempted for non-existing VBD.\n");
+
+ spin_unlock(&p->vbd_lock);
+ put_task_struct(p);
+ return -EINVAL;
+
+ found:
+ rb_erase(rb, &p->vbd_rb);
+ x = vbd->extents;
+ kfree(vbd);
+
+ while ( x != NULL )
+ {
+ t = x->next;
+ kfree(x);
+ x = t;
+ }
+
+ spin_unlock(&p->vbd_lock);
+ put_task_struct(p);
+ return 0;
+}
+
+
+void destroy_all_vbds(struct task_struct *p)
+{
+ vbd_t *vbd;
+ rb_node_t *rb;
+ xen_extent_le_t *x, *t;
+
+ spin_lock(&p->vbd_lock);
+
+ while ( (rb = p->vbd_rb.rb_node) != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+
+ rb_erase(rb, &p->vbd_rb);
+ x = vbd->extents;
+ kfree(vbd);
+
+ while ( x != NULL )
+ {
+ t = x->next;
+ kfree(x);
+ x = t;
+ }
+ }
+
+ spin_unlock(&p->vbd_lock);
+}
+
+
+static int vbd_probe_single(xen_disk_info_t *xdi,
+ vbd_t *vbd,
+ struct task_struct *p)
+{
+ xen_extent_le_t *x;
+ xen_disk_t cur_disk;
+
+ if ( xdi->count == xdi->max )
+ {
+ DPRINTK("vbd_probe_devices: out of space for probe.\n");
+ return -ENOMEM;
+ }
+
+ cur_disk.device = vbd->vdevice;
+ cur_disk.info = vbd->type;
+ if ( !VBD_CAN_WRITE(vbd) )
+ cur_disk.info |= XD_FLAG_RO;
+ cur_disk.capacity = 0ULL;
+ for ( x = vbd->extents; x != NULL; x = x->next )
+ cur_disk.capacity += x->extent.nr_sectors;
+ cur_disk.domain = p->domain;
+
+ /* Now copy into relevant part of user-space buffer */
+ if( copy_to_user(&xdi->disks[xdi->count],
+ &cur_disk,
+ sizeof(xen_disk_t)) )
+ {
+ DPRINTK("vbd_probe_devices: copy_to_user failed\n");
+ return -EFAULT;
+ }
+
+ xdi->count++;
+
+ return 0;
+}
+
+
+static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p)
+{
+ int rc = 0;
+ rb_node_t *rb;
+
+ spin_lock(&p->vbd_lock);
+
+ if ( (rb = p->vbd_rb.rb_node) == NULL )
+ goto out;
+
+ new_subtree:
+ /* STEP 1. Find least node (it'll be left-most). */
+ while ( rb->rb_left != NULL )
+ rb = rb->rb_left;
+
+ for ( ; ; )
+ {
+ /* STEP 2. Dealt with left subtree. Now process current node. */
+ if ( (rc = vbd_probe_single(xdi, rb_entry(rb, vbd_t, rb), p)) != 0 )
+ goto out;
+
+ /* STEP 3. Process right subtree, if any. */
+ if ( rb->rb_right != NULL )
+ {
+ rb = rb->rb_right;
+ goto new_subtree;
+ }
+
+ /* STEP 4. Done both subtrees. Head back through ancesstors. */
+ for ( ; ; )
+ {
+ /* We're done when we get back to the root node. */
+ if ( rb->rb_parent == NULL )
+ goto out;
+ /* If we are left of parent, then parent is next to process. */
+ if ( rb->rb_parent->rb_left == rb )
+ break;
+ /* If we are right of parent, then we climb to grandparent. */
+ rb = rb->rb_parent;
+ }
+
+ rb = rb->rb_parent;
+ }
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ return rc;
+}
+
+
+/*
+ * Return information about the VBDs available for a given domain, or for all
+ * domains; in the general case the 'domain' argument will be 0 which means
+ * "information about the caller"; otherwise the 'domain' argument will
+ * specify either a given domain, or all domains ("VBD_PROBE_ALL") -- both of
+ * these cases require the caller to be privileged.
+ */
+long vbd_probe(vbd_probe_t *probe)
+{
+ struct task_struct *p = NULL;
+ unsigned long flags;
+ long ret = 0;
+
+ if ( probe->domain != 0 )
+ {
+ /* We can only probe for ourselves (unless we're privileged). */
+ if( (probe->domain != current->domain) && !IS_PRIV(current) )
+ return -EPERM;
+
+ if ( (probe->domain != VBD_PROBE_ALL) &&
+ ((p = find_domain_by_id(probe->domain)) == NULL) )
+ {
+ DPRINTK("vbd_probe attempted for non-existent domain %llu\n",
+ probe->domain);
+ return -EINVAL;
+ }
+ }
+ else
+ {
+ /* Default is to probe for ourselves. */
+ p = current;
+ get_task_struct(p); /* to mirror final put_task_struct */
+ }
+
+ if ( probe->domain == VBD_PROBE_ALL )
+ {
+ read_lock_irqsave(&tasklist_lock, flags);
+ for_each_domain ( p )
+ {
+ if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
+ {
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ goto out;
+ }
+ }
+ read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
+ goto out;
+
+ out:
+ if ( ret != 0 )
+ DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret);
+ if ( p != NULL )
+ put_task_struct(p);
+ return ret;
+}
+
+
+long vbd_info(vbd_info_t *info)
+{
+ struct task_struct *p;
+ xen_extent_le_t *x;
+ xen_extent_t *extents;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ long ret = 0;
+
+ if ( (info->domain != current->domain) && !IS_PRIV(current) )
+ return -EPERM;
+
+ if ( (p = find_domain_by_id(info->domain)) == NULL )
+ {
+ DPRINTK("vbd_info attempted for non-existent domain %llu\n",
+ info->domain);
+ return -EINVAL;
+ }
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( info->vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( info->vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != info->vdevice) )
+ {
+ DPRINTK("vbd_info attempted on non-existent VBD.\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ info->mode = vbd->mode;
+ info->nextents = 0;
+
+ extents = info->extents;
+ for ( x = vbd->extents; x != NULL; x = x->next )
+ {
+ if ( info->nextents == info->maxextents )
+ break;
+ if ( copy_to_user(extents, &x->extent, sizeof(xen_extent_t)) )
+ {
+ DPRINTK("vbd_info: copy_to_user failed\n");
+ ret = -EFAULT;
+ goto out;
+ }
+ extents++;
+ info->nextents++;
+ }
+
+ out:
+ spin_unlock(&p->vbd_lock);
+ put_task_struct(p);
+ return ret;
+}
+
+
+int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation)
+{
+ xen_extent_le_t *x;
+ vbd_t *vbd;
+ rb_node_t *rb;
+ xen_sector_t sec_off;
+ unsigned long nr_secs;
+
+ spin_lock(&p->vbd_lock);
+
+ rb = p->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( pseg->dev < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( pseg->dev > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ goto found;
+ }
+
+ DPRINTK("vbd_translate; domain %llu attempted to access "
+ "non-existent VBD.\n", p->domain);
+
+ spin_unlock(&p->vbd_lock);
+ return -ENODEV;
+
+ found:
+
+ if ( ((operation == READ) && !VBD_CAN_READ(vbd)) ||
+ ((operation == WRITE) && !VBD_CAN_WRITE(vbd)) )
+ {
+ spin_unlock(&p->vbd_lock);
+ return -EACCES;
+ }
+
+ /*
+ * Now iterate through the list of xen_extents, working out which should
+ * be used to perform the translation.
+ */
+ sec_off = pseg->sector_number;
+ nr_secs = pseg->nr_sects;
+ for ( x = vbd->extents; x != NULL; x = x->next )
+ {
+ if ( sec_off < x->extent.nr_sectors )
+ {
+ pseg->dev = x->extent.device;
+ pseg->sector_number = x->extent.start_sector + sec_off;
+ if ( unlikely((sec_off + nr_secs) > x->extent.nr_sectors) )
+ goto overrun;
+ spin_unlock(&p->vbd_lock);
+ return 1;
+ }
+ sec_off -= x->extent.nr_sectors;
+ }
+
+ DPRINTK("vbd_translate: end of vbd.\n");
+ spin_unlock(&p->vbd_lock);
+ return -EACCES;
+
+ /*
+ * Here we deal with overrun onto the following extent. We don't deal with
+ * overrun of more than one boundary since each request is restricted to
+ * 2^9 512-byte sectors, so it should be trivial for control software to
+ * ensure that extents are large enough to prevent excessive overrun.
+ */
+ overrun:
+
+ /* Adjust length of first chunk to run to end of first extent. */
+ pseg[0].nr_sects = x->extent.nr_sectors - sec_off;
+
+ /* Set second chunk buffer and length to start where first chunk ended. */
+ pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9);
+ pseg[1].nr_sects = nr_secs - pseg[0].nr_sects;
+
+ /* Now move to the next extent. Check it exists and is long enough! */
+ if ( unlikely((x = x->next) == NULL) ||
+ unlikely(x->extent.nr_sectors < pseg[1].nr_sects) )
+ {
+ DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
+ spin_unlock(&p->vbd_lock);
+ return -EACCES;
+ }
+
+ /* Store the real device and start sector for the second chunk. */
+ pseg[1].dev = x->extent.device;
+ pseg[1].sector_number = x->extent.start_sector;
+
+ spin_unlock(&p->vbd_lock);
+ return 2;
+}
--- /dev/null
+/******************************************************************************
+ * arch/xen/drivers/vnetif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/vnetif/frontend
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+static int __init init_module(void)
+{
+ return 0;
+}
+
+static void cleanup_module(void)
+{
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <asm/ctrl_if.h>
-#include <asm/hypervisor.h>
-#include <asm/hypervisor-ifs/event_channel.h>
+#include <asm/evtchn.h>
static int ctrl_if_evtchn;
static int ctrl_if_irq;
static void ctrl_if_notify_controller(void)
{
- evtchn_op_t evtchn_op;
- evtchn_op.cmd = EVTCHNOP_send;
- evtchn_op.u.send.local_port = ctrl_if_evtchn;
- (void)HYPERVISOR_event_channel_op(&evtchn_op);
+ notify_via_evtchn(ctrl_if_evtchn);
}
static void ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id)
u.ptr = MMU_EXTENDED_COMMAND;
u.ptr |= (unsigned long)&default_ldt[0];
u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT);
- HYPERVISOR_mmu_update(&u, 1);
+ if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) )
+ {
+ show_trace(NULL);
+ panic("Failed to install default LDT");
+ }
return;
}
}
#endif
idx = 0;
wmb(); /* Make sure index is cleared first to avoid double updates. */
- HYPERVISOR_mmu_update(update_queue, _idx);
+ if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) )
+ panic("Failed to execute MMU updates");
}
void _flush_page_update_queue(void)
unsigned long flags;
spin_lock_irqsave(&update_lock, flags);
update_queue[idx].ptr = MMU_EXTENDED_COMMAND;
- update_queue[idx].val = ptr & PAGE_MASK;
- update_queue[idx].val |= MMUEXT_INVLPG;
+ update_queue[idx].ptr |= ptr & PAGE_MASK;
+ update_queue[idx].val = MMUEXT_INVLPG;
increment_index();
spin_unlock_irqrestore(&update_lock, flags);
}
unsigned long address,
unsigned long size,
unsigned long machine_addr,
- pgprot_t prot)
+ pgprot_t prot,
+ domid_t domid)
{
unsigned long end;
+ mmu_update_t *u, *v;
+ u = v = vmalloc(3*PAGE_SIZE); /* plenty */
+
+ /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */
+ if ( domid != 0 )
+ {
+ v[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL;
+ v[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL;
+ v[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL;
+ v[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL;
+ v[0].ptr |= MMU_EXTENDED_COMMAND;
+ v[0].val |= MMUEXT_SET_SUBJECTDOM_L;
+ v[1].ptr |= MMU_EXTENDED_COMMAND;
+ v[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+ v += 2;
+ }
+
address &= ~PMD_MASK;
end = address + size;
if (end > PMD_SIZE)
printk("direct_remap_area_pte: page already exists\n");
BUG();
}
- set_pte(pte, pte_mkio(direct_mk_pte_phys(machine_addr, prot)));
+ v->ptr = virt_to_machine(pte);
+ v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
+ v++;
address += PAGE_SIZE;
machine_addr += PAGE_SIZE;
pte++;
} while (address && (address < end));
+
+ if ( ((v-u) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) )
+ printk(KERN_WARNING "Failed to ioremap %08lx->%08lx (%08lx)\n",
+ end-size, end, machine_addr-size);
+ vfree(u);
}
static inline int direct_remap_area_pmd(struct mm_struct *mm,
unsigned long address,
unsigned long size,
unsigned long machine_addr,
- pgprot_t prot)
+ pgprot_t prot,
+ domid_t domid)
{
unsigned long end;
if (!pte)
return -ENOMEM;
direct_remap_area_pte(pte, address, end - address,
- address + machine_addr, prot);
+ address + machine_addr, prot, domid);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
unsigned long address,
unsigned long machine_addr,
unsigned long size,
- pgprot_t prot)
+ pgprot_t prot,
+ domid_t domid)
{
int error = 0;
pgd_t * dir;
if (!pmd)
break;
error = direct_remap_area_pmd(mm, pmd, address, end - address,
- machine_addr + address, prot);
+ machine_addr + address, prot, domid);
if (error)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY |
_PAGE_ACCESSED | flags);
if (direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(addr),
- machine_addr, size, prot)) {
+ machine_addr, size, prot, 0)) {
vfree(addr);
return NULL;
}
#endif
}
+#if !defined(CONFIG_XEN)
static int mmap_mem(struct file * file, struct vm_area_struct * vma)
{
unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-#if defined(CONFIG_XEN) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
- if (!(start_info.flags & SIF_PRIVILEGED))
- return -ENXIO;
-
- /* DONTCOPY is essential for Xen as copy_page_range is broken. */
- vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset,
- vma->vm_end-vma->vm_start, vma->vm_page_prot))
- return -EAGAIN;
- return 0;
-#elif defined(CONFIG_XEN)
- return -ENXIO;
-#else
/*
* Accessing memory above the top the kernel knows about or
* through a file pointer that was marked O_SYNC will be
vma->vm_page_prot))
return -EAGAIN;
return 0;
-#endif
}
+#elif !defined(CONFIG_XEN_PRIVILEGED_GUEST)
+static int mmap_mem(struct file * file, struct vm_area_struct * vma)
+{
+ return -ENXIO;
+}
+#else
+static int mmap_mem(struct file * file, struct vm_area_struct * vma)
+{
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ domid_t domid;
+
+ if (!(start_info.flags & SIF_PRIVILEGED))
+ return -ENXIO;
+
+ domid = file->private_data ? *(domid_t *)file->private_data : 0;
+
+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset,
+ vma->vm_end-vma->vm_start, vma->vm_page_prot,
+ domid))
+ return -EAGAIN;
+ return 0;
+}
+static int ioctl_mem(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
+{
+ if (file->private_data == NULL)
+ file->private_data = kmalloc(sizeof(domid_t), GFP_KERNEL);
+ switch (cmd) {
+ case _IO('M', 1): ((unsigned long *)file->private_data)[0]=arg; break;
+ case _IO('M', 2): ((unsigned long *)file->private_data)[1]=arg; break;
+ default: return -ENOSYS;
+ }
+ return 0;
+}
+static int release_mem(struct inode * inode, struct file * file)
+{
+ if (file->private_data != NULL)
+ kfree(file->private_data);
+ return 0;
+}
+#endif /* CONFIG_XEN */
/*
* This function reads the *virtual* memory as seen by the kernel.
goto out_up;
if (vma->vm_flags & VM_SHARED)
break;
-#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
- if (vma->vm_flags & VM_IO)
- break;
-#endif
count = vma->vm_end - addr;
if (count > size)
count = size;
unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
unsigned long size = vma->vm_end - vma->vm_start;
-#if defined(CONFIG_XEN)
- return -ENXIO;
-#endif
-
/*
* If the user is not attempting to mmap a high memory address then
* the standard mmap_mem mechanism will work. High memory addresses
write: write_mem,
mmap: mmap_mem,
open: open_mem,
+#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
+ release: release_mem,
+ ioctl: ioctl_mem,
+#endif
};
static struct file_operations kmem_fops = {
llseek: memory_lseek,
read: read_kmem,
write: write_kmem,
+#if !defined(CONFIG_XEN)
mmap: mmap_kmem,
+#endif
open: open_kmem,
};
break;
#if defined(CONFIG_ISA) || !defined(__mc68000__)
case 4:
-#if defined(CONFIG_XEN)
-#if defined(CONFIG_XEN_PRIVILEGED_GUEST)
- if (!(start_info.flags & SIF_PRIVILEGED))
-#endif
- return -ENXIO;
-#endif
filp->f_op = &port_fops;
break;
#endif
#ifndef __ASM_XEN__CTRL_IF_H__
#define __ASM_XEN__CTRL_IF_H__
+#include <linux/tqueue.h>
#include <asm/hypervisor.h>
typedef control_msg_t ctrl_msg_t;
#include <asm/hypervisor.h>
#include <asm/ptrace.h>
#include <asm/synch_bitops.h>
+#include <asm/hypervisor-ifs/event_channel.h>
/*
* LOW-LEVEL DEFINITIONS
synch_clear_bit(port, &s->evtchn_exception[0]);
}
+static inline void notify_via_evtchn(int port)
+{
+ evtchn_op_t op;
+ op.cmd = EVTCHNOP_send;
+ op.u.send.local_port = port;
+ (void)HYPERVISOR_event_channel_op(&op);
+}
+
/*
* CHARACTER-DEVICE DEFINITIONS
*/
: "=a" (ret) : "0" (__HYPERVISOR_mmu_update),
"b" (req), "c" (count) : "memory" );
- if ( unlikely(ret < 0) )
- {
- extern void show_trace(unsigned long *);
- show_trace(NULL);
- panic("Failed mmu update: %p, %d", req, count);
- }
-
return ret;
}
XEN_flush_page_update_queue();
}
+/*
+ * NB. The 'domid' field should be zero if mapping I/O space (non RAM).
+ * Otherwise it identifies the owner of the memory that is being mapped.
+ */
extern int direct_remap_area_pages(struct mm_struct *mm,
unsigned long address,
unsigned long machine_addr,
unsigned long size,
- pgprot_t prot);
+ pgprot_t prot,
+ domid_t domid);
#endif /* _I386_PGALLOC_H */