bitkeeper revision 1.879.3.1 (4097a6f8Q5eWNArSydr2Qh2tZnFF4w)
authormwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>
Tue, 4 May 2004 14:21:44 +0000 (14:21 +0000)
committermwilli2@equilibrium.research.intel-research.net <mwilli2@equilibrium.research.intel-research.net>
Tue, 4 May 2004 14:21:44 +0000 (14:21 +0000)
Support for selectively granting IO resource privileges.  Domains
that access physical devices now don't need to be fully privileged.

tools/examples/xc_dom_create.py
tools/xc/lib/xc.h
tools/xc/lib/xc_linux_build.c
tools/xc/lib/xc_netbsd_build.c
tools/xc/py/Xc.c
xen/arch/i386/irq.c
xen/arch/i386/process.c
xen/arch/i386/traps.c
xen/common/physdev.c
xen/include/asm-i386/processor.h
xen/include/xen/sched.h

index b898443ecfea04e3ad4ce5d41b44432438ff3dae..e803737ef0c6594dbafb523a4c74579c81e7560e 100755 (executable)
@@ -239,12 +239,8 @@ def make_domain():
        xc.domain_destroy ( dom=id )
        sys.exit()
 
-    # will the domain have IO privileges?
-    if pci_device_list != []: io_priv = True
-    else:                     io_priv = False
-
     if restore:
-        ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1, io_priv=%d )' % (builder_fn, io_priv))
+        ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1 )' % builder_fn )
         if ret < 0:
             print "Error restoring domain"
             print "Return code = " + str(ret)
@@ -252,7 +248,7 @@ def make_domain():
             sys.exit()
     else:
 
-        ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=xend_response["remote_port"], io_priv=%d )' % (builder_fn, io_priv) )
+        ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=xend_response["remote_port"] )' % builder_fn )
         if ret < 0:
             print "Error building Linux guest OS: "
             print "Return code = " + str(ret)
index eb1b07da91f2485e96440154aa519c3e734a2079..9a0fab2257dc8ceb0f42a86290e7064cc778ae63 100644 (file)
@@ -74,15 +74,13 @@ int xc_linux_build(int xc_handle,
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
-                   unsigned int control_evtchn,
-                   int io_priv);
+                   unsigned int control_evtchn);
 
 int xc_netbsd_build(int xc_handle,
                     u64 domid,
                     const char *image_name,
                     const char *cmdline,
-                    unsigned int control_evtchn,
-                    int io_priv);
+                    unsigned int control_evtchn);
 
 int xc_bvtsched_global_set(int xc_handle,
                            unsigned long ctx_allow);
index 27bc6c6668186258145fcd8eb3690da9aaa72b05..f1bd182827bd3cb5c5f54504e854c679ba9078be 100644 (file)
@@ -73,8 +73,7 @@ static int setup_guestos(int xc_handle,
                          dom0_builddomain_t *builddomain, 
                          const char *cmdline,
                          unsigned long shared_info_frame,
-                         unsigned int control_evtchn,
-                         int io_priv)
+                         unsigned int control_evtchn)
 {
     l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
     l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
@@ -270,7 +269,7 @@ static int setup_guestos(int xc_handle,
     memset(start_info, 0, sizeof(*start_info));
     start_info->nr_pages     = nr_pages;
     start_info->shared_info  = shared_info_frame << PAGE_SHIFT;
-    start_info->flags        = io_priv ? SIF_PRIVILEGED : 0;
+    start_info->flags        = 0;
     start_info->pt_base      = vpt_start;
     start_info->nr_pt_frames = nr_pt_pages;
     start_info->mfn_list     = vphysmap_start;
@@ -383,8 +382,7 @@ int xc_linux_build(int xc_handle,
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
-                   unsigned int control_evtchn,
-                   int io_priv)
+                   unsigned int control_evtchn)
 {
     dom0_op_t launch_op, op;
     int initrd_fd = -1;
@@ -442,7 +440,7 @@ int xc_linux_build(int xc_handle,
                        &vstartinfo_start, &vkern_entry,
                        &launch_op.u.builddomain, cmdline,
                        op.u.getdomaininfo.shared_info_frame,
-                       control_evtchn, io_priv) < 0 )
+                       control_evtchn) < 0 )
     {
         ERROR("Error constructing guest OS");
         goto error_out;
index cac444bd80c63c59092ef107ddc072a304f204ff..8793a512f296905584fe342b08e7ac1fc0eb5d8d 100644 (file)
@@ -62,8 +62,7 @@ static int setup_guestos(int xc_handle,
                          dom0_builddomain_t *builddomain, 
                          const char *cmdline,
                          unsigned long shared_info_frame,
-                         unsigned int control_evtchn,
-                         int io_priv)
+                         unsigned int control_evtchn)
 {
     l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
     l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
@@ -177,7 +176,7 @@ static int setup_guestos(int xc_handle,
     start_info->mod_len     = symtab_len;
     start_info->nr_pages    = tot_pages;
     start_info->shared_info = shared_info_frame << PAGE_SHIFT;
-    start_info->flags       = io_priv ? SIF_PRIVILEGED : 0;
+    start_info->flags       = 0;
     start_info->domain_controller_evtchn = control_evtchn;
     strncpy(start_info->cmd_line, cmdline, MAX_CMDLINE);
     start_info->cmd_line[MAX_CMDLINE-1] = '\0';
@@ -214,8 +213,7 @@ int xc_netbsd_build(int xc_handle,
                     u64 domid,
                     const char *image_name,
                     const char *cmdline,
-                    unsigned int control_evtchn,
-                    int io_priv)
+                    unsigned int control_evtchn)
 {
     dom0_op_t launch_op, op;
     unsigned long load_addr;
@@ -265,7 +263,7 @@ int xc_netbsd_build(int xc_handle,
                        &virt_startinfo_addr,
                        &load_addr, &launch_op.u.builddomain, cmdline,
                        op.u.getdomaininfo.shared_info_frame,
-                       control_evtchn, io_priv) < 0 )
+                       control_evtchn) < 0 )
     {
         ERROR("Error constructing guest OS");
         goto error_out;
index 322a20b411b4a90343f811faede2f381609b605a..6453281a612be6d349375211c87e8c9c2f430845 100644 (file)
@@ -228,19 +228,18 @@ static PyObject *pyxc_linux_build(PyObject *self,
 
     u64   dom;
     char *image, *ramdisk = NULL, *cmdline = "";
-    int   control_evtchn, io_priv = 0;
+    int   control_evtchn;
 
     static char *kwd_list[] = { "dom", "control_evtchn", 
-                                "image", "ramdisk", "cmdline", "io_priv",
-                               NULL };
+                                "image", "ramdisk", "cmdline", NULL };
 
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ssi", kwd_list, 
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ss", kwd_list, 
                                       &dom, &control_evtchn, 
-                                      &image, &ramdisk, &cmdline, &io_priv) )
+                                      &image, &ramdisk, &cmdline) )
         return NULL;
 
     if ( xc_linux_build(xc->xc_handle, dom, image, 
-                        ramdisk, cmdline, control_evtchn, io_priv) != 0 )
+                        ramdisk, cmdline, control_evtchn) != 0 )
         return PyErr_SetFromErrno(xc_error);
     
     Py_INCREF(zero);
@@ -255,19 +254,18 @@ static PyObject *pyxc_netbsd_build(PyObject *self,
 
     u64   dom;
     char *image, *ramdisk = NULL, *cmdline = "";
-    int   control_evtchn, io_priv = 0;
+    int   control_evtchn;
 
     static char *kwd_list[] = { "dom", "control_evtchn",
-                                "image", "ramdisk", "cmdline", "io_priv",
-                               NULL };
+                                "image", "ramdisk", "cmdline", NULL };
 
     if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ssi", kwd_list, 
                                       &dom, &control_evtchn,
-                                      &image, &ramdisk, &cmdline, &io_priv) )
+                                      &image, &ramdisk, &cmdline) )
         return NULL;
 
     if ( xc_netbsd_build(xc->xc_handle, dom, image, 
-                         cmdline, control_evtchn, io_priv) != 0 )
+                         cmdline, control_evtchn) != 0 )
         return PyErr_SetFromErrno(xc_error);
     
     Py_INCREF(zero);
@@ -1162,8 +1160,7 @@ static PyMethodDef pyxc_methods[] = {
       " dom     [long]:     Identifier of domain to build into.\n"
       " image   [str]:      Name of kernel image file. May be gzipped.\n"
       " ramdisk [str, n/a]: Name of ramdisk file, if any.\n"
-      " cmdline [str, n/a]: Kernel parameters, if any.\n"
-      " io_priv [boolean]:  Does the domain have IO privileges?\n\n"
+      " cmdline [str, n/a]: Kernel parameters, if any.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "netbsd_build", 
@@ -1172,15 +1169,14 @@ static PyMethodDef pyxc_methods[] = {
       "Build a new NetBSD guest OS.\n"
       " dom     [long]:     Identifier of domain to build into.\n"
       " image   [str]:      Name of kernel image file. May be gzipped.\n"
-      " cmdline [str, n/a]: Kernel parameters, if any.\n"
-      " io_priv [boolean]:  Does the domain have IO privileges?\n\n"
+      " cmdline [str, n/a]: Kernel parameters, if any.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "bvtsched_global_set",
       (PyCFunction)pyxc_bvtsched_global_set,
       METH_VARARGS | METH_KEYWORDS, "\n"
       "Set global tuning parameters for Borrowed Virtual Time scheduler.\n"
-      " ctx_allow [int]: Minimal guaranteed quantum (I think!).\n\n"
+      " ctx_allow [int]: Minimal guaranteed quantum.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "bvtsched_global_get",
@@ -1195,10 +1191,10 @@ static PyMethodDef pyxc_methods[] = {
       METH_VARARGS | METH_KEYWORDS, "\n"
       "Set per-domain tuning parameters for Borrowed Virtual Time scheduler.\n"
       " dom    [long]: Identifier of domain to be tuned.\n"
-      " mcuadv [int]:  Internal BVT parameter.\n"
-      " warp   [int]:  Internal BVT parameter.\n"
-      " warpl  [int]:  Internal BVT parameter.\n"
-      " warpu  [int]:  Internal BVT parameter.\n\n"
+      " mcuadv [int]:  Proportional to the inverse of the domain's weight.\n"
+      " warp   [int]:  How far to warp domain's EVT on unblock.\n"
+      " warpl  [int]:  How long the domain can run warped.\n"
+      " warpu  [int]:  How long before the domain can warp again.\n\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "bvtsched_domain_get",
index 4c50bb592d65c7aa8a2b4687ccde2235e0da16f2..d3eaf6af1291bf0d1f2ec635b209f9695c82359b 100644 (file)
@@ -1001,7 +1001,7 @@ int pirq_guest_bind(struct task_struct *p, int irq, int will_share)
     irq_guest_action_t *action;
     int rc = 0;
 
-    if ( !IS_PRIV(p) )
+    if ( !IS_CAPABLE_PHYSDEV(p) )
         return -EPERM;
 
     spin_lock_irqsave(&desc->lock, flags);
index ea5c51d17636615d4799ab2bdbafb84f3c8aefce..29c4fde8cba3f5441bc08b059e5a0ed0950df8dd 100644 (file)
@@ -227,12 +227,14 @@ void new_thread(struct task_struct *p,
                        : /* no output */ \
                        :"r" (thread->debugreg[register]))
 
+
 void switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
     struct thread_struct *next = &next_p->thread;
     struct tss_struct *tss = init_tss + smp_processor_id();
     execution_context_t *stack_ec = get_execution_context();
-
+    int i;
+    
     __cli();
 
     /* Switch guest general-register state. */
@@ -280,6 +282,58 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         }
     }
 
+    if ( ( prev_p->io_bitmap != NULL ) || ( next_p->io_bitmap != NULL ) ) {
+        if ( next_p->io_bitmap != NULL ) {
+            /* Copy in the appropriate parts of the IO bitmap.  We use the
+             * selector to copy only the interesting parts of the bitmap. */
+
+            u64 old_sel = ~0ULL; /* IO bitmap selector for previous task. */
+
+            if ( prev_p->io_bitmap != NULL)
+            {
+                old_sel = prev_p->io_bitmap_sel;
+
+                /* Replace any areas of the IO bitmap that had bits cleared. */
+                for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
+                    if ( !test_bit(i, &prev_p->io_bitmap_sel) )
+                        memcpy(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
+                               &next_p->io_bitmap[i * IOBMP_SELBIT_LWORDS],
+                               IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
+            }
+
+            /* Copy in any regions of the new task's bitmap that have bits
+             * clear and we haven't already dealt with. */
+            for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
+            {
+                if ( test_bit(i, &old_sel)
+                     && !test_bit(i, &next_p->io_bitmap_sel) )
+                    memcpy(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
+                           &next_p->io_bitmap[i * IOBMP_SELBIT_LWORDS],
+                           IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
+            }
+
+            tss->bitmap = IO_BITMAP_OFFSET;
+
+       } else {
+            /* In this case, we're switching FROM a task with IO port access,
+             * to a task that doesn't use the IO bitmap.  We set any TSS bits
+             * that might have been cleared, ready for future use. */
+            for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
+                if ( !test_bit(i, &prev_p->io_bitmap_sel) )
+                    memset(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
+                           0xFF, IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
+
+            /*
+             * a bitmap offset pointing outside of the TSS limit
+             * causes a nicely controllable SIGSEGV if a process
+             * tries to use a port IO instruction. The first
+             * sys_ioperm() call sets up the bitmap properly.
+             */
+            tss->bitmap = INVALID_IO_BITMAP_OFFSET;
+       }
+    }
+    
+    
     /* Switch page tables. */
     write_ptbase(&next_p->mm);
     tlb_clocktick();
index d50b101f3acd71b57da107cae4e96187c59da02c..f362faa05faf21d19229d554ece5216f313ac501 100644 (file)
@@ -657,7 +657,7 @@ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
 
 void set_tss_desc(unsigned int n, void *addr)
 {
-    _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 235, 0x89);
+    _set_tssldt_desc(gdt_table+__TSS(n), (int)addr, 8299, 0x89);
 }
 
 void __init trap_init(void)
index b57c6b564b6781f0759f870dfae4c2cced9370ac..6375fd48f0b5df3c4dff5f45327338a9c76516ef 100644 (file)
@@ -126,7 +126,7 @@ int physdev_pci_access_modify(
 {
     struct task_struct *p;
     struct pci_dev *pdev;
-    int rc = 0;
+    int i, j, rc = 0;
  
     if ( !IS_PRIV(current) )
         BUG();
@@ -146,7 +146,7 @@ int physdev_pci_access_modify(
         return -ESRCH;
 
     /* Make the domain privileged. */
-    set_bit(PF_PRIVILEGED, &p->flags);
+    set_bit(PF_PHYSDEV, &p->flags);
 
     /* Grant write access to the specified device. */
     if ( (pdev = pci_find_slot(bus, PCI_DEVFN(dev, func))) == NULL )
@@ -164,6 +164,55 @@ int physdev_pci_access_modify(
     if ( pdev->hdr_type != PCI_HEADER_TYPE_NORMAL )
         INFO("XXX can't give access to bridge devices yet\n");
 
+    /* Now, setup access to the IO ports and memory regions for the device. */
+
+    if ( p->io_bitmap == NULL )
+    {
+        p->io_bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+        if ( p->io_bitmap == NULL )
+        {
+            rc = -ENOMEM;
+            goto out;
+        }
+        memset(p->io_bitmap, 0xFF, IO_BITMAP_BYTES);
+
+        p->io_bitmap_sel = ~0ULL;
+    }
+
+    for ( i = 0; i < DEVICE_COUNT_RESOURCE; i++ )
+    {
+        struct resource *r = &pdev->resource[i];
+        
+        if ( r->flags & IORESOURCE_IO )
+        {
+            /* Give the domain access to the IO ports it needs.  Currently,
+             * this will allow all processes in that domain access to those
+             * ports as well.  This will do for now, since driver domains don't
+             * run untrusted processes! */
+            INFO("Giving domain %llu IO resources (%lx - %lx) "
+                 "for device %s\n", dom, r->start, r->end, pdev->slot_name);
+            for ( j = r->start; j < r->end + 1; j++ )
+            {
+                clear_bit(j, p->io_bitmap);
+                /* Record that we cleared a bit using bit n of the selector:
+                 * n = (j / (4 bytes in a word * 8 bits in a byte))
+                 *     / number of words per selector bit
+                 */
+                clear_bit((j / (8 * 4)) / IOBMP_SELBIT_LWORDS,
+                          &p->io_bitmap_sel);
+            }
+        }
+        else if ( r->flags & IORESOURCE_MEM )
+        {
+            /* allow domain to map IO memory for this device */
+            INFO("Giving domain %llu memory resources (%lx - %lx) "
+                 "for device %s\n", dom, r->start, r->end, pdev->slot_name);
+            for ( j = r->start; j < r->end + 1; j += PAGE_SIZE )
+                SHARE_PFN_WITH_DOMAIN(frame_table + (j >> PAGE_SHIFT), p);
+        }
+    }
+
+
  out:
     put_task_struct(p);
     return rc;
@@ -180,8 +229,8 @@ inline static int check_dev_acc (struct task_struct *p,
 
     *pdev = NULL;
 
-    if ( !IS_PRIV(p) )
-        return -EPERM; /* no pci acces permission */
+     if ( !IS_CAPABLE_PHYSDEV(p) )
+         return -EPERM; /* no pci access permission */
 
     if ( bus > PCI_BUSMAX || dev > PCI_DEVMAX || func > PCI_FUNCMAX )
         return -EINVAL;
@@ -651,5 +700,7 @@ void physdev_init_dom0(struct task_struct *p)
                    dev->slot_name);
         }
     }
+
+    set_bit(PF_PHYSDEV, &p->flags);
 }
 
index 26f64d1f9f36c264b2998b0c3156722e6d28affa..2968e2e4e9dfb9806a0b1caac40ac72b04e210d5 100644 (file)
@@ -287,9 +287,12 @@ extern unsigned int mca_pentium_flag;
 #define TASK_UNMAPPED_BASE     (TASK_SIZE / 3)
 
 /*
- * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.
+ * Size of io_bitmap in longwords:
+ * For Xen we support the full 8kbyte IO bitmap but use the io_bitmap_sel field
+ * of the task_struct to avoid a full 8kbyte copy when switching to / from
+ * domains with bits cleared.
  */
-#define IO_BITMAP_SIZE 32
+#define IO_BITMAP_SIZE 2048
 #define IO_BITMAP_BYTES (IO_BITMAP_SIZE * 4)
 #define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap)
 #define INVALID_IO_BITMAP_OFFSET 0x8000
@@ -429,7 +432,7 @@ long set_fast_trap(struct task_struct *p, int idx);
        0,0,0,0,0,0, /* ds,fs,gs */                             \
        0,0, /* ldt */                                          \
        0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */         \
-       {~0, } /* ioperm */                                     \
+       { [0 ... IO_BITMAP_SIZE] = ~0UL }, /* ioperm */         \
 }
 
 struct mm_struct {
index 14f4019ac1a9223a2086b42bedf9dee5f26e4ab7..c022b4a4078afcdecdb260d8382401b4e9e5a192 100644 (file)
@@ -44,11 +44,13 @@ extern struct mm_struct init_mm;
 #define PF_IDLETASK     4 /* Is this one of the per-CPU idle domains?    */
 #define PF_PRIVILEGED   5 /* Is this domain privileged?                  */
 #define PF_CONSOLEWRITEBUG 6 /* Has this domain used the obsolete console? */
+#define PF_PHYSDEV      7 /* May this domain do IO to physical devices? */
 
 #include <xen/vif.h>
 #include <xen/vbd.h>
 
 #define IS_PRIV(_p) (test_bit(PF_PRIVILEGED, &(_p)->flags))
+#define IS_CAPABLE_PHYSDEV(_p) (test_bit(PF_PHYSDEV, &(_p)->flags))
 
 struct task_struct;
 
@@ -174,6 +176,14 @@ struct task_struct
     spinlock_t       pcidev_lock;
     struct list_head pcidev_list;
 
+    /* The following IO bitmap stuff is x86-dependent. */
+    u64 io_bitmap_sel; /* Selector to tell us which part of the IO bitmap are
+                        * "interesting" (i.e. have clear bits) */
+
+    /* Handy macro - number of bytes of the IO bitmap, per selector bit. */
+#define IOBMP_SELBIT_LWORDS ( IO_BITMAP_SIZE / 64 )
+    unsigned long *io_bitmap; /* Pointer to task's IO bitmap or NULL */
+
     unsigned long flags;
 
     atomic_t refcnt;