bitkeeper revision 1.891.1.5 (409ba2e8A6F60eP06BqyZUGapsn8XA)
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Fri, 7 May 2004 14:53:28 +0000 (14:53 +0000)
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Fri, 7 May 2004 14:53:28 +0000 (14:53 +0000)
Network interface for new IO model is now completed.

34 files changed:
.rootkeys
tools/examples/xc_dom_create.py
tools/xenctl/lib/utils.py
tools/xend/lib/domain_controller.h
tools/xend/lib/main.py
tools/xend/lib/manager.py
tools/xend/lib/netif.py [new file with mode: 0644]
xen/common/dom_mem_ops.c
xen/common/domain.c
xen/common/kernel.c
xen/common/memory.c
xenolinux-2.4.26-sparse/arch/xen/config.in
xenolinux-2.4.26-sparse/arch/xen/defconfig
xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c
xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c
xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c
xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c
xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c
xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c
xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c
xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h
xenolinux-2.4.26-sparse/include/asm-xen/io.h
xenolinux-2.4.26-sparse/include/asm-xen/pci.h [new file with mode: 0644]
xenolinux-2.4.26-sparse/mkbuildtree
xenolinux-2.4.26-sparse/mm/page_alloc.c [new file with mode: 0644]

index 5a7a5d280390e5728d36c2b9c63406b828675e3c..4c888bbc8e59ac6a5285c31a9378fcb9affea04e 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h
 4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py
 4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py
+409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py
 40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c
 4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py
 4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend
 3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h
 3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h
 3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h
+409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h
 3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h
 3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h
 3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h
 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c
 3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c
 3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c
+409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c
 3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c
 3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c
 407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
index 22479a9d14f000a43676e2f657b2aa2f9e00746a..19bb2ac9df5d608068f71cf2cc79dde7cb47a778 100755 (executable)
@@ -333,7 +333,18 @@ def make_domain():
                 xc.domain_destroy ( dom=id )
                 sys.exit()
 
-    if not new_io_world:
+    if new_io_world:
+        cmsg = 'new_network_interface(dom='+str(id)+')'
+        xend_response = xenctl.utils.xend_control_message(cmsg)
+        if not xend_response['success']:
+            print "Error creating network interface"
+            print "Error type: " + xend_response['error_type']
+            if xend_response['error_type'] == 'exception':
+                print "Exception type: " + xend_response['exception_type']
+                print "Exception val:  " + xend_response['exception_value']
+            xc.domain_destroy ( dom=id )
+            sys.exit()
+    else:
         # setup virtual firewall rules for all aliases
         for ip in vfr_ipaddr:
             xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
index 3f0914f73fd0ab6c78a8aa6633f1c060b6962f2d..11aadb4f088568dd8d1deb491e92853fd522c209 100644 (file)
@@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'):
             return m.group(1)
     return None
 
-def get_current_ipgw(dev='eth0'):
-    """Return a string containing the IP gateway for the given
-    network interface (default 'eth0').
-    """
+def get_current_ipgw():
+    """Return a string containing the default IP gateway."""
     fd = os.popen( '/sbin/route -n' )
     lines = fd.readlines()
     for line in lines:
-        m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
-                       '\s+\S+\s+\S*G.*' + dev + '.*', line )
+        m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
+                       '\s+0.0.0.0+\s+\S*G.*', line )
         if m:
             return m.group(1)
     return None
index d5c397fe96ef18ed040e1f029dbab34bb51a1cd6..566967dc38f1eb0f775dd175de55d48718babdd6 100644 (file)
@@ -468,7 +468,6 @@ typedef struct {
     unsigned int   evtchn;            /* Event channel for notifications.    */
     unsigned long  tx_shmem_frame;    /* Page cont. tx shared comms window.  */
     unsigned long  rx_shmem_frame;    /* Page cont. rx shared comms window.  */
-    unsigned long  shmem_frame;       
     /* OUT */
     unsigned int   status;
 } netif_be_connect_t; 
index 7b5adbab8312ea25cdddfe4d4d85a292e06e39b7..0056783d7fda95eda650f063e3ec7a1055003c01 100755 (executable)
@@ -5,7 +5,7 @@
 ###########################################################
 
 import errno, re, os, pwd, select, signal, socket, struct, sys, time
-import xend.blkif, xend.console, xend.manager, xend.utils, Xc
+import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc
 
 
 # The following parameters could be placed in a configuration file.
@@ -19,6 +19,8 @@ UNIX_SOCK    = 'management_sock' # relative to CONTROL_DIR
 CMSG_CONSOLE  = 0
 CMSG_BLKIF_BE = 1
 CMSG_BLKIF_FE = 2
+CMSG_NETIF_BE = 3
+CMSG_NETIF_FE = 4
 
 
 def port_from_dom(dom):
@@ -162,6 +164,10 @@ def daemon_loop():
             if xend.blkif.interface.list.has_key(idx):
                 blk_if = xend.blkif.interface.list[idx]
 
+            net_if = False
+            if xend.netif.interface.list.has_key(idx):
+                net_if = xend.netif.interface.list[idx]
+
             # If we pick up a disconnect notification then we do any necessary
             # cleanup.
             if type == notifier.EXCEPTION:
@@ -175,6 +181,9 @@ def daemon_loop():
                     if blk_if:
                         blk_if.destroy()
                         del blk_if
+                    if net_if:
+                        net_if.destroy()
+                        del net_if
                     continue
 
             # Process incoming requests.
@@ -188,6 +197,10 @@ def daemon_loop():
                     blk_if.ctrlif_rx_req(port, msg)
                 elif type == CMSG_BLKIF_BE and port == dom0_port:
                     xend.blkif.backend_rx_req(port, msg)
+                elif type == CMSG_NETIF_FE and net_if:
+                    net_if.ctrlif_rx_req(port, msg)
+                elif type == CMSG_NETIF_BE and port == dom0_port:
+                    xend.netif.backend_rx_req(port, msg)
                 else:
                     port.write_response(msg)
 
@@ -198,6 +211,8 @@ def daemon_loop():
                 type = (msg.get_header())['type']
                 if type == CMSG_BLKIF_BE and port == dom0_port:
                     xend.blkif.backend_rx_rsp(port, msg)
+                elif type == CMSG_NETIF_BE and port == dom0_port:
+                    xend.netif.backend_rx_rsp(port, msg)
 
             # Send console data.
             if con_if and con_if.ctrlif_transmit_work(port):
@@ -207,10 +222,18 @@ def daemon_loop():
             if blk_if and blk_if.ctrlif_transmit_work(port):
                 work_done = True
 
+            # Send netif messages.
+            if net_if and net_if.ctrlif_transmit_work(port):
+                work_done = True
+
             # Back-end block-device work.
             if port == dom0_port and xend.blkif.backend_do_work(port):
                 work_done = True
                 
+            # Back-end network-device work.
+            if port == dom0_port and xend.netif.backend_do_work(port):
+                work_done = True
+                
             # Finally, notify the remote end of any work that we did.
             if work_done:
                 port.notify()
index ea7398cd4ce3a0a13a1a79d4bed95afbb7eef221..2f15683d6654d90e6b57822b1538ff01d274057c 100644 (file)
@@ -4,7 +4,7 @@
 ## Copyright (c) 2004, K A Fraser (University of Cambridge)
 #############################################################
 
-import xend.blkif, xend.console, xend.main, xend.utils
+import xend.blkif, xend.netif, xend.console, xend.main, xend.utils
 
 
 ##
@@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, pdev, start_sect, nr_sect, readonly):
 
     # Response is deferred until back-end driver sends acknowledgement.
     return None
+
+
+##
+## new_network_interface:
+##  Create a new network interface for the specified domain @dom.
+##
+def new_network_interface(dom, handle=-1):
+    # By default we create an interface with handle zero.
+    if handle < 0:
+        handle = 0
+
+    # We only support one interface per domain, which must have handle zero.
+    if handle != 0:
+        response = { 'success': False }
+        response['error_type'] = 'Bad handle %d (only handle 0 ' + \
+                                 'is supported)' % handle
+        return response
+
+    # Find local event-channel port associated with the specified domain.
+    port = xend.main.port_from_dom(dom)
+    if not port:
+        response = { 'success': False }
+        response['error_type'] = 'Unknown domain %d' % dom
+        return response
+
+    # The interface must not already exist.
+    if xend.netif.interface.list.has_key(port.local_port):
+        response = { 'success': False }
+        response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \
+                                 'exists' % (dom, handle)
+        return response
+
+    # Create the new interface. Initially no virtual devices are attached.
+    xend.netif.interface(dom, port.local_port)
+
+    # Response is deferred until back-end driver sends acknowledgement.
+    return None
diff --git a/tools/xend/lib/netif.py b/tools/xend/lib/netif.py
new file mode 100644 (file)
index 0000000..11756c5
--- /dev/null
@@ -0,0 +1,144 @@
+
+###################################################################
+## xend/netif.py -- Network-interface management functions for Xend
+## Copyright (c) 2004, K A Fraser (University of Cambridge)
+###################################################################
+
+import errno, random, re, os, select, signal, socket, struct, sys
+import xend.main, xend.console, xend.manager, xend.utils, Xc
+
+CMSG_NETIF_BE = 3
+CMSG_NETIF_FE = 4
+CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED =  0
+CMSG_NETIF_FE_DRIVER_STATUS_CHANGED    = 32
+CMSG_NETIF_FE_INTERFACE_CONNECT        = 33
+CMSG_NETIF_FE_INTERFACE_DISCONNECT     = 34
+CMSG_NETIF_BE_CREATE      = 0
+CMSG_NETIF_BE_DESTROY     = 1
+CMSG_NETIF_BE_CONNECT     = 2
+CMSG_NETIF_BE_DISCONNECT  = 3
+
+pendmsg = None
+pendaddr = None
+
+def backend_tx_req(msg):
+    port = xend.main.dom0_port
+    if port.space_to_write_request():
+        port.write_request(msg)
+        port.notify()
+    else:
+        xend.netif.pendmsg = msg
+
+def backend_rx_req(port, msg):
+    port.write_response(msg)
+
+def backend_rx_rsp(port, msg):
+    subtype = (msg.get_header())['subtype']
+    print "Received netif-be response, subtype %d" % subtype
+    if subtype == CMSG_NETIF_BE_CREATE:
+        rsp = { 'success': True }
+        xend.main.send_management_response(rsp, xend.netif.pendaddr)
+    elif subtype == CMSG_NETIF_BE_CONNECT:
+        (dom,hnd,evtchn,tx_frame,rx_frame,st) = \
+           struct.unpack("QIILLI", msg.get_payload())
+        netif = interface.list[xend.main.port_from_dom(dom).local_port]
+        msg = xend.utils.message(CMSG_NETIF_FE, \
+                                 CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
+        msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \
+                                       netif.evtchn['port2'], \
+                                       netif.mac[0],netif.mac[1], \
+                                       netif.mac[2],netif.mac[3], \
+                                       netif.mac[4],netif.mac[5], \
+                                       0,0))
+        netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg)
+
+def backend_do_work(port):
+    global pendmsg
+    if pendmsg and port.space_to_write_request():
+        port.write_request(pendmsg)
+        pendmsg = None
+        return True
+    return False
+
+
+class interface:
+
+    # Dictionary of all network-device interfaces.
+    list = {}
+
+
+    # NB. 'key' is an opaque value that has no meaning in this class.
+    def __init__(self, dom, key):
+        self.dom     = dom
+        self.key     = key
+        self.pendmsg = None
+
+        # VIFs get a random MAC address with a "special" vendor id.
+        # 
+        # NB. The vendor is currently an "obsolete" one that used to belong
+        # to DEC (AA-00-00). Using it is probably a bit rude :-)
+        # 
+        # NB2. The first bit of the first random octet is set to zero for
+        # all dynamic MAC addresses. This may allow us to manually specify
+        # MAC addresses for some VIFs with no fear of clashes.
+        self.mac = [ 0xaa, 0x00, 0x00 ]
+        self.mac.append(int(random.random()*128))
+        self.mac.append(int(random.random()*256))
+        self.mac.append(int(random.random()*256))
+                
+        interface.list[key] = self
+        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0)
+        msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \
+                                       self.mac[0],self.mac[1], \
+                                       self.mac[2],self.mac[3], \
+                                       self.mac[4],self.mac[5], \
+                                       0,0,0))
+        xend.netif.pendaddr = xend.main.mgmt_req_addr
+        backend_tx_req(msg)
+
+
+    # Completely destroy this interface.
+    def destroy(self):
+        del interface.list[self.key]
+        msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0)
+        msg.append_payload(struct.pack("QII",self.dom,0,0))
+        backend_tx_req(msg)        
+
+
+    # The parameter @port is the control-interface event channel. This method
+    # returns True if messages were written to the control interface.
+    def ctrlif_transmit_work(self, port):
+        if self.pendmsg and port.space_to_write_request():
+            port.write_request(self.pendmsg)
+            self.pendmsg = None
+            return True
+        return False
+
+    def ctrlif_tx_req(self, port, msg):
+        if port.space_to_write_request():
+            port.write_request(msg)
+            port.notify()
+        else:
+            self.pendmsg = msg
+
+    def ctrlif_rx_req(self, port, msg):
+        port.write_response(msg)
+        subtype = (msg.get_header())['subtype']
+        if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+            msg = xend.utils.message(CMSG_NETIF_FE, \
+                                     CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
+            msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \
+                                           self.mac[1],self.mac[2], \
+                                           self.mac[3],self.mac[4], \
+                                           self.mac[5],0,0))
+            self.ctrlif_tx_req(port, msg)
+        elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT:
+            (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload())
+            xc = Xc.new()
+            self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom)
+            msg = xend.utils.message(CMSG_NETIF_BE, \
+                                     CMSG_NETIF_BE_CONNECT, 0)
+            msg.append_payload(struct.pack("QIILLI",self.dom,0, \
+                                           self.evtchn['port1'],tx_frame, \
+                                           rx_frame,0))
+            backend_tx_req(msg)
index 7d596026f9385d39f48d8a4eeb6130fb66bfe8b6..79d0bb1df1563de7082b80b47f83334ff50fadaa 100644 (file)
@@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
     {
         /* Leave some slack pages; e.g., for the network. */
         if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
-                                   (PAGE_SHIFT-10))) ) 
+                                   (PAGE_SHIFT-10))) )
+        {
+            DPRINTK("Not enough slack: %u %u\n",
+                    free_pfns,
+                    SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10));
             break;
+        }
 
         /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
         if ( unlikely((page = alloc_domain_page(p)) == NULL) )
+        {
+            DPRINTK("Could not allocate a frame\n");
             break;
-        
+        }
+
         /* Inform the domain of the new page's machine address. */ 
         mpfn = (unsigned long)(page - frame_table);
         copy_to_user(op.pages, &mpfn, sizeof(mpfn));
index a9c40ae98f22451621ac31c55722cd79bbf7941f..1b8759e912e15b4afc5b1a9b3b5ab8ecb7db935c 100644 (file)
@@ -334,6 +334,8 @@ struct pfn_info *alloc_domain_page(struct task_struct *p)
         spin_lock(&p->page_list_lock);
         if ( unlikely(p->tot_pages >= p->max_pages) )
         {
+            DPRINTK("Over-allocation for domain %llu: %u >= %u\n",
+                    p->domain, p->tot_pages, p->max_pages);
             spin_unlock(&p->page_list_lock);
             goto free_and_exit;
         }
@@ -884,7 +886,7 @@ int construct_dom0(struct task_struct *p,
         page->type_and_flags  = 0;
         page->count_and_flags = PGC_allocated | 1;
         list_add_tail(&page->list, &p->page_list);
-        p->tot_pages++;
+        p->tot_pages++; p->max_pages++;
     }
 
     mpt_alloc = (vpt_start - v_start) + alloc_start;
index 7f814391cf5446490794a094dd1d9025097c7711..0d5fa023a14d5910ab8d2b6001f1ccd99ac4db98 100644 (file)
@@ -105,7 +105,6 @@ static struct {
 void cmain(unsigned long magic, multiboot_info_t *mbi)
 {
     struct task_struct *new_dom;
-    dom0_createdomain_t dom0_params;
     unsigned long max_page;
     unsigned char *cmdline;
     module_t *mod = (module_t *)__va(mbi->mods_addr);
@@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboot_info_t *mbi)
     task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task;
 
     /* Create initial domain 0. */
-    dom0_params.memory_kb = opt_dom0_mem;
     new_dom = do_createdomain(0, 0);
     if ( new_dom == NULL )
         panic("Error creating domain 0\n");
index e4d0590a579e23573f8d68c84953f653b8cb1022..5acfae8482a139e2520ad5240104521cc1b39b4b 100644 (file)
@@ -940,17 +940,25 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         }
         break;
 
+        /* XXX This function is racey! */
     case MMUEXT_REASSIGN_PAGE:
-        if ( !IS_PRIV(current) )
+        if ( unlikely(!IS_PRIV(current)) )
         {
             MEM_LOG("Dom %llu has no privilege to reassign page ownership",
                     current->domain);
             okay = 0;
         }
-        else if ( percpu_info[cpu].gps != NULL )
+        else if ( likely(percpu_info[cpu].gps != NULL) )
         {
+            current->tot_pages--;
+            percpu_info[cpu].gps->tot_pages++;
             page->u.domain = percpu_info[cpu].gps;
         }
+        else
+        {
+            MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn);
+            okay = 0;
+        }
         break;
 
     case MMUEXT_RESET_SUBJECTDOM:
index 16fa5e66d44feb8102b1455125bc183db8a130c8..7f961d852108787093f641e57b3e825716ee8c65 100644 (file)
@@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then
    bool 'HIGHMEM I/O support' CONFIG_HIGHIO
 fi
 
+define_int CONFIG_FORCE_MAX_ZONEORDER 12
+
 #bool 'Symmetric multi-processing support' CONFIG_SMP
 #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
 #   define_bool CONFIG_HAVE_DEC_LOCK y
index eaa9171b1f4faf4ea59aee60ce989a740c15458a..013e732c3fa07e3742c2793edcefd4065781e4ff 100644 (file)
@@ -50,6 +50,7 @@ CONFIG_X86_TSC=y
 CONFIG_X86_L1_CACHE_SHIFT=5
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
+CONFIG_FORCE_MAX_ZONEORDER=12
 
 #
 # General setup
@@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y
 # Network testing
 #
 # CONFIG_NET_PKTGEN is not set
+CONFIG_NETDEVICES=y
 
 #
 # Block devices
index 41b05aaaa7741d23ef6744ab64b8c8744d3155bc..3be5b50bfa7bb1f7a565faa19be487716fa8e1df 100644 (file)
@@ -51,6 +51,7 @@ CONFIG_X86_TSC=y
 CONFIG_X86_L1_CACHE_SHIFT=5
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
+CONFIG_FORCE_MAX_ZONEORDER=12
 
 #
 # General setup
index e6004b4a8e9806ed8fd9bd6bda7af515e0c8c7d0..e80435fbbb8f4e983845f7dbccfc4ed1ba3ff28a 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/rbtree.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <asm/ctrl_if.h>
 #include <asm/io.h>
 #include "../blkif.h"
index 0746ecfab0951cb394af6eecedd01d802fd9eb4e..0b2622465170313abbbe7de56581fb49e01e225a 100644 (file)
@@ -74,7 +74,8 @@ void blkif_ctrlif_init(void)
     ctrl_msg_t                       cmsg;
     blkif_be_driver_status_changed_t st;
 
-    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
 
     /* Send a driver-UP notification to the domain controller. */
     cmsg.type      = CMSG_BLKIF_BE;
index 9acbac35ab4f772853cad1a6ea5d0107b8e9b218..14a6ab324dced0921196fa2f6039ee9d3debde9e 100644 (file)
@@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *create)
     unsigned int  handle = create->blkif_handle;
     blkif_t     **pblkif, *blkif;
 
-    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL )
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
     {
         DPRINTK("Could not create blkif: out of memory\n");
         create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
index 4b11ad9a8eee1f59d6b39f9a19788a36145defbf..eb3e32c75f58fe7a8ea248ce460db8fa00ab339e 100644 (file)
 #define MAX_PENDING_REQS 64
 #define BATCH_PER_DOMAIN 16
 
-static struct vm_struct *mmap_vma;
-#define MMAP_PAGES_PER_SEGMENT \
-    ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1)
+static unsigned long mmap_vstart;
 #define MMAP_PAGES_PER_REQUEST \
-    (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT)
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
 #define MMAP_PAGES             \
     (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg)            \
-    ((unsigned long)mmap_vma->addr +     \
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
      ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
-     ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE))
+     ((_seg) * PAGE_SIZE))
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
     prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
     for ( i = 0; i < req->nr_segments; i++ )
     {
-        if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) )
+        /* Make sure the buffer is page-sized. */
+        if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) ||
+             (blkif_last_sect(req->frame_and_sects[i]) != 7) )
             goto bad_descriptor;
         rc = direct_remap_area_pages(&init_mm, 
                                      MMAP_VADDR(pending_idx, i),
-                                     req->buffer_and_sects[i] & PAGE_MASK, 
+                                     req->frame_and_sects[i] & PAGE_MASK, 
                                      PAGE_SIZE, prot, blkif->domid);
         if ( rc != 0 )
             goto bad_descriptor;
@@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
     extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
     struct buffer_head *bh;
     int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
-    unsigned short nr_sects;
-    unsigned long buffer;
+    short nr_sects;
+    unsigned long buffer, fas;
     int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
     pending_req_t *pending_req;
     pgprot_t       prot;
 
     /* We map virtual scatter/gather segments to physical segments. */
     int new_segs, nr_psegs = 0;
-    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
 
     /* Check that number of segments is sane. */
     if ( unlikely(req->nr_segments == 0) || 
@@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
      */
     for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
     {
-        buffer   = req->buffer_and_sects[i] & ~0x1FF;
-        nr_sects = req->buffer_and_sects[i] &  0x1FF;
+        fas      = req->frame_and_sects[i];
+        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
 
-        if ( unlikely(nr_sects == 0) )
-            continue;
-
-        if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) )
-        {
-            DPRINTK("Too many sectors in segment\n");
+        if ( nr_sects <= 0 )
             goto bad_descriptor;
-        }
 
         phys_seg[nr_psegs].dev           = req->device;
         phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
@@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
         }
   
         nr_psegs += new_segs;
-        ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2);
+        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
     }
 
     /* Nonsensical zero-sized request? */
@@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
 
     for ( i = 0; i < nr_psegs; i++ )
     {
-        unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + 
-                            (phys_seg[i].nr_sects << 9) + 
-                            (PAGE_SIZE - 1)) & PAGE_MASK;
         int rc = direct_remap_area_pages(&init_mm, 
                                          MMAP_VADDR(pending_idx, i),
                                          phys_seg[i].buffer & PAGE_MASK, 
-                                         sz, prot, blkif->domid);
+                                         PAGE_SIZE, prot, blkif->domid);
         if ( rc != 0 )
         {
             DPRINTK("invalid buffer\n");
@@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
                               MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
             goto bad_descriptor;
         }
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+            phys_seg[i].buffer >> PAGE_SHIFT;
     }
 
     pending_req = &pending_reqs[pending_idx];
@@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
         bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
         bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
             (phys_seg[i].buffer & ~PAGE_MASK);
+//        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
         bh->b_end_io        = end_block_io_op;
         bh->b_private       = pending_req;
 
@@ -456,13 +451,13 @@ static int __init init_module(void)
 {
     int i;
 
+    if ( !(start_info.flags & SIF_INITDOMAIN) )
+        return 0;
+
     blkif_interface_init();
 
-    if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL )
-    {
-        printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n");
-        return -ENOMEM;
-    }
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+        BUG();
 
     pending_cons = 0;
     pending_prod = MAX_PENDING_REQS;
@@ -484,6 +479,7 @@ static int __init init_module(void)
 
 static void cleanup_module(void)
 {
+    BUG();
 }
 
 module_init(init_module);
index 19b0b3015dff7a978ecc263f2e22fecabdec523f..bb5b6ea74363f03a1f9908f7b545387386fb4312 100644 (file)
@@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *create)
         }
     }
 
-    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) )
+    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
     {
         DPRINTK("vbd_create: out of memory\n");
         create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
@@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow)
     } 
 
     if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
-                               GFP_ATOMIC)) == NULL) )
+                               GFP_KERNEL)) == NULL) )
     {
         DPRINTK("vbd_grow: out of memory\n");
         grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
index 1938f68f8ec8874a15f588cefe1f37bc5a98f144..0a90744c59d41703fd969bca7a7e39e3c872769e 100644 (file)
  */
 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
 
-#define BLKIF_MAX_SECTORS_PER_SEGMENT  16
-
 typedef struct {
     u8             operation;        /* BLKIF_OP_???                         */
     u8             nr_segments;      /* number of segments                   */
     blkif_vdev_t   device;           /* only for read/write requests         */
     unsigned long  id;               /* private guest value, echoed in resp  */
     blkif_sector_t sector_number;    /* start sector idx on disk (r/w only)  */
-    /* Least 9 bits is 'nr_sects'. High 23 bits is the address.       */
-    /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */
-    unsigned long  buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame.   */
+    /* @first_sect: first sector in frame to transfer (inclusive).           */
+    /* @last_sect: last sector in frame to transfer (inclusive).             */
+    /* @frame: machine page frame number.                                    */
+    unsigned long  frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 } blkif_request_t;
 
+#define blkif_first_sect(_fas) (((_fas)>>3)&7)
+#define blkif_last_sect(_fas)  ((_fas)&7)
+
 typedef struct {
     unsigned long   id;              /* copied from request */
     u8              operation;       /* copied from request */
@@ -79,8 +82,8 @@ typedef struct {
  *  @device      == unused (zero)
  *  @id          == any value (echoed in response message)
  *  @sector_num  == unused (zero)
- *  @buffer_and_sects == list of page-aligned, page-sized buffers.
- *                       (i.e., nr_sects == 8).
+ *  @frame_and_sects == list of page-sized buffers.
+ *                       (i.e., @first_sect == 0, @last_sect == 7).
  * 
  * The response is a list of vdisk_t elements copied into the out-of-band
  * probe buffer. On success the response status field contains the number
index 29cc01d08749397ee2ee313fe2f954cbac08100d..63f1aeea26294a01d0fa2f3fa3f30b0da7413acf 100644 (file)
@@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linux/ide.h */
 static unsigned int blkif_state = BLKIF_STATE_CLOSED;
 static unsigned int blkif_evtchn, blkif_irq;
 
-static struct tq_struct blkif_statechange_tq;
-
 static int blkif_control_rsp_valid;
 static blkif_response_t blkif_control_rsp;
 
@@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned long   id,
     struct gendisk     *gd;
     blkif_request_t    *req;
     struct buffer_head *bh;
+    unsigned int        fsect, lsect;
 
-    if ( unlikely(nr_sectors >= (1<<9)) )
-        BUG();
+    fsect = (buffer_ma & ~PAGE_MASK) >> 9;
+    lsect = fsect + nr_sectors - 1;
+
+    /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
     if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
         BUG();
+    if ( lsect > 7 )
+        BUG();
+
+    buffer_ma &= PAGE_MASK;
 
     if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
         return 1;
@@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned long   id,
             bh = (struct buffer_head *)id;
             bh->b_reqnext = (struct buffer_head *)req->id;
             req->id = id;
-            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
-            if ( ++req->nr_segments < MAX_BLK_SEGS )
+            req->frame_and_sects[req->nr_segments] = 
+                buffer_ma | (fsect<<3) | lsect;
+            if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
                 sg_next_sect += nr_sectors;
             else
                 DISABLE_SCATTERGATHER();
@@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned long   id,
     req->sector_number = (blkif_sector_t)sector_number;
     req->device        = device; 
     req->nr_segments   = 1;
-    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
+    req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
     req_prod++;
 
     return 0;
@@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
 }
 
 
-static void blkif_bringup_phase1(void *unused)
+static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
 {
     ctrl_msg_t                   cmsg;
     blkif_fe_interface_connect_t up;
 
-    /* Move from CLOSED to DISCONNECTED state. */
-    blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
-    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
-    blkif_state  = BLKIF_STATE_DISCONNECTED;
-
-    /* Construct an interface-CONNECT message for the domain controller. */
-    cmsg.type      = CMSG_BLKIF_FE;
-    cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
-    cmsg.length    = sizeof(blkif_fe_interface_connect_t);
-    up.handle      = 0;
-    up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
-    memcpy(cmsg.msg, &up, sizeof(up));
-
-    /* Tell the controller to bring up the interface. */
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void blkif_bringup_phase2(void *unused)
-{
-    blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
-    (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
-
-    /* Probe for discs that are attached to the interface. */
-    xlvbd_init();
-
-    blkif_state = BLKIF_STATE_CONNECTED;
-
-    /* Kick pending requests. */
-    spin_lock_irq(&io_request_lock);
-    kick_pending_request_queues();
-    spin_unlock_irq(&io_request_lock);
-}
-
-static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
-{
     if ( status->handle != 0 )
     {
         printk(KERN_WARNING "Status change on unsupported blkif %d\n",
@@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
                    " in state %d\n", blkif_state);
             break;
         }
-        blkif_statechange_tq.routine = blkif_bringup_phase1;
-        schedule_task(&blkif_statechange_tq);
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+        blkif_state  = BLKIF_STATE_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_BLKIF_FE;
+        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
+        up.handle      = 0;
+        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
         break;
 
     case BLKIF_INTERFACE_STATUS_CONNECTED:
@@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
                    " in state %d\n", blkif_state);
             break;
         }
+
         blkif_evtchn = status->evtchn;
-        blkif_statechange_tq.routine = blkif_bringup_phase2;
-        schedule_task(&blkif_statechange_tq);
+        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
+        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
+        
+        /* Probe for discs that are attached to the interface. */
+        xlvbd_init();
+        
+        blkif_state = BLKIF_STATE_CONNECTED;
+        
+        /* Kick pending requests. */
+        spin_lock_irq(&io_request_lock);
+        kick_pending_request_queues();
+        spin_unlock_irq(&io_request_lock);
         break;
 
     default:
@@ -675,7 +671,11 @@ int __init xlblk_init(void)
     ctrl_msg_t                       cmsg;
     blkif_fe_driver_status_changed_t st;
 
-    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx);
+    if ( start_info.flags & SIF_INITDOMAIN )
+        return 0;
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
 
     /* Send a driver-UP notification to the domain controller. */
     cmsg.type      = CMSG_BLKIF_FE;
index b26907192af3145ef14592ab6e76a93c9b95b346..12ce976cb5f3b1ad819da409fb9bf8e37f8d8bf0 100644 (file)
@@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *disk_info)
     memset(&req, 0, sizeof(req));
     req.operation   = BLKIF_OP_PROBE;
     req.nr_segments = 1;
-    req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512);
+    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
 
     blkif_control_send(&req, &rsp);
 
index e01896385b5507854602a577ec3a0c40af675c26..244f309467cf08e5ad00e711523675467e908834 100644 (file)
@@ -513,7 +513,7 @@ static int __init xencons_init(void)
     }
     else
     {
-        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx);
+        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0);
     }
 
     printk("Xen virtual console successfully installed\n");
index e0e43ff2ccdccc27f7b64d3d3d5580c0763a9ae4..cf1b07503100d7f352538b6a7869b5bacf9a0ac7 100644 (file)
@@ -10,8 +10,6 @@
 
 static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
 {
-    DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype);
-    
     switch ( msg->subtype )
     {
     case CMSG_NETIF_BE_CREATE:
@@ -54,7 +52,8 @@ void netif_ctrlif_init(void)
     ctrl_msg_t                       cmsg;
     netif_be_driver_status_changed_t st;
 
-    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx);
+    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
 
     /* Send a driver-UP notification to the domain controller. */
     cmsg.type      = CMSG_NETIF_BE;
index 8623d8214ba9f9cb52494d8440ed9f76a8daceb0..b6a9cff69242ba2e2e63033bd3324737aad34d50 100644 (file)
@@ -7,6 +7,7 @@
  */
 
 #include "common.h"
+#include <linux/rtnetlink.h>
 
 #define NETIF_HASHSZ 1024
 #define NETIF_HASH(_d,_h) \
@@ -14,6 +15,7 @@
 
 static netif_t *netif_hash[NETIF_HASHSZ];
 static struct net_device *bridge_dev;
+static struct net_bridge *bridge_br;
 
 netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
 {
@@ -36,8 +38,10 @@ void __netif_disconnect_complete(netif_t *netif)
      */
     unbind_evtchn_from_irq(netif->evtchn);
     vfree(netif->tx); /* Frees netif->rx as well. */
-    (void)br_del_if((struct net_bridge *)bridge_dev->priv, netif->dev);
+    rtnl_lock();
+    (void)br_del_if(bridge_br, netif->dev);
     (void)dev_close(netif->dev);
+    rtnl_unlock();
 
     /* Construct the deferred response message. */
     cmsg.type         = CMSG_NETIF_BE;
@@ -73,7 +77,7 @@ void netif_create(netif_be_create_t *create)
     struct net_device *dev;
     netif_t          **pnetif, *netif;
 
-    dev = alloc_netdev(sizeof(netif_t), "netif-be-%d", ether_setup);
+    dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup);
     if ( dev == NULL )
     {
         DPRINTK("Could not create netif: out of memory\n");
@@ -111,7 +115,10 @@ void netif_create(netif_be_create_t *create)
     dev->hard_start_xmit = netif_be_start_xmit;
     dev->get_stats       = netif_be_get_stats;
     memcpy(dev->dev_addr, create->mac, ETH_ALEN);
-    
+
+    /* XXX In bridge mode we should force a different MAC from remote end. */
+    dev->dev_addr[2] ^= 1;
+
     if ( register_netdev(dev) != 0 )
     {
         DPRINTK("Could not register new net device\n");
@@ -225,15 +232,27 @@ void netif_connect(netif_be_connect_t *connect)
     netif->status         = CONNECTED;
     netif_get(netif);
 
+    rtnl_lock();
+
     (void)dev_open(netif->dev);
-    (void)br_add_if((struct net_bridge *)bridge_dev->priv, netif->dev);
-    /* At this point we try to ensure that eth0 is attached to the bridge. */
+    (void)br_add_if(bridge_br, netif->dev);
+
+    /*
+     * The default config is a very simple binding to eth0.
+     * If eth0 is being used as an IP interface by this OS then someone
+     * must add eth0's IP address to nbe-br, and change the routing table
+     * to refer to nbe-br instead of eth0.
+     */
+    (void)dev_open(bridge_dev);
     if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL )
     {
         (void)dev_open(eth0_dev);
-        (void)br_add_if((struct net_bridge *)bridge_dev->priv, eth0_dev);
+        (void)br_add_if(bridge_br, eth0_dev);
     }
-    (void)request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif);
+
+    rtnl_unlock();
+
+    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
     netif_start_queue(netif->dev);
 
     connect->status = NETIF_BE_STATUS_OKAY;
@@ -271,8 +290,11 @@ int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
 void netif_interface_init(void)
 {
     memset(netif_hash, 0, sizeof(netif_hash));
-    if ( br_add_bridge("netif-backend") != 0 )
+    if ( br_add_bridge("nbe-br") != 0 )
         BUG();
-    bridge_dev = __dev_get_by_name("netif-be-bridge");
-    (void)dev_open(bridge_dev);
+    bridge_dev = __dev_get_by_name("nbe-br");
+    bridge_br  = (struct net_bridge *)bridge_dev->priv;
+    bridge_br->bridge_hello_time = bridge_br->hello_time = 0;
+    bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0;
+    bridge_br->stp_enabled = 0;
 }
index 5b84eba9bc77589c30dcaf4a3bb3ec66d05b8379..62a4adf27d80ff85a5be09c2349b2c1e57d33ecf 100644 (file)
@@ -14,7 +14,7 @@
 #include <asm/hypervisor-ifs/dom_mem_ops.h>
 
 static void net_tx_action(unsigned long unused);
-static void tx_skb_release(struct sk_buff *skb);
+static void netif_page_release(struct page *page);
 static void make_tx_response(netif_t *netif, 
                              u16      id,
                              s8       st);
@@ -30,13 +30,13 @@ static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
 #define tx_work_exists(_if) (1)
 
 #define MAX_PENDING_REQS 256
-unsigned long mmap_vstart;
+static unsigned long mmap_vstart;
 #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
 
 #define PKT_PROT_LEN (ETH_HLEN + 20)
 
-/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/
 static u16 pending_id[MAX_PENDING_REQS];
+static netif_t *pending_netif[MAX_PENDING_REQS];
 static u16 pending_ring[MAX_PENDING_REQS];
 static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
 typedef unsigned int PEND_RING_IDX;
@@ -60,8 +60,7 @@ static void __refresh_mfn_list(void)
     op.u.increase.pages = mfn_list;
     if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC )
     {
-        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
-               ret);
+        printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret);
         BUG();
     }
     alloc_index = MAX_MFN_ALLOC;
@@ -100,10 +99,10 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
     netif_t *netif = (netif_t *)dev->priv;
     s8 status = NETIF_RSP_OKAY;
-    u16 size, id;
+    u16 size=0, id;
     mmu_update_t mmu[6];
     pgd_t *pgd; pmd_t *pmd; pte_t *pte;
-    unsigned long vdata, new_mfn;
+    unsigned long vdata, mdata=0, new_mfn;
 
     /* Drop the packet if the target domain has no receive buffers. */
     if ( (netif->rx_req_cons == netif->rx->req_prod) ||
@@ -126,16 +125,23 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
          (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
          ((skb->end - skb->head) < (PAGE_SIZE/2)) )
     {
-        struct sk_buff *nskb = dev_alloc_skb(PAGE_SIZE-1024);
+        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
         int hlen = skb->data - skb->head;
+        if ( unlikely(nskb == NULL) )
+        {
+            DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid);
+            status = NETIF_RSP_ERROR;
+            goto out;
+        }
         skb_reserve(nskb, hlen);
-        skb_put(nskb, skb->len);
+        __skb_put(nskb, skb->len);
         (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
         dev_kfree_skb(skb);
         skb = nskb;
     }
 
     vdata = (unsigned long)skb->data;
+    mdata = virt_to_machine(vdata);
     size  = skb->tail - skb->data;
 
     new_mfn = get_new_mfn();
@@ -153,7 +159,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
     mmu[1].ptr |= MMU_EXTENDED_COMMAND;
     mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H;
 
-    mmu[2].ptr  = virt_to_machine(vdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+    mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
     mmu[2].val  = MMUEXT_REASSIGN_PAGE;
 
     mmu[3].ptr  = MMU_EXTENDED_COMMAND;
@@ -167,6 +173,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
     if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) )
     {
+        DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid);
         dealloc_mfn(new_mfn);
         status = NETIF_RSP_ERROR;
         goto out;
@@ -174,12 +181,12 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
     phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn;
 
-    netif->stats.tx_bytes += size;
-    netif->stats.tx_packets++;
+    netif->stats.rx_bytes += size;
+    netif->stats.rx_packets++;
 
  out:
     spin_lock(&netif->rx_lock);
-    make_rx_response(netif, id, status, virt_to_machine(vdata), size);
+    make_rx_response(netif, id, status, mdata, size);
     spin_unlock(&netif->rx_lock);    
     dev_kfree_skb(skb);
     return 0;
@@ -220,6 +227,16 @@ static void add_to_net_schedule_list_tail(netif_t *netif)
     spin_unlock(&net_schedule_list_lock);
 }
 
+static inline void netif_schedule_work(netif_t *netif)
+{
+    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+}
+
 void netif_deschedule(netif_t *netif)
 {
     remove_from_net_schedule_list(netif);
@@ -229,14 +246,8 @@ void netif_deschedule(netif_t *netif)
 static void tx_credit_callback(unsigned long data)
 {
     netif_t *netif = (netif_t *)data;
-
     netif->remaining_credit = netif->credit_bytes;
-
-    if ( tx_work_exists(netif) )
-    {
-        add_to_net_schedule_list_tail(netif);
-        maybe_schedule_tx_action();
-    }    
+    netif_schedule_work(netif);
 }
 #endif
 
@@ -249,6 +260,7 @@ static void net_tx_action(unsigned long unused)
     u16 pending_idx;
     NETIF_RING_IDX i;
     pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
+    struct page *page;
 
     while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
             !list_empty(&net_schedule_list) )
@@ -261,7 +273,7 @@ static void net_tx_action(unsigned long unused)
 
         /* Work to do? */
         i = netif->tx_req_cons;
-        if ( (i == netif->tx->req_prod) && 
+        if ( (i == netif->tx->req_prod) ||
              ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
         {
             netif_put(netif);
@@ -296,7 +308,7 @@ static void net_tx_action(unsigned long unused)
         netif->remaining_credit -= tx.size;
 #endif
 
-        add_to_net_schedule_list_tail(netif);
+        netif_schedule_work(netif);
 
         if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
              unlikely(txreq.size > ETH_FRAME_LEN) )
@@ -335,6 +347,7 @@ static void net_tx_action(unsigned long unused)
 
         if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
         {
+            DPRINTK("Can't allocate a skb in start_xmit.\n");
             make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
             netif_put(netif);
             vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
@@ -346,29 +359,29 @@ static void net_tx_action(unsigned long unused)
                (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
                PKT_PROT_LEN);
 
-        skb->dev        = netif->dev;
-        skb->protocol   = eth_type_trans(skb, skb->dev);
-        
+        page = virt_to_page(MMAP_VADDR(pending_idx));
+
         /* Append the packet payload as a fragment. */
-        skb_shinfo(skb)->frags[0].page        = 
-            virt_to_page(MMAP_VADDR(pending_idx));
-        skb_shinfo(skb)->frags[0].size        =
-            txreq.size - PKT_PROT_LEN;
+        skb_shinfo(skb)->frags[0].page        = page;
+        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
         skb_shinfo(skb)->frags[0].page_offset = 
             (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
         skb_shinfo(skb)->nr_frags = 1;
         skb->data_len  = txreq.size - PKT_PROT_LEN;
         skb->len      += skb->data_len;
 
+        skb->dev      = netif->dev;
+        skb->protocol = eth_type_trans(skb, skb->dev);
+
         /* Destructor information. */
-        skb->destructor = tx_skb_release;
-        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif;
-        skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx;
+        atomic_set(&page->count, 1);
+        page->mapping = (struct address_space *)netif_page_release;
+        pending_id[pending_idx] = txreq.id;
+        pending_netif[pending_idx] = netif;
 
-        netif->stats.rx_bytes += txreq.size;
-        netif->stats.rx_packets++;
+        netif->stats.tx_bytes += txreq.size;
+        netif->stats.tx_packets++;
 
-        pending_id[pending_idx] = txreq.id;
         pending_cons++;
 
         netif_rx(skb);
@@ -376,28 +389,34 @@ static void net_tx_action(unsigned long unused)
     }
 }
 
-/* Destructor function for tx skbs. */
-static void tx_skb_release(struct sk_buff *skb)
+static void netif_page_release(struct page *page)
 {
     unsigned long flags;
-    netif_t *netif = (netif_t *)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page;
-    u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size;
+    netif_t *netif;
+    u16 pending_idx;
+
+    pending_idx = page - virt_to_page(mmap_vstart);
+
+    netif = pending_netif[pending_idx];
 
     vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
-    
-    skb_shinfo(skb)->nr_frags = 0; 
-    
+        
     spin_lock(&netif->tx_lock);
     make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
     spin_unlock(&netif->tx_lock);
-    
+
+    /*
+     * Scheduling checks must happen after the above response is posted.
+     * This avoids a possible race with a guest OS on another CPU.
+     */
+    mb();
+    netif_schedule_work(netif);
+
     netif_put(netif);
  
     spin_lock_irqsave(&pend_prod_lock, flags);
     pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
     spin_unlock_irqrestore(&pend_prod_lock, flags);
-    maybe_schedule_tx_action();        
 }
 
 #if 0
@@ -493,9 +512,26 @@ static void make_rx_response(netif_t     *netif,
 
 static int __init init_module(void)
 {
+    int i;
+
+    if ( !(start_info.flags & SIF_INITDOMAIN) )
+        return 0;
+
     netif_interface_init();
-    mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS);
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+
+    spin_lock_init(&net_schedule_list_lock);
+    INIT_LIST_HEAD(&net_schedule_list);
+
     netif_ctrlif_init();
+
     return 0;
 }
 
index af8e660b7c95119f5a769837368437610cb9074b..cc5ac31e826cc0e581c37ff2b42c0e3087d8a3e7 100644 (file)
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
-#include "../netif.h"
+#include <asm/evtchn.h>
+#include <asm/ctrl_if.h>
+#include <asm/hypervisor-ifs/dom_mem_ops.h>
 
-static struct tq_struct netif_statechange_tq;
+#include "../netif.h"
 
 #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
 
-static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
 static void network_tx_buf_gc(struct net_device *dev);
 static void network_alloc_rx_buffers(struct net_device *dev);
 static void cleanup_module(void);
 
-/* Dynamically-mapped IRQs. */
-static int network_irq, debug_irq;
-
 static struct list_head dev_list;
 
 struct net_private
@@ -47,7 +45,7 @@ struct net_private
     struct net_device *dev;
 
     struct net_device_stats stats;
-    NET_RING_IDX rx_resp_cons, tx_resp_cons;
+    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
     unsigned int tx_full;
     
     netif_tx_interface_t *tx;
@@ -69,8 +67,8 @@ struct net_private
      * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
      * array is an index into a chain of free entries.
      */
-    struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1];
-    struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1];
+    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
+    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
 };
 
 /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
@@ -91,7 +89,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle)
     {
         np = list_entry(ent, struct net_private, list);
         if ( np->handle == handle )
-            return np;
+            return np->dev;
     }
     return NULL;
 }
@@ -100,8 +98,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle)
 static int network_open(struct net_device *dev)
 {
     struct net_private *np = dev->priv;
-    netop_t netop;
-    int i, ret;
+    int i;
 
     if ( np->state != NETIF_STATE_CONNECTED )
         return -EINVAL;
@@ -111,15 +108,16 @@ static int network_open(struct net_device *dev)
     spin_lock_init(&np->tx_lock);
 
     /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
-    for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ )
+    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
         np->tx_skbs[i] = (void *)(i+1);
-    for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ )
+    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
         np->rx_skbs[i] = (void *)(i+1);
 
     wmb();
     np->state = NETIF_STATE_ACTIVE;
 
     network_alloc_rx_buffers(dev);
+    np->rx->event = np->rx_resp_cons + 1;
 
     netif_start_queue(dev);
 
@@ -131,18 +129,17 @@ static int network_open(struct net_device *dev)
 
 static void network_tx_buf_gc(struct net_device *dev)
 {
-    NET_RING_IDX i, prod;
+    NETIF_RING_IDX i, prod;
     unsigned short id;
     struct net_private *np = dev->priv;
     struct sk_buff *skb;
-    tx_entry_t *tx_ring = np->net_ring->tx_ring;
 
     do {
-        prod = np->net_idx->tx_resp_prod;
+        prod = np->tx->resp_prod;
 
         for ( i = np->tx_resp_cons; i != prod; i++ )
         {
-            id  = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
+            id  = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
             skb = np->tx_skbs[id];
             ADD_ID_TO_FREELIST(np->tx_skbs, id);
             dev_kfree_skb_any(skb);
@@ -158,14 +155,14 @@ static void network_tx_buf_gc(struct net_device *dev)
          * in such cases notification from Xen is likely to be the only kick
          * that we'll get.
          */
-        np->net_idx->tx_event = 
-            prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
+        np->tx->event = 
+            prod + ((np->tx->req_prod - prod) >> 1) + 1;
         mb();
     }
-    while ( prod != np->net_idx->tx_resp_prod );
+    while ( prod != np->tx->resp_prod );
 
     if ( np->tx_full && 
-         ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) )
+         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
     {
         np->tx_full = 0;
         if ( np->state == NETIF_STATE_ACTIVE )
@@ -189,10 +186,14 @@ static void network_alloc_rx_buffers(struct net_device *dev)
     unsigned short id;
     struct net_private *np = dev->priv;
     struct sk_buff *skb;
-    netop_t netop;
-    NET_RING_IDX i = np->net_idx->rx_req_prod;
-
-    if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 
+    NETIF_RING_IDX i = np->rx->req_prod;
+    dom_mem_op_t op;
+    unsigned long pfn_array[NETIF_RX_RING_SIZE];
+    int ret, nr_pfns = 0;
+    pte_t *pte;
+
+    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
+    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
          unlikely(np->state != NETIF_STATE_ACTIVE) )
         return;
 
@@ -209,13 +210,13 @@ static void network_alloc_rx_buffers(struct net_device *dev)
         id = GET_ID_FROM_FREELIST(np->rx_skbs);
         np->rx_skbs[id] = skb;
 
-        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id   = id;
-        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 
-            virt_to_machine(get_ppte(skb->head));
-
-        np->rx_bufs_to_notify++;
+        np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
+        
+        pte = get_ppte(skb->head);
+        pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT;
+        queue_l1_entry_update(pte, 0);
     }
-    while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
+    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
 
     /*
      * We may have allocated buffers which have entries outstanding in the page
@@ -223,17 +224,16 @@ static void network_alloc_rx_buffers(struct net_device *dev)
      */
     flush_page_update_queue();
 
-    np->net_idx->rx_req_prod = i;
-    np->net_idx->rx_event    = np->rx_resp_cons + 1;
-        
-    /* Batch Xen notifications. */
-    if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) )
+    op.op = MEMOP_RESERVATION_DECREASE;
+    op.u.decrease.size  = nr_pfns;
+    op.u.decrease.pages = pfn_array;
+    if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns )
     {
-        netop.cmd = NETOP_PUSH_BUFFERS;
-        netop.vif = np->idx;
-        (void)HYPERVISOR_net_io_op(&netop);
-        np->rx_bufs_to_notify = 0;
+        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
+        BUG();
     }
+
+    np->rx->req_prod = i;
 }
 
 
@@ -241,9 +241,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
     unsigned short id;
     struct net_private *np = (struct net_private *)dev->priv;
-    tx_req_entry_t *tx;
-    netop_t netop;
-    NET_RING_IDX i;
+    netif_tx_request_t *tx;
+    NETIF_RING_IDX i;
 
     if ( unlikely(np->tx_full) )
     {
@@ -262,27 +261,27 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
         memcpy(new_skb->data, skb->data, skb->len);
         dev_kfree_skb(skb);
         skb = new_skb;
-    }   
+    }
     
     spin_lock_irq(&np->tx_lock);
 
-    i = np->net_idx->tx_req_prod;
+    i = np->tx->req_prod;
 
     id = GET_ID_FROM_FREELIST(np->tx_skbs);
     np->tx_skbs[id] = skb;
 
-    tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
+    tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;
 
     tx->id   = id;
-    tx->addr = phys_to_machine(virt_to_phys(skb->data));
+    tx->addr = virt_to_machine(skb->data);
     tx->size = skb->len;
 
     wmb();
-    np->net_idx->tx_req_prod = i + 1;
+    np->tx->req_prod = i + 1;
 
     network_tx_buf_gc(dev);
 
-    if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) )
+    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
     {
         np->tx_full = 1;
         netif_stop_queue(dev);
@@ -295,12 +294,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
     /* Only notify Xen if there are no outstanding responses. */
     mb();
-    if ( np->net_idx->tx_resp_prod == i )
-    {
-        netop.cmd = NETOP_PUSH_BUFFERS;
-        netop.vif = np->idx;
-        (void)HYPERVISOR_net_io_op(&netop);
-    }
+    if ( np->tx->resp_prod == i )
+        notify_via_evtchn(np->evtchn);
 
     return 0;
 }
@@ -312,22 +307,24 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
     struct net_private *np = dev->priv;
     unsigned long flags;
     struct sk_buff *skb;
-    rx_resp_entry_t *rx;
-    NET_RING_IDX i;
+    netif_rx_response_t *rx;
+    NETIF_RING_IDX i;
+    mmu_update_t mmu[2];
+    pte_t *pte;
 
     spin_lock_irqsave(&np->tx_lock, flags);
     network_tx_buf_gc(dev);
     spin_unlock_irqrestore(&np->tx_lock, flags);
 
  again:
-    for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
+    for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ )
     {
-        rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
+        rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
 
         skb = np->rx_skbs[rx->id];
         ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
 
-        if ( unlikely(rx->status != RING_STATUS_OK) )
+        if ( unlikely(rx->status <= 0) )
         {
             /* Gate this error. We get a (valid) slew of them on suspend. */
             if ( np->state == NETIF_STATE_ACTIVE )
@@ -336,6 +333,17 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
             continue;
         }
 
+        /* Remap the page. */
+        pte = get_ppte(skb->head);
+        mmu[0].ptr  = virt_to_machine(pte);
+        mmu[0].val  = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
+        mmu[1].ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
+        mmu[1].val  = __pa(skb->head) >> PAGE_SHIFT;
+        if ( HYPERVISOR_mmu_update(mmu, 2) != 0 )
+            BUG();
+        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
+            rx->addr >> PAGE_SHIFT;
+
         /*
          * Set up shinfo -- from alloc_skb This was particularily nasty:  the
          * shared info is hidden at the back of the data area (presumably so it
@@ -348,13 +356,13 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
         phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
             (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
 
-        skb->data = skb->tail = skb->head + rx->offset;
-        skb_put(skb, rx->size);
+        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
+        skb_put(skb, rx->status);
         skb->protocol = eth_type_trans(skb, dev);
 
         np->stats.rx_packets++;
 
-        np->stats.rx_bytes += rx->size;
+        np->stats.rx_bytes += rx->status;
         netif_rx(skb);
         dev->last_rx = jiffies;
     }
@@ -362,10 +370,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
     np->rx_resp_cons = i;
 
     network_alloc_rx_buffers(dev);
+    np->rx->event = np->rx_resp_cons + 1;
     
     /* Deal with hypervisor racing our resetting of rx_event. */
     mb();
-    if ( np->net_idx->rx_resp_prod != i )
+    if ( np->rx->resp_prod != i )
         goto again;
 }
 
@@ -373,16 +382,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
 static int network_close(struct net_device *dev)
 {
     struct net_private *np = dev->priv;
-    netop_t netop;
 
     netif_stop_queue(np->dev);
 
-    netop.cmd = NETOP_FLUSH_BUFFERS;
-    netop.vif = np->idx;
-    (void)HYPERVISOR_net_io_op(&netop);
-
-    while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) ||
-            (np->tx_resp_cons != np->net_idx->tx_req_prod) )
+    while ( (np->rx_resp_cons != np->rx->req_prod) ||
+            (np->tx_resp_cons != np->tx->req_prod) )
     {
         barrier();
         current->state = TASK_INTERRUPTIBLE;
@@ -406,55 +410,12 @@ static struct net_device_stats *network_get_stats(struct net_device *dev)
 }
 
 
-static void netif_bringup_phase1(void *unused)
+static void netif_status_change(netif_fe_interface_status_changed_t *status)
 {
     ctrl_msg_t                   cmsg;
     netif_fe_interface_connect_t up;
     struct net_device *dev;
     struct net_private *np;
-
-    dev = find_dev_by_handle(0);
-    np  = dev->priv;
-    
-    /* Move from CLOSED to DISCONNECTED state. */
-    np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
-    np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
-    memset(np->tx, 0, PAGE_SIZE);
-    memset(np->rx, 0, PAGE_SIZE);
-    np->state  = NETIF_STATE_DISCONNECTED;
-
-    /* Construct an interface-CONNECT message for the domain controller. */
-    cmsg.type      = CMSG_NETIF_FE;
-    cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
-    cmsg.length    = sizeof(netif_fe_interface_connect_t);
-    up.handle      = 0;
-    up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
-    up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
-    memcpy(cmsg.msg, &up, sizeof(up));
-
-    /* Tell the controller to bring up the interface. */
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void netif_bringup_phase2(void *unused)
-{
-    struct net_device *dev;
-    struct net_private *np;
-
-    dev = find_dev_by_handle(0);
-    np  = dev->priv;
-    
-    np->irq = bind_evtchn_to_irq(np->evtchn);
-    (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
-                      "netif", dev);
-
-    np->state = NETIF_STATE_CONNECTED;
-}
-
-static void netif_status_change(netif_fe_interface_status_changed_t *status)
-{
-    struct net_device *dev;
-    struct net_private *np;
     
     if ( status->handle != 0 )
     {
@@ -470,31 +431,53 @@ static void netif_status_change(netif_fe_interface_status_changed_t *status)
     {
     case NETIF_INTERFACE_STATUS_DESTROYED:
         printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
-               netif_state);
+               np->state);
         break;
 
     case NETIF_INTERFACE_STATUS_DISCONNECTED:
         if ( np->state != NETIF_STATE_CLOSED )
         {
             printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
-                   " in state %d\n", netif_state);
+                   " in state %d\n", np->state);
             break;
         }
-        netif_statechange_tq.routine = netif_bringup_phase1;
-        schedule_task(&netif_statechange_tq);
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
+        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
+        memset(np->tx, 0, PAGE_SIZE);
+        memset(np->rx, 0, PAGE_SIZE);
+        np->state  = NETIF_STATE_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_NETIF_FE;
+        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(netif_fe_interface_connect_t);
+        up.handle      = 0;
+        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
+        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
         break;
 
     case NETIF_INTERFACE_STATUS_CONNECTED:
         if ( np->state == NETIF_STATE_CLOSED )
         {
             printk(KERN_WARNING "Unexpected netif-CONNECTED message"
-                   " in state %d\n", netif_state);
+                   " in state %d\n", np->state);
             break;
         }
-        np->evtchn = status->evtchn;
+
         memcpy(dev->dev_addr, status->mac, ETH_ALEN);
-        netif_statechange_tq.routine = netif_bringup_phase2;
-        schedule_task(&netif_statechange_tq);
+
+        np->evtchn = status->evtchn;
+        np->irq = bind_evtchn_to_irq(np->evtchn);
+        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
+                      dev->name, dev);
+        
+        np->state = NETIF_STATE_CONNECTED;
         break;
 
     default:
@@ -532,10 +515,13 @@ static int __init init_module(void)
 {
     ctrl_msg_t                       cmsg;
     netif_fe_driver_status_changed_t st;
-    int i, err;
+    int err;
     struct net_device *dev;
     struct net_private *np;
 
+    if ( start_info.flags & SIF_INITDOMAIN )
+        return 0;
+
     INIT_LIST_HEAD(&dev_list);
 
     if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
@@ -562,7 +548,8 @@ static int __init init_module(void)
     np->dev = dev;
     list_add(&np->list, &dev_list);
 
-    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
+    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
 
     /* Send a driver-UP notification to the domain controller. */
     cmsg.type      = CMSG_NETIF_FE;
index 715f707eb091c3510a6d69ed8e96d6ec07c0633e..19cb9a33260b231b7b4251e8b13c542a862df74d 100644 (file)
@@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_action;
 static CONTROL_RING_IDX ctrl_if_tx_resp_cons;
 static CONTROL_RING_IDX ctrl_if_rx_req_cons;
 
-/* Incoming message requests: primary message type -> message handler. */
+/* Incoming message requests. */
+    /* Primary message type -> message handler. */
 static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256];
+    /* Primary message type -> callback in process context? */
+static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)];
+    /* Is it late enough during bootstrap to use schedule_task()? */
+static int safe_to_schedule_task;
+    /* Passed to schedule_task(). */
+static struct tq_struct ctrl_if_rxmsg_deferred_tq;
+    /* Queue up messages to be handled in process context. */
+static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE];
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod;
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons;
 
 /* Incoming message responses: message identifier -> message handler/id. */
 static struct {
@@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigned long data)
     }
 }
 
+static void __ctrl_if_rxmsg_deferred(void *unused)
+{
+    ctrl_msg_t *msg;
+
+    while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
+    {
+        msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
+            ctrl_if_rxmsg_deferred_cons++)];
+        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
+    }
+}
+
 static void __ctrl_if_rx_tasklet(unsigned long data)
 {
     control_if_t *ctrl_if = get_ctrl_if();
-    ctrl_msg_t   *msg;
+    ctrl_msg_t    msg, *pmsg;
 
     while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
     {
-        /*
-         * We need no locking or barriers here. There will be one and only one
-         * response as a result of each callback, so the callback handler
-         * doesn't need to worry about the 'msg' being overwritten until:
-         *  1. It returns (if the message must persist then it must be copied).
-         *  2. A response is sent (the response may overwrite the request).
-         */
-        msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
-        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
+        pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
+        memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
+        if ( msg.length != 0 )
+            memcpy(msg.msg, pmsg->msg, msg.length);
+        if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) )
+        {
+            pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
+                ctrl_if_rxmsg_deferred_prod++)];
+            memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
+            schedule_task(&ctrl_if_rxmsg_deferred_tq);
+        }
+        else
+        {
+            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
+        }
     }
 }
 
@@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *msg)
     ctrl_if_notify_controller();
 }
 
-int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd)
+int ctrl_if_register_receiver(
+    u8 type, 
+    ctrl_msg_handler_t hnd, 
+    unsigned int flags)
 {
-    unsigned long flags;
+    unsigned long _flags;
     int inuse;
 
-    spin_lock_irqsave(&ctrl_if_lock, flags);
+    spin_lock_irqsave(&ctrl_if_lock, _flags);
 
     inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler);
 
     if ( inuse )
+    {
         printk(KERN_INFO "Receiver %p already established for control "
                "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type);
+    }
     else
+    {
         ctrl_if_rxmsg_handler[type] = hnd;
+        clear_bit(type, &ctrl_if_rxmsg_blocking_context);
+        if ( flags == CALLBACK_IN_BLOCKING_CONTEXT )
+        {
+            set_bit(type, &ctrl_if_rxmsg_blocking_context);
+            if ( !safe_to_schedule_task )
+                BUG();
+        }
+    }
 
-    spin_unlock_irqrestore(&ctrl_if_lock, flags);
+    spin_unlock_irqrestore(&ctrl_if_lock, _flags);
 
     return !inuse;
 }
@@ -326,6 +369,7 @@ void __init ctrl_if_init(void)
 
     for ( i = 0; i < 256; i++ )
         ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler;
+    ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred;
 
     spin_lock_init(&ctrl_if_lock);
 
@@ -333,6 +377,15 @@ void __init ctrl_if_init(void)
 }
 
 
+/* This is called after it is safe to call schedule_task(). */
+static int __init ctrl_if_late_setup(void)
+{
+    safe_to_schedule_task = 1;
+    return 0;
+}
+__initcall(ctrl_if_late_setup);
+
+
 /*
  * !! The following are DANGEROUS FUNCTIONS !!
  * Use with care [for example, see xencons_force_flush()].
index 20a934adddc2d7a08876628002a3fed6cfdcfc0a..d219c284030b8ae3636f8460f3fd4c267718f9a4 100644 (file)
@@ -1626,7 +1626,7 @@ int __init blk_dev_init(void)
        jsfd_init();
 #endif
 
-#ifdef CONFIG_XEN_VBD
+#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO)
     xlblk_init();
 #endif
 
index a02e2471ea7d53d3933e3f3ebaf7afc77cd7f4fe..5bc6cc22b12bb4fb48184e5eb3ecb0969660ee70 100644 (file)
@@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *msg);
  * Register a receiver for typed messages from the domain controller. The 
  * handler (@hnd) is called for every received message of specified @type.
  * Returns TRUE (non-zero) if the handler was successfully registered.
+ * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will
+ * occur in a context in which it is safe to yield (i.e., process context).
  */
-int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd);
+#define CALLBACK_IN_BLOCKING_CONTEXT 1
+int ctrl_if_register_receiver(
+    u8 type, 
+    ctrl_msg_handler_t hnd,
+    unsigned int flags);
 
 /*
  * Unregister a receiver for typed messages from the domain controller. The 
index f5243bb6a7761be4ae1b454b20ddb201980ca7ed..5ab5fe9bfcfbf7f336ac5efd39ecc387b770474f 100644 (file)
@@ -159,46 +159,11 @@ extern void iounmap(void *addr);
 extern void *bt_ioremap(unsigned long offset, unsigned long size);
 extern void bt_iounmap(void *addr, unsigned long size);
 
-#ifdef CONFIG_XEN_PHYSDEV_ACCESS
-
-#ifdef CONFIG_HIGHMEM
-#error "Highmem is not yet compatible with physical device access"
-#endif
-
-/*
- * The bus translation macros need special care if we are executing device
- * accesses to/from other domains' memory. In these cases the virtual address
- * is actually a temporary mapping in the 'vmalloc' space. The physical
- * address will therefore be >max_low_pfn, and will not have a valid entry
- * in the phys_to_mach mapping table.
- */
-static inline unsigned long phys_to_bus(unsigned long phys)
-{
-    extern unsigned long max_pfn;
-    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
-    void *addr;
-    unsigned long bus;
-    if ( (phys >> PAGE_SHIFT) < max_pfn )
-        return phys_to_machine(phys);
-    addr = phys_to_virt(phys);
-    pgd = pgd_offset_k(   (unsigned long)addr);
-    pmd = pmd_offset(pgd, (unsigned long)addr);
-    pte = pte_offset(pmd, (unsigned long)addr);
-    bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK);
-    return bus;
-}
-
-#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x))
-#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
-#define page_to_bus(_x) phys_to_bus(page_to_phys(_x))
-
-#else
-
 #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x))
 #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
 #define page_to_bus(_x) phys_to_machine(page_to_phys(_x))
-
-#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
+#define bus_to_phys(_x) machine_to_phys(_x)
+#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT))
 
 /*
  * readX/writeX() are used to access memory mapped devices. On some
diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pci.h b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h
new file mode 100644 (file)
index 0000000..74ae5ba
--- /dev/null
@@ -0,0 +1,283 @@
+#ifndef __i386_PCI_H
+#define __i386_PCI_H
+
+#include <linux/config.h>
+
+#ifdef __KERNEL__
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+   already-configured bus numbers - to be used for buggy BIOSes
+   or architectures with incomplete PCI setup by the loader */
+
+#ifdef CONFIG_PCI
+extern unsigned int pcibios_assign_all_busses(void);
+#else
+#define pcibios_assign_all_busses()    0
+#endif
+#define pcibios_scan_all_fns()         0
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO         0x1000
+#define PCIBIOS_MIN_MEM                (pci_mem_start)
+
+void pcibios_config_init(void);
+struct pci_bus * pcibios_scan_root(int bus);
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+/* Dynamic DMA mapping stuff.
+ * i386 has everything mapped statically.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/scatterlist.h>
+#include <linux/string.h>
+#include <asm/io.h>
+
+struct pci_dev;
+
+/* The networking and block device layers use this boolean for bounce
+ * buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS    (0)
+
+/* Allocate and map kernel buffer using consistent mode DMA for a device.
+ * hwdev should be valid struct pci_dev pointer for PCI devices,
+ * NULL for PCI-like buses (ISA, EISA).
+ * Returns non-NULL cpu-view pointer to the buffer if successful and
+ * sets *dma_addrp to the pci side dma address as well, else *dma_addrp
+ * is undefined.
+ */
+extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
+                                 dma_addr_t *dma_handle);
+
+/* Free and unmap a consistent DMA buffer.
+ * cpu_addr is what was returned from pci_alloc_consistent,
+ * size must be the same as what as passed into pci_alloc_consistent,
+ * and likewise dma_addr must be the same as what *dma_addrp was set to.
+ *
+ * References to the memory and mappings associated with cpu_addr/dma_addr
+ * past this call are illegal.
+ */
+extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
+                               void *vaddr, dma_addr_t dma_handle);
+
+/* Map a single buffer of the indicated size for DMA in streaming mode.
+ * The 32-bit bus address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory
+ * until either pci_unmap_single or pci_dma_sync_single is performed.
+ */
+static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
+                                       size_t size, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       flush_write_buffers();
+       return virt_to_bus(ptr);
+}
+
+/* Unmap a single streaming mode DMA translation.  The dma_addr and size
+ * must match what was provided for in a previous pci_map_single call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guarenteed to see
+ * whatever the device wrote there.
+ */
+static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
+                                   size_t size, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       /* Nothing to do */
+}
+
+/*
+ * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical
+ * to pci_map_single, but takes a struct page instead of a virtual address
+ */
+static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
+                                     unsigned long offset, size_t size, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+
+       return page_to_bus(page) + offset;
+}
+
+static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
+                                 size_t size, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       /* Nothing to do */
+}
+
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME)         (0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL)        do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME)           (0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL)  do { } while (0)
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scather-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+                            int nents, int direction)
+{
+       int i;
+
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       /*
+        * temporary 2.4 hack
+        */
+       for (i = 0; i < nents; i++ ) {
+               if (sg[i].address && sg[i].page)
+                       out_of_line_bug();
+               else if (!sg[i].address && !sg[i].page)
+                       out_of_line_bug();
+               if (sg[i].address)
+                       sg[i].dma_address = virt_to_bus(sg[i].address);
+               else
+                       sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
+       }
+       flush_write_buffers();
+       return nents;
+}
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+                               int nents, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       /* Nothing to do */
+}
+
+/* Make physical memory consistent for a single
+ * streaming mode DMA translation after a transfer.
+ *
+ * If you perform a pci_map_single() but wish to interrogate the
+ * buffer using the cpu, yet do not wish to teardown the PCI dma
+ * mapping, you must call this function before doing so.  At the
+ * next point you give the PCI dma address back to the card, the
+ * device again owns the buffer.
+ */
+static inline void pci_dma_sync_single(struct pci_dev *hwdev,
+                                      dma_addr_t dma_handle,
+                                      size_t size, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       flush_write_buffers();
+}
+
+/* Make physical memory consistent for a set of streaming
+ * mode DMA translations after a transfer.
+ *
+ * The same as pci_dma_sync_single but for a scatter-gather list,
+ * same rules and usage.
+ */
+static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
+                                  struct scatterlist *sg,
+                                  int nelems, int direction)
+{
+       if (direction == PCI_DMA_NONE)
+               out_of_line_bug();
+       flush_write_buffers();
+}
+
+/* Return whether the given PCI device DMA address mask can
+ * be supported properly.  For example, if your device can
+ * only drive the low 24-bits during PCI bus mastering, then
+ * you would pass 0x00ffffff as the mask to this function.
+ */
+static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
+{
+        /*
+         * we fall back to GFP_DMA when the mask isn't all 1s,
+         * so we can't guarantee allocations that must be
+         * within a tighter range than GFP_DMA..
+         */
+        if(mask < 0x00ffffff)
+                return 0;
+
+       return 1;
+}
+
+/* This is always fine. */
+#define pci_dac_dma_supported(pci_dev, mask)   (1)
+
+static __inline__ dma64_addr_t
+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
+{
+       return ((dma64_addr_t) page_to_bus(page) +
+               (dma64_addr_t) offset);
+}
+
+static __inline__ struct page *
+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
+{
+       return bus_to_page(dma_addr);
+}
+
+static __inline__ unsigned long
+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
+{
+       return (dma_addr & ~PAGE_MASK);
+}
+
+static __inline__ void
+pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
+{
+       flush_write_buffers();
+}
+
+/* These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg)     ((sg)->dma_address)
+#define sg_dma_len(sg)         ((sg)->length)
+
+/* Return the index of the PCI controller for device. */
+static inline int pci_controller_num(struct pci_dev *dev)
+{
+       return 0;
+}
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+                              enum pci_mmap_state mmap_state, int write_combine);
+
+#endif /* __KERNEL__ */
+
+#endif /* __i386_PCI_H */
index 46fe4784ad86970bc44616d9b20f4f0f956d08bd..2e9f7b992099df0c10ab6500dcb26d3247da7366 100755 (executable)
@@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h
 ln -sf ../asm-i386/namei.h 
 ln -sf ../asm-i386/param.h 
 ln -sf ../asm-i386/parport.h 
-ln -sf ../asm-i386/pci.h
 ln -sf ../asm-i386/pgtable-3level.h 
 ln -sf ../asm-i386/poll.h 
 ln -sf ../asm-i386/posix_types.h 
diff --git a/xenolinux-2.4.26-sparse/mm/page_alloc.c b/xenolinux-2.4.26-sparse/mm/page_alloc.c
new file mode 100644 (file)
index 0000000..62ed775
--- /dev/null
@@ -0,0 +1,930 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+int nr_swap_pages;
+int nr_active_pages;
+int nr_inactive_pages;
+LIST_HEAD(inactive_list);
+LIST_HEAD(active_list);
+pg_data_t *pgdat_list;
+
+/*
+ *
+ * The zone_table array is used to look up the address of the
+ * struct zone corresponding to a given zone number (ZONE_DMA,
+ * ZONE_NORMAL, or ZONE_HIGHMEM).
+ */
+zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
+static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
+static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+
+int vm_gfp_debug = 0;
+
+/*
+ * Temporary debugging check.
+ */
+#define BAD_RANGE(zone, page)                                          \
+(                                                                      \
+       (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
+       || (((page) - mem_map) < (zone)->zone_start_mapnr)              \
+       || ((zone) != page_zone(page))                                  \
+)
+
+/*
+ * Freeing function for a buddy system allocator.
+ * Contrary to prior comments, this is *NOT* hairy, and there
+ * is no reason for anyone not to understand it.
+ *
+ * The concept of a buddy system is to maintain direct-mapped tables
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep one bit for each pair of blocks, which
+ * is set to 1 iff only one of the pair is allocated.  So when we
+ * are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
+static void __free_pages_ok (struct page *page, unsigned int order)
+{
+       unsigned long index, page_idx, mask, flags;
+       free_area_t *area;
+       struct page *base;
+       zone_t *zone;
+
+       /*
+        * Yes, think what happens when other parts of the kernel take 
+        * a reference to a page in order to pin it for io. -ben
+        */
+       if (PageLRU(page)) {
+               if (unlikely(in_interrupt()))
+                       BUG();
+               lru_cache_del(page);
+       }
+
+       if (page->buffers)
+               BUG();
+       if (page->mapping)
+               return (*(void(*)(struct page *))page->mapping)(page);
+       if (!VALID_PAGE(page))
+               BUG();
+       if (PageLocked(page))
+               BUG();
+       if (PageActive(page))
+               BUG();
+       ClearPageReferenced(page);
+       ClearPageDirty(page);
+
+       if (current->flags & PF_FREE_PAGES)
+               goto local_freelist;
+ back_local_freelist:
+
+       zone = page_zone(page);
+
+       mask = (~0UL) << order;
+       base = zone->zone_mem_map;
+       page_idx = page - base;
+       if (page_idx & ~mask)
+               BUG();
+       index = page_idx >> (1 + order);
+
+       area = zone->free_area + order;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       zone->free_pages -= mask;
+
+       while (mask + (1 << (MAX_ORDER-1))) {
+               struct page *buddy1, *buddy2;
+
+               if (area >= zone->free_area + MAX_ORDER)
+                       BUG();
+               if (!__test_and_change_bit(index, area->map))
+                       /*
+                        * the buddy page is still allocated.
+                        */
+                       break;
+               /*
+                * Move the buddy up one level.
+                * This code is taking advantage of the identity:
+                *      -mask = 1+~mask
+                */
+               buddy1 = base + (page_idx ^ -mask);
+               buddy2 = base + page_idx;
+               if (BAD_RANGE(zone,buddy1))
+                       BUG();
+               if (BAD_RANGE(zone,buddy2))
+                       BUG();
+
+               list_del(&buddy1->list);
+               mask <<= 1;
+               area++;
+               index >>= 1;
+               page_idx &= mask;
+       }
+       list_add(&(base + page_idx)->list, &area->free_list);
+
+       spin_unlock_irqrestore(&zone->lock, flags);
+       return;
+
+ local_freelist:
+       if (current->nr_local_pages)
+               goto back_local_freelist;
+       if (in_interrupt())
+               goto back_local_freelist;               
+
+       list_add(&page->list, &current->local_pages);
+       page->index = order;
+       current->nr_local_pages++;
+}
+
+#define MARK_USED(index, order, area) \
+       __change_bit((index) >> (1+(order)), (area)->map)
+
+static inline struct page * expand (zone_t *zone, struct page *page,
+        unsigned long index, int low, int high, free_area_t * area)
+{
+       unsigned long size = 1 << high;
+
+       while (high > low) {
+               if (BAD_RANGE(zone,page))
+                       BUG();
+               area--;
+               high--;
+               size >>= 1;
+               list_add(&(page)->list, &(area)->free_list);
+               MARK_USED(index, high, area);
+               index += size;
+               page += size;
+       }
+       if (BAD_RANGE(zone,page))
+               BUG();
+       return page;
+}
+
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
+static struct page * rmqueue(zone_t *zone, unsigned int order)
+{
+       free_area_t * area = zone->free_area + order;
+       unsigned int curr_order = order;
+       struct list_head *head, *curr;
+       unsigned long flags;
+       struct page *page;
+
+       spin_lock_irqsave(&zone->lock, flags);
+       do {
+               head = &area->free_list;
+               curr = head->next;
+
+               if (curr != head) {
+                       unsigned int index;
+
+                       page = list_entry(curr, struct page, list);
+                       if (BAD_RANGE(zone,page))
+                               BUG();
+                       list_del(curr);
+                       index = page - zone->zone_mem_map;
+                       if (curr_order != MAX_ORDER-1)
+                               MARK_USED(index, curr_order, area);
+                       zone->free_pages -= 1UL << order;
+
+                       page = expand(zone, page, index, order, curr_order, area);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+
+                       set_page_count(page, 1);
+                       if (BAD_RANGE(zone,page))
+                               BUG();
+                       if (PageLRU(page))
+                               BUG();
+                       if (PageActive(page))
+                               BUG();
+                       return page;    
+               }
+               curr_order++;
+               area++;
+       } while (curr_order < MAX_ORDER);
+       spin_unlock_irqrestore(&zone->lock, flags);
+
+       return NULL;
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+       return __alloc_pages(gfp_mask, order,
+               contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+}
+#endif
+
+static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
+static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+{
+       struct page * page = NULL;
+       int __freed;
+
+       if (in_interrupt())
+               BUG();
+
+       current->allocation_order = order;
+       current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+
+       __freed = try_to_free_pages_zone(classzone, gfp_mask);
+
+       current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+
+       if (current->nr_local_pages) {
+               struct list_head * entry, * local_pages;
+               struct page * tmp;
+               int nr_pages;
+
+               local_pages = &current->local_pages;
+
+               if (likely(__freed)) {
+                       /* pick from the last inserted so we're lifo */
+                       entry = local_pages->next;
+                       do {
+                               tmp = list_entry(entry, struct page, list);
+                               if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
+                                       list_del(entry);
+                                       current->nr_local_pages--;
+                                       set_page_count(tmp, 1);
+                                       page = tmp;
+
+                                       if (page->buffers)
+                                               BUG();
+                                       if (page->mapping)
+                                               BUG();
+                                       if (!VALID_PAGE(page))
+                                               BUG();
+                                       if (PageLocked(page))
+                                               BUG();
+                                       if (PageLRU(page))
+                                               BUG();
+                                       if (PageActive(page))
+                                               BUG();
+                                       if (PageDirty(page))
+                                               BUG();
+
+                                       break;
+                               }
+                       } while ((entry = entry->next) != local_pages);
+               }
+
+               nr_pages = current->nr_local_pages;
+               /* free in reverse order so that the global order will be lifo */
+               while ((entry = local_pages->prev) != local_pages) {
+                       list_del(entry);
+                       tmp = list_entry(entry, struct page, list);
+                       __free_pages_ok(tmp, tmp->index);
+                       if (!nr_pages--)
+                               BUG();
+               }
+               current->nr_local_pages = 0;
+       }
+
+       *freed = __freed;
+       return page;
+}
+
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
+{
+       long free = zone->free_pages - (1UL << order);
+       return free >= 0 ? free : 0;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator:
+ */
+struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
+{
+       zone_t **zone, * classzone;
+       struct page * page;
+       int freed, class_idx;
+
+       zone = zonelist->zones;
+       classzone = *zone;
+       class_idx = zone_idx(classzone);
+
+       for (;;) {
+               zone_t *z = *(zone++);
+               if (!z)
+                       break;
+
+               if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
+                       page = rmqueue(z, order);
+                       if (page)
+                               return page;
+               }
+       }
+
+       classzone->need_balance = 1;
+       mb();
+       if (waitqueue_active(&kswapd_wait))
+               wake_up_interruptible(&kswapd_wait);
+
+       zone = zonelist->zones;
+       for (;;) {
+               unsigned long min;
+               zone_t *z = *(zone++);
+               if (!z)
+                       break;
+
+               min = z->watermarks[class_idx].min;
+               if (!(gfp_mask & __GFP_WAIT))
+                       min >>= 2;
+               if (zone_free_pages(z, order) > min) {
+                       page = rmqueue(z, order);
+                       if (page)
+                               return page;
+               }
+       }
+
+       /* here we're in the low on memory slow path */
+
+       if ((current->flags & PF_MEMALLOC) && 
+                       (!in_interrupt() || (current->flags & PF_MEMDIE))) {
+               zone = zonelist->zones;
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
+
+                       page = rmqueue(z, order);
+                       if (page)
+                               return page;
+               }
+               return NULL;
+       }
+
+       /* Atomic allocations - we can't balance anything */
+       if (!(gfp_mask & __GFP_WAIT))
+               goto out;
+
+ rebalance:
+       page = balance_classzone(classzone, gfp_mask, order, &freed);
+       if (page)
+               return page;
+
+       zone = zonelist->zones;
+       if (likely(freed)) {
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
+
+                       if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
+                               page = rmqueue(z, order);
+                               if (page)
+                                       return page;
+                       }
+               }
+               goto rebalance;
+       } else {
+               /* 
+                * Check that no other task is been killed meanwhile,
+                * in such a case we can succeed the allocation.
+                */
+               for (;;) {
+                       zone_t *z = *(zone++);
+                       if (!z)
+                               break;
+
+                       if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
+                               page = rmqueue(z, order);
+                               if (page)
+                                       return page;
+                       }
+               }
+       }
+
+ out:
+       printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
+              order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+       if (unlikely(vm_gfp_debug))
+               dump_stack();
+       return NULL;
+}
+
+/*
+ * Common helper functions.
+ */
+unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+{
+       struct page * page;
+
+       page = alloc_pages(gfp_mask, order);
+       if (!page)
+               return 0;
+       return (unsigned long) page_address(page);
+}
+
+unsigned long get_zeroed_page(unsigned int gfp_mask)
+{
+       struct page * page;
+
+       page = alloc_pages(gfp_mask, 0);
+       if (page) {
+               void *address = page_address(page);
+               clear_page(address);
+               return (unsigned long) address;
+       }
+       return 0;
+}
+
+void __free_pages(struct page *page, unsigned int order)
+{
+       if (!PageReserved(page) && put_page_testzero(page))
+               __free_pages_ok(page, order);
+}
+
+void free_pages(unsigned long addr, unsigned int order)
+{
+       if (addr != 0)
+               __free_pages(virt_to_page(addr), order);
+}
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages (void)
+{
+       unsigned int sum = 0;
+       zone_t *zone;
+
+       for_each_zone(zone)
+               sum += zone->free_pages;
+
+       return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+       pg_data_t *pgdat;
+       unsigned int sum = 0;
+       zonelist_t *zonelist;
+       zone_t **zonep, *zone;
+
+       for_each_pgdat(pgdat) {
+               int class_idx;
+               zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
+               zonep = zonelist->zones;
+               zone = *zonep;
+               class_idx = zone_idx(zone);
+
+               sum += zone->nr_cache_pages;
+               for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+                       int free = zone->free_pages - zone->watermarks[class_idx].high;
+                       if (free <= 0)
+                               continue;
+                       sum += free;
+               }
+       }
+
+       return sum;
+}
+
+#if CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+       pg_data_t *pgdat;
+       unsigned int pages = 0;
+
+       for_each_pgdat(pgdat)
+               pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+       return pages;
+}
+
+unsigned int freeable_lowmem(void)
+{
+       unsigned int pages = 0;
+       pg_data_t *pgdat;
+
+       for_each_pgdat(pgdat) {
+               pages += pgdat->node_zones[ZONE_DMA].free_pages;
+               pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
+               pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
+               pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
+               pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
+               pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
+       }
+
+       return pages;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas_core(pg_data_t *pgdat)
+{
+       unsigned int order;
+       unsigned type;
+       pg_data_t *tmpdat = pgdat;
+
+       printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+               K(nr_free_pages()),
+               K(nr_free_highpages()));
+
+       while (tmpdat) {
+               zone_t *zone;
+               for (zone = tmpdat->node_zones;
+                               zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
+                       printk("Zone:%s freepages:%6lukB\n", 
+                                       zone->name,
+                                       K(zone->free_pages));
+                       
+               tmpdat = tmpdat->node_next;
+       }
+
+       printk("( Active: %d, inactive: %d, free: %d )\n",
+              nr_active_pages,
+              nr_inactive_pages,
+              nr_free_pages());
+
+       for (type = 0; type < MAX_NR_ZONES; type++) {
+               struct list_head *head, *curr;
+               zone_t *zone = pgdat->node_zones + type;
+               unsigned long nr, total, flags;
+
+               total = 0;
+               if (zone->size) {
+                       spin_lock_irqsave(&zone->lock, flags);
+                       for (order = 0; order < MAX_ORDER; order++) {
+                               head = &(zone->free_area + order)->free_list;
+                               curr = head;
+                               nr = 0;
+                               for (;;) {
+                                       if ((curr = curr->next) == head)
+                                               break;
+                                       nr++;
+                               }
+                               total += nr * (1 << order);
+                               printk("%lu*%lukB ", nr, K(1UL) << order);
+                       }
+                       spin_unlock_irqrestore(&zone->lock, flags);
+               }
+               printk("= %lukB)\n", K(total));
+       }
+
+#ifdef SWAP_CACHE_INFO
+       show_swap_cache_info();
+#endif 
+}
+
+void show_free_areas(void)
+{
+       show_free_areas_core(pgdat_list);
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static inline void build_zonelists(pg_data_t *pgdat)
+{
+       int i, j, k;
+
+       for (i = 0; i <= GFP_ZONEMASK; i++) {
+               zonelist_t *zonelist;
+               zone_t *zone;
+
+               zonelist = pgdat->node_zonelists + i;
+               memset(zonelist, 0, sizeof(*zonelist));
+
+               j = 0;
+               k = ZONE_NORMAL;
+               if (i & __GFP_HIGHMEM)
+                       k = ZONE_HIGHMEM;
+               if (i & __GFP_DMA)
+                       k = ZONE_DMA;
+
+               switch (k) {
+                       default:
+                               BUG();
+                       /*
+                        * fallthrough:
+                        */
+                       case ZONE_HIGHMEM:
+                               zone = pgdat->node_zones + ZONE_HIGHMEM;
+                               if (zone->size) {
+#ifndef CONFIG_HIGHMEM
+                                       BUG();
+#endif
+                                       zonelist->zones[j++] = zone;
+                               }
+                       case ZONE_NORMAL:
+                               zone = pgdat->node_zones + ZONE_NORMAL;
+                               if (zone->size)
+                                       zonelist->zones[j++] = zone;
+                       case ZONE_DMA:
+                               zone = pgdat->node_zones + ZONE_DMA;
+                               if (zone->size)
+                                       zonelist->zones[j++] = zone;
+               }
+               zonelist->zones[j++] = NULL;
+       } 
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE    256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+       unsigned long size = 1;
+
+       pages /= PAGES_PER_WAITQUEUE;
+
+       while (size < pages)
+               size <<= 1;
+
+       /*
+        * Once we have dozens or even hundreds of threads sleeping
+        * on IO we've got bigger problems than wait queue collision.
+        * Limit the size of the wait table to a reasonable size.
+        */
+       size = min(size, 4096UL);
+
+       return size;
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+       return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
+       unsigned long *zones_size, unsigned long zone_start_paddr, 
+       unsigned long *zholes_size, struct page *lmem_map)
+{
+       unsigned long i, j;
+       unsigned long map_size;
+       unsigned long totalpages, offset, realtotalpages;
+       const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+
+       if (zone_start_paddr & ~PAGE_MASK)
+               BUG();
+
+       totalpages = 0;
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               unsigned long size = zones_size[i];
+               totalpages += size;
+       }
+       realtotalpages = totalpages;
+       if (zholes_size)
+               for (i = 0; i < MAX_NR_ZONES; i++)
+                       realtotalpages -= zholes_size[i];
+                       
+       printk("On node %d totalpages: %lu\n", nid, realtotalpages);
+
+       /*
+        * Some architectures (with lots of mem and discontinous memory
+        * maps) have to search for a good mem_map area:
+        * For discontigmem, the conceptual mem map array starts from 
+        * PAGE_OFFSET, we need to align the actual array onto a mem map 
+        * boundary, so that MAP_NR works.
+        */
+       map_size = (totalpages + 1)*sizeof(struct page);
+       if (lmem_map == (struct page *)0) {
+               lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
+               lmem_map = (struct page *)(PAGE_OFFSET + 
+                       MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
+       }
+       *gmap = pgdat->node_mem_map = lmem_map;
+       pgdat->node_size = totalpages;
+       pgdat->node_start_paddr = zone_start_paddr;
+       pgdat->node_start_mapnr = (lmem_map - mem_map);
+       pgdat->nr_zones = 0;
+
+       offset = lmem_map - mem_map;    
+       for (j = 0; j < MAX_NR_ZONES; j++) {
+               zone_t *zone = pgdat->node_zones + j;
+               unsigned long mask;
+               unsigned long size, realsize;
+               int idx;
+
+               zone_table[nid * MAX_NR_ZONES + j] = zone;
+               realsize = size = zones_size[j];
+               if (zholes_size)
+                       realsize -= zholes_size[j];
+
+               printk("zone(%lu): %lu pages.\n", j, size);
+               zone->size = size;
+               zone->realsize = realsize;
+               zone->name = zone_names[j];
+               zone->lock = SPIN_LOCK_UNLOCKED;
+               zone->zone_pgdat = pgdat;
+               zone->free_pages = 0;
+               zone->need_balance = 0;
+                zone->nr_active_pages = zone->nr_inactive_pages = 0;
+
+
+               if (!size)
+                       continue;
+
+               /*
+                * The per-page waitqueue mechanism uses hashed waitqueues
+                * per zone.
+                */
+               zone->wait_table_size = wait_table_size(size);
+               zone->wait_table_shift =
+                       BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
+               zone->wait_table = (wait_queue_head_t *)
+                       alloc_bootmem_node(pgdat, zone->wait_table_size
+                                               * sizeof(wait_queue_head_t));
+
+               for(i = 0; i < zone->wait_table_size; ++i)
+                       init_waitqueue_head(zone->wait_table + i);
+
+               pgdat->nr_zones = j+1;
+
+               mask = (realsize / zone_balance_ratio[j]);
+               if (mask < zone_balance_min[j])
+                       mask = zone_balance_min[j];
+               else if (mask > zone_balance_max[j])
+                       mask = zone_balance_max[j];
+               zone->watermarks[j].min = mask;
+               zone->watermarks[j].low = mask*2;
+               zone->watermarks[j].high = mask*3;
+               /* now set the watermarks of the lower zones in the "j" classzone */
+               for (idx = j-1; idx >= 0; idx--) {
+                       zone_t * lower_zone = pgdat->node_zones + idx;
+                       unsigned long lower_zone_reserve;
+                       if (!lower_zone->size)
+                               continue;
+
+                       mask = lower_zone->watermarks[idx].min;
+                       lower_zone->watermarks[j].min = mask;
+                       lower_zone->watermarks[j].low = mask*2;
+                       lower_zone->watermarks[j].high = mask*3;
+
+                       /* now the brainer part */
+                       lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
+                       lower_zone->watermarks[j].min += lower_zone_reserve;
+                       lower_zone->watermarks[j].low += lower_zone_reserve;
+                       lower_zone->watermarks[j].high += lower_zone_reserve;
+
+                       realsize += lower_zone->realsize;
+               }
+
+               zone->zone_mem_map = mem_map + offset;
+               zone->zone_start_mapnr = offset;
+               zone->zone_start_paddr = zone_start_paddr;
+
+               if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
+                       printk("BUG: wrong zone alignment, it will crash\n");
+
+               /*
+                * Initially all pages are reserved - free ones are freed
+                * up by free_all_bootmem() once the early boot process is
+                * done. Non-atomic initialization, single-pass.
+                */
+               for (i = 0; i < size; i++) {
+                       struct page *page = mem_map + offset + i;
+                       set_page_zone(page, nid * MAX_NR_ZONES + j);
+                       set_page_count(page, 0);
+                       SetPageReserved(page);
+                       INIT_LIST_HEAD(&page->list);
+                       if (j != ZONE_HIGHMEM)
+                               set_page_address(page, __va(zone_start_paddr));
+                       zone_start_paddr += PAGE_SIZE;
+               }
+
+               offset += size;
+               for (i = 0; ; i++) {
+                       unsigned long bitmap_size;
+
+                       INIT_LIST_HEAD(&zone->free_area[i].free_list);
+                       if (i == MAX_ORDER-1) {
+                               zone->free_area[i].map = NULL;
+                               break;
+                       }
+
+                       /*
+                        * Page buddy system uses "index >> (i+1)",
+                        * where "index" is at most "size-1".
+                        *
+                        * The extra "+3" is to round down to byte
+                        * size (8 bits per byte assumption). Thus
+                        * we get "(size-1) >> (i+4)" as the last byte
+                        * we can access.
+                        *
+                        * The "+1" is because we want to round the
+                        * byte allocation up rather than down. So
+                        * we should have had a "+7" before we shifted
+                        * down by three. Also, we have to add one as
+                        * we actually _use_ the last bit (it's [0,n]
+                        * inclusive, not [0,n[).
+                        *
+                        * So we actually had +7+1 before we shift
+                        * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+                        * (modulo overflows, which we do not have).
+                        *
+                        * Finally, we LONG_ALIGN because all bitmap
+                        * operations are on longs.
+                        */
+                       bitmap_size = (size-1) >> (i+4);
+                       bitmap_size = LONG_ALIGN(bitmap_size+1);
+                       zone->free_area[i].map = 
+                         (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+               }
+       }
+       build_zonelists(pgdat);
+}
+
+void __init free_area_init(unsigned long *zones_size)
+{
+       free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+}
+
+static int __init setup_mem_frac(char *str)
+{
+       int j = 0;
+
+       while (get_option(&str, &zone_balance_ratio[j++]) == 2);
+       printk("setup_mem_frac: ");
+       for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
+       printk("\n");
+       return 1;
+}
+
+__setup("memfrac=", setup_mem_frac);
+
+static int __init setup_lower_zone_reserve(char *str)
+{
+       int j = 0;
+
+       while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
+       printk("setup_lower_zone_reserve: ");
+       for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
+       printk("\n");
+       return 1;
+}
+
+__setup("lower_zone_reserve=", setup_lower_zone_reserve);