Fix nasty tools race between barking xu_autoreap() and xpopen3.wait() -
authorshand@ubuntu.eng.hq.xensource.com <shand@ubuntu.eng.hq.xensource.com>
Mon, 29 Aug 2005 05:40:36 +0000 (21:40 -0800)
committershand@ubuntu.eng.hq.xensource.com <shand@ubuntu.eng.hq.xensource.com>
Mon, 29 Aug 2005 05:40:36 +0000 (21:40 -0800)
save/restore now seems more robust from a tools pov at least.

Signed-off-by: Steven Hand <steven@xensource.com>
tools/libxc/xc_linux_save.c
tools/python/xen/xend/XendCheckpoint.py
tools/python/xen/xend/server/SrvDaemon.py

index f83d9a5481b74599c025be9293d26c2f24647f95..a4ae0c8db7fab759b70646cbddc8631f5a0f5786 100644 (file)
@@ -763,8 +763,6 @@ int xc_linux_save(int xc_handle, int io_fd, u32 dom)
                 batch++;
             }
      
-//            DPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
-
             if ( batch == 0 )
                 goto skip; /* vanishingly unlikely... */
       
@@ -915,7 +913,7 @@ int xc_linux_save(int xc_handle, int io_fd, u32 dom)
             continue;
         }
 
-        if ( last_iter ) break;
+        if ( last_iter ) break; 
 
         if ( live )
         {
index e2a7d574ed61608c448c1fb84df01a4d0d1c019a..1fdcda04e24d65c7421518a691b7e23423750a28 100644 (file)
@@ -51,7 +51,7 @@ def save(xd, fd, dominfo):
     p = select.poll()
     p.register(child.fromchild.fileno())
     p.register(child.childerr.fileno())
-    while True:
+    while True: 
         r = p.poll()
         for (fd, event) in r:
             if not event & select.POLLIN:
@@ -69,8 +69,9 @@ def save(xd, fd, dominfo):
                         try:
                             dominfo.db.releaseDomain(dominfo.id)
                         except Exception, ex:
-                            log.warning("error in domain release on xenstore: %s",
-                                        ex)
+                            log.warning(
+                                "error in domain release on xenstore: %s",
+                                ex)
                             pass
                     dominfo.state_wait("suspended")
                     log.info("suspend %d done" % dominfo.id)
index 2b0d6b3b2799823f8d37faa8917f7bc082715a06..0a8060fb4742b7a745b05ee6ceef4b02742da203 100644 (file)
@@ -42,7 +42,8 @@ class Daemon:
         self.traceon = 0
         self.tracefile = None
         self.traceindent = 0
-
+        self.child = 0 
+        
     def daemon_pids(self):
         pids = []
         pidex = '(?P<pid>\d+)'
@@ -140,15 +141,12 @@ class Daemon:
         else:
             return 0
 
-    def install_child_reaper(self):
-        #signal.signal(signal.SIGCHLD, self.onSIGCHLD)
-        # Ensure that zombie children are automatically reaped.
-        xu.autoreap()
-
     def onSIGCHLD(self, signum, frame):
-        code = 1
-        while code > 0:
-            code = os.waitpid(-1, os.WNOHANG)
+        if self.child > 0: 
+            try: 
+                pid, sts = os.waitpid(self.child, os.WNOHANG)
+            except os.error, ex:
+                pass
 
     def fork_pid(self, pidfile):
         """Fork and write the pid of the child to 'pidfile'.
@@ -156,13 +154,16 @@ class Daemon:
         @param pidfile: pid file
         @return: pid of child in parent, 0 in child
         """
-        pid = os.fork()
-        if pid:
+
+        self.child = os.fork()
+
+        if self.child:
             # Parent
             pidfile = open(pidfile, 'w')
-            pidfile.write(str(pid))
+            pidfile.write(str(self.child))
             pidfile.close()
-        return pid
+
+        return self.child
 
     def daemonize(self):
         if not XEND_DAEMONIZE: return
@@ -203,8 +204,7 @@ class Daemon:
             # Trying to run an already-running service is a success.
             return 0
 
-        self.install_child_reaper()
-
+        signal.signal(signal.SIGCHLD, self.onSIGCHLD)
         if self.fork_pid(XEND_PID_FILE):
             #Parent. Sleep to give child time to start.
             time.sleep(1)
@@ -309,7 +309,7 @@ class Daemon:
             print >>sys.stderr, 'Exception starting xend:', ex
             if XEND_DEBUG:
                 traceback.print_exc()
-            log.exception("Exception starting xend")
+            log.exception("Exception starting xend (%s)" % ex)
             self.exit(1)
             
     def createFactories(self):