[XEN] Initial support for multi-core and multi-threaded CPU scheduling.

author Emmanuel Ackaouy <ack@xensource.com>

Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)

committer Emmanuel Ackaouy <ack@xensource.com>

Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)
author Emmanuel Ackaouy <ack@xensource.com>
Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)
committer Emmanuel Ackaouy <ack@xensource.com>
Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c

index ee1295976331f5e70c112bc0d1d28d83ef950ce4..bdf65ad0e9921c88c207ffb1c99bf63c512bac5a 100644 (file)
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -115,6 +115,10 @@
      _MACRO(steal_peer_idle)                 \
      _MACRO(steal_peer_running)              \
      _MACRO(steal_peer_pinned)               \
+    _MACRO(steal_peer_migrating)            \
+    _MACRO(steal_peer_best_idler)           \
+    _MACRO(steal_loner_candidate)           \
+    _MACRO(steal_loner_signal)              \
      _MACRO(dom_init)                        \
      _MACRO(dom_destroy)                     \
      _MACRO(vcpu_init)                       \
@@ -370,8 +374,42 @@ __csched_vcpu_check(struct vcpu *vc)
  #define CSCHED_VCPU_CHECK(_vc)
  #endif
  
+/*
+ * Indicates which of two given idlers is most efficient to run
+ * an additional VCPU.
+ *
+ * Returns:
+ *  0:           They are the same.
+ *  negative:    One is less efficient than Two.
+ *  positive:    One is more efficient than Two.
+ */
+static int
+csched_idler_compare(int one, int two)
+{
+    cpumask_t idlers;
+    cpumask_t one_idlers;
+    cpumask_t two_idlers;
+
+    idlers = csched_priv.idlers;
+    cpu_clear(one, idlers);
+    cpu_clear(two, idlers);
+
+    if ( cpu_isset(one, cpu_core_map[two]) )
+    {
+        cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
+        cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
+    }
+    else
+    {
+        cpus_and(one_idlers, idlers, cpu_core_map[one]);
+        cpus_and(two_idlers, idlers, cpu_core_map[two]);
+    }
+
+    return cpus_weight(one_idlers) - cpus_weight(two_idlers);
+}
+
  static inline int
-__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+__csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
  {
      /*
       * Don't pick up work that's in the peer's scheduling tail. Also only pick
@@ -392,6 +430,32 @@ __csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
      return 1;
  }
  
+static inline int
+__csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
+{
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_pinned);
+        return 0;
+    }
+
+    if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
+    {
+        CSCHED_STAT_CRANK(steal_peer_migrating);
+        return 0;
+    }
+
+    if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
+    {
+        CSCHED_STAT_CRANK(steal_peer_best_idler);
+        return 0;
+    }
+
+    return 1;
+}
+
  static void
  csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
  {
@@ -652,6 +716,64 @@ csched_dom_destroy(struct domain *dom)
      xfree(sdom);
  }
  
+static int
+csched_cpu_pick(struct vcpu *vc)
+{
+    cpumask_t cpus;
+    int cpu, nxt;
+
+    /*
+     * Pick from online CPUs in VCPU's affinity mask, giving a
+     * preference to its current processor if it's in there.
+     */
+    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    ASSERT( !cpus_empty(cpus) );
+    cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
+
+    /*
+     * Try to find an idle processor within the above constraints.
+     */
+    cpus_and(cpus, cpus, csched_priv.idlers);
+    if ( !cpus_empty(cpus) )
+    {
+        cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
+        cpu_clear(cpu, cpus);
+
+        /*
+         * In multi-core and multi-threaded CPUs, not all idle execution
+         * vehicles are equal!
+         *
+         * We give preference to the idle execution vehicle with the most
+         * idling neighbours in its grouping. This distributes work across
+         * distinct cores first and guarantees we don't do something stupid
+         * like run two VCPUs on co-hyperthreads while there are idle cores
+         * or sockets.
+         */
+        while ( !cpus_empty(cpus) )
+        {
+            nxt = first_cpu(cpus);
+
+            if ( csched_idler_compare(cpu, nxt) < 0 )
+            {
+                cpu = nxt;
+                cpu_clear(nxt, cpus);
+            }
+            else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
+            {
+                cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
+            }
+            else
+            {
+                cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
+            }
+
+            ASSERT( !cpu_isset(nxt, cpus) );
+        }
+    }
+
+    return cpu;
+}
+
  /*
   * This is a O(n) optimized sort of the runq.
   *
@@ -939,7 +1061,7 @@ csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
          vc = speer->vcpu;
          BUG_ON( is_idle_vcpu(vc) );
  
-        if ( __csched_vcpu_is_stealable(cpu, vc) )
+        if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
          {
              /* We got a candidate. Grab it! */
              __runq_remove(speer);
@@ -959,6 +1081,7 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
      struct csched_pcpu *spc;
      struct vcpu *peer_vcpu;
      cpumask_t workers;
+    cpumask_t loners;
      int peer_cpu;
  
      if ( snext->pri == CSCHED_PRI_IDLE )
@@ -971,6 +1094,7 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
      /*
       * Peek at non-idling CPUs in the system
       */
+    cpus_clear(loners);
      cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
      cpu_clear(cpu, workers);
  
@@ -999,13 +1123,12 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
              continue;
          }
  
-        spc = CSCHED_PCPU(peer_cpu);
          peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
+        spc = CSCHED_PCPU(peer_cpu);
  
          if ( unlikely(spc == NULL) )
          {
              CSCHED_STAT_CRANK(steal_peer_down);
-            speer = NULL;
          }
          else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
          {
@@ -1014,26 +1137,72 @@ csched_load_balance(int cpu, struct csched_vcpu *snext)
               * pick up work from it itself.
               */
              CSCHED_STAT_CRANK(steal_peer_idle);
-            speer = NULL;
+        }
+        else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
+        {
+            if ( snext->pri == CSCHED_PRI_IDLE &&
+                 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
+            {
+                CSCHED_STAT_CRANK(steal_loner_candidate);
+                cpu_set(peer_cpu, loners);
+            }
          }
          else
          {
-            /* Try to steal work from an online non-idle CPU. */
+            /* Try to steal work from a remote CPU's runq. */
              speer = csched_runq_steal(spc, cpu, snext->pri);
+            if ( speer != NULL )
+            {
+                spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+                CSCHED_STAT_CRANK(vcpu_migrate);
+                speer->stats.migrate++;
+                return speer;
+            }
          }
  
          spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+    }
+
+    /*
+     * If we failed to find any remotely queued VCPUs to move here,
+     * see if it would be more efficient to move any of the running
+     * remote VCPUs over here.
+     */
+    while ( !cpus_empty(loners) )
+    {
+        /* For each CPU of interest, starting with our neighbour... */
+        peer_cpu = next_cpu(peer_cpu, loners);
+        if ( peer_cpu == NR_CPUS )
+            peer_cpu = first_cpu(loners);
+
+        cpu_clear(peer_cpu, loners);
  
-        /* Got one? */
-        if ( speer )
+        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
          {
-            CSCHED_STAT_CRANK(vcpu_migrate);
-            speer->stats.migrate++;
-            return speer;
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+            continue;
+        }
+
+        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
+        spc = CSCHED_PCPU(peer_cpu);
+
+        if ( !is_idle_vcpu(peer_vcpu) &&
+             is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
+             __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
+        {
+            set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
+            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+
+            CSCHED_STAT_CRANK(steal_loner_signal);
+            cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
+        }
+        else
+        {
+            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
          }
      }
  
-    /* Failed to find more important work */
+    /* Failed to find more important work elsewhere... */
      __runq_remove(snext);
      return snext;
  }
@@ -1139,9 +1308,11 @@ csched_dump_pcpu(int cpu)
      spc = CSCHED_PCPU(cpu);
      runq = &spc->runq;
  
-    printk(" tick=%lu, sort=%d\n",
+    printk(" tick=%lu, sort=%d, sibling=0x%lx, core=0x%lx\n",
              per_cpu(schedule_data, cpu).tick,
-            spc->runq_sort_last);
+            spc->runq_sort_last,
+            cpu_sibling_map[cpu].bits[0],
+            cpu_core_map[cpu].bits[0]);
  
      /* current VCPU */
      svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
@@ -1247,6 +1418,7 @@ struct scheduler sched_credit_def = {
  
      .adjust         = csched_dom_cntl,
  
+    .pick_cpu       = csched_cpu_pick,
      .tick           = csched_tick,
      .do_schedule    = csched_schedule,
  
diff --git a/xen/common/sched_sedf.c b/xen/common/sched_sedf.c

index 5b0029575fc2bce77b6bcd0c2072119f50d75a80..5381fcb3ca7bf1fb3b530056ad6fb0cff906e813 100644 (file)
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -411,6 +411,14 @@ static void sedf_destroy_domain(struct domain *d)
      xfree(d->sched_priv);
  }
  
+static int sedf_pick_cpu(struct vcpu *v)
+{
+    cpumask_t online_affinity;
+
+    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    return first_cpu(online_affinity);
+}
+
  /*
   * Handles the rescheduling & bookkeeping of domains running in their
   * guaranteed timeslice.
@@ -1436,6 +1444,7 @@ struct scheduler sched_sedf_def = {
      .destroy_vcpu   = sedf_destroy_vcpu,
  
      .do_schedule    = sedf_do_schedule,
+    .pick_cpu       = sedf_pick_cpu,
      .dump_cpu_state = sedf_dump_cpu_state,
      .sleep          = sedf_sleep,
      .wake           = sedf_wake,
diff --git a/xen/common/schedule.c b/xen/common/schedule.c

index 920afde25667f8fde15eeba57557c72d077f376e..b333406d8a8bd851cab82729997e2fed7bf880a5 100644 (file)
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -203,7 +203,6 @@ void vcpu_wake(struct vcpu *v)
  
  static void vcpu_migrate(struct vcpu *v)
  {
-    cpumask_t online_affinity;
      unsigned long flags;
      int old_cpu;
  
@@ -218,8 +217,7 @@ static void vcpu_migrate(struct vcpu *v)
  
      /* Switch to new CPU, then unlock old CPU. */
      old_cpu = v->processor;
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
-    v->processor = first_cpu(online_affinity);
+    v->processor = SCHED_OP(pick_cpu, v);
      spin_unlock_irqrestore(
          &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
  
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h

index 410358cc487c2a88c763fdb05a4cff69bb915ee4..d506d7a589cc4ab3fff9b9a21e873dd0f0922c16 100644 (file)
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -74,6 +74,7 @@ struct scheduler {
  
      struct task_slice (*do_schedule) (s_time_t);
  
+    int          (*pick_cpu)       (struct vcpu *);
      int          (*adjust)         (struct domain *,
                                      struct xen_domctl_scheduler_op *);
      void         (*dump_settings)  (void);
author	Emmanuel Ackaouy <ack@xensource.com>
	Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)
committer	Emmanuel Ackaouy <ack@xensource.com>
	Mon, 6 Nov 2006 17:32:00 +0000 (17:32 +0000)
xen/common/sched_credit.c		patch \| blob \| history
xen/common/sched_sedf.c		patch \| blob \| history
xen/common/schedule.c		patch \| blob \| history
xen/include/xen/sched-if.h		patch \| blob \| history