xen: credit1: increase efficiency and scalability of load balancing.

author Dario Faggioli <dario.faggioli@citrix.com>

Fri, 7 Apr 2017 16:57:07 +0000 (18:57 +0200)

committer George Dunlap <george.dunlap@citrix.com>

Fri, 7 Apr 2017 17:17:39 +0000 (18:17 +0100)
author Dario Faggioli <dario.faggioli@citrix.com>
Fri, 7 Apr 2017 16:57:07 +0000 (18:57 +0200)
committer George Dunlap <george.dunlap@citrix.com>
Fri, 7 Apr 2017 17:17:39 +0000 (18:17 +0100)
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c

index 59b87f7217a2416044998f55a2af636cf797fb37..a0ad167d26831a6c3057304a2b4f754eaf240017 100644 (file)
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -172,6 +172,7 @@ struct csched_pcpu {
      struct timer ticker;
      unsigned int tick;
      unsigned int idle_bias;
+    unsigned int nr_runnable;
  };
  
  /*
@@ -261,10 +262,27 @@ static inline bool_t is_runq_idle(unsigned int cpu)
             is_idle_vcpu(__runq_elem(RUNQ(cpu)->next)->vcpu);
  }
  
+static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable--;
+    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 0);
+}
+
  static inline void
  __runq_insert(struct csched_vcpu *svc)
  {
-    const struct list_head * const runq = RUNQ(svc->vcpu->processor);
+    unsigned int cpu = svc->vcpu->processor;
+    const struct list_head * const runq = RUNQ(cpu);
      struct list_head *iter;
  
      BUG_ON( __vcpu_on_runq(svc) );
@@ -291,6 +309,13 @@ __runq_insert(struct csched_vcpu *svc)
      list_add_tail(&svc->runq_elem, iter);
  }
  
+static inline void
+runq_insert(struct csched_vcpu *svc)
+{
+    __runq_insert(svc);
+    inc_nr_runnable(svc->vcpu->processor);
+}
+
  static inline void
  __runq_remove(struct csched_vcpu *svc)
  {
@@ -298,6 +323,12 @@ __runq_remove(struct csched_vcpu *svc)
      list_del_init(&svc->runq_elem);
  }
  
+static inline void
+runq_remove(struct csched_vcpu *svc)
+{
+    dec_nr_runnable(svc->vcpu->processor);
+    __runq_remove(svc);
+}
  
  #define for_each_csched_balance_step(step) \
      for ( (step) = 0; (step) <= CSCHED_BALANCE_HARD_AFFINITY; (step)++ )
@@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
      /* Start off idling... */
      BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));
      cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable = 0;
  }
  
  static void
@@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
      lock = vcpu_schedule_lock_irq(vc);
  
      if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running )
-        __runq_insert(svc);
+        runq_insert(svc);
  
      vcpu_schedule_unlock_irq(lock, vc);
  
@@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
          cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
      }
      else if ( __vcpu_on_runq(svc) )
-        __runq_remove(svc);
+        runq_remove(svc);
  }
  
  static void
@@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
      }
  
      /* Put the VCPU on the runq and tickle CPUs */
-    __runq_insert(svc);
+    runq_insert(svc);
      __runq_tickle(svc);
  }
  
@@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
              SCHED_VCPU_STAT_CRANK(speer, migrate_q);
              SCHED_STAT_CRANK(migrate_queued);
              WARN_ON(vc->is_urgent);
-            __runq_remove(speer);
+            runq_remove(speer);
              vc->processor = cpu;
+            /*
+             * speer will start executing directly on cpu, without having to
+             * go through runq_insert(). So we must update the runnable count
+             * for cpu here.
+             */
+            inc_nr_runnable(cpu);
              return speer;
          }
      }
@@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
          peer_node = node;
          do
          {
-            /* Find out what the !idle are in this node */
+            /* Select the pCPUs in this node that have work we can steal. */
              cpumask_andnot(&workers, online, prv->idlers);
              cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
              __cpumask_clear_cpu(cpu, &workers);
@@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int cpu,
                  goto next_node;
              do
              {
+                spinlock_t *lock;
+
+                /*
+                 * If there is only one runnable vCPU on peer_cpu, it means
+                 * there's no one to be stolen in its runqueue, so skip it.
+                 *
+                 * Checking this without holding the lock is racy... But that's
+                 * the whole point of this optimization!
+                 *
+                 * In more details:
+                 * - if we race with dec_nr_runnable(), we may try to take the
+                 *   lock and call csched_runq_steal() for no reason. This is
+                 *   not a functional issue, and should be infrequent enough.
+                 *   And we can avoid that by re-checking nr_runnable after
+                 *   having grabbed the lock, if we want;
+                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
+                 *   have runnable vCPUs in its runqueue, but that's not a
+                 *   problem because:
+                 *   + if racing with csched_vcpu_insert() or csched_vcpu_wake(),
+                 *     __runq_tickle() will be called afterwords, so the vCPU
+                 *     won't get stuck in the runqueue for too long;
+                 *   + if racing with csched_runq_steal(), it may be that a
+                 *     vCPU that we could have picked up, stays in a runqueue
+                 *     until someone else tries to steal it again. But this is
+                 *     no worse than what can happen already (without this
+                 *     optimization), it the pCPU would schedule right after we
+                 *     have taken the lock, and hence block on it.
+                 */
+                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
+                {
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
+                    goto next_cpu;
+                }
+
                  /*
                   * Get ahold of the scheduler lock for this peer CPU.
                   *
@@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int cpu,
                   * could cause a deadlock if the peer CPU is also load
                   * balancing and trying to lock this CPU.
                   */
-                spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
+                lock = pcpu_schedule_trylock(peer_cpu);
                  SCHED_STAT_CRANK(steal_trylock);
                  if ( !lock )
                  {
                      SCHED_STAT_CRANK(steal_trylock_failed);
                      TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
-                    peer_cpu = cpumask_cycle(peer_cpu, &workers);
-                    continue;
+                    goto next_cpu;
                  }
  
                  TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
@@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int cpu,
                      return speer;
                  }
  
+ next_cpu:
                  peer_cpu = cpumask_cycle(peer_cpu, &workers);
  
              } while( peer_cpu != cpumask_first(&workers) );
@@ -1907,7 +1979,11 @@ csched_schedule(
      if ( vcpu_runnable(current) )
          __runq_insert(scurr);
      else
+    {
          BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
+        /* Current has blocked. Update the runnable counter for this cpu. */
+        dec_nr_runnable(cpu);
+    }
  
      snext = __runq_elem(runq->next);
      ret.migrated = 0;
@@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu)
      runq = &spc->runq;
  
      cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
-    printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, cpustr);
+    printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ",
+           cpu, spc->nr_runnable, spc->runq_sort_last, cpustr);
      cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
      printk("core=%s\n", cpustr);
author	Dario Faggioli <dario.faggioli@citrix.com>
	Fri, 7 Apr 2017 16:57:07 +0000 (18:57 +0200)
committer	George Dunlap <george.dunlap@citrix.com>
	Fri, 7 Apr 2017 17:17:39 +0000 (18:17 +0100)