xen: sched: close potential races when switching scheduler to CPUs

author Dario Faggioli <dario.faggioli@citrix.com>

Wed, 6 Apr 2016 13:40:53 +0000 (15:40 +0200)

committer Ian Jackson <Ian.Jackson@eu.citrix.com>

Fri, 8 Apr 2016 14:59:12 +0000 (15:59 +0100)
author Dario Faggioli <dario.faggioli@citrix.com>
Wed, 6 Apr 2016 13:40:53 +0000 (15:40 +0200)
committer Ian Jackson <Ian.Jackson@eu.citrix.com>
Fri, 8 Apr 2016 14:59:12 +0000 (15:59 +0100)
diff --git a/xen/common/sched_arinc653.c b/xen/common/sched_arinc653.c

index b79fcdf7f6c528f596ad125be6409ec72cce15c0..ebd20902889e6918892a2dfb5d2c1c94c2c1325d 100644 (file)
--- a/xen/common/sched_arinc653.c
+++ b/xen/common/sched_arinc653.c
@@ -651,6 +651,38 @@ a653sched_pick_cpu(const struct scheduler *ops, struct vcpu *vc)
      return cpu;
  }
  
+/**
+ * Xen scheduler callback to change the scheduler of a cpu
+ *
+ * @param new_ops   Pointer to this instance of the scheduler structure
+ * @param cpu       The cpu that is changing scheduler
+ * @param pdata     scheduler specific PCPU data (we don't have any)
+ * @param vdata     scheduler specific VCPU data of the idle vcpu
+ */
+static void
+a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                  void *pdata, void *vdata)
+{
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    arinc653_vcpu_t *svc = vdata;
+
+    ASSERT(!pdata && svc && is_idle_vcpu(svc->vc));
+
+    idle_vcpu[cpu]->sched_priv = vdata;
+
+    per_cpu(scheduler, cpu) = new_ops;
+    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+
+    /*
+     * (Re?)route the lock to its default location. We actually do not use
+     * it, but if we leave it pointing to where it does now (i.e., the
+     * runqueue lock for this PCPU in the default scheduler), we'd be
+     * causing unnecessary contention on that lock (in cases where it is
+     * shared among multiple PCPUs, like in Credit2 and RTDS).
+     */
+    sd->schedule_lock = &sd->_lock;
+}
+
  /**
   * Xen scheduler callback function to perform a global (not domain-specific)
   * adjustment. It is used by the ARINC 653 scheduler to put in place a new
@@ -727,6 +759,8 @@ static const struct scheduler sched_arinc653_def = {
  
      .pick_cpu       = a653sched_pick_cpu,
  
+    .switch_sched   = a653_switch_sched,
+
      .adjust         = NULL,
      .adjust_global  = a653sched_adjust_global,
  
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c

index 96a245d2cfc6dd42f4ab385183bed891a4b47d9a..490f10b1cddd7a6f16f0100875f1bb550a790def 100644 (file)
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -578,12 +578,55 @@ csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
  {
      unsigned long flags;
      struct csched_private *prv = CSCHED_PRIV(ops);
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+
+    /*
+     * This is called either during during boot, resume or hotplug, in
+     * case Credit1 is the scheduler chosen at boot. In such cases, the
+     * scheduler lock for cpu is already pointing to the default per-cpu
+     * spinlock, as Credit1 needs it, so there is no remapping to be done.
+     */
+    ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
  
      spin_lock_irqsave(&prv->lock, flags);
      init_pdata(prv, pdata, cpu);
      spin_unlock_irqrestore(&prv->lock, flags);
  }
  
+/* Change the scheduler of cpu to us (Credit). */
+static void
+csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                    void *pdata, void *vdata)
+{
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    struct csched_private *prv = CSCHED_PRIV(new_ops);
+    struct csched_vcpu *svc = vdata;
+
+    ASSERT(svc && is_idle_vcpu(svc->vcpu));
+
+    idle_vcpu[cpu]->sched_priv = vdata;
+
+    /*
+     * We are holding the runqueue lock already (it's been taken in
+     * schedule_cpu_switch()). It actually may or may not be the 'right'
+     * one for this cpu, but that is ok for preventing races.
+     */
+    spin_lock(&prv->lock);
+    init_pdata(prv, pdata, cpu);
+    spin_unlock(&prv->lock);
+
+    per_cpu(scheduler, cpu) = new_ops;
+    per_cpu(schedule_data, cpu).sched_priv = pdata;
+
+    /*
+     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
+     * if it is free (and it can be) we want that anyone that manages
+     * taking it, finds all the initializations we've done above in place.
+     */
+    smp_mb();
+    sd->schedule_lock = &sd->_lock;
+}
+
  #ifndef NDEBUG
  static inline void
  __csched_vcpu_check(struct vcpu *vc)
@@ -2067,6 +2110,7 @@ static const struct scheduler sched_credit_def = {
      .alloc_pdata    = csched_alloc_pdata,
      .init_pdata     = csched_init_pdata,
      .free_pdata     = csched_free_pdata,
+    .switch_sched   = csched_switch_sched,
      .alloc_domdata  = csched_alloc_domdata,
      .free_domdata   = csched_free_domdata,
  
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c

index 8989eea11d146c7a7ab5ba421f3cd5468878e55d..60c6f5b20e0667223ec95609456d4752f648d746 100644 (file)
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -1971,12 +1971,12 @@ static void deactivate_runqueue(struct csched2_private *prv, int rqi)
      cpumask_clear_cpu(rqi, &prv->active_queues);
  }
  
-static void
+/* Returns the ID of the runqueue the cpu is assigned to. */
+static unsigned
  init_pdata(struct csched2_private *prv, unsigned int cpu)
  {
      unsigned rqi;
      struct csched2_runqueue_data *rqd;
-    spinlock_t *old_lock;
  
      ASSERT(spin_is_locked(&prv->lock));
      ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
@@ -2007,44 +2007,89 @@ init_pdata(struct csched2_private *prv, unsigned int cpu)
          activate_runqueue(prv, rqi);
      }
      
-    /* IRQs already disabled */
-    old_lock = pcpu_schedule_lock(cpu);
-
-    /* Move spinlock to new runq lock.  */
-    per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock;
-
      /* Set the runqueue map */
      prv->runq_map[cpu] = rqi;
      
      cpumask_set_cpu(cpu, &rqd->idle);
      cpumask_set_cpu(cpu, &rqd->active);
-
-    /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
-    spin_unlock(old_lock);
-
      cpumask_set_cpu(cpu, &prv->initialized);
  
-    return;
+    return rqi;
  }
  
  static void
  csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
  {
      struct csched2_private *prv = CSCHED2_PRIV(ops);
+    spinlock_t *old_lock;
      unsigned long flags;
+    unsigned rqi;
  
      spin_lock_irqsave(&prv->lock, flags);
-    init_pdata(prv, cpu);
+    old_lock = pcpu_schedule_lock(cpu);
+
+    rqi = init_pdata(prv, cpu);
+    /* Move the scheduler lock to the new runq lock. */
+    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
+
+    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+    spin_unlock(old_lock);
      spin_unlock_irqrestore(&prv->lock, flags);
  }
  
+/* Change the scheduler of cpu to us (Credit2). */
+static void
+csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                     void *pdata, void *vdata)
+{
+    struct csched2_private *prv = CSCHED2_PRIV(new_ops);
+    struct csched2_vcpu *svc = vdata;
+    unsigned rqi;
+
+    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
+
+    /*
+     * We own one runqueue lock already (from schedule_cpu_switch()). This
+     * looks like it violates this scheduler's locking rules, but it does
+     * not, as what we own is the lock of another scheduler, that hence has
+     * no particular (ordering) relationship with our private global lock.
+     * And owning exactly that one (the lock of the old scheduler of this
+     * cpu) is what is necessary to prevent races.
+     */
+    spin_lock_irq(&prv->lock);
+
+    idle_vcpu[cpu]->sched_priv = vdata;
+
+    rqi = init_pdata(prv, cpu);
+
+    /*
+     * Now that we know what runqueue we'll go in, double check what's said
+     * above: the lock we already hold is not the one of this runqueue of
+     * this scheduler, and so it's safe to have taken it /before/ our
+     * private global lock.
+     */
+    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock);
+
+    per_cpu(scheduler, cpu) = new_ops;
+    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+
+    /*
+     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
+     * if it is free (and it can be) we want that anyone that manages
+     * taking it, find all the initializations we've done above in place.
+     */
+    smp_mb();
+    per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock;
+
+    spin_unlock_irq(&prv->lock);
+}
+
  static void
  csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
  {
      unsigned long flags;
      struct csched2_private *prv = CSCHED2_PRIV(ops);
      struct csched2_runqueue_data *rqd;
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
      int rqi;
  
      spin_lock_irqsave(&prv->lock, flags);
@@ -2072,11 +2117,6 @@ csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
          deactivate_runqueue(prv, rqi);
      }
  
-    /* Move spinlock to the original lock.  */
-    ASSERT(sd->schedule_lock == &rqd->lock);
-    ASSERT(!spin_is_locked(&sd->_lock));
-    sd->schedule_lock = &sd->_lock;
-
      spin_unlock(&rqd->lock);
  
      cpumask_clear_cpu(cpu, &prv->initialized);
@@ -2170,6 +2210,7 @@ static const struct scheduler sched_credit2_def = {
      .free_vdata     = csched2_free_vdata,
      .init_pdata     = csched2_init_pdata,
      .free_pdata     = csched2_free_pdata,
+    .switch_sched   = csched2_switch_sched,
      .alloc_domdata  = csched2_alloc_domdata,
      .free_domdata   = csched2_free_domdata,
  };
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c

index b96bd9356a968d92d4735b12b50faeeffe29eb8f..3bb8c7119876f0af51b358a015d61e6734ea98ee 100644 (file)
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -682,6 +682,37 @@ rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
      spin_unlock_irqrestore(old_lock, flags);
  }
  
+/* Change the scheduler of cpu to us (RTDS). */
+static void
+rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                void *pdata, void *vdata)
+{
+    struct rt_private *prv = rt_priv(new_ops);
+    struct rt_vcpu *svc = vdata;
+
+    ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
+
+    /*
+     * We are holding the runqueue lock already (it's been taken in
+     * schedule_cpu_switch()). It's actually the runqueue lock of
+     * another scheduler, but that is how things need to be, for
+     * preventing races.
+     */
+    ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->lock);
+
+    idle_vcpu[cpu]->sched_priv = vdata;
+    per_cpu(scheduler, cpu) = new_ops;
+    per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
+
+    /*
+     * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
+     * if it is free (and it can be) we want that anyone that manages
+     * taking it, find all the initializations we've done above in place.
+     */
+    smp_mb();
+    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
+}
+
  static void *
  rt_alloc_pdata(const struct scheduler *ops, int cpu)
  {
@@ -707,19 +738,6 @@ rt_alloc_pdata(const struct scheduler *ops, int cpu)
  static void
  rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
  {
-    struct rt_private *prv = rt_priv(ops);
-    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-    unsigned long flags;
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    /* Move spinlock back to the default lock */
-    ASSERT(sd->schedule_lock == &prv->lock);
-    ASSERT(!spin_is_locked(&sd->_lock));
-    sd->schedule_lock = &sd->_lock;
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-
      free_cpumask_var(_cpumask_scratch[cpu]);
  }
  
@@ -1468,6 +1486,7 @@ static const struct scheduler sched_rtds_def = {
      .alloc_pdata    = rt_alloc_pdata,
      .free_pdata     = rt_free_pdata,
      .init_pdata     = rt_init_pdata,
+    .switch_sched   = rt_switch_sched,
      .alloc_domdata  = rt_alloc_domdata,
      .free_domdata   = rt_free_domdata,
      .init_domain    = rt_dom_init,
diff --git a/xen/common/schedule.c b/xen/common/schedule.c

index 19416136574aaa620108218372fbcc69cd6f774f..5559aa17c527636fb1bc705aece0551200a203e0 100644 (file)
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1635,11 +1635,11 @@ void __init scheduler_init(void)
  int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
  {
      struct vcpu *idle;
-    spinlock_t *lock;
      void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
      struct scheduler *old_ops = per_cpu(scheduler, cpu);
      struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
      struct cpupool *old_pool = per_cpu(cpupool, cpu);
+    spinlock_t * old_lock;
  
      /*
       * pCPUs only move from a valid cpupool to free (i.e., out of any pool),
@@ -1658,11 +1658,21 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
      if ( old_ops == new_ops )
          goto out;
  
+    /*
+     * To setup the cpu for the new scheduler we need:
+     *  - a valid instance of per-CPU scheduler specific data, as it is
+     *    allocated by SCHED_OP(alloc_pdata). Note that we do not want to
+     *    initialize it yet (i.e., we are not calling SCHED_OP(init_pdata)).
+     *    That will be done by the target scheduler, in SCHED_OP(switch_sched),
+     *    in proper ordering and with locking.
+     *  - a valid instance of per-vCPU scheduler specific data, for the idle
+     *    vCPU of cpu. That is what the target scheduler will use for the
+     *    sched_priv field of the per-vCPU info of the idle domain.
+     */
      idle = idle_vcpu[cpu];
      ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
      if ( IS_ERR(ppriv) )
          return PTR_ERR(ppriv);
-    SCHED_OP(new_ops, init_pdata, ppriv, cpu);
      vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
      if ( vpriv == NULL )
      {
@@ -1670,17 +1680,30 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
          return -ENOMEM;
      }
  
-    lock = pcpu_schedule_lock_irq(cpu);
-
      SCHED_OP(old_ops, tick_suspend, cpu);
+
+    /*
+     * The actual switch, including (if necessary) the rerouting of the
+     * scheduler lock to whatever new_ops prefers,  needs to happen in one
+     * critical section, protected by old_ops' lock, or races are possible.
+     * It is, in fact, the lock of another scheduler that we are taking (the
+     * scheduler of the cpupool that cpu still belongs to). But that is ok
+     * as, anyone trying to schedule on this cpu will spin until when we
+     * release that lock (bottom of this function). When he'll get the lock
+     * --thanks to the loop inside *_schedule_lock() functions-- he'll notice
+     * that the lock itself changed, and retry acquiring the new one (which
+     * will be the correct, remapped one, at that point).
+     */
+    old_lock = pcpu_schedule_lock(cpu);
+
      vpriv_old = idle->sched_priv;
-    idle->sched_priv = vpriv;
-    per_cpu(scheduler, cpu) = new_ops;
      ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
-    per_cpu(schedule_data, cpu).sched_priv = ppriv;
-    SCHED_OP(new_ops, tick_resume, cpu);
+    SCHED_OP(new_ops, switch_sched, cpu, ppriv, vpriv);
  
-    pcpu_schedule_unlock_irq(lock, cpu);
+    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+    spin_unlock_irq(old_lock);
+
+    SCHED_OP(new_ops, tick_resume, cpu);
  
      SCHED_OP(old_ops, free_vdata, vpriv_old);
      SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h

index 70c08c6b734131e8e660d2c20391a1ec83667182..9cebe418ab1cf77087d4ffff9649c74653d77885 100644 (file)
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -137,6 +137,9 @@ struct scheduler {
      void         (*free_domdata)   (const struct scheduler *, void *);
      void *       (*alloc_domdata)  (const struct scheduler *, struct domain *);
  
+    void         (*switch_sched)   (struct scheduler *, unsigned int,
+                                    void *, void *);
+
      int          (*init_domain)    (const struct scheduler *, struct domain *);
      void         (*destroy_domain) (const struct scheduler *, struct domain *);
author	Dario Faggioli <dario.faggioli@citrix.com>
	Wed, 6 Apr 2016 13:40:53 +0000 (15:40 +0200)
committer	Ian Jackson <Ian.Jackson@eu.citrix.com>
	Fri, 8 Apr 2016 14:59:12 +0000 (15:59 +0100)
xen/common/sched_arinc653.c		patch \| blob \| history
xen/common/sched_credit.c		patch \| blob \| history
xen/common/sched_credit2.c		patch \| blob \| history
xen/common/sched_rt.c		patch \| blob \| history
xen/common/schedule.c		patch \| blob \| history
xen/include/xen/sched-if.h		patch \| blob \| history