Revert "sched/rt: Simplify the IPI based RT balancing logic"

author Yves-Alexis Perez <corsac@debian.org>

Tue, 23 Jan 2018 07:23:44 +0000 (08:23 +0100)

committer Yves-Alexis Perez <corsac@debian.org>

Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
author Yves-Alexis Perez <corsac@debian.org>
Tue, 23 Jan 2018 07:23:44 +0000 (08:23 +0100)
committer Yves-Alexis Perez <corsac@debian.org>
Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index e5066955cc3a152149df120020192cd9a908c1ce..d748d4e5455d0b11b46a542befcdcfbb3e40345a 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5877,12 +5877,6 @@ static int init_rootdomain(struct root_domain *rd)
         if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_dlo_mask;
  
-#ifdef HAVE_RT_PUSH_IPI
-       rd->rto_cpu = -1;
-       raw_spin_lock_init(&rd->rto_lock);
-       init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
-#endif
-
         init_dl_bw(&rd->dl_bw);
         if (cpudl_init(&rd->cpudl) != 0)
                 goto free_dlo_mask;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 7a360d6f6798fcf36aa40052d429bb8534fe807d..34b1133cbac332f24c4d18ff5a0d769087be0c64 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -72,6 +72,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
         raw_spin_unlock(&rt_b->rt_runtime_lock);
  }
  
+#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
  void init_rt_rq(struct rt_rq *rt_rq)
  {
         struct rt_prio_array *array;
@@ -91,6 +95,13 @@ void init_rt_rq(struct rt_rq *rt_rq)
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
         plist_head_init(&rt_rq->pushable_tasks);
+
+#ifdef HAVE_RT_PUSH_IPI
+       rt_rq->push_flags = 0;
+       rt_rq->push_cpu = nr_cpu_ids;
+       raw_spin_lock_init(&rt_rq->push_lock);
+       init_irq_work(&rt_rq->push_work, push_irq_work_func);
+#endif
  #endif /* CONFIG_SMP */
         /* We start is dequeued state, because no RT tasks are queued */
         rt_rq->rt_queued = 0;
@@ -1853,166 +1864,160 @@ static void push_rt_tasks(struct rq *rq)
  }
  
  #ifdef HAVE_RT_PUSH_IPI
-
  /*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
- *
- * All CPUs with overloaded RT tasks need to be notified as there is currently
- * no way to know which of these CPUs have the highest priority task waiting
- * to run. Instead of trying to take a spinlock on each of these CPUs,
- * which has shown to cause large latency when done on machines with many
- * CPUs, sending an IPI to the CPUs to have them push off the overloaded
- * RT tasks waiting to run.
- *
- * Just sending an IPI to each of the CPUs is also an issue, as on large
- * count CPU machines, this can cause an IPI storm on a CPU, especially
- * if its the only CPU with multiple RT tasks queued, and a large number
- * of CPUs scheduling a lower priority task at the same time.
- *
- * Each root domain has its own irq work function that can iterate over
- * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
- * push off RT tasks that are waiting to run.
- *
- * When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
- * As it only takes the first CPU that schedules a lower priority task
- * to start the process, the rto_start variable is incremented and if
- * the atomic result is one, then that CPU will try to take the rto_lock.
- * This prevents high contention on the lock as the process handles all
- * CPUs scheduling lower priority tasks.
- *
- * All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
- * checks all RT overloaded CPUs whenever a CPU schedules a new lower
- * priority task, even if the iterator is in the middle of a scan. Incrementing
- * the rt_loop_next will cause the iterator to perform another scan.
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
   *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
   */
  static int rto_next_cpu(struct rq *rq)
  {
-       struct root_domain *rd = rq->rd;
-       int next;
+       int prev_cpu = rq->rt.push_cpu;
         int cpu;
  
+       cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
         /*
-        * When starting the IPI RT pushing, the rto_cpu is set to -1,
-        * rt_next_cpu() will simply return the first CPU found in
-        * the rto_mask.
-        *
-        * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
-        * will return the next CPU found in the rto_mask.
-        *
-        * If there are no more CPUs left in the rto_mask, then a check is made
-        * against rto_loop and rto_loop_next. rto_loop is only updated with
-        * the rto_lock held, but any CPU may increment the rto_loop_next
-        * without any locking.
+        * If the previous cpu is less than the rq's CPU, then it already
+        * passed the end of the mask, and has started from the beginning.
+        * We end if the next CPU is greater or equal to rq's CPU.
          */
-       for (;;) {
-
-               /* When rto_cpu is -1 this acts like cpumask_first() */
-               cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
-
-               rd->rto_cpu = cpu;
-
-               if (cpu < nr_cpu_ids)
-                       return cpu;
-
-               rd->rto_cpu = -1;
+       if (prev_cpu < rq->cpu) {
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
  
+       } else if (cpu >= nr_cpu_ids) {
                 /*
-                * ACQUIRE ensures we see the @rto_mask changes
-                * made prior to the @next value observed.
-                *
-                * Matches WMB in rt_set_overload().
+                * We passed the end of the mask, start at the beginning.
+                * If the result is greater or equal to the rq's CPU, then
+                * the loop is finished.
                  */
-               next = atomic_read_acquire(&rd->rto_loop_next);
-
-               if (rd->rto_loop == next)
-                       break;
-
-               rd->rto_loop = next;
+               cpu = cpumask_first(rq->rd->rto_mask);
+               if (cpu >= rq->cpu)
+                       return nr_cpu_ids;
         }
+       rq->rt.push_cpu = cpu;
  
-       return -1;
+       /* Return cpu to let the caller know if the loop is finished or not */
+       return cpu;
  }
  
-static inline bool rto_start_trylock(atomic_t *v)
+static int find_next_push_cpu(struct rq *rq)
  {
-       return !atomic_cmpxchg_acquire(v, 0, 1);
-}
+       struct rq *next_rq;
+       int cpu;
  
-static inline void rto_start_unlock(atomic_t *v)
-{
-       atomic_set_release(v, 0);
+       while (1) {
+               cpu = rto_next_cpu(rq);
+               if (cpu >= nr_cpu_ids)
+                       break;
+               next_rq = cpu_rq(cpu);
+
+               /* Make sure the next rq can push to this rq */
+               if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+                       break;
+       }
+
+       return cpu;
  }
  
+#define RT_PUSH_IPI_EXECUTING          1
+#define RT_PUSH_IPI_RESTART            2
+
  static void tell_cpu_to_push(struct rq *rq)
  {
-       int cpu = -1;
-
-       /* Keep the loop going if the IPI is currently active */
-       atomic_inc(&rq->rd->rto_loop_next);
-
-       /* Only one CPU can initiate a loop at a time */
-       if (!rto_start_trylock(&rq->rd->rto_loop_start))
-               return;
+       int cpu;
  
-       raw_spin_lock(&rq->rd->rto_lock);
+       if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+               raw_spin_lock(&rq->rt.push_lock);
+               /* Make sure it's still executing */
+               if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+                       /*
+                        * Tell the IPI to restart the loop as things have
+                        * changed since it started.
+                        */
+                       rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+                       raw_spin_unlock(&rq->rt.push_lock);
+                       return;
+               }
+               raw_spin_unlock(&rq->rt.push_lock);
+       }
  
-       /*
-        * The rto_cpu is updated under the lock, if it has a valid cpu
-        * then the IPI is still running and will continue due to the
-        * update to loop_next, and nothing needs to be done here.
-        * Otherwise it is finishing up and an ipi needs to be sent.
-        */
-       if (rq->rd->rto_cpu < 0)
-               cpu = rto_next_cpu(rq);
+       /* When here, there's no IPI going around */
  
-       raw_spin_unlock(&rq->rd->rto_lock);
+       rq->rt.push_cpu = rq->cpu;
+       cpu = find_next_push_cpu(rq);
+       if (cpu >= nr_cpu_ids)
+               return;
  
-       rto_start_unlock(&rq->rd->rto_loop_start);
+       rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
  
-       if (cpu >= 0)
-               irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+       irq_work_queue_on(&rq->rt.push_work, cpu);
  }
  
  /* Called from hardirq context */
-void rto_push_irq_work_func(struct irq_work *work)
+static void try_to_push_tasks(void *arg)
  {
-       struct rq *rq;
+       struct rt_rq *rt_rq = arg;
+       struct rq *rq, *src_rq;
+       int this_cpu;
         int cpu;
  
-       rq = this_rq();
+       this_cpu = rt_rq->push_cpu;
  
-       /*
-        * We do not need to grab the lock to check for has_pushable_tasks.
-        * When it gets updated, a check is made if a push is possible.
-        */
+       /* Paranoid check */
+       BUG_ON(this_cpu != smp_processor_id());
+
+       rq = cpu_rq(this_cpu);
+       src_rq = rq_of_rt_rq(rt_rq);
+
+again:
         if (has_pushable_tasks(rq)) {
                 raw_spin_lock(&rq->lock);
-               push_rt_tasks(rq);
+               push_rt_task(rq);
                 raw_spin_unlock(&rq->lock);
         }
  
-       raw_spin_lock(&rq->rd->rto_lock);
-
         /* Pass the IPI to the next rt overloaded queue */
-       cpu = rto_next_cpu(rq);
+       raw_spin_lock(&rt_rq->push_lock);
+       /*
+        * If the source queue changed since the IPI went out,
+        * we need to restart the search from that CPU again.
+        */
+       if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+               rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+               rt_rq->push_cpu = src_rq->cpu;
+       }
  
-       raw_spin_unlock(&rq->rd->rto_lock);
+       cpu = find_next_push_cpu(src_rq);
  
-       if (cpu < 0)
+       if (cpu >= nr_cpu_ids)
+               rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+       raw_spin_unlock(&rt_rq->push_lock);
+
+       if (cpu >= nr_cpu_ids)
                 return;
  
+       /*
+        * It is possible that a restart caused this CPU to be
+        * chosen again. Don't bother with an IPI, just see if we
+        * have more to push.
+        */
+       if (unlikely(cpu == rq->cpu))
+               goto again;
+
         /* Try the next RT overloaded CPU */
-       irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+       irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+       struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+       try_to_push_tasks(rt_rq);
  }
  #endif /* HAVE_RT_PUSH_IPI */
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index cff985feb6e7dd3b5c33b1bd387a33a4d623fa81..ad77d666583c3ad436c34b31f692c412ead29064 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -463,7 +463,7 @@ static inline int rt_bandwidth_enabled(void)
  }
  
  /* RT IPI pull logic requires IRQ_WORK */
-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
+#ifdef CONFIG_IRQ_WORK
  # define HAVE_RT_PUSH_IPI
  #endif
  
@@ -485,6 +485,12 @@ struct rt_rq {
         unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
+#ifdef HAVE_RT_PUSH_IPI
+       int push_flags;
+       int push_cpu;
+       struct irq_work push_work;
+       raw_spinlock_t push_lock;
+#endif
  #endif /* CONFIG_SMP */
         int rt_queued;
  
@@ -566,19 +572,6 @@ struct root_domain {
         struct dl_bw dl_bw;
         struct cpudl cpudl;
  
-#ifdef HAVE_RT_PUSH_IPI
-       /*
-        * For IPI pull requests, loop across the rto_mask.
-        */
-       struct irq_work rto_push_work;
-       raw_spinlock_t rto_lock;
-       /* These are only updated and read within rto_lock */
-       int rto_loop;
-       int rto_cpu;
-       /* These atomics are updated outside of a lock */
-       atomic_t rto_loop_next;
-       atomic_t rto_loop_start;
-#endif
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
@@ -591,9 +584,6 @@ struct root_domain {
  
  extern struct root_domain def_root_domain;
  
-#ifdef HAVE_RT_PUSH_IPI
-extern void rto_push_irq_work_func(struct irq_work *work);
-#endif
  #endif /* CONFIG_SMP */
  
  /*
author	Yves-Alexis Perez <corsac@debian.org>
	Tue, 23 Jan 2018 07:23:44 +0000 (08:23 +0100)
committer	Yves-Alexis Perez <corsac@debian.org>
	Fri, 9 Feb 2018 12:58:52 +0000 (12:58 +0000)
kernel/sched/core.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history