From: Juergen Gross <jgross@suse.com>
Date: Wed, 22 Jan 2020 14:06:43 +0000 (+0100)
Subject: xen/sched: move schedulers and cpupool coding to dedicated directory
X-Git-Tag: archive/raspbian/4.14.0+80-gd101b417b7-1+rpi1^2~63^2~854
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=6cb4b01c033b7abc3e7175501330dfb01fb09da5;p=xen.git

xen/sched: move schedulers and cpupool coding to dedicated directory

Move sched*c and cpupool.c to a new directory common/sched.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Dario Faggioli <dfaggioli@suse.com>
---

diff --git a/MAINTAINERS b/MAINTAINERS
index a91080cde5..dadcfb63d8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -174,7 +174,7 @@ M:	Josh Whitehead <josh.whitehead@dornerworks.com>
 M:	Stewart Hildebrand <stewart.hildebrand@dornerworks.com>
 S:	Supported
 L:	xen-devel@dornerworks.com
-F:	xen/common/sched_arinc653.c
+F:	xen/common/sched/arinc653.c
 F:	tools/libxc/xc_arinc653.c
 
 ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
@@ -218,7 +218,7 @@ CPU POOLS
 M:	Juergen Gross <jgross@suse.com>
 M:	Dario Faggioli <dfaggioli@suse.com>
 S:	Supported
-F:	xen/common/cpupool.c
+F:	xen/common/sched/cpupool.c
 
 DEVICE TREE
 M:	Stefano Stabellini <sstabellini@kernel.org>
@@ -384,13 +384,13 @@ RTDS SCHEDULER
 M:	Dario Faggioli <dfaggioli@suse.com>
 M:	Meng Xu <mengxu@cis.upenn.edu>
 S:	Supported
-F:	xen/common/sched_rt.c
+F:	xen/common/sched/rt.c
 
 SCHEDULING
 M:	George Dunlap <george.dunlap@eu.citrix.com>
 M:	Dario Faggioli <dfaggioli@suse.com>
 S:	Supported
-F:	xen/common/sched*
+F:	xen/common/sched/
 
 SEABIOS UPSTREAM
 M:	Wei Liu <wl@xen.org>
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index b3d161d057..9d6d09eb37 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -275,71 +275,7 @@ config ARGO
 
 	  If unsure, say N.
 
-menu "Schedulers"
-	visible if EXPERT = "y"
-
-config SCHED_CREDIT
-	bool "Credit scheduler support"
-	default y
-	---help---
-	  The traditional credit scheduler is a general purpose scheduler.
-
-config SCHED_CREDIT2
-	bool "Credit2 scheduler support"
-	default y
-	---help---
-	  The credit2 scheduler is a general purpose scheduler that is
-	  optimized for lower latency and higher VM density.
-
-config SCHED_RTDS
-	bool "RTDS scheduler support (EXPERIMENTAL)"
-	default y
-	---help---
-	  The RTDS scheduler is a soft and firm real-time scheduler for
-	  multicore, targeted for embedded, automotive, graphics and gaming
-	  in the cloud, and general low-latency workloads.
-
-config SCHED_ARINC653
-	bool "ARINC653 scheduler support (EXPERIMENTAL)"
-	default DEBUG
-	---help---
-	  The ARINC653 scheduler is a hard real-time scheduler for single
-	  cores, targeted for avionics, drones, and medical devices.
-
-config SCHED_NULL
-	bool "Null scheduler support (EXPERIMENTAL)"
-	default y
-	---help---
-	  The null scheduler is a static, zero overhead scheduler,
-	  for when there always are less vCPUs than pCPUs, typically
-	  in embedded or HPC scenarios.
-
-choice
-	prompt "Default Scheduler?"
-	default SCHED_CREDIT2_DEFAULT
-
-	config SCHED_CREDIT_DEFAULT
-		bool "Credit Scheduler" if SCHED_CREDIT
-	config SCHED_CREDIT2_DEFAULT
-		bool "Credit2 Scheduler" if SCHED_CREDIT2
-	config SCHED_RTDS_DEFAULT
-		bool "RT Scheduler" if SCHED_RTDS
-	config SCHED_ARINC653_DEFAULT
-		bool "ARINC653 Scheduler" if SCHED_ARINC653
-	config SCHED_NULL_DEFAULT
-		bool "Null Scheduler" if SCHED_NULL
-endchoice
-
-config SCHED_DEFAULT
-	string
-	default "credit" if SCHED_CREDIT_DEFAULT
-	default "credit2" if SCHED_CREDIT2_DEFAULT
-	default "rtds" if SCHED_RTDS_DEFAULT
-	default "arinc653" if SCHED_ARINC653_DEFAULT
-	default "null" if SCHED_NULL_DEFAULT
-	default "credit2"
-
-endmenu
+source "common/sched/Kconfig"
 
 config CRYPTO
 	bool
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 62b34e69e9..2abb8250b0 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -3,7 +3,6 @@ obj-y += bitmap.o
 obj-y += bsearch.o
 obj-$(CONFIG_CORE_PARKING) += core_parking.o
 obj-y += cpu.o
-obj-y += cpupool.o
 obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o
 obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
 obj-y += domctl.o
@@ -38,12 +37,6 @@ obj-y += radix-tree.o
 obj-y += rbtree.o
 obj-y += rcupdate.o
 obj-y += rwlock.o
-obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o
-obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o
-obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o
-obj-$(CONFIG_SCHED_RTDS) += sched_rt.o
-obj-$(CONFIG_SCHED_NULL) += sched_null.o
-obj-y += schedule.o
 obj-y += shutdown.o
 obj-y += softirq.o
 obj-y += sort.o
@@ -74,6 +67,7 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o kernel.o memory.o multicall
 extra-y := symbols-dummy.o
 
 subdir-$(CONFIG_COVERAGE) += coverage
+subdir-y += sched
 subdir-$(CONFIG_UBSAN) += ubsan
 
 subdir-$(CONFIG_NEEDS_LIBELF) += libelf
diff --git a/xen/common/compat/schedule.c b/xen/common/compat/schedule.c
deleted file mode 100644
index 8b6e6f107d..0000000000
--- a/xen/common/compat/schedule.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/****************************************************************************
- * schedule.c
- *
- */
-
-#include <compat/sched.h>
-
-#define COMPAT
-#define ret_t int
-
-#define do_sched_op compat_sched_op
-
-#define xen_sched_pin_override sched_pin_override
-CHECK_sched_pin_override;
-#undef xen_sched_pin_override
-
-#define xen_sched_shutdown sched_shutdown
-CHECK_sched_shutdown;
-#undef xen_sched_shutdown
-
-#define xen_sched_remote_shutdown sched_remote_shutdown
-CHECK_sched_remote_shutdown;
-#undef xen_sched_remote_shutdown
-
-static int compat_poll(struct compat_sched_poll *compat)
-{
-    struct sched_poll native;
-
-#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
-    guest_from_compat_handle((_d_)->ports, (_s_)->ports)
-    XLAT_sched_poll(&native, compat);
-#undef XLAT_sched_poll_HNDL_ports
-
-    return do_poll(&native);
-}
-
-#define do_poll compat_poll
-#define sched_poll compat_sched_poll
-
-#include "../schedule.c"
-
-int compat_set_timer_op(u32 lo, s32 hi)
-{
-    return do_set_timer_op(((s64)hi << 32) | lo);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
deleted file mode 100644
index d66b541a94..0000000000
--- a/xen/common/cpupool.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/******************************************************************************
- * cpupool.c
- * 
- * Generic cpupool-handling functions.
- *
- * Cpupools are a feature to have configurable scheduling domains. Each
- * cpupool runs an own scheduler on a dedicated set of physical cpus.
- * A domain is bound to one cpupool at any time, but it can be moved to
- * another cpupool.
- *
- * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
- */
-
-#include <xen/lib.h>
-#include <xen/init.h>
-#include <xen/cpumask.h>
-#include <xen/percpu.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/warning.h>
-#include <xen/keyhandler.h>
-#include <xen/cpu.h>
-
-#define for_each_cpupool(ptr)    \
-    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
-
-struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
-cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
-
-static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
-
-static int cpupool_moving_cpu = -1;
-static struct cpupool *cpupool_cpu_moving = NULL;
-static cpumask_t cpupool_locked_cpus;
-
-static DEFINE_SPINLOCK(cpupool_lock);
-
-static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
-static unsigned int __read_mostly sched_granularity = 1;
-
-#ifdef CONFIG_HAS_SCHED_GRANULARITY
-static int __init sched_select_granularity(const char *str)
-{
-    if ( strcmp("cpu", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_cpu;
-    else if ( strcmp("core", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_core;
-    else if ( strcmp("socket", str) == 0 )
-        opt_sched_granularity = SCHED_GRAN_socket;
-    else
-        return -EINVAL;
-
-    return 0;
-}
-custom_param("sched-gran", sched_select_granularity);
-#endif
-
-static unsigned int __init cpupool_check_granularity(void)
-{
-    unsigned int cpu;
-    unsigned int siblings, gran = 0;
-
-    if ( opt_sched_granularity == SCHED_GRAN_cpu )
-        return 1;
-
-    for_each_online_cpu ( cpu )
-    {
-        siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
-                                                        cpu));
-        if ( gran == 0 )
-            gran = siblings;
-        else if ( gran != siblings )
-            return 0;
-    }
-
-    sched_disable_smt_switching = true;
-
-    return gran;
-}
-
-/* Setup data for selected scheduler granularity. */
-static void __init cpupool_gran_init(void)
-{
-    unsigned int gran = 0;
-    const char *fallback = NULL;
-
-    while ( gran == 0 )
-    {
-        gran = cpupool_check_granularity();
-
-        if ( gran == 0 )
-        {
-            switch ( opt_sched_granularity )
-            {
-            case SCHED_GRAN_core:
-                opt_sched_granularity = SCHED_GRAN_cpu;
-                fallback = "Asymmetric cpu configuration.\n"
-                           "Falling back to sched-gran=cpu.\n";
-                break;
-            case SCHED_GRAN_socket:
-                opt_sched_granularity = SCHED_GRAN_core;
-                fallback = "Asymmetric cpu configuration.\n"
-                           "Falling back to sched-gran=core.\n";
-                break;
-            default:
-                ASSERT_UNREACHABLE();
-                break;
-            }
-        }
-    }
-
-    if ( fallback )
-        warning_add(fallback);
-
-    sched_granularity = gran;
-}
-
-unsigned int cpupool_get_granularity(const struct cpupool *c)
-{
-    return c ? sched_granularity : 1;
-}
-
-static void free_cpupool_struct(struct cpupool *c)
-{
-    if ( c )
-    {
-        free_cpumask_var(c->res_valid);
-        free_cpumask_var(c->cpu_valid);
-    }
-    xfree(c);
-}
-
-static struct cpupool *alloc_cpupool_struct(void)
-{
-    struct cpupool *c = xzalloc(struct cpupool);
-
-    if ( !c )
-        return NULL;
-
-    if ( !zalloc_cpumask_var(&c->cpu_valid) ||
-         !zalloc_cpumask_var(&c->res_valid) )
-    {
-        free_cpupool_struct(c);
-        c = NULL;
-    }
-
-    return c;
-}
-
-/*
- * find a cpupool by it's id. to be called with cpupool lock held
- * if exact is not specified, the first cpupool with an id larger or equal to
- * the searched id is returned
- * returns NULL if not found.
- */
-static struct cpupool *__cpupool_find_by_id(int id, int exact)
-{
-    struct cpupool **q;
-
-    ASSERT(spin_is_locked(&cpupool_lock));
-
-    for_each_cpupool(q)
-        if ( (*q)->cpupool_id >= id )
-            break;
-
-    return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
-}
-
-static struct cpupool *cpupool_find_by_id(int poolid)
-{
-    return __cpupool_find_by_id(poolid, 1);
-}
-
-static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
-{
-    struct cpupool *c;
-    spin_lock(&cpupool_lock);
-    c = __cpupool_find_by_id(poolid, exact);
-    if ( c != NULL )
-        atomic_inc(&c->refcnt);
-    spin_unlock(&cpupool_lock);
-    return c;
-}
-
-struct cpupool *cpupool_get_by_id(int poolid)
-{
-    return __cpupool_get_by_id(poolid, 1);
-}
-
-static struct cpupool *cpupool_get_next_by_id(int poolid)
-{
-    return __cpupool_get_by_id(poolid, 0);
-}
-
-void cpupool_put(struct cpupool *pool)
-{
-    if ( !atomic_dec_and_test(&pool->refcnt) )
-        return;
-    scheduler_free(pool->sched);
-    free_cpupool_struct(pool);
-}
-
-/*
- * create a new cpupool with specified poolid and scheduler
- * returns pointer to new cpupool structure if okay, NULL else
- * possible failures:
- * - no memory
- * - poolid already used
- * - unknown scheduler
- */
-static struct cpupool *cpupool_create(
-    int poolid, unsigned int sched_id, int *perr)
-{
-    struct cpupool *c;
-    struct cpupool **q;
-    int last = 0;
-
-    *perr = -ENOMEM;
-    if ( (c = alloc_cpupool_struct()) == NULL )
-        return NULL;
-
-    /* One reference for caller, one reference for cpupool_destroy(). */
-    atomic_set(&c->refcnt, 2);
-
-    debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
-
-    spin_lock(&cpupool_lock);
-
-    for_each_cpupool(q)
-    {
-        last = (*q)->cpupool_id;
-        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
-            break;
-    }
-    if ( *q != NULL )
-    {
-        if ( (*q)->cpupool_id == poolid )
-        {
-            *perr = -EEXIST;
-            goto err;
-        }
-        c->next = *q;
-    }
-
-    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
-    if ( poolid == 0 )
-    {
-        c->sched = scheduler_get_default();
-    }
-    else
-    {
-        c->sched = scheduler_alloc(sched_id, perr);
-        if ( c->sched == NULL )
-            goto err;
-    }
-    c->gran = opt_sched_granularity;
-
-    *q = c;
-
-    spin_unlock(&cpupool_lock);
-
-    debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
-                      c->cpupool_id, c->sched->name, c->sched->opt_name);
-
-    *perr = 0;
-    return c;
-
- err:
-    spin_unlock(&cpupool_lock);
-    free_cpupool_struct(c);
-    return NULL;
-}
-/*
- * destroys the given cpupool
- * returns 0 on success, 1 else
- * possible failures:
- * - pool still in use
- * - cpus still assigned to pool
- * - pool not in list
- */
-static int cpupool_destroy(struct cpupool *c)
-{
-    struct cpupool **q;
-
-    spin_lock(&cpupool_lock);
-    for_each_cpupool(q)
-        if ( *q == c )
-            break;
-    if ( *q != c )
-    {
-        spin_unlock(&cpupool_lock);
-        return -ENOENT;
-    }
-    if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
-    {
-        spin_unlock(&cpupool_lock);
-        return -EBUSY;
-    }
-    *q = c->next;
-    spin_unlock(&cpupool_lock);
-
-    cpupool_put(c);
-
-    debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
-    return 0;
-}
-
-/*
- * Move domain to another cpupool
- */
-static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
-{
-    int ret;
-
-    if ( unlikely(d->cpupool == c) )
-        return 0;
-
-    d->cpupool->n_dom--;
-    ret = sched_move_domain(d, c);
-    if ( ret )
-        d->cpupool->n_dom++;
-    else
-        c->n_dom++;
-
-    return ret;
-}
-int cpupool_move_domain(struct domain *d, struct cpupool *c)
-{
-    int ret;
-
-    spin_lock(&cpupool_lock);
-
-    ret = cpupool_move_domain_locked(d, c);
-
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-/*
- * assign a specific cpu to a cpupool
- * cpupool_lock must be held
- */
-static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
-{
-    int ret;
-    struct domain *d;
-    const cpumask_t *cpus;
-
-    cpus = sched_get_opt_cpumask(c->gran, cpu);
-
-    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
-        return -EADDRNOTAVAIL;
-    ret = schedule_cpu_add(cpumask_first(cpus), c);
-    if ( ret )
-        return ret;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-    if (cpupool_moving_cpu == cpu)
-    {
-        cpupool_moving_cpu = -1;
-        cpupool_put(cpupool_cpu_moving);
-        cpupool_cpu_moving = NULL;
-    }
-    cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
-    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    rcu_read_lock(&domlist_read_lock);
-    for_each_domain_in_cpupool(d, c)
-    {
-        domain_update_node_affinity(d);
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    return 0;
-}
-
-static int cpupool_unassign_cpu_finish(struct cpupool *c)
-{
-    int cpu = cpupool_moving_cpu;
-    const cpumask_t *cpus;
-    struct domain *d;
-    int ret;
-
-    if ( c != cpupool_cpu_moving )
-        return -EADDRNOTAVAIL;
-
-    /*
-     * We need this for scanning the domain list, both in
-     * cpu_disable_scheduler(), and at the bottom of this function.
-     */
-    rcu_read_lock(&domlist_read_lock);
-    ret = cpu_disable_scheduler(cpu);
-
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-    cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-
-    /*
-     * cpu_disable_scheduler() returning an error doesn't require resetting
-     * cpupool_free_cpus' cpu bit. All error cases should be of temporary
-     * nature and tools will retry the operation. Even if the number of
-     * retries may be limited, the in-between state can easily be repaired
-     * by adding the cpu to the cpupool again.
-     */
-    if ( !ret )
-    {
-        ret = schedule_cpu_rm(cpu);
-        if ( ret )
-            cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-        else
-        {
-            cpupool_moving_cpu = -1;
-            cpupool_put(cpupool_cpu_moving);
-            cpupool_cpu_moving = NULL;
-        }
-    }
-    rcu_read_unlock(&sched_res_rculock);
-
-    for_each_domain_in_cpupool(d, c)
-    {
-        domain_update_node_affinity(d);
-    }
-    rcu_read_unlock(&domlist_read_lock);
-
-    return ret;
-}
-
-static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
-{
-    int ret;
-    struct domain *d;
-    const cpumask_t *cpus;
-
-    spin_lock(&cpupool_lock);
-    ret = -EADDRNOTAVAIL;
-    if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
-         && (cpu != cpupool_moving_cpu) )
-        goto out;
-
-    ret = 0;
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-
-    if ( (c->n_dom > 0) &&
-         (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
-         (cpu != cpupool_moving_cpu) )
-    {
-        rcu_read_lock(&domlist_read_lock);
-        for_each_domain_in_cpupool(d, c)
-        {
-            if ( !d->is_dying && system_state == SYS_STATE_active )
-            {
-                ret = -EBUSY;
-                break;
-            }
-            ret = cpupool_move_domain_locked(d, cpupool0);
-            if ( ret )
-                break;
-        }
-        rcu_read_unlock(&domlist_read_lock);
-        if ( ret )
-            goto out;
-    }
-    cpupool_moving_cpu = cpu;
-    atomic_inc(&c->refcnt);
-    cpupool_cpu_moving = c;
-    cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
-    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
-    rcu_read_unlock(&domlist_read_lock);
-out:
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-static long cpupool_unassign_cpu_helper(void *info)
-{
-    struct cpupool *c = info;
-    long ret;
-
-    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
-                      cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
-    spin_lock(&cpupool_lock);
-
-    ret = cpupool_unassign_cpu_finish(c);
-
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
-
-    return ret;
-}
-
-/*
- * unassign a specific cpu from a cpupool
- * we must be sure not to run on the cpu to be unassigned! to achieve this
- * the main functionality is performed via continue_hypercall_on_cpu on a
- * specific cpu.
- * if the cpu to be removed is the last one of the cpupool no active domain
- * must be bound to the cpupool. dying domains are moved to cpupool0 as they
- * might be zombies.
- * possible failures:
- * - last cpu and still active domains in cpupool
- * - cpu just being unplugged
- */
-static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
-{
-    int work_cpu;
-    int ret;
-    unsigned int master_cpu;
-
-    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
-                      c->cpupool_id, cpu);
-
-    master_cpu = sched_get_resource_cpu(cpu);
-    ret = cpupool_unassign_cpu_start(c, master_cpu);
-    if ( ret )
-    {
-        debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
-                          c->cpupool_id, cpu, ret);
-        return ret;
-    }
-
-    work_cpu = sched_get_resource_cpu(smp_processor_id());
-    if ( work_cpu == master_cpu )
-    {
-        work_cpu = cpumask_first(cpupool0->cpu_valid);
-        if ( work_cpu == master_cpu )
-            work_cpu = cpumask_last(cpupool0->cpu_valid);
-    }
-    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
-}
-
-/*
- * add a new domain to a cpupool
- * possible failures:
- * - pool does not exist
- * - no cpu assigned to pool
- */
-int cpupool_add_domain(struct domain *d, int poolid)
-{
-    struct cpupool *c;
-    int rc;
-    int n_dom = 0;
-
-    if ( poolid == CPUPOOLID_NONE )
-        return 0;
-    spin_lock(&cpupool_lock);
-    c = cpupool_find_by_id(poolid);
-    if ( c == NULL )
-        rc = -ESRCH;
-    else if ( !cpumask_weight(c->cpu_valid) )
-        rc = -ENODEV;
-    else
-    {
-        c->n_dom++;
-        n_dom = c->n_dom;
-        d->cpupool = c;
-        rc = 0;
-    }
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
-                      d->domain_id, poolid, n_dom, rc);
-    return rc;
-}
-
-/*
- * remove a domain from a cpupool
- */
-void cpupool_rm_domain(struct domain *d)
-{
-    int cpupool_id;
-    int n_dom;
-
-    if ( d->cpupool == NULL )
-        return;
-    spin_lock(&cpupool_lock);
-    cpupool_id = d->cpupool->cpupool_id;
-    d->cpupool->n_dom--;
-    n_dom = d->cpupool->n_dom;
-    d->cpupool = NULL;
-    spin_unlock(&cpupool_lock);
-    debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
-                      d->domain_id, cpupool_id, n_dom);
-    return;
-}
-
-/*
- * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
- * as they must have been in there when unplugged.
- */
-static int cpupool_cpu_add(unsigned int cpu)
-{
-    int ret = 0;
-    const cpumask_t *cpus;
-
-    spin_lock(&cpupool_lock);
-    cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
-    cpumask_set_cpu(cpu, &cpupool_free_cpus);
-
-    /*
-     * If we are not resuming, we are hot-plugging cpu, and in which case
-     * we add it to pool0, as it certainly was there when hot-unplagged
-     * (or unplugging would have failed) and that is the default behavior
-     * anyway.
-     */
-    rcu_read_lock(&sched_res_rculock);
-    get_sched_res(cpu)->cpupool = NULL;
-
-    cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
-    if ( cpumask_subset(cpus, &cpupool_free_cpus) )
-        ret = cpupool_assign_cpu_locked(cpupool0, cpu);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    spin_unlock(&cpupool_lock);
-
-    return ret;
-}
-
-/*
- * This function is called in stop_machine context, so we can be sure no
- * non-idle vcpu is active on the system.
- */
-static void cpupool_cpu_remove(unsigned int cpu)
-{
-    int ret;
-
-    ASSERT(is_idle_vcpu(current));
-
-    if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
-    {
-        ret = cpupool_unassign_cpu_finish(cpupool0);
-        BUG_ON(ret);
-    }
-    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-}
-
-/*
- * Called before a CPU is being removed from the system.
- * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
- * to free cpus actually before removing them).
- * The CPU is locked, to forbid adding it again to another cpupool.
- */
-static int cpupool_cpu_remove_prologue(unsigned int cpu)
-{
-    int ret = 0;
-    cpumask_t *cpus;
-    unsigned int master_cpu;
-
-    spin_lock(&cpupool_lock);
-
-    rcu_read_lock(&sched_res_rculock);
-    cpus = get_sched_res(cpu)->cpus;
-    master_cpu = sched_get_resource_cpu(cpu);
-    if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
-        ret = -EBUSY;
-    else
-        cpumask_set_cpu(cpu, &cpupool_locked_cpus);
-    rcu_read_unlock(&sched_res_rculock);
-
-    spin_unlock(&cpupool_lock);
-
-    if ( ret )
-        return  ret;
-
-    if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
-    {
-        /* Cpupool0 is populated only after all cpus are up. */
-        ASSERT(system_state == SYS_STATE_active);
-
-        ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
-    }
-    else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
-        ret = -ENODEV;
-
-    return ret;
-}
-
-/*
- * Called during resume for all cpus which didn't come up again. The cpu must
- * be removed from the cpupool it is assigned to. In case a cpupool will be
- * left without cpu we move all domains of that cpupool to cpupool0.
- * As we are called with all domains still frozen there is no need to take the
- * cpupool lock here.
- */
-static void cpupool_cpu_remove_forced(unsigned int cpu)
-{
-    struct cpupool **c;
-    int ret;
-    unsigned int master_cpu = sched_get_resource_cpu(cpu);
-
-    for_each_cpupool ( c )
-    {
-        if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
-        {
-            ret = cpupool_unassign_cpu_start(*c, master_cpu);
-            BUG_ON(ret);
-            ret = cpupool_unassign_cpu_finish(*c);
-            BUG_ON(ret);
-        }
-    }
-
-    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-
-    rcu_read_lock(&sched_res_rculock);
-    sched_rm_cpu(cpu);
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * do cpupool related sysctl operations
- */
-int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
-{
-    int ret;
-    struct cpupool *c;
-
-    switch ( op->op )
-    {
-
-    case XEN_SYSCTL_CPUPOOL_OP_CREATE:
-    {
-        int poolid;
-
-        poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
-            CPUPOOLID_NONE: op->cpupool_id;
-        c = cpupool_create(poolid, op->sched_id, &ret);
-        if ( c != NULL )
-        {
-            op->cpupool_id = c->cpupool_id;
-            cpupool_put(c);
-        }
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
-    {
-        c = cpupool_get_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        ret = cpupool_destroy(c);
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_INFO:
-    {
-        c = cpupool_get_next_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        op->cpupool_id = c->cpupool_id;
-        op->sched_id = c->sched->sched_id;
-        op->n_dom = c->n_dom;
-        ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
-    {
-        unsigned cpu;
-        const cpumask_t *cpus;
-
-        cpu = op->cpu;
-        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
-                          op->cpupool_id, cpu);
-
-        spin_lock(&cpupool_lock);
-
-        c = cpupool_find_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            goto addcpu_out;
-        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
-        {
-            for_each_cpu ( cpu, &cpupool_free_cpus )
-            {
-                cpus = sched_get_opt_cpumask(c->gran, cpu);
-                if ( cpumask_subset(cpus, &cpupool_free_cpus) )
-                    break;
-            }
-            ret = -ENODEV;
-            if ( cpu >= nr_cpu_ids )
-                goto addcpu_out;
-        }
-        ret = -EINVAL;
-        if ( cpu >= nr_cpu_ids )
-            goto addcpu_out;
-        ret = -ENODEV;
-        cpus = sched_get_opt_cpumask(c->gran, cpu);
-        if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
-             cpumask_intersects(cpus, &cpupool_locked_cpus) )
-            goto addcpu_out;
-        ret = cpupool_assign_cpu_locked(c, cpu);
-
-    addcpu_out:
-        spin_unlock(&cpupool_lock);
-        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
-                          op->cpupool_id, cpu, ret);
-
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
-    {
-        unsigned cpu;
-
-        c = cpupool_get_by_id(op->cpupool_id);
-        ret = -ENOENT;
-        if ( c == NULL )
-            break;
-        cpu = op->cpu;
-        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
-            cpu = cpumask_last(c->cpu_valid);
-        ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
-        cpupool_put(c);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
-    {
-        struct domain *d;
-
-        ret = rcu_lock_remote_domain_by_id(op->domid, &d);
-        if ( ret )
-            break;
-        if ( d->cpupool == NULL )
-        {
-            ret = -EINVAL;
-            rcu_unlock_domain(d);
-            break;
-        }
-        if ( op->cpupool_id == d->cpupool->cpupool_id )
-        {
-            ret = 0;
-            rcu_unlock_domain(d);
-            break;
-        }
-        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
-                          d->domain_id, op->cpupool_id);
-        ret = -ENOENT;
-        spin_lock(&cpupool_lock);
-
-        c = cpupool_find_by_id(op->cpupool_id);
-        if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
-            ret = cpupool_move_domain_locked(d, c);
-
-        spin_unlock(&cpupool_lock);
-        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
-                          d->domain_id, op->cpupool_id, ret);
-        rcu_unlock_domain(d);
-    }
-    break;
-
-    case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
-    {
-        ret = cpumask_to_xenctl_bitmap(
-            &op->cpumap, &cpupool_free_cpus);
-    }
-    break;
-
-    default:
-        ret = -ENOSYS;
-        break;
-    }
-
-    return ret;
-}
-
-void dump_runq(unsigned char key)
-{
-    unsigned long    flags;
-    s_time_t         now = NOW();
-    struct cpupool **c;
-
-    spin_lock(&cpupool_lock);
-    local_irq_save(flags);
-
-    printk("sched_smt_power_savings: %s\n",
-            sched_smt_power_savings? "enabled":"disabled");
-    printk("NOW=%"PRI_stime"\n", now);
-
-    printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
-    if ( !cpumask_empty(&cpupool_free_cpus) )
-    {
-        printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
-        schedule_dump(NULL);
-    }
-
-    for_each_cpupool(c)
-    {
-        printk("Cpupool %d:\n", (*c)->cpupool_id);
-        printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
-        schedule_dump(*c);
-    }
-
-    local_irq_restore(flags);
-    spin_unlock(&cpupool_lock);
-}
-
-static int cpu_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-    int rc = 0;
-
-    switch ( action )
-    {
-    case CPU_DOWN_FAILED:
-    case CPU_ONLINE:
-        if ( system_state <= SYS_STATE_active )
-            rc = cpupool_cpu_add(cpu);
-        break;
-    case CPU_DOWN_PREPARE:
-        /* Suspend/Resume don't change assignments of cpus to cpupools. */
-        if ( system_state <= SYS_STATE_active )
-            rc = cpupool_cpu_remove_prologue(cpu);
-        break;
-    case CPU_DYING:
-        /* Suspend/Resume don't change assignments of cpus to cpupools. */
-        if ( system_state <= SYS_STATE_active )
-            cpupool_cpu_remove(cpu);
-        break;
-    case CPU_RESUME_FAILED:
-        cpupool_cpu_remove_forced(cpu);
-        break;
-    default:
-        break;
-    }
-
-    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_nfb = {
-    .notifier_call = cpu_callback
-};
-
-static int __init cpupool_init(void)
-{
-    unsigned int cpu;
-    int err;
-
-    cpupool_gran_init();
-
-    cpupool0 = cpupool_create(0, 0, &err);
-    BUG_ON(cpupool0 == NULL);
-    cpupool_put(cpupool0);
-    register_cpu_notifier(&cpu_nfb);
-
-    spin_lock(&cpupool_lock);
-
-    cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
-
-    for_each_cpu ( cpu, &cpupool_free_cpus )
-        cpupool_assign_cpu_locked(cpupool0, cpu);
-
-    spin_unlock(&cpupool_lock);
-
-    return 0;
-}
-__initcall(cpupool_init);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/sched/Kconfig b/xen/common/sched/Kconfig
new file mode 100644
index 0000000000..883ac87cab
--- /dev/null
+++ b/xen/common/sched/Kconfig
@@ -0,0 +1,65 @@
+menu "Schedulers"
+	visible if EXPERT = "y"
+
+config SCHED_CREDIT
+	bool "Credit scheduler support"
+	default y
+	---help---
+	  The traditional credit scheduler is a general purpose scheduler.
+
+config SCHED_CREDIT2
+	bool "Credit2 scheduler support"
+	default y
+	---help---
+	  The credit2 scheduler is a general purpose scheduler that is
+	  optimized for lower latency and higher VM density.
+
+config SCHED_RTDS
+	bool "RTDS scheduler support (EXPERIMENTAL)"
+	default y
+	---help---
+	  The RTDS scheduler is a soft and firm real-time scheduler for
+	  multicore, targeted for embedded, automotive, graphics and gaming
+	  in the cloud, and general low-latency workloads.
+
+config SCHED_ARINC653
+	bool "ARINC653 scheduler support (EXPERIMENTAL)"
+	default DEBUG
+	---help---
+	  The ARINC653 scheduler is a hard real-time scheduler for single
+	  cores, targeted for avionics, drones, and medical devices.
+
+config SCHED_NULL
+	bool "Null scheduler support (EXPERIMENTAL)"
+	default y
+	---help---
+	  The null scheduler is a static, zero overhead scheduler,
+	  for when there always are less vCPUs than pCPUs, typically
+	  in embedded or HPC scenarios.
+
+choice
+	prompt "Default Scheduler?"
+	default SCHED_CREDIT2_DEFAULT
+
+	config SCHED_CREDIT_DEFAULT
+		bool "Credit Scheduler" if SCHED_CREDIT
+	config SCHED_CREDIT2_DEFAULT
+		bool "Credit2 Scheduler" if SCHED_CREDIT2
+	config SCHED_RTDS_DEFAULT
+		bool "RT Scheduler" if SCHED_RTDS
+	config SCHED_ARINC653_DEFAULT
+		bool "ARINC653 Scheduler" if SCHED_ARINC653
+	config SCHED_NULL_DEFAULT
+		bool "Null Scheduler" if SCHED_NULL
+endchoice
+
+config SCHED_DEFAULT
+	string
+	default "credit" if SCHED_CREDIT_DEFAULT
+	default "credit2" if SCHED_CREDIT2_DEFAULT
+	default "rtds" if SCHED_RTDS_DEFAULT
+	default "arinc653" if SCHED_ARINC653_DEFAULT
+	default "null" if SCHED_NULL_DEFAULT
+	default "credit2"
+
+endmenu
diff --git a/xen/common/sched/Makefile b/xen/common/sched/Makefile
new file mode 100644
index 0000000000..3537f2a68d
--- /dev/null
+++ b/xen/common/sched/Makefile
@@ -0,0 +1,7 @@
+obj-y += cpupool.o
+obj-$(CONFIG_SCHED_ARINC653) += arinc653.o
+obj-$(CONFIG_SCHED_CREDIT) += credit.o
+obj-$(CONFIG_SCHED_CREDIT2) += credit2.o
+obj-$(CONFIG_SCHED_RTDS) += rt.o
+obj-$(CONFIG_SCHED_NULL) += null.o
+obj-y += core.o
diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c
new file mode 100644
index 0000000000..565575c326
--- /dev/null
+++ b/xen/common/sched/arinc653.c
@@ -0,0 +1,739 @@
+/******************************************************************************
+ * sched_arinc653.c
+ *
+ * An ARINC653-compatible scheduling algorithm for use in Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
+ */
+
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/timer.h>
+#include <xen/softirq.h>
+#include <xen/time.h>
+#include <xen/errno.h>
+#include <xen/list.h>
+#include <xen/guest_access.h>
+#include <public/sysctl.h>
+
+/**************************************************************************
+ * Private Macros                                                         *
+ **************************************************************************/
+
+/**
+ * Default timeslice for domain 0.
+ */
+#define DEFAULT_TIMESLICE MILLISECS(10)
+
+/**
+ * Retrieve the idle UNIT for a given physical CPU
+ */
+#define IDLETASK(cpu)  (sched_idle_unit(cpu))
+
+/**
+ * Return a pointer to the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit)
+ */
+#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
+
+/**
+ * Return the global scheduler private data given the scheduler ops pointer
+ */
+#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
+
+/**************************************************************************
+ * Private Type Definitions                                               *
+ **************************************************************************/
+
+/**
+ * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
+ * information for all non-idle UNITs
+ */
+typedef struct arinc653_unit_s
+{
+    /* unit points to Xen's struct sched_unit so we can get to it from an
+     * arinc653_unit_t pointer. */
+    struct sched_unit * unit;
+    /* awake holds whether the UNIT has been woken with vcpu_wake() */
+    bool_t              awake;
+    /* list holds the linked list information for the list this UNIT
+     * is stored in */
+    struct list_head    list;
+} arinc653_unit_t;
+
+/**
+ * The sched_entry_t structure holds a single entry of the
+ * ARINC 653 schedule.
+ */
+typedef struct sched_entry_s
+{
+    /* dom_handle holds the handle ("UUID") for the domain that this
+     * schedule entry refers to. */
+    xen_domain_handle_t dom_handle;
+    /* unit_id holds the UNIT number for the UNIT that this schedule
+     * entry refers to. */
+    int                 unit_id;
+    /* runtime holds the number of nanoseconds that the UNIT for this
+     * schedule entry should be allowed to run per major frame. */
+    s_time_t            runtime;
+    /* unit holds a pointer to the Xen sched_unit structure */
+    struct sched_unit * unit;
+} sched_entry_t;
+
+/**
+ * This structure defines data that is global to an instance of the scheduler
+ */
+typedef struct a653sched_priv_s
+{
+    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+    spinlock_t lock;
+
+    /**
+     * This array holds the active ARINC 653 schedule.
+     *
+     * When the system tries to start a new UNIT, this schedule is scanned
+     * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
+     * and UNIT number match, then the UNIT is allowed to run. Its run time
+     * (per major frame) is given in the third entry of the schedule.
+     */
+    sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+
+    /**
+     * This variable holds the number of entries that are valid in
+     * the arinc653_schedule table.
+     *
+     * This is not necessarily the same as the number of domains in the
+     * schedule. A domain could be listed multiple times within the schedule,
+     * or a domain with multiple UNITs could have a different
+     * schedule entry for each UNIT.
+     */
+    unsigned int num_schedule_entries;
+
+    /**
+     * the major frame time for the ARINC 653 schedule.
+     */
+    s_time_t major_frame;
+
+    /**
+     * the time that the next major frame starts
+     */
+    s_time_t next_major_frame;
+
+    /**
+     * pointers to all Xen UNIT structures for iterating through
+     */
+    struct list_head unit_list;
+} a653sched_priv_t;
+
+/**************************************************************************
+ * Helper functions                                                       *
+ **************************************************************************/
+
+/**
+ * This function compares two domain handles.
+ *
+ * @param h1        Pointer to handle 1
+ * @param h2        Pointer to handle 2
+ *
+ * @return          <ul>
+ *                  <li> <0:  handle 1 is less than handle 2
+ *                  <li>  0:  handle 1 is equal to handle 2
+ *                  <li> >0:  handle 1 is greater than handle 2
+ *                  </ul>
+ */
+static int dom_handle_cmp(const xen_domain_handle_t h1,
+                          const xen_domain_handle_t h2)
+{
+    return memcmp(h1, h2, sizeof(xen_domain_handle_t));
+}
+
+/**
+ * This function searches the unit list to find a UNIT that matches
+ * the domain handle and UNIT ID specified.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param handle    Pointer to handler
+ * @param unit_id   UNIT ID
+ *
+ * @return          <ul>
+ *                  <li> Pointer to the matching UNIT if one is found
+ *                  <li> NULL otherwise
+ *                  </ul>
+ */
+static struct sched_unit *find_unit(
+    const struct scheduler *ops,
+    xen_domain_handle_t handle,
+    int unit_id)
+{
+    arinc653_unit_t *aunit;
+
+    /* loop through the unit_list looking for the specified UNIT */
+    list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
+        if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
+             && (unit_id == aunit->unit->unit_id) )
+            return aunit->unit;
+
+    return NULL;
+}
+
+/**
+ * This function updates the pointer to the Xen UNIT structure for each entry
+ * in the ARINC 653 schedule.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @return          <None>
+ */
+static void update_schedule_units(const struct scheduler *ops)
+{
+    unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
+
+    for ( i = 0; i < n_entries; i++ )
+        SCHED_PRIV(ops)->schedule[i].unit =
+            find_unit(ops,
+                      SCHED_PRIV(ops)->schedule[i].dom_handle,
+                      SCHED_PRIV(ops)->schedule[i].unit_id);
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to put
+ * in place a new ARINC653 schedule.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ *
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+arinc653_sched_set(
+    const struct scheduler *ops,
+    struct xen_sysctl_arinc653_schedule *schedule)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    s_time_t total_runtime = 0;
+    unsigned int i;
+    unsigned long flags;
+    int rc = -EINVAL;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    /* Check for valid major frame and number of schedule entries. */
+    if ( (schedule->major_frame <= 0)
+         || (schedule->num_sched_entries < 1)
+         || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
+        goto fail;
+
+    for ( i = 0; i < schedule->num_sched_entries; i++ )
+    {
+        /* Check for a valid run time. */
+        if ( schedule->sched_entries[i].runtime <= 0 )
+            goto fail;
+
+        /* Add this entry's run time to total run time. */
+        total_runtime += schedule->sched_entries[i].runtime;
+    }
+
+    /*
+     * Error if the major frame is not large enough to run all entries as
+     * indicated by comparing the total run time to the major frame length.
+     */
+    if ( total_runtime > schedule->major_frame )
+        goto fail;
+
+    /* Copy the new schedule into place. */
+    sched_priv->num_schedule_entries = schedule->num_sched_entries;
+    sched_priv->major_frame = schedule->major_frame;
+    for ( i = 0; i < schedule->num_sched_entries; i++ )
+    {
+        memcpy(sched_priv->schedule[i].dom_handle,
+               schedule->sched_entries[i].dom_handle,
+               sizeof(sched_priv->schedule[i].dom_handle));
+        sched_priv->schedule[i].unit_id =
+            schedule->sched_entries[i].vcpu_id;
+        sched_priv->schedule[i].runtime =
+            schedule->sched_entries[i].runtime;
+    }
+    update_schedule_units(ops);
+
+    /*
+     * The newly-installed schedule takes effect immediately. We do not even
+     * wait for the current major frame to expire.
+     *
+     * Signal a new major frame to begin. The next major frame is set up by
+     * the do_schedule callback function when it is next invoked.
+     */
+    sched_priv->next_major_frame = NOW();
+
+    rc = 0;
+
+ fail:
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+    return rc;
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to read the
+ * current ARINC 653 schedule
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+arinc653_sched_get(
+    const struct scheduler *ops,
+    struct xen_sysctl_arinc653_schedule *schedule)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    unsigned int i;
+    unsigned long flags;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    schedule->num_sched_entries = sched_priv->num_schedule_entries;
+    schedule->major_frame = sched_priv->major_frame;
+    for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
+    {
+        memcpy(schedule->sched_entries[i].dom_handle,
+               sched_priv->schedule[i].dom_handle,
+               sizeof(sched_priv->schedule[i].dom_handle));
+        schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
+        schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
+    }
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    return 0;
+}
+
+/**************************************************************************
+ * Scheduler callback functions                                           *
+ **************************************************************************/
+
+/**
+ * This function performs initialization for an instance of the scheduler.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ *
+ * @return          <ul>
+ *                  <li> 0 = success
+ *                  <li> !0 = error
+ *                  </ul>
+ */
+static int
+a653sched_init(struct scheduler *ops)
+{
+    a653sched_priv_t *prv;
+
+    prv = xzalloc(a653sched_priv_t);
+    if ( prv == NULL )
+        return -ENOMEM;
+
+    ops->sched_data = prv;
+
+    prv->next_major_frame = 0;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->unit_list);
+
+    return 0;
+}
+
+/**
+ * This function performs deinitialization for an instance of the scheduler
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_deinit(struct scheduler *ops)
+{
+    xfree(SCHED_PRIV(ops));
+    ops->sched_data = NULL;
+}
+
+/**
+ * This function allocates scheduler-specific data for a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ *
+ * @return          Pointer to the allocated data
+ */
+static void *
+a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                      void *dd)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    arinc653_unit_t *svc;
+    unsigned int entry;
+    unsigned long flags;
+
+    /*
+     * Allocate memory for the ARINC 653-specific scheduler data information
+     * associated with the given UNIT (unit).
+     */
+    svc = xmalloc(arinc653_unit_t);
+    if ( svc == NULL )
+        return NULL;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    /*
+     * Add every one of dom0's units to the schedule, as long as there are
+     * slots available.
+     */
+    if ( unit->domain->domain_id == 0 )
+    {
+        entry = sched_priv->num_schedule_entries;
+
+        if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
+        {
+            sched_priv->schedule[entry].dom_handle[0] = '\0';
+            sched_priv->schedule[entry].unit_id = unit->unit_id;
+            sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
+            sched_priv->schedule[entry].unit = unit;
+
+            sched_priv->major_frame += DEFAULT_TIMESLICE;
+            ++sched_priv->num_schedule_entries;
+        }
+    }
+
+    /*
+     * Initialize our ARINC 653 scheduler-specific information for the UNIT.
+     * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
+     * will call the vcpu_wake scheduler callback function and our scheduler
+     * will mark the UNIT awake.
+     */
+    svc->unit = unit;
+    svc->awake = 0;
+    if ( !is_idle_unit(unit) )
+        list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
+    update_schedule_units(ops);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    return svc;
+}
+
+/**
+ * This function frees scheduler-specific UNIT data
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_free_udata(const struct scheduler *ops, void *priv)
+{
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    arinc653_unit_t *av = priv;
+    unsigned long flags;
+
+    if (av == NULL)
+        return;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    if ( !is_idle_unit(av->unit) )
+        list_del(&av->list);
+
+    xfree(av);
+    update_schedule_units(ops);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+}
+
+/**
+ * Xen scheduler callback function to sleep a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+    if ( AUNIT(unit) != NULL )
+        AUNIT(unit)->awake = 0;
+
+    /*
+     * If the UNIT being put to sleep is the same one that is currently
+     * running, raise a softirq to invoke the scheduler to switch domains.
+     */
+    if ( get_sched_res(sched_unit_master(unit))->curr == unit )
+        cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to wake up a UNIT
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+    if ( AUNIT(unit) != NULL )
+        AUNIT(unit)->awake = 1;
+
+    cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to select a UNIT to run.
+ * This is the main scheduler routine.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param now       Current time
+ */
+static void
+a653sched_do_schedule(
+    const struct scheduler *ops,
+    struct sched_unit *prev,
+    s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    struct sched_unit *new_task = NULL;
+    static unsigned int sched_index = 0;
+    static s_time_t next_switch_time;
+    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+    const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
+    unsigned long flags;
+
+    spin_lock_irqsave(&sched_priv->lock, flags);
+
+    if ( sched_priv->num_schedule_entries < 1 )
+        sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
+    else if ( now >= sched_priv->next_major_frame )
+    {
+        /* time to enter a new major frame
+         * the first time this function is called, this will be true */
+        /* start with the first domain in the schedule */
+        sched_index = 0;
+        sched_priv->next_major_frame = now + sched_priv->major_frame;
+        next_switch_time = now + sched_priv->schedule[0].runtime;
+    }
+    else
+    {
+        while ( (now >= next_switch_time)
+                && (sched_index < sched_priv->num_schedule_entries) )
+        {
+            /* time to switch to the next domain in this major frame */
+            sched_index++;
+            next_switch_time += sched_priv->schedule[sched_index].runtime;
+        }
+    }
+
+    /*
+     * If we exhausted the domains in the schedule and still have time left
+     * in the major frame then switch next at the next major frame.
+     */
+    if ( sched_index >= sched_priv->num_schedule_entries )
+        next_switch_time = sched_priv->next_major_frame;
+
+    /*
+     * If there are more domains to run in the current major frame, set
+     * new_task equal to the address of next domain's sched_unit structure.
+     * Otherwise, set new_task equal to the address of the idle task's
+     * sched_unit structure.
+     */
+    new_task = (sched_index < sched_priv->num_schedule_entries)
+        ? sched_priv->schedule[sched_index].unit
+        : IDLETASK(cpu);
+
+    /* Check to see if the new task can be run (awake & runnable). */
+    if ( !((new_task != NULL)
+           && (AUNIT(new_task) != NULL)
+           && AUNIT(new_task)->awake
+           && unit_runnable_state(new_task)) )
+        new_task = IDLETASK(cpu);
+    BUG_ON(new_task == NULL);
+
+    /*
+     * Check to make sure we did not miss a major frame.
+     * This is a good test for robust partitioning.
+     */
+    BUG_ON(now >= sched_priv->next_major_frame);
+
+    spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+    /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+    if ( tasklet_work_scheduled )
+        new_task = IDLETASK(cpu);
+
+    /* Running this task would result in a migration */
+    if ( !is_idle_unit(new_task)
+         && (sched_unit_master(new_task) != cpu) )
+        new_task = IDLETASK(cpu);
+
+    /*
+     * Return the amount of time the next domain has to run and the address
+     * of the selected task's UNIT structure.
+     */
+    prev->next_time = next_switch_time - now;
+    prev->next_task = new_task;
+    new_task->migrated = false;
+
+    BUG_ON(prev->next_time <= 0);
+}
+
+/**
+ * Xen scheduler callback function to select a resource for the UNIT to run on
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param unit      Pointer to struct sched_unit
+ *
+ * @return          Scheduler resource to run on
+ */
+static struct sched_resource *
+a653sched_pick_resource(const struct scheduler *ops,
+                        const struct sched_unit *unit)
+{
+    cpumask_t *online;
+    unsigned int cpu;
+
+    /*
+     * If present, prefer unit's current processor, else
+     * just find the first valid unit.
+     */
+    online = cpupool_domain_master_cpumask(unit->domain);
+
+    cpu = cpumask_first(online);
+
+    if ( cpumask_test_cpu(sched_unit_master(unit), online)
+         || (cpu >= nr_cpu_ids) )
+        cpu = sched_unit_master(unit);
+
+    return get_sched_res(cpu);
+}
+
+/**
+ * Xen scheduler callback to change the scheduler of a cpu
+ *
+ * @param new_ops   Pointer to this instance of the scheduler structure
+ * @param cpu       The cpu that is changing scheduler
+ * @param pdata     scheduler specific PCPU data (we don't have any)
+ * @param vdata     scheduler specific UNIT data of the idle unit
+ */
+static spinlock_t *
+a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                  void *pdata, void *vdata)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    arinc653_unit_t *svc = vdata;
+
+    ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    return &sr->_lock;
+}
+
+/**
+ * Xen scheduler callback function to perform a global (not domain-specific)
+ * adjustment. It is used by the ARINC 653 scheduler to put in place a new
+ * ARINC 653 schedule or to retrieve the schedule currently in place.
+ *
+ * @param ops       Pointer to this instance of the scheduler structure
+ * @param sc        Pointer to the scheduler operation specified by Domain 0
+ */
+static int
+a653sched_adjust_global(const struct scheduler *ops,
+                        struct xen_sysctl_scheduler_op *sc)
+{
+    struct xen_sysctl_arinc653_schedule local_sched;
+    int rc = -EINVAL;
+
+    switch ( sc->cmd )
+    {
+    case XEN_SYSCTL_SCHEDOP_putinfo:
+        if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        rc = arinc653_sched_set(ops, &local_sched);
+        break;
+    case XEN_SYSCTL_SCHEDOP_getinfo:
+        memset(&local_sched, -1, sizeof(local_sched));
+        rc = arinc653_sched_get(ops, &local_sched);
+        if ( rc )
+            break;
+
+        if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
+            rc = -EFAULT;
+        break;
+    }
+
+    return rc;
+}
+
+/**
+ * This structure defines our scheduler for Xen.
+ * The entries tell Xen where to find our scheduler-specific
+ * callback functions.
+ * The symbol must be visible to the rest of Xen at link time.
+ */
+static const struct scheduler sched_arinc653_def = {
+    .name           = "ARINC 653 Scheduler",
+    .opt_name       = "arinc653",
+    .sched_id       = XEN_SCHEDULER_ARINC653,
+    .sched_data     = NULL,
+
+    .init           = a653sched_init,
+    .deinit         = a653sched_deinit,
+
+    .free_udata     = a653sched_free_udata,
+    .alloc_udata    = a653sched_alloc_udata,
+
+    .insert_unit    = NULL,
+    .remove_unit    = NULL,
+
+    .sleep          = a653sched_unit_sleep,
+    .wake           = a653sched_unit_wake,
+    .yield          = NULL,
+    .context_saved  = NULL,
+
+    .do_schedule    = a653sched_do_schedule,
+
+    .pick_resource  = a653sched_pick_resource,
+
+    .switch_sched   = a653_switch_sched,
+
+    .adjust         = NULL,
+    .adjust_global  = a653sched_adjust_global,
+
+    .dump_settings  = NULL,
+    .dump_cpu_state = NULL,
+};
+
+REGISTER_SCHEDULER(sched_arinc653_def);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c
new file mode 100644
index 0000000000..040b4caca2
--- /dev/null
+++ b/xen/common/sched/compat.c
@@ -0,0 +1,55 @@
+/****************************************************************************
+ * schedule.c
+ *
+ */
+
+#include <compat/sched.h>
+
+#define COMPAT
+#define ret_t int
+
+#define do_sched_op compat_sched_op
+
+#define xen_sched_pin_override sched_pin_override
+CHECK_sched_pin_override;
+#undef xen_sched_pin_override
+
+#define xen_sched_shutdown sched_shutdown
+CHECK_sched_shutdown;
+#undef xen_sched_shutdown
+
+#define xen_sched_remote_shutdown sched_remote_shutdown
+CHECK_sched_remote_shutdown;
+#undef xen_sched_remote_shutdown
+
+static int compat_poll(struct compat_sched_poll *compat)
+{
+    struct sched_poll native;
+
+#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
+    guest_from_compat_handle((_d_)->ports, (_s_)->ports)
+    XLAT_sched_poll(&native, compat);
+#undef XLAT_sched_poll_HNDL_ports
+
+    return do_poll(&native);
+}
+
+#define do_poll compat_poll
+#define sched_poll compat_sched_poll
+
+#include "core.c"
+
+int compat_set_timer_op(u32 lo, s32 hi)
+{
+    return do_set_timer_op(((s64)hi << 32) | lo);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
new file mode 100644
index 0000000000..4d8eb4c617
--- /dev/null
+++ b/xen/common/sched/core.c
@@ -0,0 +1,3144 @@
+/****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: common/schedule.c
+ *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
+ *
+ * Description: Generic CPU scheduling code
+ *              implements support functionality for the Xen scheduler API.
+ *
+ */
+
+#ifndef COMPAT
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <xen/mm.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/multicall.h>
+#include <xen/cpu.h>
+#include <xen/preempt.h>
+#include <xen/event.h>
+#include <public/sched.h>
+#include <xsm/xsm.h>
+#include <xen/err.h>
+
+#ifdef CONFIG_XEN_GUEST
+#include <asm/guest.h>
+#else
+#define pv_shim false
+#endif
+
+/* opt_sched: scheduler - default to configured value */
+static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
+string_param("sched", opt_sched);
+
+/* if sched_smt_power_savings is set,
+ * scheduler will give preferrence to partially idle package compared to
+ * the full idle package, when picking pCPU to schedule vCPU.
+ */
+bool_t sched_smt_power_savings = 0;
+boolean_param("sched_smt_power_savings", sched_smt_power_savings);
+
+/* Default scheduling rate limit: 1ms
+ * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
+ * */
+int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Number of vcpus per struct sched_unit. */
+bool __read_mostly sched_disable_smt_switching;
+cpumask_t sched_res_mask;
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
+/* Various timer handlers. */
+static void s_timer_fn(void *unused);
+static void vcpu_periodic_timer_fn(void *data);
+static void vcpu_singleshot_timer_fn(void *data);
+static void poll_timer_fn(void *data);
+
+/* This is global for now so that private implementations can reach it */
+DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
+DEFINE_RCU_READ_LOCK(sched_res_rculock);
+
+/* Scratch space for cpumasks. */
+DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
+
+/* How many urgent vcpus. */
+DEFINE_PER_CPU(atomic_t, sched_urgent_count);
+
+extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
+#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
+#define schedulers __start_schedulers_array
+
+static struct scheduler __read_mostly ops;
+
+static bool scheduler_active;
+
+static void sched_set_affinity(
+    struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
+
+static struct sched_resource *
+sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    return unit->res;
+}
+
+static void *
+sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                       void *dd)
+{
+    /* Any non-NULL pointer is fine here. */
+    return ZERO_BLOCK_PTR;
+}
+
+static void
+sched_idle_free_udata(const struct scheduler *ops, void *priv)
+{
+}
+
+static void sched_idle_schedule(
+    const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    const unsigned int cpu = smp_processor_id();
+
+    unit->next_time = -1;
+    unit->next_task = sched_idle_unit(cpu);
+}
+
+static struct scheduler sched_idle_ops = {
+    .name           = "Idle Scheduler",
+    .opt_name       = "idle",
+    .sched_data     = NULL,
+
+    .pick_resource  = sched_idle_res_pick,
+    .do_schedule    = sched_idle_schedule,
+
+    .alloc_udata    = sched_idle_alloc_udata,
+    .free_udata     = sched_idle_free_udata,
+};
+
+static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
+                                         unsigned int cpu)
+{
+    unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
+    const struct domain *d = unit->domain;
+
+    return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
+}
+
+static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
+                                               unsigned int cpu)
+{
+    struct vcpu *v = unit2vcpu_cpu(unit, cpu);
+
+    return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
+}
+
+static inline struct scheduler *dom_scheduler(const struct domain *d)
+{
+    if ( likely(d->cpupool != NULL) )
+        return d->cpupool->sched;
+
+    /*
+     * If d->cpupool is NULL, this is the idle domain. This is special
+     * because the idle domain does not really belong to any cpupool, and,
+     * hence, does not really have a scheduler.
+     *
+     * This is (should be!) only called like this for allocating the idle
+     * vCPUs for the first time, during boot, in which case what we want
+     * is the default scheduler that has been, choosen at boot.
+     */
+    ASSERT(is_idle_domain(d));
+    return &ops;
+}
+
+static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
+{
+    struct domain *d = unit->domain;
+
+    if ( likely(d->cpupool != NULL) )
+        return d->cpupool->sched;
+
+    /*
+     * If d->cpupool is NULL, this is a unit of the idle domain. And this
+     * case is special because the idle domain does not really belong to
+     * a cpupool and, hence, doesn't really have a scheduler). In fact, its
+     * units (may) run on pCPUs which are in different pools, with different
+     * schedulers.
+     *
+     * What we want, in this case, is the scheduler of the pCPU where this
+     * particular idle unit is running. And, since unit->res never changes
+     * for idle units, it is safe to use it, with no locks, to figure that out.
+     */
+
+    ASSERT(is_idle_domain(d));
+    return unit->res->scheduler;
+}
+
+static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+{
+    return unit_scheduler(v->sched_unit);
+}
+#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
+
+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+    uint32_t event;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    event = TRC_SCHED_RUNSTATE_CHANGE;
+    event |= ( v->runstate.state & 0x3 ) << 8;
+    event |= ( new_state & 0x3 ) << 4;
+
+    __trace_var(event, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void trace_continue_running(struct vcpu *v)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void vcpu_urgent_count_update(struct vcpu *v)
+{
+    if ( is_idle_vcpu(v) )
+        return;
+
+    if ( unlikely(v->is_urgent) )
+    {
+        if ( !(v->pause_flags & VPF_blocked) ||
+             !test_bit(v->vcpu_id, v->domain->poll_mask) )
+        {
+            v->is_urgent = 0;
+            atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+        }
+    }
+    else
+    {
+        if ( unlikely(v->pause_flags & VPF_blocked) &&
+             unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
+        {
+            v->is_urgent = 1;
+            atomic_inc(&per_cpu(sched_urgent_count, v->processor));
+        }
+    }
+}
+
+static inline void vcpu_runstate_change(
+    struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+    s_time_t delta;
+    struct sched_unit *unit = v->sched_unit;
+
+    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+    if ( v->runstate.state == new_state )
+        return;
+
+    vcpu_urgent_count_update(v);
+
+    trace_runstate_change(v, new_state);
+
+    if ( !is_idle_vcpu(v) )
+    {
+        unit->runstate_cnt[v->runstate.state]--;
+        unit->runstate_cnt[new_state]++;
+    }
+
+    delta = new_entry_time - v->runstate.state_entry_time;
+    if ( delta > 0 )
+    {
+        v->runstate.time[v->runstate.state] += delta;
+        v->runstate.state_entry_time = new_entry_time;
+    }
+
+    v->runstate.state = new_state;
+}
+
+void sched_guest_idle(void (*idle) (void), unsigned int cpu)
+{
+    /*
+     * Another vcpu of the unit is active in guest context while this one is
+     * idle. In case of a scheduling event we don't want to have high latencies
+     * due to a cpu needing to wake up from deep C state for joining the
+     * rendezvous, so avoid those deep C states by incrementing the urgent
+     * count of the cpu.
+     */
+    atomic_inc(&per_cpu(sched_urgent_count, cpu));
+    idle();
+    atomic_dec(&per_cpu(sched_urgent_count, cpu));
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+    spinlock_t *lock;
+    s_time_t delta;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
+    memcpy(runstate, &v->runstate, sizeof(*runstate));
+    delta = NOW() - runstate->state_entry_time;
+    if ( delta > 0 )
+        runstate->time[runstate->state] += delta;
+
+    if ( unlikely(lock != NULL) )
+        unit_schedule_unlock_irq(lock, v->sched_unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+uint64_t get_cpu_idle_time(unsigned int cpu)
+{
+    struct vcpu_runstate_info state = { 0 };
+    struct vcpu *v = idle_vcpu[cpu];
+
+    if ( cpu_online(cpu) && v )
+        vcpu_runstate_get(v, &state);
+
+    return state.time[RUNSTATE_running];
+}
+
+/*
+ * If locks are different, take the one with the lower address first.
+ * This avoids dead- or live-locks when this code is running on both
+ * cpus at the same time.
+ */
+static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                   unsigned long *flags)
+{
+    if ( lock1 == lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+    }
+    else if ( lock1 < lock2 )
+    {
+        spin_lock_irqsave(lock1, *flags);
+        spin_lock(lock2);
+    }
+    else
+    {
+        spin_lock_irqsave(lock2, *flags);
+        spin_lock(lock1);
+    }
+}
+
+static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+                                     unsigned long flags)
+{
+    if ( lock1 != lock2 )
+        spin_unlock(lock2);
+    spin_unlock_irqrestore(lock1, flags);
+}
+
+static void sched_free_unit_mem(struct sched_unit *unit)
+{
+    struct sched_unit *prev_unit;
+    struct domain *d = unit->domain;
+
+    if ( d->sched_unit_list == unit )
+        d->sched_unit_list = unit->next_in_list;
+    else
+    {
+        for_each_sched_unit ( d, prev_unit )
+        {
+            if ( prev_unit->next_in_list == unit )
+            {
+                prev_unit->next_in_list = unit->next_in_list;
+                break;
+            }
+        }
+    }
+
+    free_cpumask_var(unit->cpu_hard_affinity);
+    free_cpumask_var(unit->cpu_hard_affinity_saved);
+    free_cpumask_var(unit->cpu_soft_affinity);
+
+    xfree(unit);
+}
+
+static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
+{
+    struct vcpu *vunit;
+    unsigned int cnt = 0;
+
+    /* Don't count to be released vcpu, might be not in vcpu list yet. */
+    for_each_sched_unit_vcpu ( unit, vunit )
+        if ( vunit != v )
+            cnt++;
+
+    v->sched_unit = NULL;
+    unit->runstate_cnt[v->runstate.state]--;
+
+    if ( unit->vcpu_list == v )
+        unit->vcpu_list = v->next_in_list;
+
+    if ( !cnt )
+        sched_free_unit_mem(unit);
+}
+
+static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
+{
+    v->sched_unit = unit;
+
+    /* All but idle vcpus are allocated with sequential vcpu_id. */
+    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
+    {
+        unit->vcpu_list = v;
+        /*
+         * unit_id is always the same as lowest vcpu_id of unit.
+         * This is used for stopping for_each_sched_unit_vcpu() loop and in
+         * order to support cpupools with different granularities.
+         */
+        unit->unit_id = v->vcpu_id;
+    }
+    unit->runstate_cnt[v->runstate.state]++;
+}
+
+static struct sched_unit *sched_alloc_unit_mem(void)
+{
+    struct sched_unit *unit;
+
+    unit = xzalloc(struct sched_unit);
+    if ( !unit )
+        return NULL;
+
+    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
+    {
+        sched_free_unit_mem(unit);
+        unit = NULL;
+    }
+
+    return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+    struct sched_unit **prev_unit;
+
+    unit->domain = d;
+
+    for ( prev_unit = &d->sched_unit_list; *prev_unit;
+          prev_unit = &(*prev_unit)->next_in_list )
+        if ( (*prev_unit)->next_in_list &&
+             (*prev_unit)->next_in_list->unit_id > unit->unit_id )
+            break;
+
+    unit->next_in_list = *prev_unit;
+    *prev_unit = unit;
+}
+
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+    struct sched_unit *unit;
+    struct domain *d = v->domain;
+    unsigned int gran = cpupool_get_granularity(d->cpupool);
+
+    for_each_sched_unit ( d, unit )
+        if ( unit->unit_id / gran == v->vcpu_id / gran )
+            break;
+
+    if ( unit )
+    {
+        sched_unit_add_vcpu(unit, v);
+        return unit;
+    }
+
+    if ( (unit = sched_alloc_unit_mem()) == NULL )
+        return NULL;
+
+    sched_unit_add_vcpu(unit, v);
+    sched_domain_insert_unit(unit, d);
+
+    return unit;
+}
+
+static unsigned int sched_select_initial_cpu(const struct vcpu *v)
+{
+    const struct domain *d = v->domain;
+    nodeid_t node;
+    spinlock_t *lock;
+    unsigned long flags;
+    unsigned int cpu_ret, cpu = smp_processor_id();
+    cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+    cpumask_clear(cpus);
+    for_each_node_mask ( node, d->node_affinity )
+        cpumask_or(cpus, cpus, &node_to_cpumask(node));
+    cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
+    if ( cpumask_empty(cpus) )
+        cpumask_copy(cpus, d->cpupool->cpu_valid);
+
+    if ( v->vcpu_id == 0 )
+        cpu_ret = cpumask_first(cpus);
+    else
+    {
+        /* We can rely on previous vcpu being available. */
+        ASSERT(!is_idle_domain(d));
+
+        cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
+    }
+
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+
+    return cpu_ret;
+}
+
+int sched_init_vcpu(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct sched_unit *unit;
+    unsigned int processor;
+
+    if ( (unit = sched_alloc_unit(v)) == NULL )
+        return 1;
+
+    if ( is_idle_domain(d) )
+        processor = v->vcpu_id;
+    else
+        processor = sched_select_initial_cpu(v);
+
+    /* Initialise the per-vcpu timers. */
+    spin_lock_init(&v->periodic_timer_lock);
+    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
+    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
+    init_timer(&v->poll_timer, poll_timer_fn, v, processor);
+
+    /* If this is not the first vcpu of the unit we are done. */
+    if ( unit->priv != NULL )
+    {
+        v->processor = processor;
+        return 0;
+    }
+
+    rcu_read_lock(&sched_res_rculock);
+
+    /* The first vcpu of an unit can be set via sched_set_res(). */
+    sched_set_res(unit, get_sched_res(processor));
+
+    unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
+    if ( unit->priv == NULL )
+    {
+        sched_free_unit(unit, v);
+        rcu_read_unlock(&sched_res_rculock);
+        return 1;
+    }
+
+    /*
+     * Initialize affinity settings. The idler, and potentially
+     * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+     */
+    if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
+        sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
+    else
+        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+    /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
+    if ( is_idle_domain(d) )
+    {
+        get_sched_res(v->processor)->curr = unit;
+        get_sched_res(v->processor)->sched_unit_idle = unit;
+        v->is_running = 1;
+        unit->is_running = true;
+        unit->state_entry_time = NOW();
+    }
+    else
+    {
+        sched_insert_unit(dom_scheduler(d), unit);
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return 0;
+}
+
+static void vcpu_move_irqs(struct vcpu *v)
+{
+    arch_move_irqs(v);
+    evtchn_move_pirqs(v);
+}
+
+static void sched_move_irqs(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_move_irqs(v);
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+    struct vcpu *v;
+    struct sched_unit *unit;
+    unsigned int new_p, unit_idx;
+    void **unit_priv;
+    void *domdata;
+    void *unitdata;
+    struct scheduler *old_ops;
+    void *old_domdata;
+    unsigned int gran = cpupool_get_granularity(c);
+    int ret = 0;
+
+    for_each_vcpu ( d, v )
+    {
+        if ( v->affinity_broken )
+            return -EBUSY;
+    }
+
+    rcu_read_lock(&sched_res_rculock);
+
+    domdata = sched_alloc_domdata(c->sched, d);
+    if ( IS_ERR(domdata) )
+    {
+        ret = PTR_ERR(domdata);
+        goto out;
+    }
+
+    unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
+    if ( unit_priv == NULL )
+    {
+        sched_free_domdata(c->sched, domdata);
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
+    {
+        unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
+        if ( unit_priv[unit_idx] == NULL )
+        {
+            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
+                sched_free_udata(c->sched, unit_priv[unit_idx]);
+            xfree(unit_priv);
+            sched_free_domdata(c->sched, domdata);
+            ret = -ENOMEM;
+            goto out;
+        }
+        unit_idx++;
+    }
+
+    domain_pause(d);
+
+    old_ops = dom_scheduler(d);
+    old_domdata = d->sched_priv;
+
+    for_each_sched_unit ( d, unit )
+    {
+        sched_remove_unit(old_ops, unit);
+    }
+
+    d->cpupool = c;
+    d->sched_priv = domdata;
+
+    new_p = cpumask_first(c->cpu_valid);
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
+    {
+        spinlock_t *lock;
+        unsigned int unit_p = new_p;
+
+        unitdata = unit->priv;
+
+        for_each_sched_unit_vcpu ( unit, v )
+        {
+            migrate_timer(&v->periodic_timer, new_p);
+            migrate_timer(&v->singleshot_timer, new_p);
+            migrate_timer(&v->poll_timer, new_p);
+            new_p = cpumask_cycle(new_p, c->cpu_valid);
+        }
+
+        lock = unit_schedule_lock_irq(unit);
+
+        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+        sched_set_res(unit, get_sched_res(unit_p));
+        /*
+         * With v->processor modified we must not
+         * - make any further changes assuming we hold the scheduler lock,
+         * - use unit_schedule_unlock_irq().
+         */
+        spin_unlock_irq(lock);
+
+        unit->priv = unit_priv[unit_idx];
+        if ( !d->is_dying )
+            sched_move_irqs(unit);
+
+        sched_insert_unit(c->sched, unit);
+
+        sched_free_udata(old_ops, unitdata);
+
+        unit_idx++;
+    }
+
+    domain_update_node_affinity(d);
+
+    domain_unpause(d);
+
+    sched_free_domdata(old_ops, old_domdata);
+
+    xfree(unit_priv);
+
+out:
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+void sched_destroy_vcpu(struct vcpu *v)
+{
+    struct sched_unit *unit = v->sched_unit;
+
+    kill_timer(&v->periodic_timer);
+    kill_timer(&v->singleshot_timer);
+    kill_timer(&v->poll_timer);
+    if ( test_and_clear_bool(v->is_urgent) )
+        atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+    /*
+     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
+     * is the same as being the only one.
+     */
+    if ( unit->vcpu_list == v )
+    {
+        rcu_read_lock(&sched_res_rculock);
+
+        sched_remove_unit(vcpu_scheduler(v), unit);
+        sched_free_udata(vcpu_scheduler(v), unit->priv);
+        sched_free_unit(unit, v);
+
+        rcu_read_unlock(&sched_res_rculock);
+    }
+}
+
+int sched_init_domain(struct domain *d, int poolid)
+{
+    void *sdom;
+    int ret;
+
+    ASSERT(d->cpupool == NULL);
+    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+    if ( (ret = cpupool_add_domain(d, poolid)) )
+        return ret;
+
+    SCHED_STAT_CRANK(dom_init);
+    TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sdom = sched_alloc_domdata(dom_scheduler(d), d);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    if ( IS_ERR(sdom) )
+        return PTR_ERR(sdom);
+
+    d->sched_priv = sdom;
+
+    return 0;
+}
+
+void sched_destroy_domain(struct domain *d)
+{
+    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+    if ( d->cpupool )
+    {
+        SCHED_STAT_CRANK(dom_destroy);
+        TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
+
+        rcu_read_lock(&sched_res_rculock);
+
+        sched_free_domdata(dom_scheduler(d), d->sched_priv);
+        d->sched_priv = NULL;
+
+        rcu_read_unlock(&sched_res_rculock);
+
+        cpupool_rm_domain(d);
+    }
+}
+
+static void vcpu_sleep_nosync_locked(struct vcpu *v)
+{
+    struct sched_unit *unit = v->sched_unit;
+
+    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+
+    if ( likely(!vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state == RUNSTATE_runnable )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
+        /* Only put unit to sleep in case all vcpus are not runnable. */
+        if ( likely(!unit_runnable(unit)) )
+            sched_sleep(unit_scheduler(unit), unit);
+        else if ( unit_running(unit) > 1 && v->is_running &&
+                  !v->force_context_switch )
+        {
+            v->force_context_switch = true;
+            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+        }
+    }
+}
+
+void vcpu_sleep_nosync(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *lock;
+
+    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
+
+    vcpu_sleep_nosync_locked(v);
+
+    unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_sleep_sync(struct vcpu *v)
+{
+    vcpu_sleep_nosync(v);
+
+    while ( !vcpu_runnable(v) && v->is_running )
+        cpu_relax();
+
+    sync_vcpu_execstate(v);
+}
+
+void vcpu_wake(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *lock;
+    struct sched_unit *unit = v->sched_unit;
+
+    TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irqsave(unit, &flags);
+
+    if ( likely(vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state >= RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
+        /*
+         * Call sched_wake() unconditionally, even if unit is running already.
+         * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
+         * and are now to be woken up again.
+         */
+        sched_wake(unit_scheduler(unit), unit);
+        if ( unit->is_running && !v->is_running && !v->force_context_switch )
+        {
+            v->force_context_switch = true;
+            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+        }
+    }
+    else if ( !(v->pause_flags & VPF_blocked) )
+    {
+        if ( v->runstate.state == RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+    }
+
+    unit_schedule_unlock_irqrestore(lock, flags, unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_unblock(struct vcpu *v)
+{
+    if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
+        return;
+
+    /* Polling period ends when a VCPU is unblocked. */
+    if ( unlikely(v->poll_evtchn != 0) )
+    {
+        v->poll_evtchn = 0;
+        /*
+         * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
+         * this VCPU (and it then going back to sleep on poll_mask).
+         * Test-and-clear is idiomatic and ensures clear_bit not reordered.
+         */
+        if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+            clear_bit(_VPF_blocked, &v->pause_flags);
+    }
+
+    vcpu_wake(v);
+}
+
+/*
+ * Do the actual movement of an unit from old to new CPU. Locks for *both*
+ * CPUs needs to have been taken already when calling this!
+ */
+static void sched_unit_move_locked(struct sched_unit *unit,
+                                   unsigned int new_cpu)
+{
+    unsigned int old_cpu = unit->res->master_cpu;
+    struct vcpu *v;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    /*
+     * Transfer urgency status to new CPU before switching CPUs, as
+     * once the switch occurs, v->is_urgent is no longer protected by
+     * the per-CPU scheduler lock we are holding.
+     */
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
+        {
+            atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
+            atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
+        }
+    }
+
+    /*
+     * Actual CPU switch to new CPU.  This is safe because the lock
+     * pointer can't change while the current lock is held.
+     */
+    sched_migrate(unit_scheduler(unit), unit, new_cpu);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * Initiating migration
+ *
+ * In order to migrate, we need the unit in question to have stopped
+ * running and have called sched_sleep() (to take it off any
+ * runqueues, for instance); and if it is currently running, it needs
+ * to be scheduled out.  Finally, we need to hold the scheduling locks
+ * for both the processor we're migrating from, and the processor
+ * we're migrating to.
+ *
+ * In order to avoid deadlock while satisfying the final requirement,
+ * we must release any scheduling lock we hold, then try to grab both
+ * locks we want, then double-check to make sure that what we started
+ * to do hasn't been changed in the mean time.
+ *
+ * These steps are encapsulated in the following two functions; they
+ * should be called like this:
+ *
+ *     lock = unit_schedule_lock_irq(unit);
+ *     sched_unit_migrate_start(unit);
+ *     unit_schedule_unlock_irq(lock, unit)
+ *     sched_unit_migrate_finish(unit);
+ *
+ * sched_unit_migrate_finish() will do the work now if it can, or simply
+ * return if it can't (because unit is still running); in that case
+ * sched_unit_migrate_finish() will be called by unit_context_saved().
+ */
+static void sched_unit_migrate_start(struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        set_bit(_VPF_migrating, &v->pause_flags);
+        vcpu_sleep_nosync_locked(v);
+    }
+}
+
+static void sched_unit_migrate_finish(struct sched_unit *unit)
+{
+    unsigned long flags;
+    unsigned int old_cpu, new_cpu;
+    spinlock_t *old_lock, *new_lock;
+    bool_t pick_called = 0;
+    struct vcpu *v;
+
+    /*
+     * If the unit is currently running, this will be handled by
+     * unit_context_saved(); and in any case, if the bit is cleared, then
+     * someone else has already done the work so we don't need to.
+     */
+    if ( unit->is_running )
+        return;
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( !test_bit(_VPF_migrating, &v->pause_flags) )
+            return;
+
+    old_cpu = new_cpu = unit->res->master_cpu;
+    for ( ; ; )
+    {
+        /*
+         * We need another iteration if the pre-calculated lock addresses
+         * are not correct any longer after evaluating old and new cpu holding
+         * the locks.
+         */
+        old_lock = get_sched_res(old_cpu)->schedule_lock;
+        new_lock = get_sched_res(new_cpu)->schedule_lock;
+
+        sched_spin_lock_double(old_lock, new_lock, &flags);
+
+        old_cpu = unit->res->master_cpu;
+        if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
+        {
+            /*
+             * If we selected a CPU on the previosu iteration, check if it
+             * remains suitable for running this vCPU.
+             */
+            if ( pick_called &&
+                 (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
+                 cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
+                 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
+                break;
+
+            /* Select a new CPU. */
+            new_cpu = sched_pick_resource(unit_scheduler(unit),
+                                          unit)->master_cpu;
+            if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
+                 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
+                break;
+            pick_called = 1;
+        }
+        else
+        {
+            /*
+             * We do not hold the scheduler lock appropriate for this vCPU.
+             * Thus we cannot select a new CPU on this iteration. Try again.
+             */
+            pick_called = 0;
+        }
+
+        sched_spin_unlock_double(old_lock, new_lock, flags);
+    }
+
+    /*
+     * NB. Check of v->running happens /after/ setting migration flag
+     * because they both happen in (different) spinlock regions, and those
+     * regions are strictly serialised.
+     */
+    if ( unit->is_running )
+    {
+        sched_spin_unlock_double(old_lock, new_lock, flags);
+        return;
+    }
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
+        {
+            sched_spin_unlock_double(old_lock, new_lock, flags);
+            return;
+        }
+    }
+
+    sched_unit_move_locked(unit, new_cpu);
+
+    sched_spin_unlock_double(old_lock, new_lock, flags);
+
+    if ( old_cpu != new_cpu )
+    {
+        /* Vcpus are moved to other pcpus, commit their states to memory. */
+        for_each_sched_unit_vcpu ( unit, v )
+            sync_vcpu_execstate(v);
+        sched_move_irqs(unit);
+    }
+
+    /* Wake on new CPU. */
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_wake(v);
+}
+
+static bool sched_check_affinity_broken(const struct sched_unit *unit)
+{
+    const struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( v->affinity_broken )
+            return true;
+
+    return false;
+}
+
+static void sched_reset_affinity_broken(struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        v->affinity_broken = false;
+}
+
+void restore_vcpu_affinity(struct domain *d)
+{
+    unsigned int cpu = smp_processor_id();
+    struct sched_unit *unit;
+
+    ASSERT(system_state == SYS_STATE_resume);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    for_each_sched_unit ( d, unit )
+    {
+        spinlock_t *lock;
+        unsigned int old_cpu = sched_unit_master(unit);
+        struct sched_resource *res;
+
+        ASSERT(!unit_runnable(unit));
+
+        /*
+         * Re-assign the initial processor as after resume we have no
+         * guarantee the old processor has come back to life again.
+         *
+         * Therefore, here, before actually unpausing the domains, we should
+         * set v->processor of each of their vCPUs to something that will
+         * make sense for the scheduler of the cpupool in which they are in.
+         */
+        lock = unit_schedule_lock_irq(unit);
+
+        cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                    cpupool_domain_master_cpumask(d));
+        if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+        {
+            if ( sched_check_affinity_broken(unit) )
+            {
+                sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
+                sched_reset_affinity_broken(unit);
+                cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                            cpupool_domain_master_cpumask(d));
+            }
+
+            if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+            {
+                /* Affinity settings of one vcpu are for the complete unit. */
+                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+                       unit->vcpu_list);
+                sched_set_affinity(unit, &cpumask_all, NULL);
+                cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                            cpupool_domain_master_cpumask(d));
+            }
+        }
+
+        res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
+        sched_set_res(unit, res);
+
+        spin_unlock_irq(lock);
+
+        /* v->processor might have changed, so reacquire the lock. */
+        lock = unit_schedule_lock_irq(unit);
+        res = sched_pick_resource(unit_scheduler(unit), unit);
+        sched_set_res(unit, res);
+        spin_unlock_irq(lock);
+
+        if ( old_cpu != sched_unit_master(unit) )
+            sched_move_irqs(unit);
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    domain_update_node_affinity(d);
+}
+
+/*
+ * This function is used by cpu_hotplug code via cpu notifier chain
+ * and from cpupools to switch schedulers on a cpu.
+ * Caller must get domlist_read_lock.
+ */
+int cpu_disable_scheduler(unsigned int cpu)
+{
+    struct domain *d;
+    struct cpupool *c;
+    cpumask_t online_affinity;
+    int ret = 0;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    c = get_sched_res(cpu)->cpupool;
+    if ( c == NULL )
+        goto out;
+
+    for_each_domain_in_cpupool ( d, c )
+    {
+        struct sched_unit *unit;
+
+        for_each_sched_unit ( d, unit )
+        {
+            unsigned long flags;
+            spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
+
+            cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
+            if ( cpumask_empty(&online_affinity) &&
+                 cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
+            {
+                if ( sched_check_affinity_broken(unit) )
+                {
+                    /* The unit is temporarily pinned, can't move it. */
+                    unit_schedule_unlock_irqrestore(lock, flags, unit);
+                    ret = -EADDRINUSE;
+                    break;
+                }
+
+                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+                       unit->vcpu_list);
+
+                sched_set_affinity(unit, &cpumask_all, NULL);
+            }
+
+            if ( unit->res != get_sched_res(cpu) )
+            {
+                /* The unit is not on this cpu, so we can move on. */
+                unit_schedule_unlock_irqrestore(lock, flags, unit);
+                continue;
+            }
+
+            /* If it is on this cpu, we must send it away.
+             * We are doing some cpupool manipulations:
+             *  * we want to call the scheduler, and let it re-evaluation
+             *    the placement of the vcpu, taking into account the new
+             *    cpupool configuration;
+             *  * the scheduler will always find a suitable solution, or
+             *    things would have failed before getting in here.
+             */
+            sched_unit_migrate_start(unit);
+            unit_schedule_unlock_irqrestore(lock, flags, unit);
+            sched_unit_migrate_finish(unit);
+
+            /*
+             * The only caveat, in this case, is that if a vcpu active in
+             * the hypervisor isn't migratable. In this case, the caller
+             * should try again after releasing and reaquiring all locks.
+             */
+            if ( unit->res == get_sched_res(cpu) )
+                ret = -EAGAIN;
+        }
+    }
+
+out:
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+static int cpu_disable_scheduler_check(unsigned int cpu)
+{
+    struct domain *d;
+    struct vcpu *v;
+    struct cpupool *c;
+
+    c = get_sched_res(cpu)->cpupool;
+    if ( c == NULL )
+        return 0;
+
+    for_each_domain_in_cpupool ( d, c )
+        for_each_vcpu ( d, v )
+            if ( v->affinity_broken )
+                return -EADDRINUSE;
+
+    return 0;
+}
+
+/*
+ * In general, this must be called with the scheduler lock held, because the
+ * adjust_affinity hook may want to modify the vCPU state. However, when the
+ * vCPU is being initialized (either for dom0 or domU) there is no risk of
+ * races, and it's fine to not take the look (we're talking about
+ * sched_setup_dom0_vcpus() an sched_init_vcpu()).
+ */
+static void sched_set_affinity(
+    struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
+{
+    rcu_read_lock(&sched_res_rculock);
+    sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
+    rcu_read_unlock(&sched_res_rculock);
+
+    if ( hard )
+        cpumask_copy(unit->cpu_hard_affinity, hard);
+    if ( soft )
+        cpumask_copy(unit->cpu_soft_affinity, soft);
+
+    unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
+                                               unit->cpu_soft_affinity) &&
+                               cpumask_intersects(unit->cpu_soft_affinity,
+                                                  unit->cpu_hard_affinity);
+}
+
+static int vcpu_set_affinity(
+    struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
+{
+    struct sched_unit *unit = v->sched_unit;
+    spinlock_t *lock;
+    int ret = 0;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irq(unit);
+
+    if ( v->affinity_broken )
+        ret = -EBUSY;
+    else
+    {
+        /*
+         * Tell the scheduler we changes something about affinity,
+         * and ask to re-evaluate vcpu placement.
+         */
+        if ( which == unit->cpu_hard_affinity )
+        {
+            sched_set_affinity(unit, affinity, NULL);
+        }
+        else
+        {
+            ASSERT(which == unit->cpu_soft_affinity);
+            sched_set_affinity(unit, NULL, affinity);
+        }
+        sched_unit_migrate_start(unit);
+    }
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    domain_update_node_affinity(v->domain);
+
+    sched_unit_migrate_finish(unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
+{
+    cpumask_t online_affinity;
+    cpumask_t *online;
+
+    online = VCPU2ONLINE(v);
+    cpumask_and(&online_affinity, affinity, online);
+    if ( cpumask_empty(&online_affinity) )
+        return -EINVAL;
+
+    return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
+}
+
+int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
+{
+    return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
+}
+
+/* Block the currently-executing domain until a pertinent event occurs. */
+void vcpu_block(void)
+{
+    struct vcpu *v = current;
+
+    set_bit(_VPF_blocked, &v->pause_flags);
+
+    arch_vcpu_block(v);
+
+    /* Check for events /after/ blocking: avoids wakeup waiting race. */
+    if ( local_events_need_delivery() )
+    {
+        clear_bit(_VPF_blocked, &v->pause_flags);
+    }
+    else
+    {
+        TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
+        raise_softirq(SCHEDULE_SOFTIRQ);
+    }
+}
+
+static void vcpu_block_enable_events(void)
+{
+    local_event_delivery_enable();
+    vcpu_block();
+}
+
+static long do_poll(struct sched_poll *sched_poll)
+{
+    struct vcpu   *v = current;
+    struct domain *d = v->domain;
+    evtchn_port_t  port = 0;
+    long           rc;
+    unsigned int   i;
+
+    /* Fairly arbitrary limit. */
+    if ( sched_poll->nr_ports > 128 )
+        return -EINVAL;
+
+    if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
+        return -EFAULT;
+
+    set_bit(_VPF_blocked, &v->pause_flags);
+    v->poll_evtchn = -1;
+    set_bit(v->vcpu_id, d->poll_mask);
+
+    arch_vcpu_block(v);
+
+#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
+    /* Check for events /after/ setting flags: avoids wakeup waiting race. */
+    smp_mb();
+
+    /*
+     * Someone may have seen we are blocked but not that we are polling, or
+     * vice versa. We are certainly being woken, so clean up and bail. Beyond
+     * this point others can be guaranteed to clean up for us if they wake us.
+     */
+    rc = 0;
+    if ( (v->poll_evtchn == 0) ||
+         !test_bit(_VPF_blocked, &v->pause_flags) ||
+         !test_bit(v->vcpu_id, d->poll_mask) )
+        goto out;
+#endif
+
+    rc = 0;
+    if ( local_events_need_delivery() )
+        goto out;
+
+    for ( i = 0; i < sched_poll->nr_ports; i++ )
+    {
+        rc = -EFAULT;
+        if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
+            goto out;
+
+        rc = -EINVAL;
+        if ( port >= d->max_evtchns )
+            goto out;
+
+        rc = 0;
+        if ( evtchn_port_is_pending(d, port) )
+            goto out;
+    }
+
+    if ( sched_poll->nr_ports == 1 )
+        v->poll_evtchn = port;
+
+    if ( sched_poll->timeout != 0 )
+        set_timer(&v->poll_timer, sched_poll->timeout);
+
+    TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+
+    return 0;
+
+ out:
+    v->poll_evtchn = 0;
+    clear_bit(v->vcpu_id, d->poll_mask);
+    clear_bit(_VPF_blocked, &v->pause_flags);
+    return rc;
+}
+
+/* Voluntarily yield the processor for this allocation. */
+long vcpu_yield(void)
+{
+    struct vcpu * v=current;
+    spinlock_t *lock;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irq(v->sched_unit);
+    sched_yield(vcpu_scheduler(v), v->sched_unit);
+    unit_schedule_unlock_irq(lock, v->sched_unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    SCHED_STAT_CRANK(vcpu_yield);
+
+    TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
+    raise_softirq(SCHEDULE_SOFTIRQ);
+    return 0;
+}
+
+static void domain_watchdog_timeout(void *data)
+{
+    struct domain *d = data;
+
+    if ( d->is_shutting_down || d->is_dying )
+        return;
+
+    printk("Watchdog timer fired for domain %u\n", d->domain_id);
+    domain_shutdown(d, SHUTDOWN_watchdog);
+}
+
+static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
+{
+    if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
+        return -EINVAL;
+
+    spin_lock(&d->watchdog_lock);
+
+    if ( id == 0 )
+    {
+        for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
+        {
+            if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
+                continue;
+            set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
+            break;
+        }
+        spin_unlock(&d->watchdog_lock);
+        return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
+    }
+
+    id -= 1;
+    if ( !test_bit(id, &d->watchdog_inuse_map) )
+    {
+        spin_unlock(&d->watchdog_lock);
+        return -EINVAL;
+    }
+
+    if ( timeout == 0 )
+    {
+        stop_timer(&d->watchdog_timer[id]);
+        clear_bit(id, &d->watchdog_inuse_map);
+    }
+    else
+    {
+        set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
+    }
+
+    spin_unlock(&d->watchdog_lock);
+    return 0;
+}
+
+void watchdog_domain_init(struct domain *d)
+{
+    unsigned int i;
+
+    spin_lock_init(&d->watchdog_lock);
+
+    d->watchdog_inuse_map = 0;
+
+    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
+        init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
+}
+
+void watchdog_domain_destroy(struct domain *d)
+{
+    unsigned int i;
+
+    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
+        kill_timer(&d->watchdog_timer[i]);
+}
+
+/*
+ * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
+ * cpu is NR_CPUS).
+ * Temporary pinning can be done due to two reasons, which may be nested:
+ * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
+ *   of a conflict (e.g. in case cpupool doesn't include requested CPU, or
+ *   another conflicting temporary pinning is already in effect.
+ * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
+ *   CPU it is just running on. Can't fail if used properly.
+ */
+int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
+{
+    struct sched_unit *unit = v->sched_unit;
+    spinlock_t *lock;
+    int ret = -EINVAL;
+    bool migrate;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = unit_schedule_lock_irq(unit);
+
+    if ( cpu == NR_CPUS )
+    {
+        if ( v->affinity_broken & reason )
+        {
+            ret = 0;
+            v->affinity_broken &= ~reason;
+        }
+        if ( !ret && !sched_check_affinity_broken(unit) )
+            sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
+    }
+    else if ( cpu < nr_cpu_ids )
+    {
+        if ( (v->affinity_broken & reason) ||
+             (sched_check_affinity_broken(unit) && v->processor != cpu) )
+            ret = -EBUSY;
+        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
+        {
+            if ( !sched_check_affinity_broken(unit) )
+            {
+                cpumask_copy(unit->cpu_hard_affinity_saved,
+                             unit->cpu_hard_affinity);
+                sched_set_affinity(unit, cpumask_of(cpu), NULL);
+            }
+            v->affinity_broken |= reason;
+            ret = 0;
+        }
+    }
+
+    migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
+    if ( migrate )
+        sched_unit_migrate_start(unit);
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    if ( migrate )
+        sched_unit_migrate_finish(unit);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+typedef long ret_t;
+
+#endif /* !COMPAT */
+
+ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+    ret_t ret = 0;
+
+    switch ( cmd )
+    {
+    case SCHEDOP_yield:
+    {
+        ret = vcpu_yield();
+        break;
+    }
+
+    case SCHEDOP_block:
+    {
+        vcpu_block_enable_events();
+        break;
+    }
+
+    case SCHEDOP_shutdown:
+    {
+        struct sched_shutdown sched_shutdown;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_shutdown, arg, 1) )
+            break;
+
+        TRACE_3D(TRC_SCHED_SHUTDOWN,
+                 current->domain->domain_id, current->vcpu_id,
+                 sched_shutdown.reason);
+        ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
+
+        break;
+    }
+
+    case SCHEDOP_shutdown_code:
+    {
+        struct sched_shutdown sched_shutdown;
+        struct domain *d = current->domain;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_shutdown, arg, 1) )
+            break;
+
+        TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
+                 d->domain_id, current->vcpu_id, sched_shutdown.reason);
+
+        spin_lock(&d->shutdown_lock);
+        if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
+            d->shutdown_code = (u8)sched_shutdown.reason;
+        spin_unlock(&d->shutdown_lock);
+
+        ret = 0;
+        break;
+    }
+
+    case SCHEDOP_poll:
+    {
+        struct sched_poll sched_poll;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_poll, arg, 1) )
+            break;
+
+        ret = do_poll(&sched_poll);
+
+        break;
+    }
+
+    case SCHEDOP_remote_shutdown:
+    {
+        struct domain *d;
+        struct sched_remote_shutdown sched_remote_shutdown;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
+            break;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
+        if ( d == NULL )
+            break;
+
+        ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
+        if ( likely(!ret) )
+            domain_shutdown(d, sched_remote_shutdown.reason);
+
+        rcu_unlock_domain(d);
+
+        break;
+    }
+
+    case SCHEDOP_watchdog:
+    {
+        struct sched_watchdog sched_watchdog;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_watchdog, arg, 1) )
+            break;
+
+        ret = domain_watchdog(
+            current->domain, sched_watchdog.id, sched_watchdog.timeout);
+        break;
+    }
+
+    case SCHEDOP_pin_override:
+    {
+        struct sched_pin_override sched_pin_override;
+        unsigned int cpu;
+
+        ret = -EPERM;
+        if ( !is_hardware_domain(current->domain) )
+            break;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_pin_override, arg, 1) )
+            break;
+
+        ret = -EINVAL;
+        if ( sched_pin_override.pcpu >= NR_CPUS )
+           break;
+
+        cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
+        ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
+
+        break;
+    }
+
+    default:
+        ret = -ENOSYS;
+    }
+
+    return ret;
+}
+
+#ifndef COMPAT
+
+/* Per-vcpu oneshot-timer hypercall. */
+long do_set_timer_op(s_time_t timeout)
+{
+    struct vcpu *v = current;
+    s_time_t offset = timeout - NOW();
+
+    if ( timeout == 0 )
+    {
+        stop_timer(&v->singleshot_timer);
+    }
+    else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
+              unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
+    {
+        /*
+         * Linux workaround: occasionally we will see timeouts a long way in
+         * the future due to wrapping in Linux's jiffy time handling. We check
+         * for timeouts wrapped negative, and for positive timeouts more than
+         * about 13 days in the future (2^50ns). The correct fix is to trigger
+         * an interrupt immediately (since Linux in fact has pending work to
+         * do in this situation). However, older guests also set a long timeout
+         * when they have *no* pending timers at all: setting an immediate
+         * timeout in this case can burn a lot of CPU. We therefore go for a
+         * reasonable middleground of triggering a timer event in 100ms.
+         */
+        gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
+                 timeout);
+        set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
+    }
+    else
+    {
+        migrate_timer(&v->singleshot_timer, smp_processor_id());
+        set_timer(&v->singleshot_timer, timeout);
+    }
+
+    return 0;
+}
+
+/* sched_id - fetch ID of current scheduler */
+int sched_id(void)
+{
+    return ops.sched_id;
+}
+
+/* Adjust scheduling parameter for a given domain. */
+long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
+{
+    long ret;
+
+    ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
+    if ( ret )
+        return ret;
+
+    if ( op->sched_id != dom_scheduler(d)->sched_id )
+        return -EINVAL;
+
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_SCHEDOP_putinfo:
+    case XEN_DOMCTL_SCHEDOP_getinfo:
+    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+        break;
+    default:
+        return -EINVAL;
+    }
+
+    /* NB: the pluggable scheduler code needs to take care
+     * of locking by itself. */
+    rcu_read_lock(&sched_res_rculock);
+
+    if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
+        TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
+{
+    struct cpupool *pool;
+    int rc;
+
+    rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
+    if ( rc )
+        return rc;
+
+    if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
+         (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
+        return -EINVAL;
+
+    pool = cpupool_get_by_id(op->cpupool_id);
+    if ( pool == NULL )
+        return -ESRCH;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    rc = ((op->sched_id == pool->sched->sched_id)
+          ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    cpupool_put(pool);
+
+    return rc;
+}
+
+static void vcpu_periodic_timer_work_locked(struct vcpu *v)
+{
+    s_time_t now;
+    s_time_t periodic_next_event;
+
+    now = NOW();
+    periodic_next_event = v->periodic_last_event + v->periodic_period;
+
+    if ( now >= periodic_next_event )
+    {
+        send_timer_event(v);
+        v->periodic_last_event = now;
+        periodic_next_event = now + v->periodic_period;
+    }
+
+    migrate_timer(&v->periodic_timer, v->processor);
+    set_timer(&v->periodic_timer, periodic_next_event);
+}
+
+static void vcpu_periodic_timer_work(struct vcpu *v)
+{
+    if ( v->periodic_period == 0 )
+        return;
+
+    spin_lock(&v->periodic_timer_lock);
+    if ( v->periodic_period )
+        vcpu_periodic_timer_work_locked(v);
+    spin_unlock(&v->periodic_timer_lock);
+}
+
+/*
+ * Set the periodic timer of a vcpu.
+ */
+void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
+{
+    spin_lock(&v->periodic_timer_lock);
+
+    stop_timer(&v->periodic_timer);
+
+    v->periodic_period = value;
+    if ( value )
+        vcpu_periodic_timer_work_locked(v);
+
+    spin_unlock(&v->periodic_timer_lock);
+}
+
+static void sched_switch_units(struct sched_resource *sr,
+                               struct sched_unit *next, struct sched_unit *prev,
+                               s_time_t now)
+{
+    unsigned int cpu;
+
+    ASSERT(unit_running(prev));
+
+    if ( prev != next )
+    {
+        sr->curr = next;
+        sr->prev = prev;
+
+        TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
+                 prev->unit_id, now - prev->state_entry_time);
+        TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
+                 next->unit_id,
+                 (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
+                 (now - next->state_entry_time) : 0, prev->next_time);
+        TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
+                 next->domain->domain_id, next->unit_id);
+
+        ASSERT(!unit_running(next));
+
+        /*
+         * NB. Don't add any trace records from here until the actual context
+         * switch, else lost_records resume will not work properly.
+         */
+
+        ASSERT(!next->is_running);
+        next->is_running = true;
+        next->state_entry_time = now;
+
+        if ( is_idle_unit(prev) )
+        {
+            prev->runstate_cnt[RUNSTATE_running] = 0;
+            prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
+        }
+        if ( is_idle_unit(next) )
+        {
+            next->runstate_cnt[RUNSTATE_running] = sr->granularity;
+            next->runstate_cnt[RUNSTATE_runnable] = 0;
+        }
+    }
+
+    for_each_cpu ( cpu, sr->cpus )
+    {
+        struct vcpu *vprev = get_cpu_current(cpu);
+        struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
+
+        if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
+        {
+            vcpu_runstate_change(vprev,
+                ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
+                 (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+                now);
+            vcpu_runstate_change(vnext, vnext->new_state, now);
+        }
+
+        vnext->is_running = 1;
+
+        if ( is_idle_vcpu(vnext) )
+            vnext->sched_unit = next;
+    }
+}
+
+static bool sched_tasklet_check_cpu(unsigned int cpu)
+{
+    unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
+
+    switch ( *tasklet_work )
+    {
+    case TASKLET_enqueued:
+        set_bit(_TASKLET_scheduled, tasklet_work);
+        /* fallthrough */
+    case TASKLET_enqueued|TASKLET_scheduled:
+        return true;
+        break;
+    case TASKLET_scheduled:
+        clear_bit(_TASKLET_scheduled, tasklet_work);
+        /* fallthrough */
+    case 0:
+        /* return false; */
+        break;
+    default:
+        BUG();
+    }
+
+    return false;
+}
+
+static bool sched_tasklet_check(unsigned int cpu)
+{
+    bool tasklet_work_scheduled = false;
+    const cpumask_t *mask = get_sched_res(cpu)->cpus;
+    unsigned int cpu_iter;
+
+    for_each_cpu ( cpu_iter, mask )
+        if ( sched_tasklet_check_cpu(cpu_iter) )
+            tasklet_work_scheduled = true;
+
+    return tasklet_work_scheduled;
+}
+
+static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
+                                      unsigned int cpu)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct scheduler *sched = sr->scheduler;
+    struct sched_unit *next;
+
+    /* get policy-specific decision on scheduling... */
+    sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
+
+    next = prev->next_task;
+
+    if ( prev->next_time >= 0 ) /* -ve means no limit */
+        set_timer(&sr->s_timer, now + prev->next_time);
+
+    sched_switch_units(sr, next, prev, now);
+
+    return next;
+}
+
+static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
+{
+    /* Clear running flag /after/ writing context to memory. */
+    smp_wmb();
+
+    if ( vprev != vnext )
+        vprev->is_running = 0;
+}
+
+static void unit_context_saved(struct sched_resource *sr)
+{
+    struct sched_unit *unit = sr->prev;
+
+    if ( !unit )
+        return;
+
+    unit->is_running = false;
+    unit->state_entry_time = NOW();
+    sr->prev = NULL;
+
+    /* Check for migration request /after/ clearing running flag. */
+    smp_mb();
+
+    sched_context_saved(unit_scheduler(unit), unit);
+
+    /* Idle never migrates and idle vcpus might belong to other units. */
+    if ( !is_idle_unit(unit) )
+        sched_unit_migrate_finish(unit);
+}
+
+/*
+ * Rendezvous on end of context switch.
+ * As no lock is protecting this rendezvous function we need to use atomic
+ * access functions on the counter.
+ * The counter will be 0 in case no rendezvous is needed. For the rendezvous
+ * case it is initialised to the number of cpus to rendezvous plus 1. Each
+ * member entering decrements the counter. The last one will decrement it to
+ * 1 and perform the final needed action in that case (call of
+ * unit_context_saved()), and then set the counter to zero. The other members
+ * will wait until the counter becomes zero until they proceed.
+ */
+void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
+{
+    struct sched_unit *next = vnext->sched_unit;
+    struct sched_resource *sr;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(smp_processor_id());
+
+    if ( atomic_read(&next->rendezvous_out_cnt) )
+    {
+        int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
+
+        vcpu_context_saved(vprev, vnext);
+
+        /* Call unit_context_saved() before releasing other waiters. */
+        if ( cnt == 1 )
+        {
+            unit_context_saved(sr);
+            atomic_set(&next->rendezvous_out_cnt, 0);
+        }
+        else
+            while ( atomic_read(&next->rendezvous_out_cnt) )
+                cpu_relax();
+    }
+    else
+    {
+        vcpu_context_saved(vprev, vnext);
+        if ( sr->granularity == 1 )
+            unit_context_saved(sr);
+    }
+
+    if ( is_idle_vcpu(vprev) && vprev != vnext )
+        vprev->sched_unit = sr->sched_unit_idle;
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
+                                 bool reset_idle_unit, s_time_t now)
+{
+    if ( unlikely(vprev == vnext) )
+    {
+        TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
+                 vnext->domain->domain_id, vnext->sched_unit->unit_id,
+                 now - vprev->runstate.state_entry_time,
+                 vprev->sched_unit->next_time);
+        sched_context_switched(vprev, vnext);
+
+        /*
+         * We are switching from a non-idle to an idle unit.
+         * A vcpu of the idle unit might have been running before due to
+         * the guest vcpu being blocked. We must adjust the unit of the idle
+         * vcpu which might have been set to the guest's one.
+         */
+        if ( reset_idle_unit )
+            vnext->sched_unit =
+                get_sched_res(smp_processor_id())->sched_unit_idle;
+
+        rcu_read_unlock(&sched_res_rculock);
+
+        trace_continue_running(vnext);
+        return continue_running(vprev);
+    }
+
+    SCHED_STAT_CRANK(sched_ctx);
+
+    stop_timer(&vprev->periodic_timer);
+
+    if ( vnext->sched_unit->migrated )
+        vcpu_move_irqs(vnext);
+
+    vcpu_periodic_timer_work(vnext);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    context_switch(vprev, vnext);
+}
+
+/*
+ * Force a context switch of a single vcpu of an unit.
+ * Might be called either if a vcpu of an already running unit is woken up
+ * or if a vcpu of a running unit is put asleep with other vcpus of the same
+ * unit still running.
+ * Returns either NULL if v is already in the correct state or the vcpu to
+ * run next.
+ */
+static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
+                                               struct vcpu *v,
+                                               unsigned int cpu, s_time_t now)
+{
+    v->force_context_switch = false;
+
+    if ( vcpu_runnable(v) == v->is_running )
+        return NULL;
+
+    if ( vcpu_runnable(v) )
+    {
+        if ( is_idle_vcpu(vprev) )
+        {
+            vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+            vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+        }
+        vcpu_runstate_change(v, RUNSTATE_running, now);
+    }
+    else
+    {
+        /* Make sure not to switch last vcpu of an unit away. */
+        if ( unit_running(v->sched_unit) == 1 )
+            return NULL;
+
+        v->new_state = vcpu_runstate_blocked(v);
+        vcpu_runstate_change(v, v->new_state, now);
+        v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
+        if ( v != vprev )
+        {
+            if ( is_idle_vcpu(vprev) )
+            {
+                vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+                vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+            }
+            else
+            {
+                v->sched_unit = vprev->sched_unit;
+                vcpu_runstate_change(v, RUNSTATE_running, now);
+            }
+        }
+    }
+
+    /* This vcpu will be switched to. */
+    v->is_running = true;
+
+    /* Make sure not to loose another slave call. */
+    raise_softirq(SCHED_SLAVE_SOFTIRQ);
+
+    return v;
+}
+
+/*
+ * Rendezvous before taking a scheduling decision.
+ * Called with schedule lock held, so all accesses to the rendezvous counter
+ * can be normal ones (no atomic accesses needed).
+ * The counter is initialized to the number of cpus to rendezvous initially.
+ * Each cpu entering will decrement the counter. In case the counter becomes
+ * zero do_schedule() is called and the rendezvous counter for leaving
+ * context_switch() is set. All other members will wait until the counter is
+ * becoming zero, dropping the schedule lock in between.
+ */
+static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
+                                                   spinlock_t **lock, int cpu,
+                                                   s_time_t now)
+{
+    struct sched_unit *next;
+    struct vcpu *v;
+    unsigned int gran = get_sched_res(cpu)->granularity;
+
+    if ( !--prev->rendezvous_in_cnt )
+    {
+        next = do_schedule(prev, now, cpu);
+        atomic_set(&next->rendezvous_out_cnt, gran + 1);
+        return next;
+    }
+
+    v = unit2vcpu_cpu(prev, cpu);
+    while ( prev->rendezvous_in_cnt )
+    {
+        if ( v && v->force_context_switch )
+        {
+            struct vcpu *vprev = current;
+
+            v = sched_force_context_switch(vprev, v, cpu, now);
+
+            if ( v )
+            {
+                /* We'll come back another time, so adjust rendezvous_in_cnt. */
+                prev->rendezvous_in_cnt++;
+                atomic_set(&prev->rendezvous_out_cnt, 0);
+
+                pcpu_schedule_unlock_irq(*lock, cpu);
+
+                sched_context_switch(vprev, v, false, now);
+
+                return NULL;     /* ARM only. */
+            }
+
+            v = unit2vcpu_cpu(prev, cpu);
+        }
+        /*
+         * Coming from idle might need to do tasklet work.
+         * In order to avoid deadlocks we can't do that here, but have to
+         * continue the idle loop.
+         * Undo the rendezvous_in_cnt decrement and schedule another call of
+         * sched_slave().
+         */
+        if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
+        {
+            struct vcpu *vprev = current;
+
+            prev->rendezvous_in_cnt++;
+            atomic_set(&prev->rendezvous_out_cnt, 0);
+
+            pcpu_schedule_unlock_irq(*lock, cpu);
+
+            raise_softirq(SCHED_SLAVE_SOFTIRQ);
+            sched_context_switch(vprev, vprev, false, now);
+
+            return NULL;         /* ARM only. */
+        }
+
+        pcpu_schedule_unlock_irq(*lock, cpu);
+
+        cpu_relax();
+
+        *lock = pcpu_schedule_lock_irq(cpu);
+
+        if ( unlikely(!scheduler_active) )
+        {
+            ASSERT(is_idle_unit(prev));
+            atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
+            prev->rendezvous_in_cnt = 0;
+        }
+    }
+
+    return prev->next_task;
+}
+
+static void sched_slave(void)
+{
+    struct vcpu          *v, *vprev = current;
+    struct sched_unit    *prev = vprev->sched_unit, *next;
+    s_time_t              now;
+    spinlock_t           *lock;
+    bool                  do_softirq = false;
+    unsigned int          cpu = smp_processor_id();
+
+    ASSERT_NOT_IN_ATOMIC();
+
+    rcu_read_lock(&sched_res_rculock);
+
+    lock = pcpu_schedule_lock_irq(cpu);
+
+    now = NOW();
+
+    v = unit2vcpu_cpu(prev, cpu);
+    if ( v && v->force_context_switch )
+    {
+        v = sched_force_context_switch(vprev, v, cpu, now);
+
+        if ( v )
+        {
+            pcpu_schedule_unlock_irq(lock, cpu);
+
+            sched_context_switch(vprev, v, false, now);
+
+            return;
+        }
+
+        do_softirq = true;
+    }
+
+    if ( !prev->rendezvous_in_cnt )
+    {
+        pcpu_schedule_unlock_irq(lock, cpu);
+
+        rcu_read_unlock(&sched_res_rculock);
+
+        /* Check for failed forced context switch. */
+        if ( do_softirq )
+            raise_softirq(SCHEDULE_SOFTIRQ);
+
+        return;
+    }
+
+    stop_timer(&get_sched_res(cpu)->s_timer);
+
+    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+    if ( !next )
+        return;
+
+    pcpu_schedule_unlock_irq(lock, cpu);
+
+    sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
+                         is_idle_unit(next) && !is_idle_unit(prev), now);
+}
+
+/*
+ * The main function
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
+ */
+static void schedule(void)
+{
+    struct vcpu          *vnext, *vprev = current;
+    struct sched_unit    *prev = vprev->sched_unit, *next = NULL;
+    s_time_t              now;
+    struct sched_resource *sr;
+    spinlock_t           *lock;
+    int cpu = smp_processor_id();
+    unsigned int          gran;
+
+    ASSERT_NOT_IN_ATOMIC();
+
+    SCHED_STAT_CRANK(sched_run);
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+    gran = sr->granularity;
+
+    lock = pcpu_schedule_lock_irq(cpu);
+
+    if ( prev->rendezvous_in_cnt )
+    {
+        /*
+         * We have a race: sched_slave() should be called, so raise a softirq
+         * in order to re-enter schedule() later and call sched_slave() now.
+         */
+        pcpu_schedule_unlock_irq(lock, cpu);
+
+        rcu_read_unlock(&sched_res_rculock);
+
+        raise_softirq(SCHEDULE_SOFTIRQ);
+        return sched_slave();
+    }
+
+    stop_timer(&sr->s_timer);
+
+    now = NOW();
+
+    if ( gran > 1 )
+    {
+        cpumask_t mask;
+
+        prev->rendezvous_in_cnt = gran;
+        cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
+        cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
+        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+        if ( !next )
+            return;
+    }
+    else
+    {
+        prev->rendezvous_in_cnt = 0;
+        next = do_schedule(prev, now, cpu);
+        atomic_set(&next->rendezvous_out_cnt, 0);
+    }
+
+    pcpu_schedule_unlock_irq(lock, cpu);
+
+    vnext = sched_unit2vcpu_cpu(next, cpu);
+    sched_context_switch(vprev, vnext,
+                         !is_idle_unit(prev) && is_idle_unit(next), now);
+}
+
+/* The scheduler timer: force a run through the scheduler */
+static void s_timer_fn(void *unused)
+{
+    raise_softirq(SCHEDULE_SOFTIRQ);
+    SCHED_STAT_CRANK(sched_irq);
+}
+
+/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
+static void vcpu_periodic_timer_fn(void *data)
+{
+    struct vcpu *v = data;
+    vcpu_periodic_timer_work(v);
+}
+
+/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
+static void vcpu_singleshot_timer_fn(void *data)
+{
+    struct vcpu *v = data;
+    send_timer_event(v);
+}
+
+/* SCHEDOP_poll timeout callback. */
+static void poll_timer_fn(void *data)
+{
+    struct vcpu *v = data;
+
+    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+        vcpu_unblock(v);
+}
+
+static struct sched_resource *sched_alloc_res(void)
+{
+    struct sched_resource *sr;
+
+    sr = xzalloc(struct sched_resource);
+    if ( sr == NULL )
+        return NULL;
+    if ( !zalloc_cpumask_var(&sr->cpus) )
+    {
+        xfree(sr);
+        return NULL;
+    }
+    return sr;
+}
+
+static int cpu_schedule_up(unsigned int cpu)
+{
+    struct sched_resource *sr;
+
+    sr = sched_alloc_res();
+    if ( sr == NULL )
+        return -ENOMEM;
+
+    sr->master_cpu = cpu;
+    cpumask_copy(sr->cpus, cpumask_of(cpu));
+    set_sched_res(cpu, sr);
+
+    sr->scheduler = &sched_idle_ops;
+    spin_lock_init(&sr->_lock);
+    sr->schedule_lock = &sched_free_cpu_lock;
+    init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
+    atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
+
+    /* We start with cpu granularity. */
+    sr->granularity = 1;
+
+    cpumask_set_cpu(cpu, &sched_res_mask);
+
+    /* Boot CPU is dealt with later in scheduler_init(). */
+    if ( cpu == 0 )
+        return 0;
+
+    if ( idle_vcpu[cpu] == NULL )
+        vcpu_create(idle_vcpu[0]->domain, cpu);
+    else
+        idle_vcpu[cpu]->sched_unit->res = sr;
+
+    if ( idle_vcpu[cpu] == NULL )
+        return -ENOMEM;
+
+    idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
+
+    /*
+     * No need to allocate any scheduler data, as cpus coming online are
+     * free initially and the idle scheduler doesn't need any data areas
+     * allocated.
+     */
+
+    sr->curr = idle_vcpu[cpu]->sched_unit;
+    sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
+
+    sr->sched_priv = NULL;
+
+    return 0;
+}
+
+static void sched_res_free(struct rcu_head *head)
+{
+    struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
+
+    free_cpumask_var(sr->cpus);
+    if ( sr->sched_unit_idle )
+        sched_free_unit_mem(sr->sched_unit_idle);
+    xfree(sr);
+}
+
+static void cpu_schedule_down(unsigned int cpu)
+{
+    struct sched_resource *sr;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+
+    kill_timer(&sr->s_timer);
+
+    cpumask_clear_cpu(cpu, &sched_res_mask);
+    set_sched_res(cpu, NULL);
+
+    /* Keep idle unit. */
+    sr->sched_unit_idle = NULL;
+    call_rcu(&sr->rcu, sched_res_free);
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+void sched_rm_cpu(unsigned int cpu)
+{
+    int rc;
+
+    rcu_read_lock(&domlist_read_lock);
+    rc = cpu_disable_scheduler(cpu);
+    BUG_ON(rc);
+    rcu_read_unlock(&domlist_read_lock);
+    cpu_schedule_down(cpu);
+}
+
+static int cpu_schedule_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    int rc = 0;
+
+    /*
+     * All scheduler related suspend/resume handling needed is done in
+     * cpupool.c.
+     */
+    if ( system_state > SYS_STATE_active )
+        return NOTIFY_DONE;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    /*
+     * From the scheduler perspective, bringing up a pCPU requires
+     * allocating and initializing the per-pCPU scheduler specific data,
+     * as well as "registering" this pCPU to the scheduler (which may
+     * involve modifying some scheduler wide data structures).
+     * As new pCPUs always start as "free" cpus with the minimal idle
+     * scheduler being in charge, we don't need any of that.
+     *
+     * On the other hand, at teardown, we need to reverse what has been done
+     * during initialization, and then free the per-pCPU specific data. A
+     * pCPU brought down is not forced through "free" cpus, so here we need to
+     * use the appropriate hooks.
+     *
+     * This happens by calling the deinit_pdata and free_pdata hooks, in this
+     * order. If no per-pCPU memory was allocated, there is no need to
+     * provide an implementation of free_pdata. deinit_pdata may, however,
+     * be necessary/useful in this case too (e.g., it can undo something done
+     * on scheduler wide data structure during init_pdata). Both deinit_pdata
+     * and free_pdata are called during CPU_DEAD.
+     *
+     * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
+     */
+    switch ( action )
+    {
+    case CPU_UP_PREPARE:
+        rc = cpu_schedule_up(cpu);
+        break;
+    case CPU_DOWN_PREPARE:
+        rcu_read_lock(&domlist_read_lock);
+        rc = cpu_disable_scheduler_check(cpu);
+        rcu_read_unlock(&domlist_read_lock);
+        break;
+    case CPU_DEAD:
+        sched_rm_cpu(cpu);
+        break;
+    case CPU_UP_CANCELED:
+        cpu_schedule_down(cpu);
+        break;
+    default:
+        break;
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
+}
+
+static struct notifier_block cpu_schedule_nfb = {
+    .notifier_call = cpu_schedule_callback
+};
+
+const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
+{
+    const cpumask_t *mask;
+
+    switch ( opt )
+    {
+    case SCHED_GRAN_cpu:
+        mask = cpumask_of(cpu);
+        break;
+    case SCHED_GRAN_core:
+        mask = per_cpu(cpu_sibling_mask, cpu);
+        break;
+    case SCHED_GRAN_socket:
+        mask = per_cpu(cpu_core_mask, cpu);
+        break;
+    default:
+        ASSERT_UNREACHABLE();
+        return NULL;
+    }
+
+    return mask;
+}
+
+static void schedule_dummy(void)
+{
+    sched_tasklet_check_cpu(smp_processor_id());
+}
+
+void scheduler_disable(void)
+{
+    scheduler_active = false;
+    open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
+    open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
+}
+
+void scheduler_enable(void)
+{
+    open_softirq(SCHEDULE_SOFTIRQ, schedule);
+    open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
+    scheduler_active = true;
+}
+
+/* Initialise the data structures. */
+void __init scheduler_init(void)
+{
+    struct domain *idle_domain;
+    int i;
+
+    scheduler_enable();
+
+    for ( i = 0; i < NUM_SCHEDULERS; i++)
+    {
+#define sched_test_func(f)                               \
+        if ( !schedulers[i]->f )                         \
+        {                                                \
+            printk("scheduler %s misses .%s, dropped\n", \
+                   schedulers[i]->opt_name, #f);         \
+            schedulers[i] = NULL;                        \
+        }
+
+        sched_test_func(init);
+        sched_test_func(deinit);
+        sched_test_func(pick_resource);
+        sched_test_func(alloc_udata);
+        sched_test_func(free_udata);
+        sched_test_func(switch_sched);
+        sched_test_func(do_schedule);
+
+#undef sched_test_func
+
+        if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
+        {
+            printk("scheduler %s failed initialization, dropped\n",
+                   schedulers[i]->opt_name);
+            schedulers[i] = NULL;
+        }
+
+        if ( schedulers[i] && !ops.name &&
+             !strcmp(schedulers[i]->opt_name, opt_sched) )
+            ops = *schedulers[i];
+    }
+
+    if ( !ops.name )
+    {
+        printk("Could not find scheduler: %s\n", opt_sched);
+        for ( i = 0; i < NUM_SCHEDULERS; i++ )
+            if ( schedulers[i] &&
+                 !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
+            {
+                ops = *schedulers[i];
+                break;
+            }
+        BUG_ON(!ops.name);
+        printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
+    }
+
+    if ( cpu_schedule_up(0) )
+        BUG();
+    register_cpu_notifier(&cpu_schedule_nfb);
+
+    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    if ( sched_init(&ops) )
+        panic("scheduler returned error on init\n");
+
+    if ( sched_ratelimit_us &&
+         (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+          || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
+    {
+        printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
+               " Resetting to default %u\n",
+               XEN_SYSCTL_SCHED_RATELIMIT_MIN,
+               XEN_SYSCTL_SCHED_RATELIMIT_MAX,
+               SCHED_DEFAULT_RATELIMIT_US);
+        sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+    }
+
+    idle_domain = domain_create(DOMID_IDLE, NULL, false);
+    BUG_ON(IS_ERR(idle_domain));
+    BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
+    idle_domain->vcpu = idle_vcpu;
+    idle_domain->max_vcpus = nr_cpu_ids;
+    if ( vcpu_create(idle_domain, 0) == NULL )
+        BUG();
+
+    rcu_read_lock(&sched_res_rculock);
+
+    get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
+    get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
+ * using any "real" scheduler.
+ * The cpu is still marked as "free" and not yet valid for its cpupool.
+ */
+int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
+{
+    struct vcpu *idle;
+    void *ppriv, *vpriv;
+    struct scheduler *new_ops = c->sched;
+    struct sched_resource *sr;
+    spinlock_t *old_lock, *new_lock;
+    unsigned long flags;
+    int ret = 0;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+
+    ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
+    ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
+    ASSERT(get_sched_res(cpu)->cpupool == NULL);
+
+    /*
+     * To setup the cpu for the new scheduler we need:
+     *  - a valid instance of per-CPU scheduler specific data, as it is
+     *    allocated by sched_alloc_pdata(). Note that we do not want to
+     *    initialize it yet (i.e., we are not calling sched_init_pdata()).
+     *    That will be done by the target scheduler, in sched_switch_sched(),
+     *    in proper ordering and with locking.
+     *  - a valid instance of per-vCPU scheduler specific data, for the idle
+     *    vCPU of cpu. That is what the target scheduler will use for the
+     *    sched_priv field of the per-vCPU info of the idle domain.
+     */
+    idle = idle_vcpu[cpu];
+    ppriv = sched_alloc_pdata(new_ops, cpu);
+    if ( IS_ERR(ppriv) )
+    {
+        ret = PTR_ERR(ppriv);
+        goto out;
+    }
+
+    vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
+                              idle->domain->sched_priv);
+    if ( vpriv == NULL )
+    {
+        sched_free_pdata(new_ops, ppriv, cpu);
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    /*
+     * The actual switch, including the rerouting of the scheduler lock to
+     * whatever new_ops prefers, needs to happen in one critical section,
+     * protected by old_ops' lock, or races are possible.
+     * It is, in fact, the lock of the idle scheduler that we are taking.
+     * But that is ok as anyone trying to schedule on this cpu will spin until
+     * when we release that lock (bottom of this function). When he'll get the
+     * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
+     * notice that the lock itself changed, and retry acquiring the new one
+     * (which will be the correct, remapped one, at that point).
+     */
+    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+    if ( cpupool_get_granularity(c) > 1 )
+    {
+        const cpumask_t *mask;
+        unsigned int cpu_iter, idx = 0;
+        struct sched_unit *old_unit, *master_unit;
+        struct sched_resource *sr_old;
+
+        /*
+         * We need to merge multiple idle_vcpu units and sched_resource structs
+         * into one. As the free cpus all share the same lock we are fine doing
+         * that now. The worst which could happen would be someone waiting for
+         * the lock, thus dereferencing sched_res->schedule_lock. This is the
+         * reason we are freeing struct sched_res via call_rcu() to avoid the
+         * lock pointer suddenly disappearing.
+         */
+        mask = sched_get_opt_cpumask(c->gran, cpu);
+        master_unit = idle_vcpu[cpu]->sched_unit;
+
+        for_each_cpu ( cpu_iter, mask )
+        {
+            if ( idx )
+                cpumask_clear_cpu(cpu_iter, &sched_res_mask);
+
+            per_cpu(sched_res_idx, cpu_iter) = idx++;
+
+            if ( cpu == cpu_iter )
+                continue;
+
+            old_unit = idle_vcpu[cpu_iter]->sched_unit;
+            sr_old = get_sched_res(cpu_iter);
+            kill_timer(&sr_old->s_timer);
+            idle_vcpu[cpu_iter]->sched_unit = master_unit;
+            master_unit->runstate_cnt[RUNSTATE_running]++;
+            set_sched_res(cpu_iter, sr);
+            cpumask_set_cpu(cpu_iter, sr->cpus);
+
+            call_rcu(&sr_old->rcu, sched_res_free);
+        }
+    }
+
+    new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
+
+    sr->scheduler = new_ops;
+    sr->sched_priv = ppriv;
+
+    /*
+     * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
+     * if it is free (and it can be) we want that anyone that manages
+     * taking it, finds all the initializations we've done above in place.
+     */
+    smp_wmb();
+    sr->schedule_lock = new_lock;
+
+    /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
+    spin_unlock_irqrestore(old_lock, flags);
+
+    sr->granularity = cpupool_get_granularity(c);
+    sr->cpupool = c;
+    /* The  cpu is added to a pool, trigger it to go pick up some work */
+    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+
+out:
+    rcu_read_unlock(&sched_res_rculock);
+
+    return ret;
+}
+
+/*
+ * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
+ * (the idle scheduler).
+ * The cpu is already marked as "free" and not valid any longer for its
+ * cpupool.
+ */
+int schedule_cpu_rm(unsigned int cpu)
+{
+    void *ppriv_old, *vpriv_old;
+    struct sched_resource *sr, **sr_new = NULL;
+    struct sched_unit *unit;
+    struct scheduler *old_ops;
+    spinlock_t *old_lock;
+    unsigned long flags;
+    int idx, ret = -ENOMEM;
+    unsigned int cpu_iter;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    sr = get_sched_res(cpu);
+    old_ops = sr->scheduler;
+
+    if ( sr->granularity > 1 )
+    {
+        sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
+        if ( !sr_new )
+            goto out;
+        for ( idx = 0; idx < sr->granularity - 1; idx++ )
+        {
+            sr_new[idx] = sched_alloc_res();
+            if ( sr_new[idx] )
+            {
+                sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
+                if ( !sr_new[idx]->sched_unit_idle )
+                {
+                    sched_res_free(&sr_new[idx]->rcu);
+                    sr_new[idx] = NULL;
+                }
+            }
+            if ( !sr_new[idx] )
+            {
+                for ( idx--; idx >= 0; idx-- )
+                    sched_res_free(&sr_new[idx]->rcu);
+                goto out;
+            }
+            sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
+            sr_new[idx]->scheduler = &sched_idle_ops;
+            sr_new[idx]->granularity = 1;
+
+            /* We want the lock not to change when replacing the resource. */
+            sr_new[idx]->schedule_lock = sr->schedule_lock;
+        }
+    }
+
+    ret = 0;
+    ASSERT(sr->cpupool != NULL);
+    ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
+    ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
+
+    /* See comment in schedule_cpu_add() regarding lock switching. */
+    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+    vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
+    ppriv_old = sr->sched_priv;
+
+    idx = 0;
+    for_each_cpu ( cpu_iter, sr->cpus )
+    {
+        per_cpu(sched_res_idx, cpu_iter) = 0;
+        if ( cpu_iter == cpu )
+        {
+            idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
+        }
+        else
+        {
+            /* Initialize unit. */
+            unit = sr_new[idx]->sched_unit_idle;
+            unit->res = sr_new[idx];
+            unit->is_running = true;
+            sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
+            sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
+
+            /* Adjust cpu masks of resources (old and new). */
+            cpumask_clear_cpu(cpu_iter, sr->cpus);
+            cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
+
+            /* Init timer. */
+            init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
+
+            /* Last resource initializations and insert resource pointer. */
+            sr_new[idx]->master_cpu = cpu_iter;
+            set_sched_res(cpu_iter, sr_new[idx]);
+
+            /* Last action: set the new lock pointer. */
+            smp_mb();
+            sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
+
+            idx++;
+        }
+    }
+    sr->scheduler = &sched_idle_ops;
+    sr->sched_priv = NULL;
+
+    smp_mb();
+    sr->schedule_lock = &sched_free_cpu_lock;
+
+    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+    spin_unlock_irqrestore(old_lock, flags);
+
+    sched_deinit_pdata(old_ops, ppriv_old, cpu);
+
+    sched_free_udata(old_ops, vpriv_old);
+    sched_free_pdata(old_ops, ppriv_old, cpu);
+
+    sr->granularity = 1;
+    sr->cpupool = NULL;
+
+out:
+    rcu_read_unlock(&sched_res_rculock);
+    xfree(sr_new);
+
+    return ret;
+}
+
+struct scheduler *scheduler_get_default(void)
+{
+    return &ops;
+}
+
+struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
+{
+    int i;
+    struct scheduler *sched;
+
+    for ( i = 0; i < NUM_SCHEDULERS; i++ )
+        if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
+            goto found;
+    *perr = -ENOENT;
+    return NULL;
+
+ found:
+    *perr = -ENOMEM;
+    if ( (sched = xmalloc(struct scheduler)) == NULL )
+        return NULL;
+    memcpy(sched, schedulers[i], sizeof(*sched));
+    if ( (*perr = sched_init(sched)) != 0 )
+    {
+        xfree(sched);
+        sched = NULL;
+    }
+
+    return sched;
+}
+
+void scheduler_free(struct scheduler *sched)
+{
+    BUG_ON(sched == &ops);
+    sched_deinit(sched);
+    xfree(sched);
+}
+
+void schedule_dump(struct cpupool *c)
+{
+    unsigned int      i;
+    struct scheduler *sched;
+    cpumask_t        *cpus;
+
+    /* Locking, if necessary, must be handled withing each scheduler */
+
+    rcu_read_lock(&sched_res_rculock);
+
+    if ( c != NULL )
+    {
+        sched = c->sched;
+        cpus = c->cpu_valid;
+        printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
+        sched_dump_settings(sched);
+    }
+    else
+    {
+        sched = &ops;
+        cpus = &cpupool_free_cpus;
+    }
+
+    if ( sched->dump_cpu_state != NULL )
+    {
+        printk("CPUs info:\n");
+        for_each_cpu (i, cpus)
+            sched_dump_cpu_state(sched, i);
+    }
+
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+void sched_tick_suspend(void)
+{
+    rcu_idle_enter(smp_processor_id());
+    rcu_idle_timer_start();
+}
+
+void sched_tick_resume(void)
+{
+    rcu_idle_timer_stop();
+    rcu_idle_exit(smp_processor_id());
+}
+
+void wait(void)
+{
+    schedule();
+}
+
+#ifdef CONFIG_X86
+void __init sched_setup_dom0_vcpus(struct domain *d)
+{
+    unsigned int i;
+    struct sched_unit *unit;
+
+    for ( i = 1; i < d->max_vcpus; i++ )
+        vcpu_create(d, i);
+
+    /*
+     * PV-shim: vcpus are pinned 1:1.
+     * Initially only 1 cpu is online, others will be dealt with when
+     * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
+     */
+    if ( pv_shim )
+        sched_set_affinity(d->vcpu[0]->sched_unit,
+                           cpumask_of(0), cpumask_of(0));
+    else
+    {
+        for_each_sched_unit ( d, unit )
+        {
+            if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
+                sched_set_affinity(unit, &dom0_cpus, NULL);
+            sched_set_affinity(unit, NULL, &dom0_cpus);
+        }
+    }
+
+    domain_update_node_affinity(d);
+}
+#endif
+
+#ifdef CONFIG_COMPAT
+#include "compat.c"
+#endif
+
+#endif /* !COMPAT */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
new file mode 100644
index 0000000000..d66b541a94
--- /dev/null
+++ b/xen/common/sched/cpupool.c
@@ -0,0 +1,979 @@
+/******************************************************************************
+ * cpupool.c
+ * 
+ * Generic cpupool-handling functions.
+ *
+ * Cpupools are a feature to have configurable scheduling domains. Each
+ * cpupool runs an own scheduler on a dedicated set of physical cpus.
+ * A domain is bound to one cpupool at any time, but it can be moved to
+ * another cpupool.
+ *
+ * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
+ */
+
+#include <xen/lib.h>
+#include <xen/init.h>
+#include <xen/cpumask.h>
+#include <xen/percpu.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/warning.h>
+#include <xen/keyhandler.h>
+#include <xen/cpu.h>
+
+#define for_each_cpupool(ptr)    \
+    for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
+
+struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
+cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
+
+static struct cpupool *cpupool_list;     /* linked list, sorted by poolid */
+
+static int cpupool_moving_cpu = -1;
+static struct cpupool *cpupool_cpu_moving = NULL;
+static cpumask_t cpupool_locked_cpus;
+
+static DEFINE_SPINLOCK(cpupool_lock);
+
+static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
+static unsigned int __read_mostly sched_granularity = 1;
+
+#ifdef CONFIG_HAS_SCHED_GRANULARITY
+static int __init sched_select_granularity(const char *str)
+{
+    if ( strcmp("cpu", str) == 0 )
+        opt_sched_granularity = SCHED_GRAN_cpu;
+    else if ( strcmp("core", str) == 0 )
+        opt_sched_granularity = SCHED_GRAN_core;
+    else if ( strcmp("socket", str) == 0 )
+        opt_sched_granularity = SCHED_GRAN_socket;
+    else
+        return -EINVAL;
+
+    return 0;
+}
+custom_param("sched-gran", sched_select_granularity);
+#endif
+
+static unsigned int __init cpupool_check_granularity(void)
+{
+    unsigned int cpu;
+    unsigned int siblings, gran = 0;
+
+    if ( opt_sched_granularity == SCHED_GRAN_cpu )
+        return 1;
+
+    for_each_online_cpu ( cpu )
+    {
+        siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
+                                                        cpu));
+        if ( gran == 0 )
+            gran = siblings;
+        else if ( gran != siblings )
+            return 0;
+    }
+
+    sched_disable_smt_switching = true;
+
+    return gran;
+}
+
+/* Setup data for selected scheduler granularity. */
+static void __init cpupool_gran_init(void)
+{
+    unsigned int gran = 0;
+    const char *fallback = NULL;
+
+    while ( gran == 0 )
+    {
+        gran = cpupool_check_granularity();
+
+        if ( gran == 0 )
+        {
+            switch ( opt_sched_granularity )
+            {
+            case SCHED_GRAN_core:
+                opt_sched_granularity = SCHED_GRAN_cpu;
+                fallback = "Asymmetric cpu configuration.\n"
+                           "Falling back to sched-gran=cpu.\n";
+                break;
+            case SCHED_GRAN_socket:
+                opt_sched_granularity = SCHED_GRAN_core;
+                fallback = "Asymmetric cpu configuration.\n"
+                           "Falling back to sched-gran=core.\n";
+                break;
+            default:
+                ASSERT_UNREACHABLE();
+                break;
+            }
+        }
+    }
+
+    if ( fallback )
+        warning_add(fallback);
+
+    sched_granularity = gran;
+}
+
+unsigned int cpupool_get_granularity(const struct cpupool *c)
+{
+    return c ? sched_granularity : 1;
+}
+
+static void free_cpupool_struct(struct cpupool *c)
+{
+    if ( c )
+    {
+        free_cpumask_var(c->res_valid);
+        free_cpumask_var(c->cpu_valid);
+    }
+    xfree(c);
+}
+
+static struct cpupool *alloc_cpupool_struct(void)
+{
+    struct cpupool *c = xzalloc(struct cpupool);
+
+    if ( !c )
+        return NULL;
+
+    if ( !zalloc_cpumask_var(&c->cpu_valid) ||
+         !zalloc_cpumask_var(&c->res_valid) )
+    {
+        free_cpupool_struct(c);
+        c = NULL;
+    }
+
+    return c;
+}
+
+/*
+ * find a cpupool by it's id. to be called with cpupool lock held
+ * if exact is not specified, the first cpupool with an id larger or equal to
+ * the searched id is returned
+ * returns NULL if not found.
+ */
+static struct cpupool *__cpupool_find_by_id(int id, int exact)
+{
+    struct cpupool **q;
+
+    ASSERT(spin_is_locked(&cpupool_lock));
+
+    for_each_cpupool(q)
+        if ( (*q)->cpupool_id >= id )
+            break;
+
+    return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
+}
+
+static struct cpupool *cpupool_find_by_id(int poolid)
+{
+    return __cpupool_find_by_id(poolid, 1);
+}
+
+static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
+{
+    struct cpupool *c;
+    spin_lock(&cpupool_lock);
+    c = __cpupool_find_by_id(poolid, exact);
+    if ( c != NULL )
+        atomic_inc(&c->refcnt);
+    spin_unlock(&cpupool_lock);
+    return c;
+}
+
+struct cpupool *cpupool_get_by_id(int poolid)
+{
+    return __cpupool_get_by_id(poolid, 1);
+}
+
+static struct cpupool *cpupool_get_next_by_id(int poolid)
+{
+    return __cpupool_get_by_id(poolid, 0);
+}
+
+void cpupool_put(struct cpupool *pool)
+{
+    if ( !atomic_dec_and_test(&pool->refcnt) )
+        return;
+    scheduler_free(pool->sched);
+    free_cpupool_struct(pool);
+}
+
+/*
+ * create a new cpupool with specified poolid and scheduler
+ * returns pointer to new cpupool structure if okay, NULL else
+ * possible failures:
+ * - no memory
+ * - poolid already used
+ * - unknown scheduler
+ */
+static struct cpupool *cpupool_create(
+    int poolid, unsigned int sched_id, int *perr)
+{
+    struct cpupool *c;
+    struct cpupool **q;
+    int last = 0;
+
+    *perr = -ENOMEM;
+    if ( (c = alloc_cpupool_struct()) == NULL )
+        return NULL;
+
+    /* One reference for caller, one reference for cpupool_destroy(). */
+    atomic_set(&c->refcnt, 2);
+
+    debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
+
+    spin_lock(&cpupool_lock);
+
+    for_each_cpupool(q)
+    {
+        last = (*q)->cpupool_id;
+        if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
+            break;
+    }
+    if ( *q != NULL )
+    {
+        if ( (*q)->cpupool_id == poolid )
+        {
+            *perr = -EEXIST;
+            goto err;
+        }
+        c->next = *q;
+    }
+
+    c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
+    if ( poolid == 0 )
+    {
+        c->sched = scheduler_get_default();
+    }
+    else
+    {
+        c->sched = scheduler_alloc(sched_id, perr);
+        if ( c->sched == NULL )
+            goto err;
+    }
+    c->gran = opt_sched_granularity;
+
+    *q = c;
+
+    spin_unlock(&cpupool_lock);
+
+    debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
+                      c->cpupool_id, c->sched->name, c->sched->opt_name);
+
+    *perr = 0;
+    return c;
+
+ err:
+    spin_unlock(&cpupool_lock);
+    free_cpupool_struct(c);
+    return NULL;
+}
+/*
+ * destroys the given cpupool
+ * returns 0 on success, 1 else
+ * possible failures:
+ * - pool still in use
+ * - cpus still assigned to pool
+ * - pool not in list
+ */
+static int cpupool_destroy(struct cpupool *c)
+{
+    struct cpupool **q;
+
+    spin_lock(&cpupool_lock);
+    for_each_cpupool(q)
+        if ( *q == c )
+            break;
+    if ( *q != c )
+    {
+        spin_unlock(&cpupool_lock);
+        return -ENOENT;
+    }
+    if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
+    {
+        spin_unlock(&cpupool_lock);
+        return -EBUSY;
+    }
+    *q = c->next;
+    spin_unlock(&cpupool_lock);
+
+    cpupool_put(c);
+
+    debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
+    return 0;
+}
+
+/*
+ * Move domain to another cpupool
+ */
+static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
+{
+    int ret;
+
+    if ( unlikely(d->cpupool == c) )
+        return 0;
+
+    d->cpupool->n_dom--;
+    ret = sched_move_domain(d, c);
+    if ( ret )
+        d->cpupool->n_dom++;
+    else
+        c->n_dom++;
+
+    return ret;
+}
+int cpupool_move_domain(struct domain *d, struct cpupool *c)
+{
+    int ret;
+
+    spin_lock(&cpupool_lock);
+
+    ret = cpupool_move_domain_locked(d, c);
+
+    spin_unlock(&cpupool_lock);
+
+    return ret;
+}
+
+/*
+ * assign a specific cpu to a cpupool
+ * cpupool_lock must be held
+ */
+static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
+{
+    int ret;
+    struct domain *d;
+    const cpumask_t *cpus;
+
+    cpus = sched_get_opt_cpumask(c->gran, cpu);
+
+    if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
+        return -EADDRNOTAVAIL;
+    ret = schedule_cpu_add(cpumask_first(cpus), c);
+    if ( ret )
+        return ret;
+
+    rcu_read_lock(&sched_res_rculock);
+
+    cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+    if (cpupool_moving_cpu == cpu)
+    {
+        cpupool_moving_cpu = -1;
+        cpupool_put(cpupool_cpu_moving);
+        cpupool_cpu_moving = NULL;
+    }
+    cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
+    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    rcu_read_lock(&domlist_read_lock);
+    for_each_domain_in_cpupool(d, c)
+    {
+        domain_update_node_affinity(d);
+    }
+    rcu_read_unlock(&domlist_read_lock);
+
+    return 0;
+}
+
+static int cpupool_unassign_cpu_finish(struct cpupool *c)
+{
+    int cpu = cpupool_moving_cpu;
+    const cpumask_t *cpus;
+    struct domain *d;
+    int ret;
+
+    if ( c != cpupool_cpu_moving )
+        return -EADDRNOTAVAIL;
+
+    /*
+     * We need this for scanning the domain list, both in
+     * cpu_disable_scheduler(), and at the bottom of this function.
+     */
+    rcu_read_lock(&domlist_read_lock);
+    ret = cpu_disable_scheduler(cpu);
+
+    rcu_read_lock(&sched_res_rculock);
+    cpus = get_sched_res(cpu)->cpus;
+    cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+
+    /*
+     * cpu_disable_scheduler() returning an error doesn't require resetting
+     * cpupool_free_cpus' cpu bit. All error cases should be of temporary
+     * nature and tools will retry the operation. Even if the number of
+     * retries may be limited, the in-between state can easily be repaired
+     * by adding the cpu to the cpupool again.
+     */
+    if ( !ret )
+    {
+        ret = schedule_cpu_rm(cpu);
+        if ( ret )
+            cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
+        else
+        {
+            cpupool_moving_cpu = -1;
+            cpupool_put(cpupool_cpu_moving);
+            cpupool_cpu_moving = NULL;
+        }
+    }
+    rcu_read_unlock(&sched_res_rculock);
+
+    for_each_domain_in_cpupool(d, c)
+    {
+        domain_update_node_affinity(d);
+    }
+    rcu_read_unlock(&domlist_read_lock);
+
+    return ret;
+}
+
+static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
+{
+    int ret;
+    struct domain *d;
+    const cpumask_t *cpus;
+
+    spin_lock(&cpupool_lock);
+    ret = -EADDRNOTAVAIL;
+    if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
+         && (cpu != cpupool_moving_cpu) )
+        goto out;
+
+    ret = 0;
+    rcu_read_lock(&sched_res_rculock);
+    cpus = get_sched_res(cpu)->cpus;
+
+    if ( (c->n_dom > 0) &&
+         (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
+         (cpu != cpupool_moving_cpu) )
+    {
+        rcu_read_lock(&domlist_read_lock);
+        for_each_domain_in_cpupool(d, c)
+        {
+            if ( !d->is_dying && system_state == SYS_STATE_active )
+            {
+                ret = -EBUSY;
+                break;
+            }
+            ret = cpupool_move_domain_locked(d, cpupool0);
+            if ( ret )
+                break;
+        }
+        rcu_read_unlock(&domlist_read_lock);
+        if ( ret )
+            goto out;
+    }
+    cpupool_moving_cpu = cpu;
+    atomic_inc(&c->refcnt);
+    cpupool_cpu_moving = c;
+    cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
+    cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
+
+    rcu_read_unlock(&domlist_read_lock);
+out:
+    spin_unlock(&cpupool_lock);
+
+    return ret;
+}
+
+static long cpupool_unassign_cpu_helper(void *info)
+{
+    struct cpupool *c = info;
+    long ret;
+
+    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+                      cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
+    spin_lock(&cpupool_lock);
+
+    ret = cpupool_unassign_cpu_finish(c);
+
+    spin_unlock(&cpupool_lock);
+    debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
+
+    return ret;
+}
+
+/*
+ * unassign a specific cpu from a cpupool
+ * we must be sure not to run on the cpu to be unassigned! to achieve this
+ * the main functionality is performed via continue_hypercall_on_cpu on a
+ * specific cpu.
+ * if the cpu to be removed is the last one of the cpupool no active domain
+ * must be bound to the cpupool. dying domains are moved to cpupool0 as they
+ * might be zombies.
+ * possible failures:
+ * - last cpu and still active domains in cpupool
+ * - cpu just being unplugged
+ */
+static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
+{
+    int work_cpu;
+    int ret;
+    unsigned int master_cpu;
+
+    debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
+                      c->cpupool_id, cpu);
+
+    master_cpu = sched_get_resource_cpu(cpu);
+    ret = cpupool_unassign_cpu_start(c, master_cpu);
+    if ( ret )
+    {
+        debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
+                          c->cpupool_id, cpu, ret);
+        return ret;
+    }
+
+    work_cpu = sched_get_resource_cpu(smp_processor_id());
+    if ( work_cpu == master_cpu )
+    {
+        work_cpu = cpumask_first(cpupool0->cpu_valid);
+        if ( work_cpu == master_cpu )
+            work_cpu = cpumask_last(cpupool0->cpu_valid);
+    }
+    return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
+}
+
+/*
+ * add a new domain to a cpupool
+ * possible failures:
+ * - pool does not exist
+ * - no cpu assigned to pool
+ */
+int cpupool_add_domain(struct domain *d, int poolid)
+{
+    struct cpupool *c;
+    int rc;
+    int n_dom = 0;
+
+    if ( poolid == CPUPOOLID_NONE )
+        return 0;
+    spin_lock(&cpupool_lock);
+    c = cpupool_find_by_id(poolid);
+    if ( c == NULL )
+        rc = -ESRCH;
+    else if ( !cpumask_weight(c->cpu_valid) )
+        rc = -ENODEV;
+    else
+    {
+        c->n_dom++;
+        n_dom = c->n_dom;
+        d->cpupool = c;
+        rc = 0;
+    }
+    spin_unlock(&cpupool_lock);
+    debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
+                      d->domain_id, poolid, n_dom, rc);
+    return rc;
+}
+
+/*
+ * remove a domain from a cpupool
+ */
+void cpupool_rm_domain(struct domain *d)
+{
+    int cpupool_id;
+    int n_dom;
+
+    if ( d->cpupool == NULL )
+        return;
+    spin_lock(&cpupool_lock);
+    cpupool_id = d->cpupool->cpupool_id;
+    d->cpupool->n_dom--;
+    n_dom = d->cpupool->n_dom;
+    d->cpupool = NULL;
+    spin_unlock(&cpupool_lock);
+    debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
+                      d->domain_id, cpupool_id, n_dom);
+    return;
+}
+
+/*
+ * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
+ * as they must have been in there when unplugged.
+ */
+static int cpupool_cpu_add(unsigned int cpu)
+{
+    int ret = 0;
+    const cpumask_t *cpus;
+
+    spin_lock(&cpupool_lock);
+    cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
+    cpumask_set_cpu(cpu, &cpupool_free_cpus);
+
+    /*
+     * If we are not resuming, we are hot-plugging cpu, and in which case
+     * we add it to pool0, as it certainly was there when hot-unplagged
+     * (or unplugging would have failed) and that is the default behavior
+     * anyway.
+     */
+    rcu_read_lock(&sched_res_rculock);
+    get_sched_res(cpu)->cpupool = NULL;
+
+    cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
+    if ( cpumask_subset(cpus, &cpupool_free_cpus) )
+        ret = cpupool_assign_cpu_locked(cpupool0, cpu);
+
+    rcu_read_unlock(&sched_res_rculock);
+
+    spin_unlock(&cpupool_lock);
+
+    return ret;
+}
+
+/*
+ * This function is called in stop_machine context, so we can be sure no
+ * non-idle vcpu is active on the system.
+ */
+static void cpupool_cpu_remove(unsigned int cpu)
+{
+    int ret;
+
+    ASSERT(is_idle_vcpu(current));
+
+    if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
+    {
+        ret = cpupool_unassign_cpu_finish(cpupool0);
+        BUG_ON(ret);
+    }
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+}
+
+/*
+ * Called before a CPU is being removed from the system.
+ * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
+ * to free cpus actually before removing them).
+ * The CPU is locked, to forbid adding it again to another cpupool.
+ */
+static int cpupool_cpu_remove_prologue(unsigned int cpu)
+{
+    int ret = 0;
+    cpumask_t *cpus;
+    unsigned int master_cpu;
+
+    spin_lock(&cpupool_lock);
+
+    rcu_read_lock(&sched_res_rculock);
+    cpus = get_sched_res(cpu)->cpus;
+    master_cpu = sched_get_resource_cpu(cpu);
+    if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
+        ret = -EBUSY;
+    else
+        cpumask_set_cpu(cpu, &cpupool_locked_cpus);
+    rcu_read_unlock(&sched_res_rculock);
+
+    spin_unlock(&cpupool_lock);
+
+    if ( ret )
+        return  ret;
+
+    if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
+    {
+        /* Cpupool0 is populated only after all cpus are up. */
+        ASSERT(system_state == SYS_STATE_active);
+
+        ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
+    }
+    else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
+        ret = -ENODEV;
+
+    return ret;
+}
+
+/*
+ * Called during resume for all cpus which didn't come up again. The cpu must
+ * be removed from the cpupool it is assigned to. In case a cpupool will be
+ * left without cpu we move all domains of that cpupool to cpupool0.
+ * As we are called with all domains still frozen there is no need to take the
+ * cpupool lock here.
+ */
+static void cpupool_cpu_remove_forced(unsigned int cpu)
+{
+    struct cpupool **c;
+    int ret;
+    unsigned int master_cpu = sched_get_resource_cpu(cpu);
+
+    for_each_cpupool ( c )
+    {
+        if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
+        {
+            ret = cpupool_unassign_cpu_start(*c, master_cpu);
+            BUG_ON(ret);
+            ret = cpupool_unassign_cpu_finish(*c);
+            BUG_ON(ret);
+        }
+    }
+
+    cpumask_clear_cpu(cpu, &cpupool_free_cpus);
+
+    rcu_read_lock(&sched_res_rculock);
+    sched_rm_cpu(cpu);
+    rcu_read_unlock(&sched_res_rculock);
+}
+
+/*
+ * do cpupool related sysctl operations
+ */
+int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
+{
+    int ret;
+    struct cpupool *c;
+
+    switch ( op->op )
+    {
+
+    case XEN_SYSCTL_CPUPOOL_OP_CREATE:
+    {
+        int poolid;
+
+        poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
+            CPUPOOLID_NONE: op->cpupool_id;
+        c = cpupool_create(poolid, op->sched_id, &ret);
+        if ( c != NULL )
+        {
+            op->cpupool_id = c->cpupool_id;
+            cpupool_put(c);
+        }
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
+    {
+        c = cpupool_get_by_id(op->cpupool_id);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        ret = cpupool_destroy(c);
+        cpupool_put(c);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_INFO:
+    {
+        c = cpupool_get_next_by_id(op->cpupool_id);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        op->cpupool_id = c->cpupool_id;
+        op->sched_id = c->sched->sched_id;
+        op->n_dom = c->n_dom;
+        ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
+        cpupool_put(c);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
+    {
+        unsigned cpu;
+        const cpumask_t *cpus;
+
+        cpu = op->cpu;
+        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
+                          op->cpupool_id, cpu);
+
+        spin_lock(&cpupool_lock);
+
+        c = cpupool_find_by_id(op->cpupool_id);
+        ret = -ENOENT;
+        if ( c == NULL )
+            goto addcpu_out;
+        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+        {
+            for_each_cpu ( cpu, &cpupool_free_cpus )
+            {
+                cpus = sched_get_opt_cpumask(c->gran, cpu);
+                if ( cpumask_subset(cpus, &cpupool_free_cpus) )
+                    break;
+            }
+            ret = -ENODEV;
+            if ( cpu >= nr_cpu_ids )
+                goto addcpu_out;
+        }
+        ret = -EINVAL;
+        if ( cpu >= nr_cpu_ids )
+            goto addcpu_out;
+        ret = -ENODEV;
+        cpus = sched_get_opt_cpumask(c->gran, cpu);
+        if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
+             cpumask_intersects(cpus, &cpupool_locked_cpus) )
+            goto addcpu_out;
+        ret = cpupool_assign_cpu_locked(c, cpu);
+
+    addcpu_out:
+        spin_unlock(&cpupool_lock);
+        debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
+                          op->cpupool_id, cpu, ret);
+
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
+    {
+        unsigned cpu;
+
+        c = cpupool_get_by_id(op->cpupool_id);
+        ret = -ENOENT;
+        if ( c == NULL )
+            break;
+        cpu = op->cpu;
+        if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
+            cpu = cpumask_last(c->cpu_valid);
+        ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
+        cpupool_put(c);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
+    {
+        struct domain *d;
+
+        ret = rcu_lock_remote_domain_by_id(op->domid, &d);
+        if ( ret )
+            break;
+        if ( d->cpupool == NULL )
+        {
+            ret = -EINVAL;
+            rcu_unlock_domain(d);
+            break;
+        }
+        if ( op->cpupool_id == d->cpupool->cpupool_id )
+        {
+            ret = 0;
+            rcu_unlock_domain(d);
+            break;
+        }
+        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
+                          d->domain_id, op->cpupool_id);
+        ret = -ENOENT;
+        spin_lock(&cpupool_lock);
+
+        c = cpupool_find_by_id(op->cpupool_id);
+        if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
+            ret = cpupool_move_domain_locked(d, c);
+
+        spin_unlock(&cpupool_lock);
+        debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
+                          d->domain_id, op->cpupool_id, ret);
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
+    {
+        ret = cpumask_to_xenctl_bitmap(
+            &op->cpumap, &cpupool_free_cpus);
+    }
+    break;
+
+    default:
+        ret = -ENOSYS;
+        break;
+    }
+
+    return ret;
+}
+
+void dump_runq(unsigned char key)
+{
+    unsigned long    flags;
+    s_time_t         now = NOW();
+    struct cpupool **c;
+
+    spin_lock(&cpupool_lock);
+    local_irq_save(flags);
+
+    printk("sched_smt_power_savings: %s\n",
+            sched_smt_power_savings? "enabled":"disabled");
+    printk("NOW=%"PRI_stime"\n", now);
+
+    printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
+    if ( !cpumask_empty(&cpupool_free_cpus) )
+    {
+        printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
+        schedule_dump(NULL);
+    }
+
+    for_each_cpupool(c)
+    {
+        printk("Cpupool %d:\n", (*c)->cpupool_id);
+        printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
+        schedule_dump(*c);
+    }
+
+    local_irq_restore(flags);
+    spin_unlock(&cpupool_lock);
+}
+
+static int cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    int rc = 0;
+
+    switch ( action )
+    {
+    case CPU_DOWN_FAILED:
+    case CPU_ONLINE:
+        if ( system_state <= SYS_STATE_active )
+            rc = cpupool_cpu_add(cpu);
+        break;
+    case CPU_DOWN_PREPARE:
+        /* Suspend/Resume don't change assignments of cpus to cpupools. */
+        if ( system_state <= SYS_STATE_active )
+            rc = cpupool_cpu_remove_prologue(cpu);
+        break;
+    case CPU_DYING:
+        /* Suspend/Resume don't change assignments of cpus to cpupools. */
+        if ( system_state <= SYS_STATE_active )
+            cpupool_cpu_remove(cpu);
+        break;
+    case CPU_RESUME_FAILED:
+        cpupool_cpu_remove_forced(cpu);
+        break;
+    default:
+        break;
+    }
+
+    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback
+};
+
+static int __init cpupool_init(void)
+{
+    unsigned int cpu;
+    int err;
+
+    cpupool_gran_init();
+
+    cpupool0 = cpupool_create(0, 0, &err);
+    BUG_ON(cpupool0 == NULL);
+    cpupool_put(cpupool0);
+    register_cpu_notifier(&cpu_nfb);
+
+    spin_lock(&cpupool_lock);
+
+    cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
+
+    for_each_cpu ( cpu, &cpupool_free_cpus )
+        cpupool_assign_cpu_locked(cpupool0, cpu);
+
+    spin_unlock(&cpupool_lock);
+
+    return 0;
+}
+__initcall(cpupool_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c
new file mode 100644
index 0000000000..aa41a3301b
--- /dev/null
+++ b/xen/common/sched/credit.c
@@ -0,0 +1,2284 @@
+/****************************************************************************
+ * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
+ ****************************************************************************
+ *
+ *        File: common/csched_credit.c
+ *      Author: Emmanuel Ackaouy
+ *
+ * Description: Credit-based SMP CPU scheduler
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <asm/div64.h>
+#include <xen/errno.h>
+#include <xen/keyhandler.h>
+#include <xen/trace.h>
+#include <xen/err.h>
+
+
+/*
+ * Locking:
+ * - Scheduler-lock (a.k.a. runqueue lock):
+ *  + is per-runqueue, and there is one runqueue per-cpu;
+ *  + serializes all runqueue manipulation operations;
+ * - Private data lock (a.k.a. private scheduler lock):
+ *  + serializes accesses to the scheduler global state (weight,
+ *    credit, balance_credit, etc);
+ *  + serializes updates to the domains' scheduling parameters.
+ *
+ * Ordering is "private lock always comes first":
+ *  + if we need both locks, we must acquire the private
+ *    scheduler lock for first;
+ *  + if we already own a runqueue lock, we must never acquire
+ *    the private scheduler lock.
+ */
+
+/*
+ * Basic constants
+ */
+#define CSCHED_DEFAULT_WEIGHT       256
+#define CSCHED_TICKS_PER_TSLICE     3
+/* Default timeslice: 30ms */
+#define CSCHED_DEFAULT_TSLICE_MS    30
+#define CSCHED_CREDITS_PER_MSEC     10
+/* Never set a timer shorter than this value. */
+#define CSCHED_MIN_TIMER            XEN_SYSCTL_SCHED_RATELIMIT_MIN
+
+
+/*
+ * Priorities
+ */
+#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
+#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
+#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
+#define CSCHED_PRI_IDLE         -64     /* idle */
+
+
+/*
+ * Flags
+ *
+ * Note that svc->flags (where these flags live) is protected by an
+ * inconsistent set of locks. Therefore atomic-safe bit operations must
+ * be used for accessing it.
+ */
+#define CSCHED_FLAG_UNIT_PARKED    0x0  /* UNIT over capped credits */
+#define CSCHED_FLAG_UNIT_YIELD     0x1  /* UNIT yielding */
+#define CSCHED_FLAG_UNIT_MIGRATING 0x2  /* UNIT may have moved to a new pcpu */
+#define CSCHED_FLAG_UNIT_PINNED    0x4  /* UNIT can run only on 1 pcpu */
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PRIV(_ops)   \
+    ((struct csched_private *)((_ops)->sched_data))
+#define CSCHED_PCPU(_c)     \
+    ((struct csched_pcpu *)get_sched_res(_c)->sched_priv)
+#define CSCHED_UNIT(unit)   ((struct csched_unit *) (unit)->priv)
+#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
+#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+
+
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic per-unit counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console).
+ */
+#ifdef SCHED_STATS
+
+#define CSCHED_STATS
+
+#define SCHED_UNIT_STATS_RESET(_V)                      \
+    do                                                  \
+    {                                                   \
+        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
+    } while ( 0 )
+
+#define SCHED_UNIT_STAT_CRANK(_V, _X)       (((_V)->stats._X)++)
+
+#define SCHED_UNIT_STAT_SET(_V, _X, _Y)     (((_V)->stats._X) = (_Y))
+
+#else /* !SCHED_STATS */
+
+#undef CSCHED_STATS
+
+#define SCHED_UNIT_STATS_RESET(_V)         do {} while ( 0 )
+#define SCHED_UNIT_STAT_CRANK(_V, _X)      do {} while ( 0 )
+#define SCHED_UNIT_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
+
+#endif /* SCHED_STATS */
+
+
+/*
+ * Credit tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1)
+#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2)
+#define TRC_CSCHED_ACCOUNT_STOP  TRC_SCHED_CLASS_EVT(CSCHED, 3)
+#define TRC_CSCHED_STOLEN_UNIT   TRC_SCHED_CLASS_EVT(CSCHED, 4)
+#define TRC_CSCHED_PICKED_CPU    TRC_SCHED_CLASS_EVT(CSCHED, 5)
+#define TRC_CSCHED_TICKLE        TRC_SCHED_CLASS_EVT(CSCHED, 6)
+#define TRC_CSCHED_BOOST_START   TRC_SCHED_CLASS_EVT(CSCHED, 7)
+#define TRC_CSCHED_BOOST_END     TRC_SCHED_CLASS_EVT(CSCHED, 8)
+#define TRC_CSCHED_SCHEDULE      TRC_SCHED_CLASS_EVT(CSCHED, 9)
+#define TRC_CSCHED_RATELIMIT     TRC_SCHED_CLASS_EVT(CSCHED, 10)
+#define TRC_CSCHED_STEAL_CHECK   TRC_SCHED_CLASS_EVT(CSCHED, 11)
+
+/*
+ * Boot parameters
+ */
+static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
+integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms);
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+    struct list_head runq;
+    uint32_t runq_sort_last;
+
+    unsigned int idle_bias;
+    unsigned int nr_runnable;
+
+    unsigned int tick;
+    struct timer ticker;
+};
+
+/*
+ * Virtual UNIT
+ */
+struct csched_unit {
+    struct list_head runq_elem;
+    struct list_head active_unit_elem;
+
+    /* Up-pointers */
+    struct csched_dom *sdom;
+    struct sched_unit *unit;
+
+    s_time_t start_time;   /* When we were scheduled (used for credit) */
+    unsigned flags;
+    int pri;
+
+    atomic_t credit;
+    unsigned int residual;
+
+    s_time_t last_sched_time;
+
+#ifdef CSCHED_STATS
+    struct {
+        int credit_last;
+        uint32_t credit_incr;
+        uint32_t state_active;
+        uint32_t state_idle;
+        uint32_t migrate_q;
+        uint32_t migrate_r;
+        uint32_t kicked_away;
+    } stats;
+#endif
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+    struct list_head active_unit;
+    struct list_head active_sdom_elem;
+    struct domain *dom;
+    uint16_t active_unit_count;
+    uint16_t weight;
+    uint16_t cap;
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+    spinlock_t lock;
+
+    cpumask_var_t idlers;
+    cpumask_var_t cpus;
+    uint32_t *balance_bias;
+    uint32_t runq_sort;
+    uint32_t ncpus;
+
+    /* Period of master and tick in milliseconds */
+    unsigned int tick_period_us, ticks_per_tslice;
+    s_time_t ratelimit, tslice, unit_migr_delay;
+
+    struct list_head active_sdom;
+    uint32_t weight;
+    uint32_t credit;
+    int credit_balance;
+    unsigned int credits_per_tslice;
+
+    unsigned int master;
+    struct timer master_ticker;
+};
+
+static void csched_tick(void *_cpu);
+static void csched_acct(void *dummy);
+
+static inline int
+__unit_on_runq(struct csched_unit *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched_unit *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched_unit, runq_elem);
+}
+
+/* Is the first element of cpu's runq (if any) cpu's idle unit? */
+static inline bool_t is_runq_idle(unsigned int cpu)
+{
+    /*
+     * We're peeking at cpu's runq, we must hold the proper lock.
+     */
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    return list_empty(RUNQ(cpu)) ||
+           is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit);
+}
+
+static inline void
+inc_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+    CSCHED_PCPU(cpu)->nr_runnable++;
+
+}
+
+static inline void
+dec_nr_runnable(unsigned int cpu)
+{
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1);
+    CSCHED_PCPU(cpu)->nr_runnable--;
+}
+
+static inline void
+__runq_insert(struct csched_unit *svc)
+{
+    unsigned int cpu = sched_unit_master(svc->unit);
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+
+    BUG_ON( __unit_on_runq(svc) );
+
+    list_for_each( iter, runq )
+    {
+        const struct csched_unit * const iter_svc = __runq_elem(iter);
+        if ( svc->pri > iter_svc->pri )
+            break;
+    }
+
+    /* If the unit yielded, try to put it behind one lower-priority
+     * runnable unit if we can.  The next runq_sort will bring it forward
+     * within 30ms if the queue too long. */
+    if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags)
+         && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
+    {
+        iter=iter->next;
+
+        /* Some sanity checks */
+        BUG_ON(iter == runq);
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+runq_insert(struct csched_unit *svc)
+{
+    __runq_insert(svc);
+    inc_nr_runnable(sched_unit_master(svc->unit));
+}
+
+static inline void
+__runq_remove(struct csched_unit *svc)
+{
+    BUG_ON( !__unit_on_runq(svc) );
+    list_del_init(&svc->runq_elem);
+}
+
+static inline void
+runq_remove(struct csched_unit *svc)
+{
+    dec_nr_runnable(sched_unit_master(svc->unit));
+    __runq_remove(svc);
+}
+
+static void burn_credits(struct csched_unit *svc, s_time_t now)
+{
+    s_time_t delta;
+    uint64_t val;
+    unsigned int credits;
+
+    /* Assert svc is current */
+    ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) );
+
+    if ( (delta = now - svc->start_time) <= 0 )
+        return;
+
+    val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual;
+    svc->residual = do_div(val, MILLISECS(1));
+    credits = val;
+    ASSERT(credits == val); /* make sure we haven't truncated val */
+    atomic_sub(credits, &svc->credit);
+    svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
+}
+
+static bool_t __read_mostly opt_tickle_one_idle = 1;
+boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
+
+DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
+
+static inline void __runq_tickle(struct csched_unit *new)
+{
+    unsigned int cpu = sched_unit_master(new->unit);
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct sched_unit *unit = new->unit;
+    struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu));
+    struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
+    cpumask_t mask, idle_mask, *online;
+    int balance_step, idlers_empty;
+
+    ASSERT(cur);
+    cpumask_clear(&mask);
+
+    online = cpupool_domain_master_cpumask(new->sdom->dom);
+    cpumask_and(&idle_mask, prv->idlers, online);
+    idlers_empty = cpumask_empty(&idle_mask);
+
+    /*
+     * Exclusive pinning is when a unit has hard-affinity with only one
+     * cpu, and there is no other unit that has hard-affinity with that
+     * same cpu. This is infrequent, but if it happens, is for achieving
+     * the most possible determinism, and least possible overhead for
+     * the units in question.
+     *
+     * Try to identify the vast majority of these situations, and deal
+     * with them quickly.
+     */
+    if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) &&
+                  cpumask_test_cpu(cpu, &idle_mask)) )
+    {
+        ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
+        SCHED_STAT_CRANK(tickled_idle_cpu_excl);
+        __cpumask_set_cpu(cpu, &mask);
+        goto tickle;
+    }
+
+    /*
+     * If the pcpu is idle, or there are no idlers and the new
+     * unit is a higher priority than the old unit, run it here.
+     *
+     * If there are idle cpus, first try to find one suitable to run
+     * new, so we can avoid preempting cur.  If we cannot find a
+     * suitable idler on which to run new, run it here, but try to
+     * find a suitable idler on which to run cur instead.
+     */
+    if ( cur->pri == CSCHED_PRI_IDLE
+         || (idlers_empty && new->pri > cur->pri) )
+    {
+        if ( cur->pri != CSCHED_PRI_IDLE )
+            SCHED_STAT_CRANK(tickled_busy_cpu);
+        else
+            SCHED_STAT_CRANK(tickled_idle_cpu);
+        __cpumask_set_cpu(cpu, &mask);
+    }
+    else if ( !idlers_empty )
+    {
+        /*
+         * Soft and hard affinity balancing loop. For units without
+         * a useful soft affinity, consider hard affinity only.
+         */
+        for_each_affinity_balance_step( balance_step )
+        {
+            int new_idlers_empty;
+
+            if ( balance_step == BALANCE_SOFT_AFFINITY
+                 && !has_soft_affinity(unit) )
+                continue;
+
+            /* Are there idlers suitable for new (for this balance step)? */
+            affinity_balance_cpumask(unit, balance_step,
+                                     cpumask_scratch_cpu(cpu));
+            cpumask_and(cpumask_scratch_cpu(cpu),
+                        cpumask_scratch_cpu(cpu), &idle_mask);
+            new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu));
+
+            /*
+             * Let's not be too harsh! If there aren't idlers suitable
+             * for new in its soft affinity mask, make sure we check its
+             * hard affinity as well, before taking final decisions.
+             */
+            if ( new_idlers_empty
+                 && balance_step == BALANCE_SOFT_AFFINITY )
+                continue;
+
+            /*
+             * If there are no suitable idlers for new, and it's higher
+             * priority than cur, check whether we can migrate cur away.
+             * We have to do it indirectly, via _VPF_migrating (instead
+             * of just tickling any idler suitable for cur) because cur
+             * is running.
+             *
+             * If there are suitable idlers for new, no matter priorities,
+             * leave cur alone (as it is running and is, likely, cache-hot)
+             * and wake some of them (which is waking up and so is, likely,
+             * cache cold anyway).
+             */
+            if ( new_idlers_empty && new->pri > cur->pri )
+            {
+                if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) )
+                {
+                    SCHED_UNIT_STAT_CRANK(cur, kicked_away);
+                    SCHED_UNIT_STAT_CRANK(cur, migrate_r);
+                    SCHED_STAT_CRANK(migrate_kicked_away);
+                    sched_set_pause_flags_atomic(cur->unit, _VPF_migrating);
+                }
+                /* Tickle cpu anyway, to let new preempt cur. */
+                SCHED_STAT_CRANK(tickled_busy_cpu);
+                __cpumask_set_cpu(cpu, &mask);
+            }
+            else if ( !new_idlers_empty )
+            {
+                /* Which of the idlers suitable for new shall we wake up? */
+                SCHED_STAT_CRANK(tickled_idle_cpu);
+                if ( opt_tickle_one_idle )
+                {
+                    this_cpu(last_tickle_cpu) =
+                        cpumask_cycle(this_cpu(last_tickle_cpu),
+                                      cpumask_scratch_cpu(cpu));
+                    __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
+                }
+                else
+                    cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu));
+            }
+
+            /* Did we find anyone? */
+            if ( !cpumask_empty(&mask) )
+                break;
+        }
+    }
+
+ tickle:
+    if ( !cpumask_empty(&mask) )
+    {
+        if ( unlikely(tb_init_done) )
+        {
+            /* Avoid TRACE_*: saves checking !tb_init_done each step */
+            for_each_cpu(cpu, &mask)
+                __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
+        }
+
+        /*
+         * Mark the designated CPUs as busy and send them all the scheduler
+         * interrupt. We need the for_each_cpu for dealing with the
+         * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
+         * can't use cpumask_andnot(), because prv->idlers needs atomic access.
+         *
+         * In the default (and most common) case, when opt_rickle_one_idle is
+         * true, the loop does only one step, and only one bit is cleared.
+         */
+        for_each_cpu(cpu, &mask)
+            cpumask_clear_cpu(cpu, prv->idlers);
+        cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
+    }
+    else
+        SCHED_STAT_CRANK(tickled_no_cpu);
+}
+
+static void
+csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+
+    /*
+     * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're
+     * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed
+     * very early. xfree() does not really mind, but we want to be sure that,
+     * when we get here, either init_pdata has never been called, or
+     * deinit_pdata has been called already.
+     */
+    ASSERT(!cpumask_test_cpu(cpu, prv->cpus));
+
+    xfree(pcpu);
+}
+
+static void
+csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc = pcpu;
+    unsigned int node = cpu_to_node(cpu);
+    unsigned long flags;
+
+    /*
+     * Scheduler specific data for this pCPU must still be there and and be
+     * valid. In fact, if we are here:
+     *  1. alloc_pdata must have been called for this cpu, and free_pdata
+     *     must not have been called on it before us,
+     *  2. init_pdata must have been called on this cpu, and deinit_pdata
+     *     (us!) must not have been called on it already.
+     */
+    ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus));
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    prv->credit -= prv->credits_per_tslice;
+    prv->ncpus--;
+    cpumask_clear_cpu(cpu, prv->idlers);
+    cpumask_clear_cpu(cpu, prv->cpus);
+    if ( (prv->master == cpu) && (prv->ncpus > 0) )
+    {
+        prv->master = cpumask_first(prv->cpus);
+        migrate_timer(&prv->master_ticker, prv->master);
+    }
+    if ( prv->balance_bias[node] == cpu )
+    {
+        cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node));
+        if ( !cpumask_empty(cpumask_scratch) )
+            prv->balance_bias[node] =  cpumask_first(cpumask_scratch);
+    }
+    kill_timer(&spc->ticker);
+    if ( prv->ncpus == 0 )
+        kill_timer(&prv->master_ticker);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+csched_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct csched_pcpu *spc;
+
+    /* Allocate per-PCPU info */
+    spc = xzalloc(struct csched_pcpu);
+    if ( spc == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    return spc;
+}
+
+static void
+init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
+{
+    ASSERT(spin_is_locked(&prv->lock));
+    /* cpu data needs to be allocated, but STILL uninitialized. */
+    ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
+
+    /* Initialize/update system-wide config */
+    prv->credit += prv->credits_per_tslice;
+    prv->ncpus++;
+    cpumask_set_cpu(cpu, prv->cpus);
+    if ( prv->ncpus == 1 )
+    {
+        prv->master = cpu;
+        init_timer(&prv->master_ticker, csched_acct, prv, cpu);
+        set_timer(&prv->master_ticker, NOW() + prv->tslice);
+    }
+
+    cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu)));
+    if ( cpumask_weight(cpumask_scratch) == 1 )
+        prv->balance_bias[cpu_to_node(cpu)] = cpu;
+
+    init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
+    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
+
+    INIT_LIST_HEAD(&spc->runq);
+    spc->runq_sort_last = prv->runq_sort;
+    spc->idle_bias = nr_cpu_ids - 1;
+
+    /* Start off idling... */
+    BUG_ON(!is_idle_unit(curr_on_cpu(cpu)));
+    cpumask_set_cpu(cpu, prv->idlers);
+    spc->nr_runnable = 0;
+}
+
+static void
+csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+    unsigned long flags;
+    struct csched_private *prv = CSCHED_PRIV(ops);
+
+    spin_lock_irqsave(&prv->lock, flags);
+    init_pdata(prv, pdata, cpu);
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+/* Change the scheduler of cpu to us (Credit). */
+static spinlock_t *
+csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                    void *pdata, void *vdata)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct csched_private *prv = CSCHED_PRIV(new_ops);
+    struct csched_unit *svc = vdata;
+
+    ASSERT(svc && is_idle_unit(svc->unit));
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    /*
+     * We are holding the runqueue lock already (it's been taken in
+     * schedule_cpu_switch()). It actually may or may not be the 'right'
+     * one for this cpu, but that is ok for preventing races.
+     */
+    ASSERT(!local_irq_is_enabled());
+    spin_lock(&prv->lock);
+    init_pdata(prv, pdata, cpu);
+    spin_unlock(&prv->lock);
+
+    return &sr->_lock;
+}
+
+#ifndef NDEBUG
+static inline void
+__csched_unit_check(struct sched_unit *unit)
+{
+    struct csched_unit * const svc = CSCHED_UNIT(unit);
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( svc->unit != unit );
+    BUG_ON( sdom != CSCHED_DOM(unit->domain) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_unit(unit) );
+        BUG_ON( sdom->dom != unit->domain );
+    }
+    else
+    {
+        BUG_ON( !is_idle_unit(unit) );
+    }
+
+    SCHED_STAT_CRANK(unit_check);
+}
+#define CSCHED_UNIT_CHECK(unit)  (__csched_unit_check(unit))
+#else
+#define CSCHED_UNIT_CHECK(unit)
+#endif
+
+/*
+ * Delay, in microseconds, between migrations of a UNIT between PCPUs.
+ * This prevents rapid fluttering of a UNIT between CPUs, and reduces the
+ * implicit overheads such as cache-warming. 1ms (1000) has been measured
+ * as a good value.
+ */
+static unsigned int vcpu_migration_delay_us;
+integer_param("vcpu_migration_delay", vcpu_migration_delay_us);
+
+static inline bool
+__csched_vcpu_is_cache_hot(const struct csched_private *prv,
+                           const struct csched_unit *svc)
+{
+    bool hot = prv->unit_migr_delay &&
+               (NOW() - svc->last_sched_time) < prv->unit_migr_delay;
+
+    if ( hot )
+        SCHED_STAT_CRANK(unit_hot);
+
+    return hot;
+}
+
+static inline int
+__csched_unit_is_migrateable(const struct csched_private *prv,
+                             struct sched_unit *unit,
+                             int dest_cpu, cpumask_t *mask)
+{
+    const struct csched_unit *svc = CSCHED_UNIT(unit);
+    /*
+     * Don't pick up work that's hot on peer PCPU, or that can't (or
+     * would prefer not to) run on cpu.
+     *
+     * The caller is supposed to have already checked that unit is also
+     * not running.
+     */
+    ASSERT(!unit->is_running);
+
+    return !__csched_vcpu_is_cache_hot(prv, svc) &&
+           cpumask_test_cpu(dest_cpu, mask);
+}
+
+static int
+_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit,
+                 bool_t commit)
+{
+    int cpu = sched_unit_master(unit);
+    /* We must always use cpu's scratch space */
+    cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+    cpumask_t idlers;
+    cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
+    struct csched_pcpu *spc = NULL;
+    int balance_step;
+
+    for_each_affinity_balance_step( balance_step )
+    {
+        affinity_balance_cpumask(unit, balance_step, cpus);
+        cpumask_and(cpus, online, cpus);
+        /*
+         * We want to pick up a pcpu among the ones that are online and
+         * can accommodate vc. As far as hard affinity is concerned, there
+         * always will be at least one of these pcpus in the scratch cpumask,
+         * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below
+         * are ok.
+         *
+         * On the other hand, when considering soft affinity, it is possible
+         * that the mask is empty (for instance, if the domain has been put
+         * in a cpupool that does not contain any of the pcpus in its soft
+         * affinity), which would result in the ASSERT()-s inside cpumask_*()
+         * operations triggering (in debug builds).
+         *
+         * Therefore, if that is the case, we just skip the soft affinity
+         * balancing step all together.
+         */
+        if ( balance_step == BALANCE_SOFT_AFFINITY &&
+             (!has_soft_affinity(unit) || cpumask_empty(cpus)) )
+            continue;
+
+        /* If present, prefer vc's current processor */
+        cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
+                ? sched_unit_master(unit)
+                : cpumask_cycle(sched_unit_master(unit), cpus);
+        ASSERT(cpumask_test_cpu(cpu, cpus));
+
+        /*
+         * Try to find an idle processor within the above constraints.
+         *
+         * In multi-core and multi-threaded CPUs, not all idle execution
+         * vehicles are equal!
+         *
+         * We give preference to the idle execution vehicle with the most
+         * idling neighbours in its grouping. This distributes work across
+         * distinct cores first and guarantees we don't do something stupid
+         * like run two UNITs on co-hyperthreads while there are idle cores
+         * or sockets.
+         *
+         * Notice that, when computing the "idleness" of cpu, we may want to
+         * discount unit. That is, iff unit is the currently running and the
+         * only runnable unit on cpu, we add cpu to the idlers.
+         */
+        cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
+        if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) )
+            __cpumask_set_cpu(cpu, &idlers);
+        cpumask_and(cpus, &idlers, cpus);
+
+        /*
+         * It is important that cpu points to an idle processor, if a suitable
+         * one exists (and we can use cpus to check and, possibly, choose a new
+         * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and
+         * cpu points to a busy thread with an idle sibling, both the threads
+         * will be considered the same, from the "idleness" calculation point
+         * of view", preventing unit from being moved to the thread that is
+         * actually idle.
+         *
+         * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so
+         * we check for it first.
+         */
+        if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) )
+            cpu = cpumask_cycle(cpu, cpus);
+        __cpumask_clear_cpu(cpu, cpus);
+
+        while ( !cpumask_empty(cpus) )
+        {
+            cpumask_t cpu_idlers;
+            cpumask_t nxt_idlers;
+            int nxt, weight_cpu, weight_nxt;
+            int migrate_factor;
+
+            nxt = cpumask_cycle(cpu, cpus);
+
+            if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) )
+            {
+                /* We're on the same socket, so check the busy-ness of threads.
+                 * Migrate if # of idlers is less at all */
+                ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
+                migrate_factor = 1;
+                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask,
+                            cpu));
+                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask,
+                            nxt));
+            }
+            else
+            {
+                /* We're on different sockets, so check the busy-ness of cores.
+                 * Migrate only if the other core is twice as idle */
+                ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
+                migrate_factor = 2;
+                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu));
+                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt));
+            }
+
+            weight_cpu = cpumask_weight(&cpu_idlers);
+            weight_nxt = cpumask_weight(&nxt_idlers);
+            /* smt_power_savings: consolidate work rather than spreading it */
+            if ( sched_smt_power_savings ?
+                 weight_cpu > weight_nxt :
+                 weight_cpu * migrate_factor < weight_nxt )
+            {
+                cpumask_and(&nxt_idlers, &nxt_idlers, cpus);
+                spc = CSCHED_PCPU(nxt);
+                cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers);
+                cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu));
+            }
+            else
+            {
+                cpumask_andnot(cpus, cpus, &nxt_idlers);
+            }
+        }
+
+        /* Stop if cpu is idle */
+        if ( cpumask_test_cpu(cpu, &idlers) )
+            break;
+    }
+
+    if ( commit && spc )
+       spc->idle_bias = cpu;
+
+    TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id,
+             cpu);
+
+    return cpu;
+}
+
+static struct sched_resource *
+csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    struct csched_unit *svc = CSCHED_UNIT(unit);
+
+    /*
+     * We have been called by vcpu_migrate() (in schedule.c), as part
+     * of the process of seeing if vc can be migrated to another pcpu.
+     * We make a note about this in svc->flags so that later, in
+     * csched_unit_wake() (still called from vcpu_migrate()) we won't
+     * get boosted, which we don't deserve as we are "only" migrating.
+     */
+    set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
+    return get_sched_res(_csched_cpu_pick(ops, unit, 1));
+}
+
+static inline void
+__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    if ( list_empty(&svc->active_unit_elem) )
+    {
+        SCHED_UNIT_STAT_CRANK(svc, state_active);
+        SCHED_STAT_CRANK(acct_unit_active);
+
+        sdom->active_unit_count++;
+        list_add(&svc->active_unit_elem, &sdom->active_unit);
+        /* Make weight per-unit */
+        prv->weight += sdom->weight;
+        if ( list_empty(&sdom->active_sdom_elem) )
+        {
+            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+        }
+    }
+
+    TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id,
+             svc->unit->unit_id, sdom->active_unit_count);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static inline void
+__csched_unit_acct_stop_locked(struct csched_private *prv,
+    struct csched_unit *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    BUG_ON( list_empty(&svc->active_unit_elem) );
+
+    SCHED_UNIT_STAT_CRANK(svc, state_idle);
+    SCHED_STAT_CRANK(acct_unit_idle);
+
+    BUG_ON( prv->weight < sdom->weight );
+    sdom->active_unit_count--;
+    list_del_init(&svc->active_unit_elem);
+    prv->weight -= sdom->weight;
+    if ( list_empty(&sdom->active_unit) )
+    {
+        list_del_init(&sdom->active_sdom_elem);
+    }
+
+    TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id,
+             svc->unit->unit_id, sdom->active_unit_count);
+}
+
+static void
+csched_unit_acct(struct csched_private *prv, unsigned int cpu)
+{
+    struct sched_unit *currunit = current->sched_unit;
+    struct csched_unit * const svc = CSCHED_UNIT(currunit);
+    struct sched_resource *sr = get_sched_res(cpu);
+    const struct scheduler *ops = sr->scheduler;
+
+    ASSERT( sched_unit_master(currunit) == cpu );
+    ASSERT( svc->sdom != NULL );
+    ASSERT( !is_idle_unit(svc->unit) );
+
+    /*
+     * If this UNIT's priority was boosted when it last awoke, reset it.
+     * If the UNIT is found here, then it's consuming a non-negligeable
+     * amount of CPU resources and should no longer be boosted.
+     */
+    if ( svc->pri == CSCHED_PRI_TS_BOOST )
+    {
+        svc->pri = CSCHED_PRI_TS_UNDER;
+        TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id,
+                 svc->unit->unit_id);
+    }
+
+    /*
+     * Update credits
+     */
+    burn_credits(svc, NOW());
+
+    /*
+     * Put this UNIT and domain back on the active list if it was
+     * idling.
+     */
+    if ( list_empty(&svc->active_unit_elem) )
+    {
+        __csched_unit_acct_start(prv, svc);
+    }
+    else
+    {
+        unsigned int new_cpu;
+        unsigned long flags;
+        spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags);
+
+        /*
+         * If it's been active a while, check if we'd be better off
+         * migrating it to run elsewhere (see multi-core and multi-thread
+         * support in csched_res_pick()).
+         */
+        new_cpu = _csched_cpu_pick(ops, currunit, 0);
+
+        unit_schedule_unlock_irqrestore(lock, flags, currunit);
+
+        if ( new_cpu != cpu )
+        {
+            SCHED_UNIT_STAT_CRANK(svc, migrate_r);
+            SCHED_STAT_CRANK(migrate_running);
+            sched_set_pause_flags_atomic(currunit, _VPF_migrating);
+            /*
+             * As we are about to tickle cpu, we should clear its bit in
+             * idlers. But, if we are here, it means there is someone running
+             * on it, and hence the bit must be zero already.
+             */
+            ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers));
+            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+        }
+    }
+}
+
+static void *
+csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                   void *dd)
+{
+    struct csched_unit *svc;
+
+    /* Allocate per-UNIT info */
+    svc = xzalloc(struct csched_unit);
+    if ( svc == NULL )
+        return NULL;
+
+    INIT_LIST_HEAD(&svc->runq_elem);
+    INIT_LIST_HEAD(&svc->active_unit_elem);
+    svc->sdom = dd;
+    svc->unit = unit;
+    svc->pri = is_idle_unit(unit) ?
+        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+    SCHED_UNIT_STATS_RESET(svc);
+    SCHED_STAT_CRANK(unit_alloc);
+    return svc;
+}
+
+static void
+csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched_unit *svc = unit->priv;
+    spinlock_t *lock;
+
+    BUG_ON( is_idle_unit(unit) );
+
+    /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */
+    lock = unit_schedule_lock_irq(unit);
+
+    sched_set_res(unit, csched_res_pick(ops, unit));
+
+    spin_unlock_irq(lock);
+
+    lock = unit_schedule_lock_irq(unit);
+
+    if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running )
+        runq_insert(svc);
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    SCHED_STAT_CRANK(unit_insert);
+}
+
+static void
+csched_free_udata(const struct scheduler *ops, void *priv)
+{
+    struct csched_unit *svc = priv;
+
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    xfree(svc);
+}
+
+static void
+csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_unit * const svc = CSCHED_UNIT(unit);
+    struct csched_dom * const sdom = svc->sdom;
+
+    SCHED_STAT_CRANK(unit_remove);
+
+    ASSERT(!__unit_on_runq(svc));
+
+    if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+    {
+        SCHED_STAT_CRANK(unit_unpark);
+        sched_unit_unpause(svc->unit);
+    }
+
+    spin_lock_irq(&prv->lock);
+
+    if ( !list_empty(&svc->active_unit_elem) )
+        __csched_unit_acct_stop_locked(prv, svc);
+
+    spin_unlock_irq(&prv->lock);
+
+    BUG_ON( sdom == NULL );
+}
+
+static void
+csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched_unit * const svc = CSCHED_UNIT(unit);
+    unsigned int cpu = sched_unit_master(unit);
+    struct sched_resource *sr = get_sched_res(cpu);
+
+    SCHED_STAT_CRANK(unit_sleep);
+
+    BUG_ON( is_idle_unit(unit) );
+
+    if ( curr_on_cpu(cpu) == unit )
+    {
+        /*
+         * We are about to tickle cpu, so we should clear its bit in idlers.
+         * But, we are here because unit is going to sleep while running on cpu,
+         * so the bit must be zero already.
+         */
+        ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers));
+        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+    }
+    else if ( __unit_on_runq(svc) )
+        runq_remove(svc);
+}
+
+static void
+csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched_unit * const svc = CSCHED_UNIT(unit);
+    bool_t migrating;
+
+    BUG_ON( is_idle_unit(unit) );
+
+    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+    {
+        SCHED_STAT_CRANK(unit_wake_running);
+        return;
+    }
+    if ( unlikely(__unit_on_runq(svc)) )
+    {
+        SCHED_STAT_CRANK(unit_wake_onrunq);
+        return;
+    }
+
+    if ( likely(unit_runnable(unit)) )
+        SCHED_STAT_CRANK(unit_wake_runnable);
+    else
+        SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+    /*
+     * We temporarily boost the priority of awaking UNITs!
+     *
+     * If this UNIT consumes a non negligible amount of CPU, it
+     * will eventually find itself in the credit accounting code
+     * path where its priority will be reset to normal.
+     *
+     * If on the other hand the UNIT consumes little CPU and is
+     * blocking and awoken a lot (doing I/O for example), its
+     * priority will remain boosted, optimizing it's wake-to-run
+     * latencies.
+     *
+     * This allows wake-to-run latency sensitive UNITs to preempt
+     * more CPU resource intensive UNITs without impacting overall
+     * system fairness.
+     *
+     * There are two cases, when we don't want to boost:
+     *  - UNITs that are waking up after a migration, rather than
+     *    after having block;
+     *  - UNITs of capped domains unpausing after earning credits
+     *    they had overspent.
+     */
+    migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
+
+    if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER &&
+         !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+    {
+        TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id,
+                 unit->unit_id);
+        SCHED_STAT_CRANK(unit_boost);
+        svc->pri = CSCHED_PRI_TS_BOOST;
+    }
+
+    /* Put the UNIT on the runq and tickle CPUs */
+    runq_insert(svc);
+    __runq_tickle(svc);
+}
+
+static void
+csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched_unit * const svc = CSCHED_UNIT(unit);
+
+    /* Let the scheduler know that this vcpu is trying to yield */
+    set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags);
+}
+
+static int
+csched_dom_cntl(
+    const struct scheduler *ops,
+    struct domain *d,
+    struct xen_domctl_scheduler_op *op)
+{
+    struct csched_dom * const sdom = CSCHED_DOM(d);
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    unsigned long flags;
+    int rc = 0;
+
+    /* Protect both get and put branches with the pluggable scheduler
+     * lock. Runq lock not needed anywhere in here. */
+    spin_lock_irqsave(&prv->lock, flags);
+
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_SCHEDOP_getinfo:
+        op->u.credit.weight = sdom->weight;
+        op->u.credit.cap = sdom->cap;
+        break;
+    case XEN_DOMCTL_SCHEDOP_putinfo:
+        if ( op->u.credit.weight != 0 )
+        {
+            if ( !list_empty(&sdom->active_sdom_elem) )
+            {
+                prv->weight -= sdom->weight * sdom->active_unit_count;
+                prv->weight += op->u.credit.weight * sdom->active_unit_count;
+            }
+            sdom->weight = op->u.credit.weight;
+        }
+
+        if ( op->u.credit.cap != (uint16_t)~0U )
+            sdom->cap = op->u.credit.cap;
+        break;
+    default:
+        rc = -EINVAL;
+        break;
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return rc;
+}
+
+static void
+csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
+                const cpumask_t *hard, const cpumask_t *soft)
+{
+    struct csched_unit *svc = CSCHED_UNIT(unit);
+
+    if ( !hard )
+        return;
+
+    /* Are we becoming exclusively pinned? */
+    if ( cpumask_weight(hard) == 1 )
+        set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
+    else
+        clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
+}
+
+static inline void
+__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms)
+{
+    prv->tslice = MILLISECS(timeslice_ms);
+    prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
+    if ( timeslice_ms < prv->ticks_per_tslice )
+        prv->ticks_per_tslice = 1;
+    prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice;
+    prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms;
+    prv->credit = prv->credits_per_tslice * prv->ncpus;
+}
+
+static int
+csched_sys_cntl(const struct scheduler *ops,
+                        struct xen_sysctl_scheduler_op *sc)
+{
+    int rc = -EINVAL;
+    struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit;
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    unsigned long flags;
+
+    switch ( sc->cmd )
+    {
+    case XEN_SYSCTL_SCHEDOP_putinfo:
+        if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
+             || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
+             || (params->ratelimit_us
+                 && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
+                     || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
+             || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms)
+             || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
+                goto out;
+
+        spin_lock_irqsave(&prv->lock, flags);
+        __csched_set_tslice(prv, params->tslice_ms);
+        if ( !prv->ratelimit && params->ratelimit_us )
+            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
+        else if ( prv->ratelimit && !params->ratelimit_us )
+            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
+        prv->ratelimit = MICROSECS(params->ratelimit_us);
+        prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us);
+        spin_unlock_irqrestore(&prv->lock, flags);
+
+        /* FALLTHRU */
+    case XEN_SYSCTL_SCHEDOP_getinfo:
+        params->tslice_ms = prv->tslice / MILLISECS(1);
+        params->ratelimit_us = prv->ratelimit / MICROSECS(1);
+        params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1);
+        rc = 0;
+        break;
+    }
+    out:
+    return rc;
+}
+
+static void *
+csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+    struct csched_dom *sdom;
+
+    sdom = xzalloc(struct csched_dom);
+    if ( sdom == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Initialize credit and weight */
+    INIT_LIST_HEAD(&sdom->active_unit);
+    INIT_LIST_HEAD(&sdom->active_sdom_elem);
+    sdom->dom = dom;
+    sdom->weight = CSCHED_DEFAULT_WEIGHT;
+
+    return sdom;
+}
+
+static void
+csched_free_domdata(const struct scheduler *ops, void *data)
+{
+    xfree(data);
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk
+ * through the runq and move up any UNDERs that are preceded by OVERS. We
+ * remember the last UNDER to make the move up operation O(1).
+ */
+static void
+csched_runq_sort(struct csched_private *prv, unsigned int cpu)
+{
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+    struct list_head *runq, *elem, *next, *last_under;
+    struct csched_unit *svc_elem;
+    spinlock_t *lock;
+    unsigned long flags;
+    int sort_epoch;
+
+    sort_epoch = prv->runq_sort;
+    if ( sort_epoch == spc->runq_sort_last )
+        return;
+
+    spc->runq_sort_last = sort_epoch;
+
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+    runq = &spc->runq;
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+
+        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+}
+
+static void
+csched_acct(void* dummy)
+{
+    struct csched_private *prv = dummy;
+    unsigned long flags;
+    struct list_head *iter_unit, *next_unit;
+    struct list_head *iter_sdom, *next_sdom;
+    struct csched_unit *svc;
+    struct csched_dom *sdom;
+    uint32_t credit_total;
+    uint32_t weight_total;
+    uint32_t weight_left;
+    uint32_t credit_fair;
+    uint32_t credit_peak;
+    uint32_t credit_cap;
+    int credit_balance;
+    int credit_xtra;
+    int credit;
+
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    weight_total = prv->weight;
+    credit_total = prv->credit;
+
+    /* Converge balance towards 0 when it drops negative */
+    if ( prv->credit_balance < 0 )
+    {
+        credit_total -= prv->credit_balance;
+        SCHED_STAT_CRANK(acct_balance);
+    }
+
+    if ( unlikely(weight_total == 0) )
+    {
+        prv->credit_balance = 0;
+        spin_unlock_irqrestore(&prv->lock, flags);
+        SCHED_STAT_CRANK(acct_no_work);
+        goto out;
+    }
+
+    SCHED_STAT_CRANK(acct_run);
+
+    weight_left = weight_total;
+    credit_balance = 0;
+    credit_xtra = 0;
+    credit_cap = 0U;
+
+    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
+    {
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        BUG_ON( is_idle_domain(sdom->dom) );
+        BUG_ON( sdom->active_unit_count == 0 );
+        BUG_ON( sdom->weight == 0 );
+        BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left );
+
+        weight_left -= ( sdom->weight * sdom->active_unit_count );
+
+        /*
+         * A domain's fair share is computed using its weight in competition
+         * with that of all other active domains.
+         *
+         * At most, a domain can use credits to run all its active UNITs
+         * for one full accounting period. We allow a domain to earn more
+         * only when the system-wide credit balance is negative.
+         */
+        credit_peak = sdom->active_unit_count * prv->credits_per_tslice;
+        if ( prv->credit_balance < 0 )
+        {
+            credit_peak += ( ( -prv->credit_balance
+                               * sdom->weight
+                               * sdom->active_unit_count) +
+                             (weight_total - 1)
+                           ) / weight_total;
+        }
+
+        if ( sdom->cap != 0U )
+        {
+            credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100;
+            if ( credit_cap < credit_peak )
+                credit_peak = credit_cap;
+
+            /* FIXME -- set cap per-unit as well...? */
+            credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 )
+                         ) / sdom->active_unit_count;
+        }
+
+        credit_fair = ( ( credit_total
+                          * sdom->weight
+                          * sdom->active_unit_count )
+                        + (weight_total - 1)
+                      ) / weight_total;
+
+        if ( credit_fair < credit_peak )
+        {
+            credit_xtra = 1;
+        }
+        else
+        {
+            if ( weight_left != 0U )
+            {
+                /* Give other domains a chance at unused credits */
+                credit_total += ( ( ( credit_fair - credit_peak
+                                    ) * weight_total
+                                  ) + ( weight_left - 1 )
+                                ) / weight_left;
+            }
+
+            if ( credit_xtra )
+            {
+                /*
+                 * Lazily keep domains with extra credits at the head of
+                 * the queue to give others a chance at them in future
+                 * accounting periods.
+                 */
+                SCHED_STAT_CRANK(acct_reorder);
+                list_del(&sdom->active_sdom_elem);
+                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
+            }
+
+            credit_fair = credit_peak;
+        }
+
+        /* Compute fair share per UNIT */
+        credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 )
+                      ) / sdom->active_unit_count;
+
+
+        list_for_each_safe( iter_unit, next_unit, &sdom->active_unit )
+        {
+            svc = list_entry(iter_unit, struct csched_unit, active_unit_elem);
+            BUG_ON( sdom != svc->sdom );
+
+            /* Increment credit */
+            atomic_add(credit_fair, &svc->credit);
+            credit = atomic_read(&svc->credit);
+
+            /*
+             * Recompute priority or, if UNIT is idling, remove it from
+             * the active list.
+             */
+            if ( credit < 0 )
+            {
+                svc->pri = CSCHED_PRI_TS_OVER;
+
+                /* Park running UNITs of capped-out domains */
+                if ( sdom->cap != 0U &&
+                     credit < -credit_cap &&
+                     !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+                {
+                    SCHED_STAT_CRANK(unit_park);
+                    sched_unit_pause_nosync(svc->unit);
+                }
+
+                /* Lower bound on credits */
+                if ( credit < -prv->credits_per_tslice )
+                {
+                    SCHED_STAT_CRANK(acct_min_credit);
+                    credit = -prv->credits_per_tslice;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+            else
+            {
+                svc->pri = CSCHED_PRI_TS_UNDER;
+
+                /* Unpark any capped domains whose credits go positive */
+                if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
+                {
+                    /*
+                     * It's important to unset the flag AFTER the unpause()
+                     * call to make sure the UNIT's priority is not boosted
+                     * if it is woken up here.
+                     */
+                    SCHED_STAT_CRANK(unit_unpark);
+                    sched_unit_unpause(svc->unit);
+                    clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags);
+                }
+
+                /* Upper bound on credits means UNIT stops earning */
+                if ( credit > prv->credits_per_tslice )
+                {
+                    __csched_unit_acct_stop_locked(prv, svc);
+                    /* Divide credits in half, so that when it starts
+                     * accounting again, it starts a little bit "ahead" */
+                    credit /= 2;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+
+            SCHED_UNIT_STAT_SET(svc, credit_last, credit);
+            SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair);
+            credit_balance += credit;
+        }
+    }
+
+    prv->credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    /* Inform each CPU that its runq needs to be sorted */
+    prv->runq_sort++;
+
+out:
+    set_timer( &prv->master_ticker, NOW() + prv->tslice);
+}
+
+static void
+csched_tick(void *_cpu)
+{
+    unsigned int cpu = (unsigned long)_cpu;
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
+
+    spc->tick++;
+
+    /*
+     * Accounting for running UNIT
+     */
+    if ( !is_idle_unit(current->sched_unit) )
+        csched_unit_acct(prv, cpu);
+
+    /*
+     * Check if runq needs to be sorted
+     *
+     * Every physical CPU resorts the runq after the accounting master has
+     * modified priorities. This is a special O(n) sort and runs at most
+     * once per accounting period (currently 30 milliseconds).
+     */
+    csched_runq_sort(prv, cpu);
+
+    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
+}
+
+static struct csched_unit *
+csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler);
+    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
+    struct csched_unit *speer;
+    struct list_head *iter;
+    struct sched_unit *unit;
+
+    ASSERT(peer_pcpu != NULL);
+
+    /*
+     * Don't steal from an idle CPU's runq because it's about to
+     * pick up work from it itself.
+     */
+    if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) )
+        goto out;
+
+    list_for_each( iter, &peer_pcpu->runq )
+    {
+        speer = __runq_elem(iter);
+
+        /*
+         * If next available UNIT here is not of strictly higher
+         * priority than ours, this PCPU is useless to us.
+         */
+        if ( speer->pri <= pri )
+            break;
+
+        /* Is this UNIT runnable on our PCPU? */
+        unit = speer->unit;
+        BUG_ON( is_idle_unit(unit) );
+
+        /*
+         * If the unit is still in peer_cpu's scheduling tail, or if it
+         * has no useful soft affinity, skip it.
+         *
+         * In fact, what we want is to check if we have any "soft-affine
+         * work" to steal, before starting to look at "hard-affine work".
+         *
+         * Notice that, if not even one unit on this runq has a useful
+         * soft affinity, we could have avoid considering this runq for
+         * a soft balancing step in the first place. This, for instance,
+         * can be implemented by taking note of on what runq there are
+         * units with useful soft affinities in some sort of bitmap
+         * or counter.
+         */
+        if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY &&
+                                  !has_soft_affinity(unit)) )
+            continue;
+
+        affinity_balance_cpumask(unit, balance_step, cpumask_scratch);
+        if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) )
+        {
+            /* We got a candidate. Grab it! */
+            TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu,
+                     unit->domain->domain_id, unit->unit_id);
+            SCHED_UNIT_STAT_CRANK(speer, migrate_q);
+            SCHED_STAT_CRANK(migrate_queued);
+            runq_remove(speer);
+            sched_set_res(unit, get_sched_res(cpu));
+            /*
+             * speer will start executing directly on cpu, without having to
+             * go through runq_insert(). So we must update the runnable count
+             * for cpu here.
+             */
+            inc_nr_runnable(cpu);
+            return speer;
+        }
+    }
+ out:
+    SCHED_STAT_CRANK(steal_peer_idle);
+    return NULL;
+}
+
+static struct csched_unit *
+csched_load_balance(struct csched_private *prv, int cpu,
+    struct csched_unit *snext, bool *stolen)
+{
+    struct cpupool *c = get_sched_res(cpu)->cpupool;
+    struct csched_unit *speer;
+    cpumask_t workers;
+    cpumask_t *online = c->res_valid;
+    int peer_cpu, first_cpu, peer_node, bstep;
+    int node = cpu_to_node(cpu);
+
+    BUG_ON(get_sched_res(cpu) != snext->unit->res);
+
+    /*
+     * If this CPU is going offline, or is not (yet) part of any cpupool
+     * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
+     */
+    if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
+        goto out;
+
+    if ( snext->pri == CSCHED_PRI_IDLE )
+        SCHED_STAT_CRANK(load_balance_idle);
+    else if ( snext->pri == CSCHED_PRI_TS_OVER )
+        SCHED_STAT_CRANK(load_balance_over);
+    else
+        SCHED_STAT_CRANK(load_balance_other);
+
+    /*
+     * Let's look around for work to steal, taking both hard affinity
+     * and soft affinity into account. More specifically, we check all
+     * the non-idle CPUs' runq, looking for:
+     *  1. any "soft-affine work" to steal first,
+     *  2. if not finding anything, any "hard-affine work" to steal.
+     */
+    for_each_affinity_balance_step( bstep )
+    {
+        /*
+         * We peek at the non-idling CPUs in a node-wise fashion. In fact,
+         * it is more likely that we find some affine work on our same
+         * node, not to mention that migrating units within the same node
+         * could well expected to be cheaper than across-nodes (memory
+         * stays local, there might be some node-wide cache[s], etc.).
+         */
+        peer_node = node;
+        do
+        {
+            /* Select the pCPUs in this node that have work we can steal. */
+            cpumask_andnot(&workers, online, prv->idlers);
+            cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
+            __cpumask_clear_cpu(cpu, &workers);
+
+            first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers);
+            if ( first_cpu >= nr_cpu_ids )
+                goto next_node;
+            peer_cpu = first_cpu;
+            do
+            {
+                spinlock_t *lock;
+
+                /*
+                 * If there is only one runnable unit on peer_cpu, it means
+                 * there's no one to be stolen in its runqueue, so skip it.
+                 *
+                 * Checking this without holding the lock is racy... But that's
+                 * the whole point of this optimization!
+                 *
+                 * In more details:
+                 * - if we race with dec_nr_runnable(), we may try to take the
+                 *   lock and call csched_runq_steal() for no reason. This is
+                 *   not a functional issue, and should be infrequent enough.
+                 *   And we can avoid that by re-checking nr_runnable after
+                 *   having grabbed the lock, if we want;
+                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
+                 *   have runnable units in its runqueue, but that's not a
+                 *   problem because:
+                 *   + if racing with csched_unit_insert() or csched_unit_wake(),
+                 *     __runq_tickle() will be called afterwords, so the unit
+                 *     won't get stuck in the runqueue for too long;
+                 *   + if racing with csched_runq_steal(), it may be that an
+                 *     unit that we could have picked up, stays in a runqueue
+                 *     until someone else tries to steal it again. But this is
+                 *     no worse than what can happen already (without this
+                 *     optimization), it the pCPU would schedule right after we
+                 *     have taken the lock, and hence block on it.
+                 */
+                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
+                {
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
+                    goto next_cpu;
+                }
+
+                /*
+                 * Get ahold of the scheduler lock for this peer CPU.
+                 *
+                 * Note: We don't spin on this lock but simply try it. Spinning
+                 * could cause a deadlock if the peer CPU is also load
+                 * balancing and trying to lock this CPU.
+                 */
+                lock = pcpu_schedule_trylock(peer_cpu);
+                SCHED_STAT_CRANK(steal_trylock);
+                if ( !lock )
+                {
+                    SCHED_STAT_CRANK(steal_trylock_failed);
+                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
+                    goto next_cpu;
+                }
+
+                TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
+
+                /* Any work over there to steal? */
+                speer = cpumask_test_cpu(peer_cpu, online) ?
+                    csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
+                pcpu_schedule_unlock(lock, peer_cpu);
+
+                /* As soon as one unit is found, balancing ends */
+                if ( speer != NULL )
+                {
+                    *stolen = true;
+                    /*
+                     * Next time we'll look for work to steal on this node, we
+                     * will start from the next pCPU, with respect to this one,
+                     * so we don't risk stealing always from the same ones.
+                     */
+                    prv->balance_bias[peer_node] = peer_cpu;
+                    return speer;
+                }
+
+ next_cpu:
+                peer_cpu = cpumask_cycle(peer_cpu, &workers);
+
+            } while( peer_cpu != first_cpu );
+
+ next_node:
+            peer_node = cycle_node(peer_node, node_online_map);
+        } while( peer_node != node );
+    }
+
+ out:
+    /* Failed to find more important work elsewhere... */
+    __runq_remove(snext);
+    return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static void csched_schedule(
+    const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    const unsigned int cur_cpu = smp_processor_id();
+    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+    struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu);
+    struct list_head * const runq = RUNQ(sched_cpu);
+    struct csched_unit * const scurr = CSCHED_UNIT(unit);
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_unit *snext;
+    s_time_t runtime, tslice;
+    bool migrated = false;
+
+    SCHED_STAT_CRANK(schedule);
+    CSCHED_UNIT_CHECK(unit);
+
+    /*
+     * Here in Credit1 code, we usually just call TRACE_nD() helpers, and
+     * don't care about packing. But scheduling happens very often, so it
+     * actually is important that the record is as small as possible.
+     */
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned cpu:16, tasklet:8, idle:8;
+        } d;
+        d.cpu = cur_cpu;
+        d.tasklet = tasklet_work_scheduled;
+        d.idle = is_idle_unit(unit);
+        __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    runtime = now - unit->state_entry_time;
+    if ( runtime < 0 ) /* Does this ever happen? */
+        runtime = 0;
+
+    if ( !is_idle_unit(unit) )
+    {
+        /* Update credits of a non-idle UNIT. */
+        burn_credits(scurr, now);
+        scurr->start_time -= now;
+        scurr->last_sched_time = now;
+    }
+    else
+    {
+        /* Re-instate a boosted idle UNIT as normal-idle. */
+        scurr->pri = CSCHED_PRI_IDLE;
+    }
+
+    /* Choices, choices:
+     * - If we have a tasklet, we need to run the idle unit no matter what.
+     * - If sched rate limiting is in effect, and the current unit has
+     *   run for less than that amount of time, continue the current one,
+     *   but with a shorter timeslice and return it immediately
+     * - Otherwise, chose the one with the highest priority (which may
+     *   be the one currently running)
+     * - If the currently running one is TS_OVER, see if there
+     *   is a higher priority one waiting on the runqueue of another
+     *   cpu and steal it.
+     */
+
+    /*
+     * If we have schedule rate limiting enabled, check to see
+     * how long we've run for.
+     *
+     * If scurr is yielding, however, we don't let rate limiting kick in.
+     * In fact, it may be the case that scurr is about to spin, and there's
+     * no point forcing it to do so until rate limiting expires.
+     */
+    if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags)
+         && !tasklet_work_scheduled
+         && prv->ratelimit
+         && unit_runnable_state(unit)
+         && !is_idle_unit(unit)
+         && runtime < prv->ratelimit )
+    {
+        snext = scurr;
+        snext->start_time += now;
+        perfc_incr(delay_ms);
+        /*
+         * Next timeslice must last just until we'll have executed for
+         * ratelimit. However, to avoid setting a really short timer, which
+         * will most likely be inaccurate and counterproductive, we never go
+         * below CSCHED_MIN_TIMER.
+         */
+        tslice = prv->ratelimit - runtime;
+        if ( unlikely(runtime < CSCHED_MIN_TIMER) )
+            tslice = CSCHED_MIN_TIMER;
+        if ( unlikely(tb_init_done) )
+        {
+            struct {
+                unsigned unit:16, dom:16;
+                unsigned runtime;
+            } d;
+            d.dom = unit->domain->domain_id;
+            d.unit = unit->unit_id;
+            d.runtime = runtime;
+            __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d),
+                        (unsigned char *)&d);
+        }
+
+        goto out;
+    }
+    tslice = prv->tslice;
+
+    /*
+     * Select next runnable local UNIT (ie top of local runq)
+     */
+    if ( unit_runnable(unit) )
+        __runq_insert(scurr);
+    else
+    {
+        BUG_ON( is_idle_unit(unit) || list_empty(runq) );
+        /* Current has blocked. Update the runnable counter for this cpu. */
+        dec_nr_runnable(sched_cpu);
+    }
+
+    /*
+     * Clear YIELD flag before scheduling out
+     */
+    clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
+
+    do {
+        snext = __runq_elem(runq->next);
+
+        /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+        if ( tasklet_work_scheduled )
+        {
+            TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
+            snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
+            snext->pri = CSCHED_PRI_TS_BOOST;
+        }
+
+        /*
+         * SMP Load balance:
+         *
+         * If the next highest priority local runnable UNIT has already eaten
+         * through its credits, look on other PCPUs to see if we have more
+         * urgent work... If not, csched_load_balance() will return snext, but
+         * already removed from the runq.
+         */
+        if ( snext->pri > CSCHED_PRI_TS_OVER )
+            __runq_remove(snext);
+        else
+            snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
+
+    } while ( !unit_runnable_state(snext->unit) );
+
+    /*
+     * Update idlers mask if necessary. When we're idling, other CPUs
+     * will tickle us when they get extra work.
+     */
+    if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE )
+    {
+        if ( !cpumask_test_cpu(sched_cpu, prv->idlers) )
+            cpumask_set_cpu(sched_cpu, prv->idlers);
+    }
+    else if ( cpumask_test_cpu(sched_cpu, prv->idlers) )
+    {
+        cpumask_clear_cpu(sched_cpu, prv->idlers);
+    }
+
+    if ( !is_idle_unit(snext->unit) )
+        snext->start_time += now;
+
+out:
+    /*
+     * Return task to run next...
+     */
+    unit->next_time = (is_idle_unit(snext->unit) ?
+                -1 : tslice);
+    unit->next_task = snext->unit;
+    snext->unit->migrated = migrated;
+
+    /* Stop credit tick when going to idle, restart it when coming from idle. */
+    if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) )
+        stop_timer(&spc->ticker);
+    if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) )
+        set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us)
+                                - now % MICROSECS(prv->tick_period_us) );
+
+    CSCHED_UNIT_CHECK(unit->next_task);
+}
+
+static void
+csched_dump_unit(struct csched_unit *svc)
+{
+    struct csched_dom * const sdom = svc->sdom;
+
+    printk("[%i.%i] pri=%i flags=%x cpu=%i",
+            svc->unit->domain->domain_id,
+            svc->unit->unit_id,
+            svc->pri,
+            svc->flags,
+            sched_unit_master(svc->unit));
+
+    if ( sdom )
+    {
+        printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit),
+                sdom->weight, sdom->cap);
+#ifdef CSCHED_STATS
+        printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}",
+                svc->stats.credit_last,
+                svc->stats.credit_incr,
+                svc->stats.state_active,
+                svc->stats.state_idle,
+                svc->stats.migrate_q,
+                svc->stats.migrate_r,
+                svc->stats.kicked_away);
+#endif
+    }
+
+    printk("\n");
+}
+
+static void
+csched_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+    struct list_head *runq, *iter;
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    struct csched_pcpu *spc;
+    struct csched_unit *svc;
+    spinlock_t *lock;
+    unsigned long flags;
+    int loop;
+
+    /*
+     * We need both locks:
+     * - csched_dump_unit() wants to access domains' scheduling
+     *   parameters, which are protected by the private scheduler lock;
+     * - we scan through the runqueue, so we need the proper runqueue
+     *   lock (the one of the runqueue of this cpu).
+     */
+    spin_lock_irqsave(&prv->lock, flags);
+    lock = pcpu_schedule_lock(cpu);
+
+    spc = CSCHED_PCPU(cpu);
+    runq = &spc->runq;
+
+    printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n",
+           cpu, spc->nr_runnable, spc->runq_sort_last,
+           CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+
+    /* current UNIT (nothing to say if that's the idle unit). */
+    svc = CSCHED_UNIT(curr_on_cpu(cpu));
+    if ( svc && !is_idle_unit(svc->unit) )
+    {
+        printk("\trun: ");
+        csched_dump_unit(svc);
+    }
+
+    loop = 0;
+    list_for_each( iter, runq )
+    {
+        svc = __runq_elem(iter);
+        if ( svc )
+        {
+            printk("\t%3d: ", ++loop);
+            csched_dump_unit(svc);
+        }
+    }
+
+    pcpu_schedule_unlock(lock, cpu);
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void
+csched_dump(const struct scheduler *ops)
+{
+    struct list_head *iter_sdom, *iter_svc;
+    struct csched_private *prv = CSCHED_PRIV(ops);
+    int loop;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    printk("info:\n"
+           "\tncpus              = %u\n"
+           "\tmaster             = %u\n"
+           "\tcredit             = %u\n"
+           "\tcredit balance     = %d\n"
+           "\tweight             = %u\n"
+           "\trunq_sort          = %u\n"
+           "\tdefault-weight     = %d\n"
+           "\ttslice             = %"PRI_stime"ms\n"
+           "\tratelimit          = %"PRI_stime"us\n"
+           "\tcredits per msec   = %d\n"
+           "\tticks per tslice   = %d\n"
+           "\tmigration delay    = %"PRI_stime"us\n",
+           prv->ncpus,
+           prv->master,
+           prv->credit,
+           prv->credit_balance,
+           prv->weight,
+           prv->runq_sort,
+           CSCHED_DEFAULT_WEIGHT,
+           prv->tslice / MILLISECS(1),
+           prv->ratelimit / MICROSECS(1),
+           CSCHED_CREDITS_PER_MSEC,
+           prv->ticks_per_tslice,
+           prv->unit_migr_delay/ MICROSECS(1));
+
+    printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers));
+
+    printk("active units:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &prv->active_sdom )
+    {
+        struct csched_dom *sdom;
+        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
+
+        list_for_each( iter_svc, &sdom->active_unit )
+        {
+            struct csched_unit *svc;
+            spinlock_t *lock;
+
+            svc = list_entry(iter_svc, struct csched_unit, active_unit_elem);
+            lock = unit_schedule_lock(svc->unit);
+
+            printk("\t%3d: ", ++loop);
+            csched_dump_unit(svc);
+
+            unit_schedule_unlock(lock, svc->unit);
+        }
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static int __init
+csched_global_init(void)
+{
+    if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX ||
+         sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN )
+    {
+        printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n"
+               " Resetting to default %u\n",
+               XEN_SYSCTL_CSCHED_TSLICE_MIN,
+               XEN_SYSCTL_CSCHED_TSLICE_MAX,
+               CSCHED_DEFAULT_TSLICE_MS);
+        sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
+    }
+
+    if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
+        printk("WARNING: sched_ratelimit_us >"
+               "sched_credit_tslice_ms is undefined\n"
+               "Setting ratelimit to tslice\n");
+
+    if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
+    {
+        vcpu_migration_delay_us = 0;
+        printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n"
+               "Resetting to default: %u\n",
+               XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us);
+    }
+
+    return 0;
+}
+
+static int
+csched_init(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = xzalloc(struct csched_private);
+    if ( prv == NULL )
+        return -ENOMEM;
+
+    prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
+    if ( prv->balance_bias == NULL )
+    {
+        xfree(prv);
+        return -ENOMEM;
+    }
+
+    if ( !zalloc_cpumask_var(&prv->cpus) ||
+         !zalloc_cpumask_var(&prv->idlers) )
+    {
+        free_cpumask_var(prv->cpus);
+        xfree(prv->balance_bias);
+        xfree(prv);
+        return -ENOMEM;
+    }
+
+    ops->sched_data = prv;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->active_sdom);
+    prv->master = UINT_MAX;
+
+    __csched_set_tslice(prv, sched_credit_tslice_ms);
+
+    if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
+        prv->ratelimit = prv->tslice;
+    else
+        prv->ratelimit = MICROSECS(sched_ratelimit_us);
+
+    prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us);
+
+    return 0;
+}
+
+static void
+csched_deinit(struct scheduler *ops)
+{
+    struct csched_private *prv;
+
+    prv = CSCHED_PRIV(ops);
+    if ( prv != NULL )
+    {
+        ops->sched_data = NULL;
+        free_cpumask_var(prv->cpus);
+        free_cpumask_var(prv->idlers);
+        xfree(prv->balance_bias);
+        xfree(prv);
+    }
+}
+
+static const struct scheduler sched_credit_def = {
+    .name           = "SMP Credit Scheduler",
+    .opt_name       = "credit",
+    .sched_id       = XEN_SCHEDULER_CREDIT,
+    .sched_data     = NULL,
+
+    .global_init    = csched_global_init,
+
+    .insert_unit    = csched_unit_insert,
+    .remove_unit    = csched_unit_remove,
+
+    .sleep          = csched_unit_sleep,
+    .wake           = csched_unit_wake,
+    .yield          = csched_unit_yield,
+
+    .adjust         = csched_dom_cntl,
+    .adjust_affinity= csched_aff_cntl,
+    .adjust_global  = csched_sys_cntl,
+
+    .pick_resource  = csched_res_pick,
+    .do_schedule    = csched_schedule,
+
+    .dump_cpu_state = csched_dump_pcpu,
+    .dump_settings  = csched_dump,
+    .init           = csched_init,
+    .deinit         = csched_deinit,
+    .alloc_udata    = csched_alloc_udata,
+    .free_udata     = csched_free_udata,
+    .alloc_pdata    = csched_alloc_pdata,
+    .init_pdata     = csched_init_pdata,
+    .deinit_pdata   = csched_deinit_pdata,
+    .free_pdata     = csched_free_pdata,
+    .switch_sched   = csched_switch_sched,
+    .alloc_domdata  = csched_alloc_domdata,
+    .free_domdata   = csched_free_domdata,
+};
+
+REGISTER_SCHEDULER(sched_credit_def);
diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c
new file mode 100644
index 0000000000..f7c477053c
--- /dev/null
+++ b/xen/common/sched/credit2.c
@@ -0,0 +1,4122 @@
+
+/****************************************************************************
+ * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
+ ****************************************************************************
+ *
+ *        File: common/sched_credit2.c
+ *      Author: George Dunlap
+ *
+ * Description: Credit-based SMP CPU scheduler
+ * Based on an earlier verson by Emmanuel Ackaouy.
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/div64.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/cpu.h>
+#include <xen/keyhandler.h>
+
+/* Meant only for helping developers during debugging. */
+/* #define d2printk printk */
+#define d2printk(x...)
+
+
+/*
+ * Credit2 tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_CSCHED2_TICK             TRC_SCHED_CLASS_EVT(CSCHED2, 1)
+#define TRC_CSCHED2_RUNQ_POS         TRC_SCHED_CLASS_EVT(CSCHED2, 2)
+#define TRC_CSCHED2_CREDIT_BURN      TRC_SCHED_CLASS_EVT(CSCHED2, 3)
+#define TRC_CSCHED2_CREDIT_ADD       TRC_SCHED_CLASS_EVT(CSCHED2, 4)
+#define TRC_CSCHED2_TICKLE_CHECK     TRC_SCHED_CLASS_EVT(CSCHED2, 5)
+#define TRC_CSCHED2_TICKLE           TRC_SCHED_CLASS_EVT(CSCHED2, 6)
+#define TRC_CSCHED2_CREDIT_RESET     TRC_SCHED_CLASS_EVT(CSCHED2, 7)
+#define TRC_CSCHED2_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(CSCHED2, 8)
+#define TRC_CSCHED2_UPDATE_LOAD      TRC_SCHED_CLASS_EVT(CSCHED2, 9)
+#define TRC_CSCHED2_RUNQ_ASSIGN      TRC_SCHED_CLASS_EVT(CSCHED2, 10)
+#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
+#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
+#define TRC_CSCHED2_TICKLE_NEW       TRC_SCHED_CLASS_EVT(CSCHED2, 13)
+#define TRC_CSCHED2_RUNQ_MAX_WEIGHT  TRC_SCHED_CLASS_EVT(CSCHED2, 14)
+#define TRC_CSCHED2_MIGRATE          TRC_SCHED_CLASS_EVT(CSCHED2, 15)
+#define TRC_CSCHED2_LOAD_CHECK       TRC_SCHED_CLASS_EVT(CSCHED2, 16)
+#define TRC_CSCHED2_LOAD_BALANCE     TRC_SCHED_CLASS_EVT(CSCHED2, 17)
+#define TRC_CSCHED2_PICKED_CPU       TRC_SCHED_CLASS_EVT(CSCHED2, 19)
+#define TRC_CSCHED2_RUNQ_CANDIDATE   TRC_SCHED_CLASS_EVT(CSCHED2, 20)
+#define TRC_CSCHED2_SCHEDULE         TRC_SCHED_CLASS_EVT(CSCHED2, 21)
+#define TRC_CSCHED2_RATELIMIT        TRC_SCHED_CLASS_EVT(CSCHED2, 22)
+#define TRC_CSCHED2_RUNQ_CAND_CHECK  TRC_SCHED_CLASS_EVT(CSCHED2, 23)
+
+/*
+ * TODO:
+ * + Hyperthreading
+ *  - "Discount" time run on a thread with busy siblings
+ * + Algorithm:
+ *  - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
+ *    a flash animation in the background) can we schedule it with low enough latency
+ *    so that audio doesn't skip?
+ * + Optimizing
+ *  - Profiling, making new algorithms, making math more efficient (no long division)
+ */
+
+/*
+ * Design:
+ *
+ * VMs "burn" credits based on their weight; higher weight means
+ * credits burn more slowly.  The highest weight unit burns credits at
+ * a rate of 1 credit per nanosecond.  Others burn proportionally
+ * more.
+ *
+ * units are inserted into the runqueue by credit order.
+ *
+ * Credits are "reset" when the next unit in the runqueue is less than
+ * or equal to zero.  At that point, everyone's credits are "clipped"
+ * to a small value, and a fixed credit is added to everyone.
+ */
+
+/*
+ * Utilization cap:
+ *
+ * Setting an pCPU utilization cap for a domain means the following:
+ *
+ * - a domain can have a cap, expressed in terms of % of physical CPU time.
+ *   A domain that must not use more than 1/4 of _one_ physical CPU, will
+ *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
+ *   physical CPU time, will be given a cap of 150%;
+ *
+ * - caps are per-domain (not per-unit). If a domain has only 1 unit, and
+ *   a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4
+ *   units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
+ *   split among the v units. How much each of the units will actually get,
+ *   during any given interval of time, is unspecified (as it depends on
+ *   various aspects: workload, system load, etc.). For instance, it is
+ *   possible that, during a given time interval, 2 units use 100% each,
+ *   and the other two use nothing; while during another time interval,
+ *   two units use 80%, one uses 10% and the other 30%; or that each use
+ *   50% (and so on and so forth).
+ *
+ * For implementing this, we use the following approach:
+ *
+ * - each domain is given a 'budget', an each domain has a timer, which
+ *   replenishes the domain's budget periodically. The budget is the amount
+ *   of time the units of the domain can use every 'period';
+ *
+ * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
+ *   (but each domain has its own timer; so the all are periodic by the same
+ *   period, but replenishment of the budgets of the various domains, at
+ *   periods boundaries, are not synchronous);
+ *
+ * - when units run, they consume budget. When they don't run, they don't
+ *   consume budget. If there is no budget left for the domain, no unit of
+ *   that domain can run. If an unit tries to run and finds that there is no
+ *   budget, it blocks.
+ *   At whatever time an unit wants to run, it must check the domain's budget,
+ *   and if there is some, it can use it.
+ *
+ * - budget is replenished to the top of the capacity for the domain once
+ *   per period. Even if there was some leftover budget from previous period,
+ *   though, the budget after a replenishment will always be at most equal
+ *   to the total capacify of the domain ('tot_budget');
+ *
+ * - when a budget replenishment occurs, if there are units that had been
+ *   blocked because of lack of budget, they'll be unblocked, and they will
+ *   (potentially) be able to run again.
+ *
+ * Finally, some even more implementation related detail:
+ *
+ * - budget is stored in a domain-wide pool. Units of the domain that want
+ *   to run go to such pool, and grub some. When they do so, the amount
+ *   they grabbed is _immediately_ removed from the pool. This happens in
+ *   unit_grab_budget();
+ *
+ * - when units stop running, if they've not consumed all the budget they
+ *   took, the leftover is put back in the pool. This happens in
+ *   unit_return_budget();
+ *
+ * - the above means that an unit can find out that there is no budget and
+ *   block, not only if the cap has actually been reached (for this period),
+ *   but also if some other units, in order to run, have grabbed a certain
+ *   quota of budget, no matter whether they've already used it all or not.
+ *   An unit blocking because (any form of) lack of budget is said to be
+ *   "parked", and such blocking happens in park_unit();
+ *
+ * - when an unit stops running, and puts back some budget in the domain pool,
+ *   we need to check whether there is someone which has been parked and that
+ *   can be unparked. This happens in unpark_parked_units(), called from
+ *   csched2_context_saved();
+ *
+ * - of course, unparking happens also as a consequence of the domain's budget
+ *   being replenished by the periodic timer. This also occurs by means of
+ *   calling csched2_context_saved() (but from replenish_domain_budget());
+ *
+ * - parked units of a domain are kept in a (per-domain) list, called
+ *   'parked_units'). Manipulation of the list and of the domain-wide budget
+ *   pool, must occur only when holding the 'budget_lock'.
+ */
+
+/*
+ * Locking:
+ *
+ * - runqueue lock
+ *  + it is per-runqueue, so:
+ *   * cpus in a runqueue take the runqueue lock, when using
+ *     pcpu_schedule_lock() / unit_schedule_lock() (and friends),
+ *   * a cpu may (try to) take a "remote" runqueue lock, e.g., for
+ *     load balancing;
+ *  + serializes runqueue operations (removing and inserting units);
+ *  + protects runqueue-wide data in csched2_runqueue_data;
+ *  + protects unit parameters in csched2_unit for the unit in the
+ *    runqueue.
+ *
+ * - Private scheduler lock
+ *  + protects scheduler-wide data in csched2_private, such as:
+ *   * the list of domains active in this scheduler,
+ *   * what cpus and what runqueues are active and in what
+ *     runqueue each cpu is;
+ *  + serializes the operation of changing the weights of domains;
+ *
+ * - Budget lock
+ *  + it is per-domain;
+ *  + protects, in domains that have an utilization cap;
+ *   * manipulation of the total budget of the domain (as it is shared
+ *     among all units of the domain),
+ *   * manipulation of the list of units that are blocked waiting for
+ *     some budget to be available.
+ *
+ * - Type:
+ *  + runqueue locks are 'regular' spinlocks;
+ *  + the private scheduler lock can be an rwlock. In fact, data
+ *    it protects is modified only during initialization, cpupool
+ *    manipulation and when changing weights, and read in all
+ *    other cases (e.g., during load balancing);
+ *  + budget locks are 'regular' spinlocks.
+ *
+ * Ordering:
+ *  + tylock must be used when wanting to take a runqueue lock,
+ *    if we already hold another one;
+ *  + if taking both a runqueue lock and the private scheduler
+ *    lock is, the latter must always be taken for first;
+ *  + if taking both a runqueue lock and a budget lock, the former
+ *    must always be taken for first.
+ */
+
+/*
+ * Basic constants
+ */
+/* Default weight: How much a new domain starts with. */
+#define CSCHED2_DEFAULT_WEIGHT       256
+/*
+ * Min timer: Minimum length a timer will be set, to
+ * achieve efficiency.
+ */
+#define CSCHED2_MIN_TIMER            MICROSECS(500)
+/*
+ * Amount of credit VMs begin with, and are reset to.
+ * ATM, set so that highest-weight VMs can only run for 10ms
+ * before a reset event.
+ */
+#define CSCHED2_CREDIT_INIT          MILLISECS(10)
+/*
+ * Amount of credit the idle units have. It never changes, as idle
+ * units does not consume credits, and it must be lower than whatever
+ * amount of credit 'regular' unit would end up with.
+ */
+#define CSCHED2_IDLE_CREDIT          (-(1U<<30))
+/*
+ * Carryover: How much "extra" credit may be carried over after
+ * a reset.
+ */
+#define CSCHED2_CARRYOVER_MAX        CSCHED2_MIN_TIMER
+/*
+ * Stickiness: Cross-L2 migration resistance.  Should be less than
+ * MIN_TIMER.
+ */
+#define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
+/* How much to "compensate" an unit for L2 migration. */
+#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
+/* How tolerant we should be when peeking at runtime of units on other cpus */
+#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
+/* Reset: Value below which credit will be reset. */
+#define CSCHED2_CREDIT_RESET         0
+/* Max timer: Maximum time a guest can be run for. */
+#define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
+/* Period of the cap replenishment timer. */
+#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
+
+/*
+ * Flags
+ */
+/*
+ * CSFLAG_scheduled: Is this unit either running on, or context-switching off,
+ * a physical cpu?
+ * + Accessed only with runqueue lock held
+ * + Set when chosen as next in csched2_schedule().
+ * + Cleared after context switch has been saved in csched2_context_saved()
+ * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
+ *   set CSFLAG_delayed_runq_add
+ * + Checked to be false in runq_insert.
+ */
+#define __CSFLAG_scheduled 1
+#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
+/*
+ * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
+ * being context switched out?
+ * + Set when scheduling out in csched2_schedule() if prev is runnable
+ * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set
+ * + Read in csched2_context_saved().  If set, it adds prev to the runqueue and
+ *   clears the bit.
+ */
+#define __CSFLAG_delayed_runq_add 2
+#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
+/*
+ * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a
+ * credit2-initiated runq migrate request; migrate it to the runqueue indicated
+ * in the svc struct.
+ */
+#define __CSFLAG_runq_migrate_request 3
+#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
+/*
+ * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The
+ * scheduler is invoked to see if we can give the cpu to someone else, and
+ * get back to the yielding unit in a while.
+ */
+#define __CSFLAG_unit_yield 4
+#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield)
+/*
+ * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard
+ * affinity set to one and only 1 cpu (and, hence, can only run there).
+ */
+#define __CSFLAG_pinned 5
+#define CSFLAG_pinned (1U<<__CSFLAG_pinned)
+
+static unsigned int __read_mostly opt_migrate_resist = 500;
+integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
+
+/*
+ * Load tracking and load balancing
+ *
+ * Load history of runqueues and units is accounted for by using an
+ * exponential weighted moving average algorithm. However, instead of using
+ * fractions,we shift everything to left by the number of bits we want to
+ * use for representing the fractional part (Q-format).
+ *
+ * We may also want to reduce the precision of time accounting, to
+ * accommodate 'longer  windows'. So, if that is the case, we just need to
+ * shift all time samples to the right.
+ *
+ * The details of the formulas used for load tracking are explained close to
+ * update_runq_load(). Let's just say here that, with full nanosecond time
+ * granularity, a 30 bits wide 'decaying window' is ~1 second long.
+ *
+ * We want to consider the following equations:
+ *
+ *  avg[0] = load*P
+ *  avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W,  0 <= delta <= W
+ *
+ * where W is the length of the window, P the multiplier for transitiong into
+ * Q-format fixed point arithmetic and load is the instantaneous load of a
+ * runqueue, which basically is the number of runnable units there are on the
+ * runqueue (for the meaning of the other terms, look at the doc comment to
+ *  update_runq_load()).
+ *
+ *  So, again, with full nanosecond granularity, and 1 second window, we have:
+ *
+ *  W = 2^30
+ *  P = 2^18
+ *
+ * The maximum possible value for the average load, which we want to store in
+ * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
+ * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
+ * means we can have 2^45 units in each runqueue, before overflow occurs!
+ *
+ * However, it can happen that, at step j+1, if:
+ *
+ *  avg[j] = load*P
+ *  delta = W
+ *
+ * then:
+ *
+ *  avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
+ *
+ * So we must be able to deal with W*load*P. This means load can't be higher
+ * than:
+ *
+ *  2^(63 - 30 - 18) = 2^15 = 32768
+ *
+ * So 32768 is the maximum number of units the we can have in a runqueue,
+ * at any given time, and still not have problems with the load tracking
+ * calculations... and this is more than fine.
+ *
+ * As a matter of fact, since we are using microseconds granularity, we have
+ * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
+ * may be 2^25 = 33554432 units in a runq before we have to start thinking
+ * about overflow.
+ */
+
+/* If >0, decreases the granularity of time samples used for load tracking. */
+#define LOADAVG_GRANULARITY_SHIFT   (10)
+/* Time window during which we still give value to previous load history. */
+#define LOADAVG_WINDOW_SHIFT        (30)
+/* 18 bits by default (and not less than 4) for decimals. */
+#define LOADAVG_PRECISION_SHIFT     (18)
+#define LOADAVG_PRECISION_SHIFT_MIN (4)
+
+/*
+ * Both the length of the window and the number of fractional bits can be
+ * decided with boot parameters.
+ *
+ * The length of the window is always expressed in nanoseconds. The actual
+ * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
+ */
+static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
+integer_param("credit2_load_window_shift", opt_load_window_shift);
+static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
+integer_param("credit2_load_precision_shift", opt_load_precision_shift);
+
+static int __read_mostly opt_underload_balance_tolerance = 0;
+integer_param("credit2_balance_under", opt_underload_balance_tolerance);
+static int __read_mostly opt_overload_balance_tolerance = -3;
+integer_param("credit2_balance_over", opt_overload_balance_tolerance);
+/*
+ * Domains subject to a cap receive a replenishment of their runtime budget
+ * once every opt_cap_period interval. Default is 10 ms. The amount of budget
+ * they receive depends on their cap. For instance, a domain with a 50% cap
+ * will receive 50% of 10 ms, so 5 ms.
+ */
+static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
+integer_param("credit2_cap_period_ms", opt_cap_period);
+
+/*
+ * Runqueue organization.
+ *
+ * The various cpus are to be assigned each one to a runqueue, and we
+ * want that to happen basing on topology. At the moment, it is possible
+ * to choose to arrange runqueues to be:
+ *
+ * - per-cpu: meaning that there will be one runqueue per logical cpu. This
+ *            will happen when if the opt_runqueue parameter is set to 'cpu'.
+ *
+ * - per-core: meaning that there will be one runqueue per each physical
+ *             core of the host. This will happen if the opt_runqueue
+ *             parameter is set to 'core';
+ *
+ * - per-socket: meaning that there will be one runqueue per each physical
+ *               socket (AKA package, which often, but not always, also
+ *               matches a NUMA node) of the host; This will happen if
+ *               the opt_runqueue parameter is set to 'socket';
+ *
+ * - per-node: meaning that there will be one runqueue per each physical
+ *             NUMA node of the host. This will happen if the opt_runqueue
+ *             parameter is set to 'node';
+ *
+ * - global: meaning that there will be only one runqueue to which all the
+ *           (logical) processors of the host belong. This will happen if
+ *           the opt_runqueue parameter is set to 'all'.
+ *
+ * Depending on the value of opt_runqueue, therefore, cpus that are part of
+ * either the same physical core, the same physical socket, the same NUMA
+ * node, or just all of them, will be put together to form runqueues.
+ */
+#define OPT_RUNQUEUE_CPU    0
+#define OPT_RUNQUEUE_CORE   1
+#define OPT_RUNQUEUE_SOCKET 2
+#define OPT_RUNQUEUE_NODE   3
+#define OPT_RUNQUEUE_ALL    4
+static const char *const opt_runqueue_str[] = {
+    [OPT_RUNQUEUE_CPU] = "cpu",
+    [OPT_RUNQUEUE_CORE] = "core",
+    [OPT_RUNQUEUE_SOCKET] = "socket",
+    [OPT_RUNQUEUE_NODE] = "node",
+    [OPT_RUNQUEUE_ALL] = "all"
+};
+static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;
+
+static int __init parse_credit2_runqueue(const char *s)
+{
+    unsigned int i;
+
+    for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
+    {
+        if ( !strcmp(s, opt_runqueue_str[i]) )
+        {
+            opt_runqueue = i;
+            return 0;
+        }
+    }
+
+    return -EINVAL;
+}
+custom_param("credit2_runqueue", parse_credit2_runqueue);
+
+/*
+ * Per-runqueue data
+ */
+struct csched2_runqueue_data {
+    spinlock_t lock;           /* Lock for this runqueue                     */
+
+    struct list_head runq;     /* Ordered list of runnable vms               */
+    unsigned int nr_cpus;      /* How many CPUs are sharing this runqueue    */
+    int id;                    /* ID of this runqueue (-1 if invalid)        */
+
+    int load;                  /* Instantaneous load (num of non-idle units) */
+    s_time_t load_last_update; /* Last time average was updated              */
+    s_time_t avgload;          /* Decaying queue load                        */
+    s_time_t b_avgload;        /* Decaying queue load modified by balancing  */
+
+    cpumask_t active,          /* CPUs enabled for this runqueue             */
+        smt_idle,              /* Fully idle-and-untickled cores (see below) */
+        tickled,               /* Have been asked to go through schedule     */
+        idle;                  /* Currently idle pcpus                       */
+
+    struct list_head svc;      /* List of all units assigned to the runqueue */
+    unsigned int max_weight;   /* Max weight of the units in this runqueue   */
+    unsigned int pick_bias;    /* Last picked pcpu. Start from it next time  */
+};
+
+/*
+ * System-wide private data
+ */
+struct csched2_private {
+    rwlock_t lock;                     /* Private scheduler lock             */
+
+    unsigned int load_precision_shift; /* Precision of load calculations     */
+    unsigned int load_window_shift;    /* Lenght of load decaying window     */
+    unsigned int ratelimit_us;         /* Rate limiting for this scheduler   */
+
+    cpumask_t active_queues;           /* Runqueues with (maybe) active cpus */
+    struct csched2_runqueue_data *rqd; /* Data of the various runqueues      */
+
+    cpumask_t initialized;             /* CPUs part of this scheduler        */
+    struct list_head sdom;             /* List of domains (for debug key)    */
+};
+
+/*
+ * Physical CPU
+ */
+struct csched2_pcpu {
+    cpumask_t sibling_mask;            /* Siblings in the same runqueue      */
+    int runq_id;
+};
+
+/*
+ * Schedule Unit
+ */
+struct csched2_unit {
+    struct csched2_dom *sdom;          /* Up-pointer to domain                */
+    struct sched_unit *unit;           /* Up-pointer, to schedule unit        */
+    struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue          */
+
+    int credit;                        /* Current amount of credit            */
+    unsigned int weight;               /* Weight of this unit                 */
+    unsigned int residual;             /* Reminder of div(max_weight/weight)  */
+    unsigned flags;                    /* Status flags (16 bits would be ok,  */
+    s_time_t budget;                   /* Current budget (if domains has cap) */
+                                       /* but clear_bit() does not like that) */
+    s_time_t budget_quota;             /* Budget to which unit is entitled    */
+
+    s_time_t start_time;               /* Time we were scheduled (for credit) */
+
+    /* Individual contribution to load                                        */
+    s_time_t load_last_update;         /* Last time average was updated       */
+    s_time_t avgload;                  /* Decaying queue load                 */
+
+    struct list_head runq_elem;        /* On the runqueue (rqd->runq)         */
+    struct list_head parked_elem;      /* On the parked_units list            */
+    struct list_head rqd_elem;         /* On csched2_runqueue_data's svc list */
+    struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
+    int tickled_cpu;                   /* Cpu that will pick us (-1 if none)  */
+};
+
+/*
+ * Domain
+ */
+struct csched2_dom {
+    struct domain *dom;         /* Up-pointer to domain                       */
+
+    spinlock_t budget_lock;     /* Serialized budget calculations             */
+    s_time_t tot_budget;        /* Total amount of budget                     */
+    s_time_t budget;            /* Currently available budget                 */
+
+    struct timer repl_timer;    /* Timer for periodic replenishment of budget */
+    s_time_t next_repl;         /* Time at which next replenishment occurs    */
+    struct list_head parked_units; /* List of CPUs waiting for budget         */
+
+    struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list       */
+    uint16_t weight;            /* User specified weight                      */
+    uint16_t cap;               /* User specified cap                         */
+    uint16_t nr_units;          /* Number of units of this domain             */
+};
+
+/*
+ * Accessor helpers functions.
+ */
+static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
+{
+    return ops->sched_data;
+}
+
+static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
+{
+    return get_sched_res(cpu)->sched_priv;
+}
+
+static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit)
+{
+    return unit->priv;
+}
+
+static inline struct csched2_dom *csched2_dom(const struct domain *d)
+{
+    return d->sched_priv;
+}
+
+/* CPU to runq_id macro */
+static inline int c2r(unsigned int cpu)
+{
+    return csched2_pcpu(cpu)->runq_id;
+}
+
+/* CPU to runqueue struct macro */
+static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
+                                                  unsigned int cpu)
+{
+    return &csched2_priv(ops)->rqd[c2r(cpu)];
+}
+
+/* Does the domain of this unit have a cap? */
+static inline bool has_cap(const struct csched2_unit *svc)
+{
+    return svc->budget != STIME_MAX;
+}
+
+/*
+ * Hyperthreading (SMT) support.
+ *
+ * We use a special per-runq mask (smt_idle) and update it according to the
+ * following logic:
+ *  - when _all_ the SMT sibling in a core are idle, all their corresponding
+ *    bits are set in the smt_idle mask;
+ *  - when even _just_one_ of the SMT siblings in a core is not idle, all the
+ *    bits correspondings to it and to all its siblings are clear in the
+ *    smt_idle mask.
+ *
+ * Once we have such a mask, it is easy to implement a policy that, either:
+ *  - uses fully idle cores first: it is enough to try to schedule the units
+ *    on pcpus from smt_idle mask first. This is what happens if
+ *    sched_smt_power_savings was not set at boot (default), and it maximizes
+ *    true parallelism, and hence performance;
+ *  - uses already busy cores first: it is enough to try to schedule the units
+ *    on pcpus that are idle, but are not in smt_idle. This is what happens if
+ *    sched_smt_power_savings is set at boot, and it allows as more cores as
+ *    possible to stay in low power states, minimizing power consumption.
+ *
+ * This logic is entirely implemented in runq_tickle(), and that is enough.
+ * In fact, in this scheduler, placement of an unit on one of the pcpus of a
+ * runq, _always_ happens by means of tickling:
+ *  - when an unit wakes up, it calls csched2_unit_wake(), which calls
+ *    runq_tickle();
+ *  - when a migration is initiated in schedule.c, we call csched2_res_pick(),
+ *    csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake().
+ *    csched2_res_pick() looks for the least loaded runq and return just any
+ *    of its processors. Then, csched2_unit_migrate() just moves the unit to
+ *    the chosen runq, and it is again runq_tickle(), called by
+ *    csched2_unit_wake() that actually decides what pcpu to use within the
+ *    chosen runq;
+ *  - when a migration is initiated in sched_credit2.c, by calling  migrate()
+ *    directly, that again temporarily use a random pcpu from the new runq,
+ *    and then calls runq_tickle(), by itself.
+ */
+
+/*
+ * If all the siblings of cpu (including cpu itself) are both idle and
+ * untickled, set all their bits in mask.
+ *
+ * NB that rqd->smt_idle is different than rqd->idle.  rqd->idle
+ * records pcpus that at are merely idle (i.e., at the moment do not
+ * have an unit running on them).  But you have to manually filter out
+ * which pcpus have been tickled in order to find cores that are not
+ * going to be busy soon.  Filtering out tickled cpus pairwise is a
+ * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
+ * the bits of a pcpu are set only if all the threads on its core are
+ * both idle *and* untickled.
+ *
+ * This means changing the mask when either rqd->idle or rqd->tickled
+ * changes.
+ */
+static inline
+void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
+                       cpumask_t *mask)
+{
+    const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
+
+    if ( cpumask_subset(cpu_siblings, idlers) )
+        cpumask_or(mask, mask, cpu_siblings);
+}
+
+/*
+ * Clear the bits of all the siblings of cpu from mask (if necessary).
+ */
+static inline
+void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
+{
+    const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
+
+    if ( cpumask_subset(cpu_siblings, mask) )
+        cpumask_andnot(mask, mask, cpu_siblings);
+}
+
+/*
+ * In csched2_res_pick(), it may not be possible to actually look at remote
+ * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
+ * we pick, in order of decreasing preference:
+ *  1) svc's current pcpu, if it is part of svc's soft affinity;
+ *  2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
+ *  3) svc's current pcpu, if it is part of svc's hard affinity;
+ *  4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
+ *  5) just one valid pcpu from svc's hard affinity
+ *
+ * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
+ * note that at least 5 is guaranteed to _always_ return at least one pcpu.
+ */
+static int get_fallback_cpu(struct csched2_unit *svc)
+{
+    struct sched_unit *unit = svc->unit;
+    unsigned int bs;
+
+    SCHED_STAT_CRANK(need_fallback_cpu);
+
+    for_each_affinity_balance_step( bs )
+    {
+        int cpu = sched_unit_master(unit);
+
+        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+            continue;
+
+        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    cpupool_domain_master_cpumask(unit->domain));
+
+        /*
+         * This is cases 1 or 3 (depending on bs): if processor is (still)
+         * in our affinity, go for it, for cache betterness.
+         */
+        if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
+            return cpu;
+
+        /*
+         * This is cases 2 or 4 (depending on bs): v->processor isn't there
+         * any longer, check if we at least can stay in our current runq.
+         */
+        if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
+                                       &svc->rqd->active)) )
+        {
+            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                        &svc->rqd->active);
+            return cpumask_first(cpumask_scratch_cpu(cpu));
+        }
+
+        /*
+         * We may well pick any valid pcpu from our soft-affinity, outside
+         * of our current runqueue, but we decide not to. In fact, changing
+         * runqueue is slow, affects load distribution, and is a source of
+         * overhead for the units running on the other runqueue (we need the
+         * lock). So, better do that as a consequence of a well informed
+         * decision (or if we really don't have any other chance, as we will,
+         * at step 5, if we get to there).
+         *
+         * Also, being here, looking for a fallback, is an unfortunate and
+         * infrequent event, while the decision of putting us in the runqueue
+         * wehere we are was (likely) made taking all the relevant factors
+         * into account. So let's not disrupt that, just for the sake of
+         * soft-affinity, and let's wait here to be able to made (hopefully,
+         * soon), another similar well informed decision.
+         */
+        if ( bs == BALANCE_SOFT_AFFINITY )
+            continue;
+
+        /*
+         * This is cases 5: last stand, just one valid pcpu from our hard
+         * affinity. It's guaranteed that there is at least one valid cpu,
+         * and therefore we are sure that we return it, and never really
+         * exit the loop.
+         */
+        ASSERT(bs == BALANCE_HARD_AFFINITY &&
+               !cpumask_empty(cpumask_scratch_cpu(cpu)));
+        cpu = cpumask_first(cpumask_scratch_cpu(cpu));
+        if ( likely(cpu < nr_cpu_ids) )
+            return cpu;
+    }
+    ASSERT_UNREACHABLE();
+    /*
+     * We can't be here.  But if that somehow happen (in non-debug builds),
+     * at least return something which both online and in our hard-affinity.
+     */
+    return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit)));
+}
+
+/*
+ * Time-to-credit, credit-to-time.
+ *
+ * We keep track of the "residual" time to make sure that frequent short
+ * schedules still get accounted for in the end.
+ *
+ * FIXME: Do pre-calculated division?
+ */
+static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
+                          struct csched2_unit *svc)
+{
+    uint64_t val = time * rqd->max_weight + svc->residual;
+
+    svc->residual = do_div(val, svc->weight);
+    svc->credit -= val;
+}
+
+static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc)
+{
+    return credit * svc->weight / rqd->max_weight;
+}
+
+/*
+ * Runqueue related code.
+ */
+
+static inline int unit_on_runq(struct csched2_unit *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched2_unit * runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched2_unit, runq_elem);
+}
+
+static void activate_runqueue(struct csched2_private *prv, int rqi)
+{
+    struct csched2_runqueue_data *rqd;
+
+    rqd = prv->rqd + rqi;
+
+    BUG_ON(!cpumask_empty(&rqd->active));
+
+    rqd->max_weight = 1;
+    rqd->id = rqi;
+    INIT_LIST_HEAD(&rqd->svc);
+    INIT_LIST_HEAD(&rqd->runq);
+    spin_lock_init(&rqd->lock);
+
+    __cpumask_set_cpu(rqi, &prv->active_queues);
+}
+
+static void deactivate_runqueue(struct csched2_private *prv, int rqi)
+{
+    struct csched2_runqueue_data *rqd;
+
+    rqd = prv->rqd + rqi;
+
+    BUG_ON(!cpumask_empty(&rqd->active));
+
+    rqd->id = -1;
+
+    __cpumask_clear_cpu(rqi, &prv->active_queues);
+}
+
+static inline bool same_node(unsigned int cpua, unsigned int cpub)
+{
+    return cpu_to_node(cpua) == cpu_to_node(cpub);
+}
+
+static inline bool same_socket(unsigned int cpua, unsigned int cpub)
+{
+    return cpu_to_socket(cpua) == cpu_to_socket(cpub);
+}
+
+static inline bool same_core(unsigned int cpua, unsigned int cpub)
+{
+    return same_socket(cpua, cpub) &&
+           cpu_to_core(cpua) == cpu_to_core(cpub);
+}
+
+static unsigned int
+cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
+{
+    struct csched2_runqueue_data *rqd;
+    unsigned int rqi;
+
+    for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
+    {
+        unsigned int peer_cpu;
+
+        /*
+         * As soon as we come across an uninitialized runqueue, use it.
+         * In fact, either:
+         *  - we are initializing the first cpu, and we assign it to
+         *    runqueue 0. This is handy, especially if we are dealing
+         *    with the boot cpu (if credit2 is the default scheduler),
+         *    as we would not be able to use cpu_to_socket() and similar
+         *    helpers anyway (they're result of which is not reliable yet);
+         *  - we have gone through all the active runqueues, and have not
+         *    found anyone whose cpus' topology matches the one we are
+         *    dealing with, so activating a new runqueue is what we want.
+         */
+        if ( prv->rqd[rqi].id == -1 )
+            break;
+
+        rqd = prv->rqd + rqi;
+        BUG_ON(cpumask_empty(&rqd->active));
+
+        peer_cpu = cpumask_first(&rqd->active);
+        BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
+               cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);
+
+        if (opt_runqueue == OPT_RUNQUEUE_CPU)
+            continue;
+        if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
+             (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
+             (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
+             (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
+            break;
+    }
+
+    /* We really expect to be able to assign each cpu to a runqueue. */
+    BUG_ON(rqi >= nr_cpu_ids);
+
+    return rqi;
+}
+
+/* Find the domain with the highest weight. */
+static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
+                              int old_weight)
+{
+    /* Try to avoid brute-force search:
+     * - If new_weight is larger, max_weigth <- new_weight
+     * - If old_weight != max_weight, someone else is still max_weight
+     *   (No action required)
+     * - If old_weight == max_weight, brute-force search for max weight
+     */
+    if ( new_weight > rqd->max_weight )
+    {
+        rqd->max_weight = new_weight;
+        SCHED_STAT_CRANK(upd_max_weight_quick);
+    }
+    else if ( old_weight == rqd->max_weight )
+    {
+        struct list_head *iter;
+        int max_weight = 1;
+
+        list_for_each( iter, &rqd->svc )
+        {
+            struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem);
+
+            if ( svc->weight > max_weight )
+                max_weight = svc->weight;
+        }
+
+        rqd->max_weight = max_weight;
+        SCHED_STAT_CRANK(upd_max_weight_full);
+    }
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned rqi:16, max_weight:16;
+        } d;
+        d.rqi = rqd->id;
+        d.max_weight = rqd->max_weight;
+        __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+}
+
+/* Add and remove from runqueue assignment (not active run queue) */
+static void
+_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd)
+{
+
+    svc->rqd = rqd;
+    list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
+
+    update_max_weight(svc->rqd, svc->weight, 0);
+
+    /* Expected new load based on adding this unit */
+    rqd->b_avgload += svc->avgload;
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            unsigned rqi:16;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.rqi=rqd->id;
+        __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+}
+
+static void
+runq_assign(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit *svc = unit->priv;
+
+    ASSERT(svc->rqd == NULL);
+
+    _runq_assign(svc, c2rqd(ops, sched_unit_master(unit)));
+}
+
+static void
+_runq_deassign(struct csched2_unit *svc)
+{
+    struct csched2_runqueue_data *rqd = svc->rqd;
+
+    ASSERT(!unit_on_runq(svc));
+    ASSERT(!(svc->flags & CSFLAG_scheduled));
+
+    list_del_init(&svc->rqd_elem);
+    update_max_weight(rqd, 0, svc->weight);
+
+    /* Expected new load based on removing this unit */
+    rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
+
+    svc->rqd = NULL;
+}
+
+static void
+runq_deassign(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit *svc = unit->priv;
+
+    ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+    _runq_deassign(svc);
+}
+
+/*
+ * Track the runq load by gathering instantaneous load samples, and using
+ * exponentially weighted moving average (EWMA) for the 'decaying'.
+ *
+ * We consider a window of length W=2^(prv->load_window_shift) nsecs
+ * (which takes LOADAVG_GRANULARITY_SHIFT into account).
+ *
+ * If load is the instantaneous load, the formula for EWMA looks as follows,
+ * for the i-eth sample:
+ *
+ *  avg[i] = a*load + (1 - a)*avg[i-1]
+ *
+ * where avg[i] is the new value of the average load, avg[i-1] is the value
+ * of the average load calculated so far, and a is a coefficient less or
+ * equal to 1.
+ *
+ * So, for us, it becomes:
+ *
+ *  avgload = a*load + (1 - a)*avgload
+ *
+ * For determining a, we consider _when_ we are doing the load update, wrt
+ * the length of the window. We define delta as follows:
+ *
+ *  delta = t - load_last_update
+ *
+ * where t is current time (i.e., time at which we are both sampling and
+ * updating the load average) and load_last_update is the last time we did
+ * that.
+ *
+ * There are two possible situations:
+ *
+ * a) delta <= W
+ *    this means that, during the last window of length W, the runeuque load
+ *    was avgload for (W - detla) time, and load for delta time:
+ *
+ *                |----------- W ---------|
+ *                |                       |
+ *                |     load_last_update  t
+ *     -------------------------|---------|---
+ *                |             |         |
+ *                \__W - delta__/\_delta__/
+ *                |             |         |
+ *                |___avgload___|__load___|
+ *
+ *    So, what about using delta/W as our smoothing coefficient a. If we do,
+ *    here's what happens:
+ *
+ *     a = delta / W
+ *     1 - a = 1 - (delta / W) = (W - delta) / W
+ *
+ *    Which matches the above description of what happened in the last
+ *    window of length W.
+ *
+ *    Note that this also means that the weight that we assign to both the
+ *    latest load sample, and to previous history, varies at each update.
+ *    The longer the latest load sample has been in efect, within the last
+ *    window, the higher it weights (and the lesser the previous history
+ *    weights).
+ *
+ *    This is some sort of extension of plain EWMA to fit even better to our
+ *    use case.
+ *
+ * b) delta > W
+ *    this means more than a full window has passed since the last update:
+ *
+ *                |----------- W ---------|
+ *                |                       |
+ *       load_last_update                 t
+ *     ----|------------------------------|---
+ *         |                              |
+ *         \_________________delta________/
+ *
+ *    Basically, it means the last load sample has been in effect for more
+ *    than W time, and hence we should just use it, and forget everything
+ *    before that.
+ *
+ *    This can be seen as a 'reset condition', occurring when, for whatever
+ *    reason, load has not been updated for longer than we expected. (It is
+ *    also how avgload is assigned its first value.)
+ *
+ * The formula for avgload then becomes:
+ *
+ *  avgload = (delta/W)*load + (W - delta)*avgload/W
+ *  avgload = delta*load/W + W*avgload/W - delta*avgload/W
+ *  avgload = avgload + delta*load/W - delta*avgload/W
+ *
+ * So, final form is:
+ *
+ *  avgload_0 = load
+ *  avgload = avgload + delta*load/W - delta*avgload/W,  0<=delta<=W
+ *
+ * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
+ * what happens if we  update the load twice, at the same time instant?):
+ *
+ *  avgload = avgload + 0*load/W - 0*avgload/W
+ *  avgload = avgload
+ *
+ * and when delta is W (i.e., what happens if we update at the last
+ * possible instant before the window 'expires'?):
+ *
+ *  avgload = avgload + W*load/W - W*avgload/W
+ *  avgload = avgload + load - avgload
+ *  avgload = load
+ *
+ * Which, in both cases, is what we expect.
+ */
+static void
+update_runq_load(const struct scheduler *ops,
+                 struct csched2_runqueue_data *rqd, int change, s_time_t now)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    s_time_t delta, load = rqd->load;
+    unsigned int P, W;
+
+    W = prv->load_window_shift;
+    P = prv->load_precision_shift;
+    now >>= LOADAVG_GRANULARITY_SHIFT;
+
+    /*
+     * To avoid using fractions, we shift to left by load_precision_shift,
+     * and use the least last load_precision_shift bits as fractional part.
+     * Looking back at the formula we want to use, we now have:
+     *
+     *  P = 2^(load_precision_shift)
+     *  P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
+     *  P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
+     *
+     * And if we are ok storing and using P*avgload, we can rewrite this as:
+     *
+     *  P*avgload = avgload'
+     *  avgload' = avgload' + delta*P*load/W - delta*avgload'/W
+     *
+     * Coupled with, of course:
+     *
+     *  avgload_0' = P*load
+     */
+
+    if ( rqd->load_last_update + (1ULL << W)  < now )
+    {
+        rqd->avgload = load << P;
+        rqd->b_avgload = load << P;
+    }
+    else
+    {
+        delta = now - rqd->load_last_update;
+        if ( unlikely(delta < 0) )
+        {
+            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
+                     __func__, now, rqd->load_last_update);
+            delta = 0;
+        }
+
+        /*
+         * Note that, if we were to enforce (or check) some relationship
+         * between P and W, we may save one shift. E.g., if we are sure
+         * that P < W, we could write:
+         *
+         *  (delta * (load << P)) >> W
+         *
+         * as:
+         *
+         *  (delta * load) >> (W - P)
+         */
+        rqd->avgload = rqd->avgload +
+                       ((delta * (load << P)) >> W) -
+                       ((delta * rqd->avgload) >> W);
+        rqd->b_avgload = rqd->b_avgload +
+                         ((delta * (load << P)) >> W) -
+                         ((delta * rqd->b_avgload) >> W);
+    }
+    rqd->load += change;
+    rqd->load_last_update = now;
+
+    /* Overflow, capable of making the load look negative, must not occur. */
+    ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint64_t rq_avgload, b_avgload;
+            unsigned rq_load:16, rq_id:8, shift:8;
+        } d;
+        d.rq_id = rqd->id;
+        d.rq_load = rqd->load;
+        d.rq_avgload = rqd->avgload;
+        d.b_avgload = rqd->b_avgload;
+        d.shift = P;
+        __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+}
+
+static void
+update_svc_load(const struct scheduler *ops,
+                struct csched2_unit *svc, int change, s_time_t now)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    s_time_t delta, unit_load;
+    unsigned int P, W;
+
+    if ( change == -1 )
+        unit_load = 1;
+    else if ( change == 1 )
+        unit_load = 0;
+    else
+        unit_load = unit_runnable(svc->unit);
+
+    W = prv->load_window_shift;
+    P = prv->load_precision_shift;
+    now >>= LOADAVG_GRANULARITY_SHIFT;
+
+    if ( svc->load_last_update + (1ULL << W) < now )
+    {
+        svc->avgload = unit_load << P;
+    }
+    else
+    {
+        delta = now - svc->load_last_update;
+        if ( unlikely(delta < 0) )
+        {
+            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
+                     __func__, now, svc->load_last_update);
+            delta = 0;
+        }
+
+        svc->avgload = svc->avgload +
+                       ((delta * (unit_load << P)) >> W) -
+                       ((delta * svc->avgload) >> W);
+    }
+    svc->load_last_update = now;
+
+    /* Overflow, capable of making the load look negative, must not occur. */
+    ASSERT(svc->avgload >= 0);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint64_t v_avgload;
+            unsigned unit:16, dom:16;
+            unsigned shift;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.v_avgload = svc->avgload;
+        d.shift = P;
+        __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+}
+
+static void
+update_load(const struct scheduler *ops,
+            struct csched2_runqueue_data *rqd,
+            struct csched2_unit *svc, int change, s_time_t now)
+{
+    trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0,  NULL);
+
+    update_runq_load(ops, rqd, change, now);
+    if ( svc )
+        update_svc_load(ops, svc, change, now);
+}
+
+static void
+runq_insert(const struct scheduler *ops, struct csched2_unit *svc)
+{
+    struct list_head *iter;
+    unsigned int cpu = sched_unit_master(svc->unit);
+    struct list_head * runq = &c2rqd(ops, cpu)->runq;
+    int pos = 0;
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    ASSERT(!unit_on_runq(svc));
+    ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit)));
+
+    ASSERT(&svc->rqd->runq == runq);
+    ASSERT(!is_idle_unit(svc->unit));
+    ASSERT(!svc->unit->is_running);
+    ASSERT(!(svc->flags & CSFLAG_scheduled));
+
+    list_for_each( iter, runq )
+    {
+        struct csched2_unit * iter_svc = runq_elem(iter);
+
+        if ( svc->credit > iter_svc->credit )
+            break;
+
+        pos++;
+    }
+    list_add_tail(&svc->runq_elem, iter);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            unsigned pos;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.pos = pos;
+        __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+}
+
+static inline void runq_remove(struct csched2_unit *svc)
+{
+    ASSERT(unit_on_runq(svc));
+    list_del_init(&svc->runq_elem);
+}
+
+void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t);
+
+static inline void
+tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
+{
+    __cpumask_set_cpu(cpu, &rqd->tickled);
+    smt_idle_mask_clear(cpu, &rqd->smt_idle);
+    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+}
+
+/*
+ * What we want to know is whether svc, which we assume to be running on some
+ * pcpu, can be interrupted and preempted (which, so far, basically means
+ * whether or not it already run for more than the ratelimit, to which we
+ * apply some tolerance).
+ */
+static inline bool is_preemptable(const struct csched2_unit *svc,
+                                    s_time_t now, s_time_t ratelimit)
+{
+    if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
+        return true;
+
+    ASSERT(svc->unit->is_running);
+    return now - svc->unit->state_entry_time >
+           ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
+}
+
+/*
+ * Score to preempt the target cpu.  Return a negative number if the
+ * credit isn't high enough; if it is, favor a preemption on cpu in
+ * this order:
+ * - cpu is in new's soft-affinity, not in cur's soft-affinity
+ *   (2 x CSCHED2_CREDIT_INIT score bonus);
+ * - cpu is in new's soft-affinity and cur's soft-affinity, or
+ *   cpu is not in new's soft-affinity, nor in cur's soft-affinity
+ *   (1x CSCHED2_CREDIT_INIT score bonus);
+ * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
+ *   (no bonus).
+ *
+ * Within the same class, the highest difference of credit.
+ */
+static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
+                             struct csched2_unit *new, unsigned int cpu)
+{
+    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+    struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu));
+    struct csched2_private *prv = csched2_priv(ops);
+    s_time_t score;
+
+    /*
+     * We are dealing with cpus that are marked non-idle (i.e., that are not
+     * in rqd->idle). However, some of them may be running their idle unit,
+     * if taking care of tasklets. In that case, we want to leave it alone.
+     */
+    if ( unlikely(is_idle_unit(cur->unit) ||
+         !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
+        return -1;
+
+    burn_credits(rqd, cur, now);
+
+    score = new->credit - cur->credit;
+    if ( sched_unit_master(new->unit) != cpu )
+        score -= CSCHED2_MIGRATE_RESIST;
+
+    /*
+     * If score is positive, it means new has enough credits (i.e.,
+     * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
+     *
+     * Let's compute the bonuses for soft-affinities.
+     */
+    if ( score > 0 )
+    {
+        if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) )
+            score += CSCHED2_CREDIT_INIT;
+
+        if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) )
+            score += CSCHED2_CREDIT_INIT;
+    }
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            int credit, score;
+        } d;
+        d.dom = cur->unit->domain->domain_id;
+        d.unit = cur->unit->unit_id;
+        d.credit = cur->credit;
+        d.score = score;
+        __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    return score;
+}
+
+/*
+ * Check what processor it is best to 'wake', for picking up an unit that has
+ * just been put (back) in the runqueue. Logic is as follows:
+ *  1. if there are idle processors in the runq, wake one of them;
+ *  2. if there aren't idle processor, check the one were the unit was
+ *     running before to see if we can preempt what's running there now
+ *     (and hence doing just one migration);
+ *  3. last stand: check all processors and see if the unit is in right
+ *     of preempting any of the other units running on them (this requires
+ *     two migrations, and that's indeed why it is left as the last stand).
+ *
+ * Note that when we say 'idle processors' what we really mean is (pretty
+ * much always) both _idle_ and _not_already_tickled_. In fact, if a
+ * processor has been tickled, it will run csched2_schedule() shortly, and
+ * pick up some work, so it would be wrong to consider it idle.
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now)
+{
+    int i, ipid = -1;
+    s_time_t max = 0;
+    struct sched_unit *unit = new->unit;
+    unsigned int bs, cpu = sched_unit_master(unit);
+    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+    cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
+    cpumask_t mask;
+
+    ASSERT(new->rqd == rqd);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            unsigned processor;
+            int credit;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.processor = cpu;
+        d.credit = new->credit;
+        __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    /*
+     * Exclusive pinning is when an unit has hard-affinity with only one
+     * cpu, and there is no other unit that has hard-affinity with that
+     * same cpu. This is infrequent, but if it happens, is for achieving
+     * the most possible determinism, and least possible overhead for
+     * the units in question.
+     *
+     * Try to identify the vast majority of these situations, and deal
+     * with them quickly.
+     */
+    if ( unlikely((new->flags & CSFLAG_pinned) &&
+                  cpumask_test_cpu(cpu, &rqd->idle) &&
+                  !cpumask_test_cpu(cpu, &rqd->tickled)) )
+    {
+        ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
+        SCHED_STAT_CRANK(tickled_idle_cpu_excl);
+        ipid = cpu;
+        goto tickle;
+    }
+
+    for_each_affinity_balance_step( bs )
+    {
+        /* Just skip first step, if we don't have a soft affinity */
+        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+            continue;
+
+        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+
+        /*
+         * First of all, consider idle cpus, checking if we can just
+         * re-use the pcpu where we were running before.
+         *
+         * If there are cores where all the siblings are idle, consider
+         * them first, honoring whatever the spreading-vs-consolidation
+         * SMT policy wants us to do.
+         */
+        if ( unlikely(sched_smt_power_savings) )
+        {
+            cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
+            cpumask_and(&mask, &mask, online);
+        }
+        else
+            cpumask_and(&mask, &rqd->smt_idle, online);
+        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+        i = cpumask_test_or_cycle(cpu, &mask);
+        if ( i < nr_cpu_ids )
+        {
+            SCHED_STAT_CRANK(tickled_idle_cpu);
+            ipid = i;
+            goto tickle;
+        }
+
+        /*
+         * If there are no fully idle cores, check all idlers, after
+         * having filtered out pcpus that have been tickled but haven't
+         * gone through the scheduler yet.
+         */
+        cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
+        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+        i = cpumask_test_or_cycle(cpu, &mask);
+        if ( i < nr_cpu_ids )
+        {
+            SCHED_STAT_CRANK(tickled_idle_cpu);
+            ipid = i;
+            goto tickle;
+        }
+    }
+
+    /*
+     * Note that, if we are here, it means we have done the hard-affinity
+     * balancing step of the loop, and hence what we have in cpumask_scratch
+     * is what we put there for last, i.e., new's unit_hard_affinity & online
+     * which is exactly what we need for the next part of the function.
+     */
+
+    /*
+     * Otherwise, look for the non-idle (and non-tickled) processors with
+     * the lowest credit, among the ones new is allowed to run on. Again,
+     * the cpu were it was running on would be the best candidate.
+     *
+     * For deciding which cpu to tickle, we use tickle_score(), which will
+     * factor in both new's soft-affinity, and the soft-affinity of the
+     * unit running on each cpu that we consider.
+     */
+    cpumask_andnot(&mask, &rqd->active, &rqd->idle);
+    cpumask_andnot(&mask, &mask, &rqd->tickled);
+    cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
+    if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
+    {
+        s_time_t score = tickle_score(ops, now, new, cpu);
+
+        if ( score > max )
+        {
+            max = score;
+            ipid = cpu;
+
+            /* If this is in new's soft affinity, just take it */
+            if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) )
+            {
+                SCHED_STAT_CRANK(tickled_busy_cpu);
+                goto tickle;
+            }
+        }
+    }
+
+    for_each_cpu(i, &mask)
+    {
+        s_time_t score;
+
+        /* Already looked at this one above */
+        ASSERT(i != cpu);
+
+        score = tickle_score(ops, now, new, i);
+
+        if ( score > max )
+        {
+            max = score;
+            ipid = i;
+        }
+    }
+
+    if ( ipid == -1 )
+    {
+        SCHED_STAT_CRANK(tickled_no_cpu);
+        return;
+    }
+
+    ASSERT(!is_idle_unit(curr_on_cpu(ipid)));
+    SCHED_STAT_CRANK(tickled_busy_cpu);
+ tickle:
+    BUG_ON(ipid == -1);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned cpu:16, pad:16;
+        } d;
+        d.cpu = ipid; d.pad = 0;
+        __trace_var(TRC_CSCHED2_TICKLE, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    tickle_cpu(ipid, rqd);
+
+    if ( unlikely(new->tickled_cpu != -1) )
+        SCHED_STAT_CRANK(tickled_cpu_overwritten);
+    new->tickled_cpu = ipid;
+}
+
+/*
+ * Credit-related code
+ */
+static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
+                         struct csched2_unit *snext)
+{
+    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+    struct list_head *iter;
+    int m;
+
+    /*
+     * Under normal circumstances, snext->credit should never be less
+     * than -CSCHED2_MIN_TIMER.  However, under some circumstances, an
+     * unit with low credits may be allowed to run long enough that
+     * its credits are actually less than -CSCHED2_CREDIT_INIT.
+     * (Instances have been observed, for example, where an unit with
+     * 200us of credit was allowed to run for 11ms, giving it -10.8ms
+     * of credit.  Thus it was still negative even after the reset.)
+     *
+     * If this is the case for snext, we simply want to keep moving
+     * everyone up until it is in the black again.  This fair because
+     * none of the other units want to run at the moment.
+     *
+     * Rather than looping, however, we just calculate a multiplier,
+     * avoiding an integer division and multiplication in the common
+     * case.
+     */
+    m = 1;
+    if ( snext->credit < -CSCHED2_CREDIT_INIT )
+        m += (-snext->credit) / CSCHED2_CREDIT_INIT;
+
+    list_for_each( iter, &rqd->svc )
+    {
+        unsigned int svc_cpu;
+        struct csched2_unit * svc;
+        int start_credit;
+
+        svc = list_entry(iter, struct csched2_unit, rqd_elem);
+        svc_cpu = sched_unit_master(svc->unit);
+
+        ASSERT(!is_idle_unit(svc->unit));
+        ASSERT(svc->rqd == rqd);
+
+        /*
+         * If svc is running, it is our responsibility to make sure, here,
+         * that the credit it has spent so far get accounted.
+         */
+        if ( svc->unit == curr_on_cpu(svc_cpu) )
+        {
+            burn_credits(rqd, svc, now);
+            /*
+             * And, similarly, in case it has run out of budget, as a
+             * consequence of this round of accounting, we also must inform
+             * its pCPU that it's time to park it, and pick up someone else.
+             */
+            if ( unlikely(svc->budget <= 0) )
+                tickle_cpu(svc_cpu, rqd);
+        }
+
+        start_credit = svc->credit;
+
+        /*
+         * Add INIT * m, avoiding integer multiplication in the common case.
+         */
+        if ( likely(m==1) )
+            svc->credit += CSCHED2_CREDIT_INIT;
+        else
+            svc->credit += m * CSCHED2_CREDIT_INIT;
+
+        /* "Clip" credits to max carryover */
+        if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
+            svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;
+
+        svc->start_time = now;
+
+        if ( unlikely(tb_init_done) )
+        {
+            struct {
+                unsigned unit:16, dom:16;
+                int credit_start, credit_end;
+                unsigned multiplier;
+            } d;
+            d.dom = svc->unit->domain->domain_id;
+            d.unit = svc->unit->unit_id;
+            d.credit_start = start_credit;
+            d.credit_end = svc->credit;
+            d.multiplier = m;
+            __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
+                        sizeof(d),
+                        (unsigned char *)&d);
+        }
+    }
+
+    SCHED_STAT_CRANK(credit_reset);
+
+    /* No need to resort runqueue, as everyone's order should be the same. */
+}
+
+void burn_credits(struct csched2_runqueue_data *rqd,
+                  struct csched2_unit *svc, s_time_t now)
+{
+    s_time_t delta;
+
+    ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit))));
+
+    if ( unlikely(is_idle_unit(svc->unit)) )
+    {
+        ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
+        return;
+    }
+
+    delta = now - svc->start_time;
+
+    if ( unlikely(delta <= 0) )
+    {
+        if ( unlikely(delta < 0) )
+            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
+                     " start_time %"PRI_stime"\n", __func__, now,
+                     svc->start_time);
+        goto out;
+    }
+
+    SCHED_STAT_CRANK(burn_credits_t2c);
+    t2c_update(rqd, delta, svc);
+
+    if ( has_cap(svc) )
+        svc->budget -= delta;
+
+    svc->start_time = now;
+
+ out:
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            int credit, budget;
+            int delta;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.credit = svc->credit;
+        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
+        d.delta = delta;
+        __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+}
+
+/*
+ * Budget-related code.
+ */
+
+static void park_unit(struct csched2_unit *svc)
+{
+    struct sched_unit *unit = svc->unit;
+
+    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
+
+    /*
+     * It was impossible to find budget for this unit, so it has to be
+     * "parked". This implies it is not runnable, so we mark it as such in
+     * its pause_flags. If the unit is currently scheduled (which means we
+     * are here after being called from within csched_schedule()), flagging
+     * is enough, as we'll choose someone else, and then context_saved()
+     * will take care of updating the load properly.
+     *
+     * If, OTOH, the unit is sitting in the runqueue (which means we are here
+     * after being called from within runq_candidate()), we must go all the
+     * way down to taking it out of there, and updating the load accordingly.
+     *
+     * In both cases, we also add it to the list of parked units of the domain.
+     */
+    sched_set_pause_flags(unit, _VPF_parked);
+    if ( unit_on_runq(svc) )
+    {
+        runq_remove(svc);
+        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
+    }
+    list_add(&svc->parked_elem, &svc->sdom->parked_units);
+}
+
+static bool unit_grab_budget(struct csched2_unit *svc)
+{
+    struct csched2_dom *sdom = svc->sdom;
+    unsigned int cpu = sched_unit_master(svc->unit);
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    if ( svc->budget > 0 )
+        return true;
+
+    /* budget_lock nests inside runqueue lock. */
+    spin_lock(&sdom->budget_lock);
+
+    /*
+     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
+     * above!). That basically means the unit has overrun a bit --because of
+     * various reasons-- and we want to take that into account. With the +=,
+     * we are actually subtracting the amount of budget the unit has
+     * overconsumed, from the total domain budget.
+     */
+    sdom->budget += svc->budget;
+
+    if ( sdom->budget > 0 )
+    {
+        s_time_t budget;
+
+        /* Get our quota, if there's at least as much budget */
+        if ( likely(sdom->budget >= svc->budget_quota) )
+            budget = svc->budget_quota;
+        else
+            budget = sdom->budget;
+
+        svc->budget = budget;
+        sdom->budget -= budget;
+    }
+    else
+    {
+        svc->budget = 0;
+        park_unit(svc);
+    }
+
+    spin_unlock(&sdom->budget_lock);
+
+    return svc->budget > 0;
+}
+
+static void
+unit_return_budget(struct csched2_unit *svc, struct list_head *parked)
+{
+    struct csched2_dom *sdom = svc->sdom;
+    unsigned int cpu = sched_unit_master(svc->unit);
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+    ASSERT(list_empty(parked));
+
+    /* budget_lock nests inside runqueue lock. */
+    spin_lock(&sdom->budget_lock);
+
+    /*
+     * The unit is stopping running (e.g., because it's blocking, or it has
+     * been preempted). If it hasn't consumed all the budget it got when,
+     * starting to run, put that remaining amount back in the domain's budget
+     * pool.
+     */
+    sdom->budget += svc->budget;
+    svc->budget = 0;
+
+    /*
+     * Making budget available again to the domain means that parked units
+     * may be unparked and run. They are, if any, in the domain's parked_units
+     * list, so we want to go through that and unpark them (so they can try
+     * to get some budget).
+     *
+     * Touching the list requires the budget_lock, which we hold. Let's
+     * therefore put everyone in that list in another, temporary list, which
+     * then the caller will traverse, unparking the units it finds there.
+     *
+     * In fact, we can't do the actual unparking here, because that requires
+     * taking the runqueue lock of the units being unparked, and we can't
+     * take any runqueue locks while we hold a budget_lock.
+     */
+    if ( sdom->budget > 0 )
+        list_splice_init(&sdom->parked_units, parked);
+
+    spin_unlock(&sdom->budget_lock);
+}
+
+static void
+unpark_parked_units(const struct scheduler *ops, struct list_head *units)
+{
+    struct csched2_unit *svc, *tmp;
+    spinlock_t *lock;
+
+    list_for_each_entry_safe ( svc, tmp, units, parked_elem )
+    {
+        unsigned long flags;
+        s_time_t now;
+
+        lock = unit_schedule_lock_irqsave(svc->unit, &flags);
+
+        sched_clear_pause_flags(svc->unit, _VPF_parked);
+        if ( unlikely(svc->flags & CSFLAG_scheduled) )
+        {
+            /*
+             * We end here if a budget replenishment arrived between
+             * csched2_schedule() (and, in particular, after a call to
+             * unit_grab_budget() that returned false), and
+             * context_saved(). By setting __CSFLAG_delayed_runq_add,
+             * we tell context_saved() to put the unit back in the
+             * runqueue, from where it will compete with the others
+             * for the newly replenished budget.
+             */
+            ASSERT( svc->rqd != NULL );
+            ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd );
+            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+        }
+        else if ( unit_runnable(svc->unit) )
+        {
+            /*
+             * The unit should go back to the runqueue, and compete for
+             * the newly replenished budget, but only if it is actually
+             * runnable (and was therefore offline only because of the
+             * lack of budget).
+             */
+            now = NOW();
+            update_load(ops, svc->rqd, svc, 1, now);
+            runq_insert(ops, svc);
+            runq_tickle(ops, svc, now);
+        }
+        list_del_init(&svc->parked_elem);
+
+        unit_schedule_unlock_irqrestore(lock, flags, svc->unit);
+    }
+}
+
+static inline void do_replenish(struct csched2_dom *sdom)
+{
+    sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
+    sdom->budget += sdom->tot_budget;
+}
+
+static void replenish_domain_budget(void* data)
+{
+    struct csched2_dom *sdom = data;
+    unsigned long flags;
+    s_time_t now;
+    LIST_HEAD(parked);
+
+    spin_lock_irqsave(&sdom->budget_lock, flags);
+
+    now = NOW();
+
+    /*
+     * Let's do the replenishment. Note, though, that a domain may overrun,
+     * which means the budget would have gone below 0 (reasons may be system
+     * overbooking, accounting issues, etc.). It also may happen that we are
+     * handling the replenishment (much) later than we should (reasons may
+     * again be overbooking, or issues with timers).
+     *
+     * Even in cases of overrun or delay, however, we expect that in 99% of
+     * cases, doing just one replenishment will be good enough for being able
+     * to unpark the units that are waiting for some budget.
+     */
+    do_replenish(sdom);
+
+    /*
+     * And now, the special cases:
+     * 1) if we are late enough to have skipped (at least) one full period,
+     * what we must do is doing more replenishments. Note that, however,
+     * every time we add tot_budget to the budget, we also move next_repl
+     * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
+     * respected.
+     */
+    if ( unlikely(sdom->next_repl <= now) )
+    {
+        do
+            do_replenish(sdom);
+        while ( sdom->next_repl <= now );
+    }
+    /*
+     * 2) if we overrun by more than tot_budget, then budget+tot_budget is
+     * still < 0, which means that we can't unpark the units. Let's bail,
+     * and wait for future replenishments.
+     */
+    if ( unlikely(sdom->budget <= 0) )
+    {
+        spin_unlock_irqrestore(&sdom->budget_lock, flags);
+        goto out;
+    }
+
+    /* Since we do more replenishments, make sure we didn't overshot. */
+    sdom->budget = min(sdom->budget, sdom->tot_budget);
+
+    /*
+     * As above, let's prepare the temporary list, out of the domain's
+     * parked_units list, now that we hold the budget_lock. Then, drop such
+     * lock, and pass the list to the unparking function.
+     */
+    list_splice_init(&sdom->parked_units, &parked);
+
+    spin_unlock_irqrestore(&sdom->budget_lock, flags);
+
+    unpark_parked_units(sdom->dom->cpupool->sched, &parked);
+
+ out:
+    set_timer(&sdom->repl_timer, sdom->next_repl);
+}
+
+#ifndef NDEBUG
+static inline void
+csched2_unit_check(struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+    struct csched2_dom * const sdom = svc->sdom;
+
+    BUG_ON( svc->unit != unit );
+    BUG_ON( sdom != csched2_dom(unit->domain) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_unit(unit) );
+        BUG_ON( sdom->dom != unit->domain );
+    }
+    else
+    {
+        BUG_ON( !is_idle_unit(unit) );
+    }
+    SCHED_STAT_CRANK(unit_check);
+}
+#define CSCHED2_UNIT_CHECK(unit)  (csched2_unit_check(unit))
+#else
+#define CSCHED2_UNIT_CHECK(unit)
+#endif
+
+static void *
+csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+                    void *dd)
+{
+    struct csched2_unit *svc;
+
+    /* Allocate per-UNIT info */
+    svc = xzalloc(struct csched2_unit);
+    if ( svc == NULL )
+        return NULL;
+
+    INIT_LIST_HEAD(&svc->rqd_elem);
+    INIT_LIST_HEAD(&svc->runq_elem);
+
+    svc->sdom = dd;
+    svc->unit = unit;
+    svc->flags = 0U;
+
+    if ( ! is_idle_unit(unit) )
+    {
+        ASSERT(svc->sdom != NULL);
+        svc->credit = CSCHED2_CREDIT_INIT;
+        svc->weight = svc->sdom->weight;
+        /* Starting load of 50% */
+        svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
+        svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
+    }
+    else
+    {
+        ASSERT(svc->sdom == NULL);
+        svc->credit = CSCHED2_IDLE_CREDIT;
+        svc->weight = 0;
+    }
+    svc->tickled_cpu = -1;
+
+    svc->budget = STIME_MAX;
+    svc->budget_quota = 0;
+    INIT_LIST_HEAD(&svc->parked_elem);
+
+    SCHED_STAT_CRANK(unit_alloc);
+
+    return svc;
+}
+
+static void
+csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+
+    ASSERT(!is_idle_unit(unit));
+    SCHED_STAT_CRANK(unit_sleep);
+
+    if ( curr_on_cpu(sched_unit_master(unit)) == unit )
+    {
+        tickle_cpu(sched_unit_master(unit), svc->rqd);
+    }
+    else if ( unit_on_runq(svc) )
+    {
+        ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+        update_load(ops, svc->rqd, svc, -1, NOW());
+        runq_remove(svc);
+    }
+    else
+        __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+}
+
+static void
+csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+    unsigned int cpu = sched_unit_master(unit);
+    s_time_t now;
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    ASSERT(!is_idle_unit(unit));
+
+    if ( unlikely(curr_on_cpu(cpu) == unit) )
+    {
+        SCHED_STAT_CRANK(unit_wake_running);
+        goto out;
+    }
+
+    if ( unlikely(unit_on_runq(svc)) )
+    {
+        SCHED_STAT_CRANK(unit_wake_onrunq);
+        goto out;
+    }
+
+    if ( likely(unit_runnable(unit)) )
+        SCHED_STAT_CRANK(unit_wake_runnable);
+    else
+        SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+    /* If the context hasn't been saved for this unit yet, we can't put it on
+     * another runqueue.  Instead, we set a flag so that it will be put on the runqueue
+     * after the context has been saved. */
+    if ( unlikely(svc->flags & CSFLAG_scheduled) )
+    {
+        __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+        goto out;
+    }
+
+    /* Add into the new runqueue if necessary */
+    if ( svc->rqd == NULL )
+        runq_assign(ops, unit);
+    else
+        ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd );
+
+    now = NOW();
+
+    update_load(ops, svc->rqd, svc, 1, now);
+
+    /* Put the UNIT on the runq */
+    runq_insert(ops, svc);
+    runq_tickle(ops, svc, now);
+
+out:
+    return;
+}
+
+static void
+csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+
+    __set_bit(__CSFLAG_unit_yield, &svc->flags);
+}
+
+static void
+csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+    spinlock_t *lock = unit_schedule_lock_irq(unit);
+    s_time_t now = NOW();
+    LIST_HEAD(were_parked);
+
+    BUG_ON( !is_idle_unit(unit) &&
+            svc->rqd != c2rqd(ops, sched_unit_master(unit)));
+    ASSERT(is_idle_unit(unit) ||
+           svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+    /* This unit is now eligible to be put on the runqueue again */
+    __clear_bit(__CSFLAG_scheduled, &svc->flags);
+
+    if ( unlikely(has_cap(svc) && svc->budget > 0) )
+        unit_return_budget(svc, &were_parked);
+
+    /* If someone wants it on the runqueue, put it there. */
+    /*
+     * NB: We can get rid of CSFLAG_scheduled by checking for
+     * vc->is_running and unit_on_runq(svc) here.  However,
+     * since we're accessing the flags cacheline anyway,
+     * it seems a bit pointless; especially as we have plenty of
+     * bits free.
+     */
+    if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
+         && likely(unit_runnable(unit)) )
+    {
+        ASSERT(!unit_on_runq(svc));
+
+        runq_insert(ops, svc);
+        runq_tickle(ops, svc, now);
+    }
+    else if ( !is_idle_unit(unit) )
+        update_load(ops, svc->rqd, svc, -1, now);
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    unpark_parked_units(ops, &were_parked);
+}
+
+#define MAX_LOAD (STIME_MAX)
+static struct sched_resource *
+csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    int i, min_rqi = -1, min_s_rqi = -1;
+    unsigned int new_cpu, cpu = sched_unit_master(unit);
+    struct csched2_unit *svc = csched2_unit(unit);
+    s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
+    bool has_soft;
+
+    ASSERT(!cpumask_empty(&prv->active_queues));
+
+    SCHED_STAT_CRANK(pick_resource);
+
+    /* Locking:
+     * - Runqueue lock of vc->processor is already locked
+     * - Need to grab prv lock to make sure active runqueues don't
+     *   change
+     * - Need to grab locks for other runqueues while checking
+     *   avgload
+     * Locking constraint is:
+     * - Lock prv before runqueue locks
+     * - Trylock between runqueue locks (no ordering)
+     *
+     * Since one of the runqueue locks is already held, we can't
+     * just grab the prv lock.  Instead, we'll have to trylock, and
+     * do something else reasonable if we fail.
+     */
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    if ( !read_trylock(&prv->lock) )
+    {
+        /* We may be here because someone requested us to migrate. */
+        __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
+        new_cpu = get_fallback_cpu(svc);
+        /*
+         * Tracing of runq and its load won't be accurate, since we could
+         * not get the lock, but at least we will output the chosen pcpu.
+         */
+        goto out;
+    }
+
+    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                cpupool_domain_master_cpumask(unit->domain));
+
+    /*
+     * First check to see if we're here because someone else suggested a place
+     * for us to move.
+     */
+    if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
+    {
+        if ( unlikely(svc->migrate_rqd->id < 0) )
+        {
+            printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
+                   __func__);
+        }
+        else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
+                                     &svc->migrate_rqd->active) )
+        {
+            /*
+             * If we've been asked to move to migrate_rqd, we should just do
+             * that, which we actually do by returning one cpu from that runq.
+             * There is no need to take care of soft affinity, as that will
+             * happen in runq_tickle().
+             */
+            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                        &svc->migrate_rqd->active);
+            new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
+                                    cpumask_scratch_cpu(cpu));
+
+            svc->migrate_rqd->pick_bias = new_cpu;
+            goto out_up;
+        }
+        /* Fall-through to normal cpu pick */
+    }
+
+    /*
+     * What we want is:
+     *  - if we have soft affinity, the runqueue with the lowest average
+     *    load, among the ones that contain cpus in our soft affinity; this
+     *    represents the best runq on which we would want to run.
+     *  - the runqueue with the lowest average load among the ones that
+     *    contains cpus in our hard affinity; this represent the best runq
+     *    on which we can run.
+     *
+     * Find both runqueues in one pass.
+     */
+    has_soft = has_soft_affinity(unit);
+    for_each_cpu(i, &prv->active_queues)
+    {
+        struct csched2_runqueue_data *rqd;
+        s_time_t rqd_avgload = MAX_LOAD;
+
+        rqd = prv->rqd + i;
+
+        /*
+         * If none of the cpus of this runqueue is in svc's hard-affinity,
+         * skip the runqueue.
+         *
+         * Note that, in case svc's hard-affinity has changed, this is the
+         * first time when we see such change, so it is indeed possible
+         * that we end up skipping svc's current runqueue.
+         */
+        if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
+            continue;
+
+        /*
+         * If checking a different runqueue, grab the lock, read the avg,
+         * and then release the lock.
+         *
+         * If on our own runqueue, don't grab or release the lock;
+         * but subtract our own load from the runqueue load to simulate
+         * impartiality.
+         */
+        if ( rqd == svc->rqd )
+        {
+            rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
+        }
+        else if ( spin_trylock(&rqd->lock) )
+        {
+            rqd_avgload = rqd->b_avgload;
+            spin_unlock(&rqd->lock);
+        }
+
+        /*
+         * if svc has a soft-affinity, and some cpus of rqd are part of it,
+         * see if we need to update the "soft-affinity minimum".
+         */
+        if ( has_soft &&
+             rqd_avgload < min_s_avgload )
+        {
+            cpumask_t mask;
+
+            cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
+            if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) )
+            {
+                min_s_avgload = rqd_avgload;
+                min_s_rqi = i;
+            }
+        }
+        /* In any case, keep the "hard-affinity minimum" updated too. */
+        if ( rqd_avgload < min_avgload )
+        {
+            min_avgload = rqd_avgload;
+            min_rqi = i;
+        }
+    }
+
+    if ( has_soft && min_s_rqi != -1 )
+    {
+        /*
+         * We have soft affinity, and we have a candidate runq, so go for it.
+         *
+         * Note that, to obtain the soft-affinity mask, we "just" put what we
+         * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is
+         * ok because:
+         * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have
+         *   a non-empty intersection (because has_soft is true);
+         * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask()
+         *   already in cpumask_scratch, we do save a lot doing like this.
+         *
+         * It's kind of like open coding affinity_balance_cpumask() but, in
+         * this specific case, calling that would mean a lot of (unnecessary)
+         * cpumask operations.
+         */
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    unit->cpu_soft_affinity);
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    &prv->rqd[min_s_rqi].active);
+    }
+    else if ( min_rqi != -1 )
+    {
+        /*
+         * Either we don't have soft-affinity, or we do, but we did not find
+         * any suitable runq. But we did find one when considering hard
+         * affinity, so go for it.
+         *
+         * cpumask_scratch already has unit->cpu_hard_affinity &
+         * cpupool_domain_master_cpumask() in it, so it's enough that we filter
+         * with the cpus of the runq.
+         */
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    &prv->rqd[min_rqi].active);
+    }
+    else
+    {
+        /*
+         * We didn't find anyone at all (most likely because of spinlock
+         * contention).
+         */
+        new_cpu = get_fallback_cpu(svc);
+        min_rqi = c2r(new_cpu);
+        min_avgload = prv->rqd[min_rqi].b_avgload;
+        goto out_up;
+    }
+
+    new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
+                            cpumask_scratch_cpu(cpu));
+    prv->rqd[min_rqi].pick_bias = new_cpu;
+    BUG_ON(new_cpu >= nr_cpu_ids);
+
+ out_up:
+    read_unlock(&prv->lock);
+ out:
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint64_t b_avgload;
+            unsigned unit:16, dom:16;
+            unsigned rq_id:16, new_cpu:16;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.rq_id = min_rqi;
+        d.b_avgload = min_avgload;
+        d.new_cpu = new_cpu;
+        __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    return get_sched_res(new_cpu);
+}
+
+/* Working state of the load-balancing algorithm */
+typedef struct {
+    /* NB: Modified by consider() */
+    s_time_t load_delta;
+    struct csched2_unit * best_push_svc, *best_pull_svc;
+    /* NB: Read by consider() */
+    struct csched2_runqueue_data *lrqd;
+    struct csched2_runqueue_data *orqd;
+} balance_state_t;
+
+static void consider(balance_state_t *st,
+                     struct csched2_unit *push_svc,
+                     struct csched2_unit *pull_svc)
+{
+    s_time_t l_load, o_load, delta;
+
+    l_load = st->lrqd->b_avgload;
+    o_load = st->orqd->b_avgload;
+    if ( push_svc )
+    {
+        /* What happens to the load on both if we push? */
+        l_load -= push_svc->avgload;
+        o_load += push_svc->avgload;
+    }
+    if ( pull_svc )
+    {
+        /* What happens to the load on both if we pull? */
+        l_load += pull_svc->avgload;
+        o_load -= pull_svc->avgload;
+    }
+
+    delta = l_load - o_load;
+    if ( delta < 0 )
+        delta = -delta;
+
+    if ( delta < st->load_delta )
+    {
+        st->load_delta = delta;
+        st->best_push_svc=push_svc;
+        st->best_pull_svc=pull_svc;
+    }
+}
+
+
+static void migrate(const struct scheduler *ops,
+                    struct csched2_unit *svc,
+                    struct csched2_runqueue_data *trqd,
+                    s_time_t now)
+{
+    struct sched_unit *unit = svc->unit;
+    int cpu = sched_unit_master(unit);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            unsigned rqi:16, trqi:16;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.rqi = svc->rqd->id;
+        d.trqi = trqd->id;
+        __trace_var(TRC_CSCHED2_MIGRATE, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    if ( svc->flags & CSFLAG_scheduled )
+    {
+        /* It's running; mark it to migrate. */
+        svc->migrate_rqd = trqd;
+        sched_set_pause_flags(unit, _VPF_migrating);
+        __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
+        SCHED_STAT_CRANK(migrate_requested);
+        tickle_cpu(cpu, svc->rqd);
+    }
+    else
+    {
+        int on_runq = 0;
+        /* It's not running; just move it */
+        if ( unit_on_runq(svc) )
+        {
+            runq_remove(svc);
+            update_load(ops, svc->rqd, NULL, -1, now);
+            on_runq = 1;
+        }
+        _runq_deassign(svc);
+
+        cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                    cpupool_domain_master_cpumask(unit->domain));
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    &trqd->active);
+        sched_set_res(unit,
+                      get_sched_res(cpumask_cycle(trqd->pick_bias,
+                                                  cpumask_scratch_cpu(cpu))));
+        trqd->pick_bias = sched_unit_master(unit);
+        ASSERT(sched_unit_master(unit) < nr_cpu_ids);
+
+        _runq_assign(svc, trqd);
+        if ( on_runq )
+        {
+            update_load(ops, svc->rqd, NULL, 1, now);
+            runq_insert(ops, svc);
+            runq_tickle(ops, svc, now);
+            SCHED_STAT_CRANK(migrate_on_runq);
+        }
+        else
+            SCHED_STAT_CRANK(migrate_no_runq);
+    }
+}
+
+/*
+ * It makes sense considering migrating svc to rqd, if:
+ *  - svc is not already flagged to migrate,
+ *  - if svc is allowed to run on at least one of the pcpus of rqd.
+ */
+static bool unit_is_migrateable(struct csched2_unit *svc,
+                                  struct csched2_runqueue_data *rqd)
+{
+    struct sched_unit *unit = svc->unit;
+    int cpu = sched_unit_master(unit);
+
+    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                cpupool_domain_master_cpumask(unit->domain));
+
+    return !(svc->flags & CSFLAG_runq_migrate_request) &&
+           cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
+}
+
+static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    int i, max_delta_rqi;
+    struct list_head *push_iter, *pull_iter;
+    bool inner_load_updated = 0;
+
+    balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };
+
+    /*
+     * Basic algorithm: Push, pull, or swap.
+     * - Find the runqueue with the furthest load distance
+     * - Find a pair that makes the difference the least (where one
+     * on either side may be empty).
+     */
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+    st.lrqd = c2rqd(ops, cpu);
+
+    update_runq_load(ops, st.lrqd, 0, now);
+
+retry:
+    max_delta_rqi = -1;
+    if ( !read_trylock(&prv->lock) )
+        return;
+
+    st.load_delta = 0;
+
+    for_each_cpu(i, &prv->active_queues)
+    {
+        s_time_t delta;
+
+        st.orqd = prv->rqd + i;
+
+        if ( st.orqd == st.lrqd
+             || !spin_trylock(&st.orqd->lock) )
+            continue;
+
+        update_runq_load(ops, st.orqd, 0, now);
+
+        delta = st.lrqd->b_avgload - st.orqd->b_avgload;
+        if ( delta < 0 )
+            delta = -delta;
+
+        if ( delta > st.load_delta )
+        {
+            st.load_delta = delta;
+            max_delta_rqi = i;
+        }
+
+        spin_unlock(&st.orqd->lock);
+    }
+
+    /* Minimize holding the private scheduler lock. */
+    read_unlock(&prv->lock);
+    if ( max_delta_rqi == -1 )
+        goto out;
+
+    {
+        s_time_t load_max;
+        int cpus_max;
+
+
+        load_max = st.lrqd->b_avgload;
+        if ( st.orqd->b_avgload > load_max )
+            load_max = st.orqd->b_avgload;
+
+        cpus_max = st.lrqd->nr_cpus;
+        i = st.orqd->nr_cpus;
+        if ( i > cpus_max )
+            cpus_max = i;
+
+        if ( unlikely(tb_init_done) )
+        {
+            struct {
+                unsigned lrq_id:16, orq_id:16;
+                unsigned load_delta;
+            } d;
+            d.lrq_id = st.lrqd->id;
+            d.orq_id = st.orqd->id;
+            d.load_delta = st.load_delta;
+            __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
+                        sizeof(d),
+                        (unsigned char *)&d);
+        }
+
+        /*
+         * If we're under 100% capacaty, only shift if load difference
+         * is > 1.  otherwise, shift if under 12.5%
+         */
+        if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
+        {
+            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
+                                           opt_underload_balance_tolerance)) )
+                 goto out;
+        }
+        else
+            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
+                                           opt_overload_balance_tolerance)) )
+                goto out;
+    }
+
+    /* Try to grab the other runqueue lock; if it's been taken in the
+     * meantime, try the process over again.  This can't deadlock
+     * because if it doesn't get any other rqd locks, it will simply
+     * give up and return. */
+    st.orqd = prv->rqd + max_delta_rqi;
+    if ( !spin_trylock(&st.orqd->lock) )
+        goto retry;
+
+    /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
+    if ( unlikely(st.orqd->id < 0) )
+        goto out_up;
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint64_t lb_avgload, ob_avgload;
+            unsigned lrq_id:16, orq_id:16;
+        } d;
+        d.lrq_id = st.lrqd->id;
+        d.lb_avgload = st.lrqd->b_avgload;
+        d.orq_id = st.orqd->id;
+        d.ob_avgload = st.orqd->b_avgload;
+        __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    SCHED_STAT_CRANK(acct_load_balance);
+
+    /* Look for "swap" which gives the best load average
+     * FIXME: O(n^2)! */
+
+    /* Reuse load delta (as we're trying to minimize it) */
+    list_for_each( push_iter, &st.lrqd->svc )
+    {
+        struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem);
+
+        update_svc_load(ops, push_svc, 0, now);
+
+        if ( !unit_is_migrateable(push_svc, st.orqd) )
+            continue;
+
+        list_for_each( pull_iter, &st.orqd->svc )
+        {
+            struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
+
+            if ( !inner_load_updated )
+                update_svc_load(ops, pull_svc, 0, now);
+
+            if ( !unit_is_migrateable(pull_svc, st.lrqd) )
+                continue;
+
+            consider(&st, push_svc, pull_svc);
+        }
+
+        inner_load_updated = 1;
+
+        /* Consider push only */
+        consider(&st, push_svc, NULL);
+    }
+
+    list_for_each( pull_iter, &st.orqd->svc )
+    {
+        struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
+
+        if ( !unit_is_migrateable(pull_svc, st.lrqd) )
+            continue;
+
+        /* Consider pull only */
+        consider(&st, NULL, pull_svc);
+    }
+
+    /* OK, now we have some candidates; do the moving */
+    if ( st.best_push_svc )
+        migrate(ops, st.best_push_svc, st.orqd, now);
+    if ( st.best_pull_svc )
+        migrate(ops, st.best_pull_svc, st.lrqd, now);
+
+ out_up:
+    spin_unlock(&st.orqd->lock);
+ out:
+    return;
+}
+
+static void
+csched2_unit_migrate(
+    const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
+{
+    struct domain *d = unit->domain;
+    struct csched2_unit * const svc = csched2_unit(unit);
+    struct csched2_runqueue_data *trqd;
+    s_time_t now = NOW();
+
+    /*
+     * Being passed a target pCPU which is outside of our cpupool is only
+     * valid if we are shutting down (or doing ACPI suspend), and we are
+     * moving everyone to BSP, no matter whether or not BSP is inside our
+     * cpupool.
+     *
+     * And since there indeed is the chance that it is not part of it, all
+     * we must do is remove _and_ unassign the unit from any runqueue, as
+     * well as updating v->processor with the target, so that the suspend
+     * process can continue.
+     *
+     * It will then be during resume that a new, meaningful, value for
+     * v->processor will be chosen, and during actual domain unpause that
+     * the unit will be assigned to and added to the proper runqueue.
+     */
+    if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) )
+    {
+        ASSERT(system_state == SYS_STATE_suspend);
+        if ( unit_on_runq(svc) )
+        {
+            runq_remove(svc);
+            update_load(ops, svc->rqd, NULL, -1, now);
+        }
+        _runq_deassign(svc);
+        sched_set_res(unit, get_sched_res(new_cpu));
+        return;
+    }
+
+    /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
+    ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
+    ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity));
+
+    trqd = c2rqd(ops, new_cpu);
+
+    /*
+     * Do the actual movement toward new_cpu, and update vc->processor.
+     * If we are changing runqueue, migrate() takes care of everything.
+     * If we are not changing runqueue, we need to update vc->processor
+     * here. In fact, if, for instance, we are here because the unit's
+     * hard affinity changed, we don't want to risk leaving vc->processor
+     * pointing to a pcpu where we can't run any longer.
+     */
+    if ( trqd != svc->rqd )
+        migrate(ops, svc, trqd, now);
+    else
+        sched_set_res(unit, get_sched_res(new_cpu));
+}
+
+static int
+csched2_dom_cntl(
+    const struct scheduler *ops,
+    struct domain *d,
+    struct xen_domctl_scheduler_op *op)
+{
+    struct csched2_dom * const sdom = csched2_dom(d);
+    struct csched2_private *prv = csched2_priv(ops);
+    unsigned long flags;
+    struct sched_unit *unit;
+    int rc = 0;
+
+    /*
+     * Locking:
+     *  - we must take the private lock for accessing the weights of the
+     *    units of d, and/or the cap;
+     *  - in the putinfo case, we also need the runqueue lock(s), for
+     *    updating the max waight of the runqueue(s).
+     *    If changing the cap, we also need the budget_lock, for updating
+     *    the value of the domain budget pool (and the runqueue lock,
+     *    for adjusting the parameters and rescheduling any unit that is
+     *    running at the time of the change).
+     */
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_SCHEDOP_getinfo:
+        read_lock_irqsave(&prv->lock, flags);
+        op->u.credit2.weight = sdom->weight;
+        op->u.credit2.cap = sdom->cap;
+        read_unlock_irqrestore(&prv->lock, flags);
+        break;
+    case XEN_DOMCTL_SCHEDOP_putinfo:
+        write_lock_irqsave(&prv->lock, flags);
+        /* Weight */
+        if ( op->u.credit2.weight != 0 )
+        {
+            int old_weight;
+
+            old_weight = sdom->weight;
+
+            sdom->weight = op->u.credit2.weight;
+
+            /* Update weights for units, and max_weight for runqueues on which they reside */
+            for_each_sched_unit ( d, unit )
+            {
+                struct csched2_unit *svc = csched2_unit(unit);
+                spinlock_t *lock = unit_schedule_lock(unit);
+
+                ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
+
+                svc->weight = sdom->weight;
+                update_max_weight(svc->rqd, svc->weight, old_weight);
+
+                unit_schedule_unlock(lock, unit);
+            }
+        }
+        /* Cap */
+        if ( op->u.credit2.cap != 0 )
+        {
+            struct csched2_unit *svc;
+            spinlock_t *lock;
+
+            /* Cap is only valid if it's below 100 * nr_of_units */
+            if ( op->u.credit2.cap > 100 * sdom->nr_units )
+            {
+                rc = -EINVAL;
+                write_unlock_irqrestore(&prv->lock, flags);
+                break;
+            }
+
+            spin_lock(&sdom->budget_lock);
+            sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
+            sdom->tot_budget /= 100;
+            spin_unlock(&sdom->budget_lock);
+
+            /*
+             * When trying to get some budget and run, each unit will grab
+             * from the pool 1/N (with N = nr of units of the domain) of
+             * the total budget. Roughly speaking, this means each unit will
+             * have at least one chance to run during every period.
+             */
+            for_each_sched_unit ( d, unit )
+            {
+                svc = csched2_unit(unit);
+                lock = unit_schedule_lock(unit);
+                /*
+                 * Too small quotas would in theory cause a lot of overhead,
+                 * which then won't happen because, in csched2_runtime(),
+                 * CSCHED2_MIN_TIMER is what would be used anyway.
+                 */
+                svc->budget_quota = max(sdom->tot_budget / sdom->nr_units,
+                                        CSCHED2_MIN_TIMER);
+                unit_schedule_unlock(lock, unit);
+            }
+
+            if ( sdom->cap == 0 )
+            {
+                /*
+                 * We give to the domain the budget to which it is entitled,
+                 * and queue its first replenishment event.
+                 *
+                 * Since cap is currently disabled for this domain, we
+                 * know no unit is messing with the domain's budget, and
+                 * the replenishment timer is still off.
+                 * For these reasons, it is safe to do the following without
+                 * taking the budget_lock.
+                 */
+                sdom->budget = sdom->tot_budget;
+                sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
+                set_timer(&sdom->repl_timer, sdom->next_repl);
+
+                /*
+                 * Now, let's enable budget accounting for all the units.
+                 * For making sure that they will start to honour the domain's
+                 * cap, we set their budget to 0.
+                 * This way, as soon as they will try to run, they will have
+                 * to get some budget.
+                 *
+                 * For the units that are already running, we trigger the
+                 * scheduler on their pCPU. When, as a consequence of this,
+                 * csched2_schedule() will run, it will figure out there is
+                 * no budget, and the unit will try to get some (and be parked,
+                 * if there's none, and we'll switch to someone else).
+                 */
+                for_each_sched_unit ( d, unit )
+                {
+                    svc = csched2_unit(unit);
+                    lock = unit_schedule_lock(unit);
+                    if ( unit->is_running )
+                    {
+                        unsigned int cpu = sched_unit_master(unit);
+                        struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+
+                        ASSERT(curr_on_cpu(cpu) == unit);
+
+                        /*
+                         * We are triggering a reschedule on the unit's
+                         * pCPU. That will run burn_credits() and, since
+                         * the unit is capped now, it would charge all the
+                         * execution time of this last round as budget as
+                         * well. That will make the unit budget go negative,
+                         * potentially by a large amount, and it's unfair.
+                         *
+                         * To avoid that, call burn_credit() here, to do the
+                         * accounting of this current running instance now,
+                         * with budgetting still disabled. This does not
+                         * prevent some small amount of budget being charged
+                         * to the unit (i.e., the amount of time it runs from
+                         * now, to when scheduling happens). The budget will
+                         * also go below 0, but a lot less than how it would
+                         * if we don't do this.
+                         */
+                        burn_credits(rqd, svc, NOW());
+                        __cpumask_set_cpu(cpu, &rqd->tickled);
+                        ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
+                        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+                    }
+                    svc->budget = 0;
+                    unit_schedule_unlock(lock, unit);
+                }
+            }
+
+            sdom->cap = op->u.credit2.cap;
+        }
+        else if ( sdom->cap != 0 )
+        {
+            LIST_HEAD(parked);
+
+            stop_timer(&sdom->repl_timer);
+
+            /* Disable budget accounting for all the units. */
+            for_each_sched_unit ( d, unit )
+            {
+                struct csched2_unit *svc = csched2_unit(unit);
+                spinlock_t *lock = unit_schedule_lock(unit);
+
+                svc->budget = STIME_MAX;
+                svc->budget_quota = 0;
+
+                unit_schedule_unlock(lock, unit);
+            }
+            sdom->cap = 0;
+            /*
+             * We are disabling the cap for this domain, which may have
+             * units waiting for a replenishment, so we unpark them all.
+             * Note that, since we have already disabled budget accounting
+             * for all the units of the domain, no currently running unit
+             * will be added to the parked units list any longer.
+             */
+            spin_lock(&sdom->budget_lock);
+            list_splice_init(&sdom->parked_units, &parked);
+            spin_unlock(&sdom->budget_lock);
+
+            unpark_parked_units(ops, &parked);
+        }
+        write_unlock_irqrestore(&prv->lock, flags);
+        break;
+    default:
+        rc = -EINVAL;
+        break;
+    }
+
+
+    return rc;
+}
+
+static void
+csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
+                 const cpumask_t *hard, const cpumask_t *soft)
+{
+    struct csched2_unit *svc = csched2_unit(unit);
+
+    if ( !hard )
+        return;
+
+    /* Are we becoming exclusively pinned? */
+    if ( cpumask_weight(hard) == 1 )
+        __set_bit(__CSFLAG_pinned, &svc->flags);
+    else
+        __clear_bit(__CSFLAG_pinned, &svc->flags);
+}
+
+static int csched2_sys_cntl(const struct scheduler *ops,
+                            struct xen_sysctl_scheduler_op *sc)
+{
+    struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
+    struct csched2_private *prv = csched2_priv(ops);
+    unsigned long flags;
+
+    switch (sc->cmd )
+    {
+    case XEN_SYSCTL_SCHEDOP_putinfo:
+        if ( params->ratelimit_us &&
+             (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
+              params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
+            return -EINVAL;
+
+        write_lock_irqsave(&prv->lock, flags);
+        if ( !prv->ratelimit_us && params->ratelimit_us )
+            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
+        else if ( prv->ratelimit_us && !params->ratelimit_us )
+            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
+        prv->ratelimit_us = params->ratelimit_us;
+        write_unlock_irqrestore(&prv->lock, flags);
+
+    /* FALLTHRU */
+    case XEN_SYSCTL_SCHEDOP_getinfo:
+        params->ratelimit_us = prv->ratelimit_us;
+        break;
+    }
+
+    return 0;
+}
+
+static void *
+csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    struct csched2_dom *sdom;
+    unsigned long flags;
+
+    sdom = xzalloc(struct csched2_dom);
+    if ( sdom == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Initialize credit, cap and weight */
+    INIT_LIST_HEAD(&sdom->sdom_elem);
+    sdom->dom = dom;
+    sdom->weight = CSCHED2_DEFAULT_WEIGHT;
+    sdom->cap = 0U;
+    sdom->nr_units = 0;
+
+    init_timer(&sdom->repl_timer, replenish_domain_budget, sdom,
+               cpumask_any(cpupool_domain_master_cpumask(dom)));
+    spin_lock_init(&sdom->budget_lock);
+    INIT_LIST_HEAD(&sdom->parked_units);
+
+    write_lock_irqsave(&prv->lock, flags);
+
+    list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
+
+    write_unlock_irqrestore(&prv->lock, flags);
+
+    return sdom;
+}
+
+static void
+csched2_free_domdata(const struct scheduler *ops, void *data)
+{
+    struct csched2_dom *sdom = data;
+    struct csched2_private *prv = csched2_priv(ops);
+
+    if ( sdom )
+    {
+        unsigned long flags;
+
+        kill_timer(&sdom->repl_timer);
+
+        write_lock_irqsave(&prv->lock, flags);
+        list_del_init(&sdom->sdom_elem);
+        write_unlock_irqrestore(&prv->lock, flags);
+
+        xfree(sdom);
+    }
+}
+
+static void
+csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit *svc = unit->priv;
+    struct csched2_dom * const sdom = svc->sdom;
+    spinlock_t *lock;
+
+    ASSERT(!is_idle_unit(unit));
+    ASSERT(list_empty(&svc->runq_elem));
+
+    /* csched2_res_pick() expects the pcpu lock to be held */
+    lock = unit_schedule_lock_irq(unit);
+
+    sched_set_res(unit, csched2_res_pick(ops, unit));
+
+    spin_unlock_irq(lock);
+
+    lock = unit_schedule_lock_irq(unit);
+
+    /* Add unit to runqueue of initial processor */
+    runq_assign(ops, unit);
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    sdom->nr_units++;
+
+    SCHED_STAT_CRANK(unit_insert);
+
+    CSCHED2_UNIT_CHECK(unit);
+}
+
+static void
+csched2_free_udata(const struct scheduler *ops, void *priv)
+{
+    struct csched2_unit *svc = priv;
+
+    xfree(svc);
+}
+
+static void
+csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct csched2_unit * const svc = csched2_unit(unit);
+    spinlock_t *lock;
+
+    ASSERT(!is_idle_unit(unit));
+    ASSERT(list_empty(&svc->runq_elem));
+
+    SCHED_STAT_CRANK(unit_remove);
+
+    /* Remove from runqueue */
+    lock = unit_schedule_lock_irq(unit);
+
+    runq_deassign(ops, unit);
+
+    unit_schedule_unlock_irq(lock, unit);
+
+    svc->sdom->nr_units--;
+}
+
+/* How long should we let this unit run for? */
+static s_time_t
+csched2_runtime(const struct scheduler *ops, int cpu,
+                struct csched2_unit *snext, s_time_t now)
+{
+    s_time_t time, min_time;
+    int rt_credit; /* Proposed runtime measured in credits */
+    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
+    struct list_head *runq = &rqd->runq;
+    struct csched2_private *prv = csched2_priv(ops);
+
+    /*
+     * If we're idle, just stay so. Others (or external events)
+     * will poke us when necessary.
+     */
+    if ( is_idle_unit(snext->unit) )
+        return -1;
+
+    /* General algorithm:
+     * 1) Run until snext's credit will be 0.
+     * 2) But if someone is waiting, run until snext's credit is equal
+     *    to his.
+     * 3) But, if we are capped, never run more than our budget.
+     * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
+     *    the ratelimit time.
+     */
+
+    /* Calculate mintime */
+    min_time = CSCHED2_MIN_TIMER;
+    if ( prv->ratelimit_us )
+    {
+        s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
+        if ( snext->unit->is_running )
+            ratelimit_min = snext->unit->state_entry_time +
+                            MICROSECS(prv->ratelimit_us) - now;
+        if ( ratelimit_min > min_time )
+            min_time = ratelimit_min;
+    }
+
+    /* 1) Run until snext's credit will be 0. */
+    rt_credit = snext->credit;
+
+    /*
+     * 2) If there's someone waiting whose credit is positive,
+     *    run until your credit ~= his.
+     */
+    if ( ! list_empty(runq) )
+    {
+        struct csched2_unit *swait = runq_elem(runq->next);
+
+        if ( ! is_idle_unit(swait->unit)
+             && swait->credit > 0 )
+        {
+            rt_credit = snext->credit - swait->credit;
+        }
+    }
+
+    /*
+     * The next guy on the runqueue may actually have a higher credit,
+     * if we've tried to avoid migrating him from a different cpu.
+     * Setting time=0 will ensure the minimum timeslice is chosen.
+     *
+     * FIXME: See if we can eliminate this conversion if we know time
+     * will be outside (MIN,MAX).  Probably requires pre-calculating
+     * credit values of MIN,MAX per unit, since each unit burns credit
+     * at a different rate.
+     */
+    if ( rt_credit > 0 )
+        time = c2t(rqd, rt_credit, snext);
+    else
+        time = 0;
+
+    /*
+     * 3) But, if capped, never run more than our budget.
+     */
+    if ( has_cap(snext) )
+        time = snext->budget < time ? snext->budget : time;
+
+    /*
+     * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
+     *    the rate_limit time.
+     */
+    if ( time < min_time )
+    {
+        time = min_time;
+        SCHED_STAT_CRANK(runtime_min_timer);
+    }
+    else if (time > CSCHED2_MAX_TIMER)
+    {
+        time = CSCHED2_MAX_TIMER;
+        SCHED_STAT_CRANK(runtime_max_timer);
+    }
+
+    return time;
+}
+
+/*
+ * Find a candidate.
+ */
+static struct csched2_unit *
+runq_candidate(struct csched2_runqueue_data *rqd,
+               struct csched2_unit *scurr,
+               int cpu, s_time_t now,
+               unsigned int *skipped)
+{
+    struct list_head *iter, *temp;
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct csched2_unit *snext = NULL;
+    struct csched2_private *prv = csched2_priv(sr->scheduler);
+    bool yield = false, soft_aff_preempt = false;
+
+    *skipped = 0;
+
+    if ( unlikely(is_idle_unit(scurr->unit)) )
+    {
+        snext = scurr;
+        goto check_runq;
+    }
+
+    yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags);
+
+    /*
+     * Return the current unit if it has executed for less than ratelimit.
+     * Adjuststment for the selected unit's credit and decision
+     * for how long it will run will be taken in csched2_runtime.
+     *
+     * Note that, if scurr is yielding, we don't let rate limiting kick in.
+     * In fact, it may be the case that scurr is about to spin, and there's
+     * no point forcing it to do so until rate limiting expires.
+     */
+    if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) &&
+         (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) )
+    {
+        if ( unlikely(tb_init_done) )
+        {
+            struct {
+                unsigned unit:16, dom:16;
+                unsigned runtime;
+            } d;
+            d.dom = scurr->unit->domain->domain_id;
+            d.unit = scurr->unit->unit_id;
+            d.runtime = now - scurr->unit->state_entry_time;
+            __trace_var(TRC_CSCHED2_RATELIMIT, 1,
+                        sizeof(d),
+                        (unsigned char *)&d);
+        }
+        return scurr;
+    }
+
+    /* If scurr has a soft-affinity, let's check whether cpu is part of it */
+    if ( has_soft_affinity(scurr->unit) )
+    {
+        affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY,
+                                 cpumask_scratch);
+        if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
+        {
+            cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain);
+
+            /* Ok, is any of the pcpus in scurr soft-affinity idle? */
+            cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
+            cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
+            soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
+        }
+    }
+
+    /*
+     * If scurr is runnable, and this cpu is in its soft-affinity, default to
+     * it. We also default to it, even if cpu is not in its soft-affinity, if
+     * there aren't any idle and not tickled cpu in its soft-affinity. In
+     * fact, we don't want to risk leaving scurr in the runq and this cpu idle
+     * only because scurr is running outside of its soft-affinity.
+     *
+     * On the other hand, if cpu is not in scurr's soft-affinity, and there
+     * looks to be better options, go for them. That happens by defaulting to
+     * idle here, which means scurr will be preempted, put back in runq, and
+     * one of those idle and not tickled cpus from its soft-affinity will be
+     * tickled to pick it up.
+     *
+     * Finally, if scurr does not have a valid soft-affinity, we also let it
+     * continue to run here (in fact, soft_aff_preempt will still be false,
+     * in this case).
+     *
+     * Of course, we also default to idle also if scurr is not runnable.
+     */
+    if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt )
+        snext = scurr;
+    else
+        snext = csched2_unit(sched_idle_unit(cpu));
+
+ check_runq:
+    list_for_each_safe( iter, temp, &rqd->runq )
+    {
+        struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem);
+
+        if ( unlikely(tb_init_done) )
+        {
+            struct {
+                unsigned unit:16, dom:16;
+            } d;
+            d.dom = svc->unit->domain->domain_id;
+            d.unit = svc->unit->unit_id;
+            __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
+                        sizeof(d),
+                        (unsigned char *)&d);
+        }
+
+        /* Only consider units that are allowed to run on this processor. */
+        if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) )
+        {
+            (*skipped)++;
+            continue;
+        }
+
+        /*
+         * If an unit is meant to be picked up by another processor, and such
+         * processor has not scheduled yet, leave it in the runqueue for him.
+         */
+        if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
+             cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
+        {
+            (*skipped)++;
+            SCHED_STAT_CRANK(deferred_to_tickled_cpu);
+            continue;
+        }
+
+        /*
+         * If this is on a different processor, don't pull it unless
+         * its credit is at least CSCHED2_MIGRATE_RESIST higher.
+         */
+        if ( sched_unit_master(svc->unit) != cpu
+             && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
+        {
+            (*skipped)++;
+            SCHED_STAT_CRANK(migrate_resisted);
+            continue;
+        }
+
+        /*
+         * If the one in the runqueue has more credit than current (or idle,
+         * if current is not runnable), or if current is yielding, and also
+         * if the one in runqueue either is not capped, or is capped but has
+         * some budget, then choose it.
+         */
+        if ( (yield || svc->credit > snext->credit) &&
+             (!has_cap(svc) || unit_grab_budget(svc)) &&
+             unit_runnable_state(svc->unit) )
+            snext = svc;
+
+        /* In any case, if we got this far, break. */
+        break;
+    }
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned unit:16, dom:16;
+            unsigned tickled_cpu, skipped;
+            int credit;
+        } d;
+        d.dom = snext->unit->domain->domain_id;
+        d.unit = snext->unit->unit_id;
+        d.credit = snext->credit;
+        d.tickled_cpu = snext->tickled_cpu;
+        d.skipped = *skipped;
+        __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
+        SCHED_STAT_CRANK(tickled_cpu_overridden);
+
+    /*
+     * If snext is from a capped domain, it must have budget (or it
+     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
+     * which still is >= 0.
+     */
+    ASSERT(snext->budget >= 0);
+
+    return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static void csched2_schedule(
+    const struct scheduler *ops, struct sched_unit *currunit, s_time_t now,
+    bool tasklet_work_scheduled)
+{
+    const unsigned int cur_cpu = smp_processor_id();
+    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+    struct csched2_runqueue_data *rqd;
+    struct csched2_unit * const scurr = csched2_unit(currunit);
+    struct csched2_unit *snext = NULL;
+    unsigned int skipped_units = 0;
+    bool tickled;
+    bool migrated = false;
+
+    SCHED_STAT_CRANK(schedule);
+    CSCHED2_UNIT_CHECK(currunit);
+
+    BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized));
+
+    rqd = c2rqd(ops, sched_cpu);
+    BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active));
+
+    ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock));
+
+    BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd);
+
+    /* Clear "tickled" bit now that we've been scheduled */
+    tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled);
+    if ( tickled )
+    {
+        __cpumask_clear_cpu(sched_cpu, &rqd->tickled);
+        cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
+        smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
+    }
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            unsigned cpu:16, rq_id:16;
+            unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
+        } d;
+        d.cpu = cur_cpu;
+        d.rq_id = c2r(sched_cpu);
+        d.tasklet = tasklet_work_scheduled;
+        d.idle = is_idle_unit(currunit);
+        d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle);
+        d.tickled = tickled;
+        __trace_var(TRC_CSCHED2_SCHEDULE, 1,
+                    sizeof(d),
+                    (unsigned char *)&d);
+    }
+
+    /* Update credits (and budget, if necessary). */
+    burn_credits(rqd, scurr, now);
+
+    /*
+     *  Below 0, means that we are capped and we have overrun our  budget.
+     *  Let's try to get some more but, if we fail (e.g., because of the
+     *  other running units), we will be parked.
+     */
+    if ( unlikely(scurr->budget <= 0) )
+        unit_grab_budget(scurr);
+
+    /*
+     * Select next runnable local UNIT (ie top of local runq).
+     *
+     * If the current unit is runnable, and has higher credit than
+     * the next guy on the queue (or there is noone else), we want to
+     * run him again.
+     *
+     * If there's tasklet work to do, we want to chose the idle unit
+     * for this processor, and mark the current for delayed runqueue
+     * add.
+     *
+     * If the current unit is runnable, and there's another runnable
+     * candidate, we want to mark current for delayed runqueue add,
+     * and remove the next guy from the queue.
+     *
+     * If the current unit is not runnable, we want to chose the idle
+     * unit for this processor.
+     */
+    if ( tasklet_work_scheduled )
+    {
+        __clear_bit(__CSFLAG_unit_yield, &scurr->flags);
+        trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
+        snext = csched2_unit(sched_idle_unit(sched_cpu));
+    }
+    else
+        snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units);
+
+    /* If switching from a non-idle runnable unit, put it
+     * back on the runqueue. */
+    if ( snext != scurr
+         && !is_idle_unit(currunit)
+         && unit_runnable(currunit) )
+        __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
+
+    /* Accounting for non-idle tasks */
+    if ( !is_idle_unit(snext->unit) )
+    {
+        /* If switching, remove this from the runqueue and mark it scheduled */
+        if ( snext != scurr )
+        {
+            ASSERT(snext->rqd == rqd);
+            ASSERT(!snext->unit->is_running);
+
+            runq_remove(snext);
+            __set_bit(__CSFLAG_scheduled, &snext->flags);
+        }
+
+        /* Clear the idle mask if necessary */
+        if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
+        {
+            __cpumask_clear_cpu(sched_cpu, &rqd->idle);
+            smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
+        }
+
+        /*
+         * The reset condition is "has a scheduler epoch come to an end?".
+         * The way this is enforced is checking whether the unit at the top
+         * of the runqueue has negative credits. This means the epochs have
+         * variable length, as in one epoch expores when:
+         *  1) the unit at the top of the runqueue has executed for
+         *     around 10 ms (with default parameters);
+         *  2) no other unit with higher credits wants to run.
+         *
+         * Here, where we want to check for reset, we need to make sure the
+         * proper unit is being used. In fact, runqueue_candidate() may have
+         * not returned the first unit in the runqueue, for various reasons
+         * (e.g., affinity). Only trigger a reset when it does.
+         */
+        if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
+        {
+            reset_credit(ops, sched_cpu, now, snext);
+            balance_load(ops, sched_cpu, now);
+        }
+
+        snext->start_time = now;
+        snext->tickled_cpu = -1;
+
+        /* Safe because lock for old processor is held */
+        if ( sched_unit_master(snext->unit) != sched_cpu )
+        {
+            snext->credit += CSCHED2_MIGRATE_COMPENSATION;
+            sched_set_res(snext->unit, get_sched_res(sched_cpu));
+            SCHED_STAT_CRANK(migrated);
+            migrated = true;
+        }
+    }
+    else
+    {
+        /*
+         * Update the idle mask if necessary. Note that, if we're scheduling
+         * idle in order to carry on some tasklet work, we want to play busy!
+         */
+        if ( tasklet_work_scheduled )
+        {
+            if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
+            {
+                __cpumask_clear_cpu(sched_cpu, &rqd->idle);
+                smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
+            }
+        }
+        else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) )
+        {
+            __cpumask_set_cpu(sched_cpu, &rqd->idle);
+            cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
+            smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
+        }
+        /* Make sure avgload gets updated periodically even
+         * if there's no activity */
+        update_load(ops, rqd, NULL, 0, now);
+    }
+
+    /*
+     * Return task to run next...
+     */
+    currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now);
+    currunit->next_task = snext->unit;
+    snext->unit->migrated = migrated;
+
+    CSCHED2_UNIT_CHECK(currunit->next_task);
+}
+
+static void
+csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc)
+{
+    printk("[%i.%i] flags=%x cpu=%i",
+            svc->unit->domain->domain_id,
+            svc->unit->unit_id,
+            svc->flags,
+            sched_unit_master(svc->unit));
+
+    printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
+
+    if ( has_cap(svc) )
+        printk(" budget=%"PRI_stime"(%"PRI_stime")",
+               svc->budget, svc->budget_quota);
+
+    printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
+           (svc->avgload * 100) >> prv->load_precision_shift);
+
+    printk("\n");
+}
+
+static inline void
+dump_pcpu(const struct scheduler *ops, int cpu)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    struct csched2_unit *svc;
+
+    printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n",
+           cpu, c2r(cpu),
+           CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+
+    /* current UNIT (nothing to say if that's the idle unit) */
+    svc = csched2_unit(curr_on_cpu(cpu));
+    if ( svc && !is_idle_unit(svc->unit) )
+    {
+        printk("\trun: ");
+        csched2_dump_unit(prv, svc);
+    }
+}
+
+static void
+csched2_dump(const struct scheduler *ops)
+{
+    struct list_head *iter_sdom;
+    struct csched2_private *prv = csched2_priv(ops);
+    unsigned long flags;
+    unsigned int i, j, loop;
+
+    /*
+     * We need the private scheduler lock as we access global
+     * scheduler data and (below) the list of active domains.
+     */
+    read_lock_irqsave(&prv->lock, flags);
+
+    printk("Active queues: %d\n"
+           "\tdefault-weight     = %d\n",
+           cpumask_weight(&prv->active_queues),
+           CSCHED2_DEFAULT_WEIGHT);
+    for_each_cpu(i, &prv->active_queues)
+    {
+        s_time_t fraction;
+
+        fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;
+
+        printk("Runqueue %d:\n"
+               "\tncpus              = %u\n"
+               "\tcpus               = %*pbl\n"
+               "\tmax_weight         = %u\n"
+               "\tpick_bias          = %u\n"
+               "\tinstload           = %d\n"
+               "\taveload            = %"PRI_stime" (~%"PRI_stime"%%)\n",
+               i,
+               prv->rqd[i].nr_cpus,
+               CPUMASK_PR(&prv->rqd[i].active),
+               prv->rqd[i].max_weight,
+               prv->rqd[i].pick_bias,
+               prv->rqd[i].load,
+               prv->rqd[i].avgload,
+               fraction);
+
+        printk("\tidlers: %*pb\n"
+               "\ttickled: %*pb\n"
+               "\tfully idle cores: %*pb\n",
+               CPUMASK_PR(&prv->rqd[i].idle),
+               CPUMASK_PR(&prv->rqd[i].tickled),
+               CPUMASK_PR(&prv->rqd[i].smt_idle));
+    }
+
+    printk("Domain info:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &prv->sdom )
+    {
+        struct csched2_dom *sdom;
+        struct sched_unit *unit;
+
+        sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
+
+        printk("\tDomain: %d w %d c %u v %d\n",
+               sdom->dom->domain_id,
+               sdom->weight,
+               sdom->cap,
+               sdom->nr_units);
+
+        for_each_sched_unit ( sdom->dom, unit )
+        {
+            struct csched2_unit * const svc = csched2_unit(unit);
+            spinlock_t *lock;
+
+            lock = unit_schedule_lock(unit);
+
+            printk("\t%3d: ", ++loop);
+            csched2_dump_unit(prv, svc);
+
+            unit_schedule_unlock(lock, unit);
+        }
+    }
+
+    for_each_cpu(i, &prv->active_queues)
+    {
+        struct csched2_runqueue_data *rqd = prv->rqd + i;
+        struct list_head *iter, *runq = &rqd->runq;
+        int loop = 0;
+
+        /* We need the lock to scan the runqueue. */
+        spin_lock(&rqd->lock);
+
+        printk("Runqueue %d:\n", i);
+
+        for_each_cpu(j, &rqd->active)
+            dump_pcpu(ops, j);
+
+        printk("RUNQ:\n");
+        list_for_each( iter, runq )
+        {
+            struct csched2_unit *svc = runq_elem(iter);
+
+            if ( svc )
+            {
+                printk("\t%3d: ", loop++);
+                csched2_dump_unit(prv, svc);
+            }
+        }
+        spin_unlock(&rqd->lock);
+    }
+
+    read_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+csched2_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct csched2_pcpu *spc;
+
+    spc = xzalloc(struct csched2_pcpu);
+    if ( spc == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    /* Not in any runqueue yet */
+    spc->runq_id = -1;
+
+    return spc;
+}
+
+/* Returns the ID of the runqueue the cpu is assigned to. */
+static unsigned
+init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
+           unsigned int cpu)
+{
+    struct csched2_runqueue_data *rqd;
+    unsigned int rcpu;
+
+    ASSERT(rw_is_write_locked(&prv->lock));
+    ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
+    /* CPU data needs to be allocated, but still uninitialized. */
+    ASSERT(spc && spc->runq_id == -1);
+
+    /* Figure out which runqueue to put it in */
+    spc->runq_id = cpu_to_runqueue(prv, cpu);
+
+    rqd = prv->rqd + spc->runq_id;
+
+    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
+    if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
+    {
+        printk(XENLOG_INFO " First cpu on runqueue, activating\n");
+        activate_runqueue(prv, spc->runq_id);
+    }
+
+    __cpumask_set_cpu(cpu, &spc->sibling_mask);
+
+    if ( rqd->nr_cpus > 0 )
+        for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) )
+            if ( cpumask_test_cpu(rcpu, &rqd->active) )
+            {
+                __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
+                __cpumask_set_cpu(rcpu, &spc->sibling_mask);
+            }
+
+    __cpumask_set_cpu(cpu, &rqd->idle);
+    __cpumask_set_cpu(cpu, &rqd->active);
+    __cpumask_set_cpu(cpu, &prv->initialized);
+    __cpumask_set_cpu(cpu, &rqd->smt_idle);
+
+    rqd->nr_cpus++;
+    ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
+
+    if ( rqd->nr_cpus == 1 )
+        rqd->pick_bias = cpu;
+
+    return spc->runq_id;
+}
+
+static void
+csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+    struct csched2_private *prv = csched2_priv(ops);
+    spinlock_t *old_lock;
+    unsigned long flags;
+    unsigned rqi;
+
+    write_lock_irqsave(&prv->lock, flags);
+    old_lock = pcpu_schedule_lock(cpu);
+
+    rqi = init_pdata(prv, pdata, cpu);
+    /* Move the scheduler lock to the new runq lock. */
+    get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock;
+
+    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
+    spin_unlock(old_lock);
+    write_unlock_irqrestore(&prv->lock, flags);
+}
+
+/* Change the scheduler of cpu to us (Credit2). */
+static spinlock_t *
+csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                     void *pdata, void *vdata)
+{
+    struct csched2_private *prv = csched2_priv(new_ops);
+    struct csched2_unit *svc = vdata;
+    unsigned rqi;
+
+    ASSERT(pdata && svc && is_idle_unit(svc->unit));
+
+    /*
+     * We own one runqueue lock already (from schedule_cpu_switch()). This
+     * looks like it violates this scheduler's locking rules, but it does
+     * not, as what we own is the lock of another scheduler, that hence has
+     * no particular (ordering) relationship with our private global lock.
+     * And owning exactly that one (the lock of the old scheduler of this
+     * cpu) is what is necessary to prevent races.
+     */
+    ASSERT(!local_irq_is_enabled());
+    write_lock(&prv->lock);
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    rqi = init_pdata(prv, pdata, cpu);
+
+    /*
+     * Now that we know what runqueue we'll go in, double check what's said
+     * above: the lock we already hold is not the one of this runqueue of
+     * this scheduler, and so it's safe to have taken it /before/ our
+     * private global lock.
+     */
+    ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock);
+
+    write_unlock(&prv->lock);
+
+    return &prv->rqd[rqi].lock;
+}
+
+static void
+csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    unsigned long flags;
+    struct csched2_private *prv = csched2_priv(ops);
+    struct csched2_runqueue_data *rqd;
+    struct csched2_pcpu *spc = pcpu;
+    unsigned int rcpu;
+
+    write_lock_irqsave(&prv->lock, flags);
+
+    /*
+     * alloc_pdata is not implemented, so pcpu must be NULL. On the other
+     * hand, init_pdata must have been called for this pCPU.
+     */
+    /*
+     * Scheduler specific data for this pCPU must still be there and and be
+     * valid. In fact, if we are here:
+     *  1. alloc_pdata must have been called for this cpu, and free_pdata
+     *     must not have been called on it before us,
+     *  2. init_pdata must have been called on this cpu, and deinit_pdata
+     *     (us!) must not have been called on it already.
+     */
+    ASSERT(spc && spc->runq_id != -1);
+    ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
+
+    /* Find the old runqueue and remove this cpu from it */
+    rqd = prv->rqd + spc->runq_id;
+
+    /* No need to save IRQs here, they're already disabled */
+    spin_lock(&rqd->lock);
+
+    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id);
+
+    __cpumask_clear_cpu(cpu, &rqd->idle);
+    __cpumask_clear_cpu(cpu, &rqd->smt_idle);
+    __cpumask_clear_cpu(cpu, &rqd->active);
+
+    for_each_cpu ( rcpu, &rqd->active )
+        __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
+
+    rqd->nr_cpus--;
+    ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
+
+    if ( rqd->nr_cpus == 0 )
+    {
+        printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
+        deactivate_runqueue(prv, spc->runq_id);
+    }
+    else if ( rqd->pick_bias == cpu )
+        rqd->pick_bias = cpumask_first(&rqd->active);
+
+    spc->runq_id = -1;
+
+    spin_unlock(&rqd->lock);
+
+    __cpumask_clear_cpu(cpu, &prv->initialized);
+
+    write_unlock_irqrestore(&prv->lock, flags);
+
+    return;
+}
+
+static void
+csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct csched2_pcpu *spc = pcpu;
+
+    /*
+     * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
+     * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
+     * xfree() does not really mind, but we want to be sure that either
+     * init_pdata has never been called, or deinit_pdata has been called
+     * already.
+     */
+    ASSERT(!pcpu || spc->runq_id == -1);
+    ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
+
+    xfree(pcpu);
+}
+
+static int __init
+csched2_global_init(void)
+{
+    if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
+    {
+        printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n",
+               __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
+        opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
+    }
+
+    if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
+    {
+        printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n",
+               __func__, opt_load_window_shift);
+        opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
+    }
+
+    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
+    {
+        printk("WARNING: %s: opt_cap_period %u too small, resetting\n",
+               __func__, opt_cap_period);
+        opt_cap_period = 10; /* ms */
+    }
+
+    return 0;
+}
+
+static int
+csched2_init(struct scheduler *ops)
+{
+    int i;
+    struct csched2_private *prv;
+
+    printk("Initializing Credit2 scheduler\n");
+
+    printk(XENLOG_INFO " load_precision_shift: %d\n"
+           XENLOG_INFO " load_window_shift: %d\n"
+           XENLOG_INFO " underload_balance_tolerance: %d\n"
+           XENLOG_INFO " overload_balance_tolerance: %d\n"
+           XENLOG_INFO " runqueues arrangement: %s\n"
+           XENLOG_INFO " cap enforcement granularity: %dms\n",
+           opt_load_precision_shift,
+           opt_load_window_shift,
+           opt_underload_balance_tolerance,
+           opt_overload_balance_tolerance,
+           opt_runqueue_str[opt_runqueue],
+           opt_cap_period);
+
+    printk(XENLOG_INFO "load tracking window length %llu ns\n",
+           1ULL << opt_load_window_shift);
+
+    /*
+     * Basically no CPU information is available at this point; just
+     * set up basic structures, and a callback when the CPU info is
+     * available.
+     */
+
+    prv = xzalloc(struct csched2_private);
+    if ( prv == NULL )
+        return -ENOMEM;
+    ops->sched_data = prv;
+
+    rwlock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->sdom);
+
+    /* Allocate all runqueues and mark them as un-initialized */
+    prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
+    if ( !prv->rqd )
+    {
+        xfree(prv);
+        return -ENOMEM;
+    }
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        prv->rqd[i].id = -1;
+
+    /* initialize ratelimit */
+    prv->ratelimit_us = sched_ratelimit_us;
+
+    prv->load_precision_shift = opt_load_precision_shift;
+    prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
+    ASSERT(opt_load_window_shift > 0);
+
+    return 0;
+}
+
+static void
+csched2_deinit(struct scheduler *ops)
+{
+    struct csched2_private *prv;
+
+    prv = csched2_priv(ops);
+    ops->sched_data = NULL;
+    if ( prv )
+        xfree(prv->rqd);
+    xfree(prv);
+}
+
+static const struct scheduler sched_credit2_def = {
+    .name           = "SMP Credit Scheduler rev2",
+    .opt_name       = "credit2",
+    .sched_id       = XEN_SCHEDULER_CREDIT2,
+    .sched_data     = NULL,
+
+    .global_init    = csched2_global_init,
+
+    .insert_unit    = csched2_unit_insert,
+    .remove_unit    = csched2_unit_remove,
+
+    .sleep          = csched2_unit_sleep,
+    .wake           = csched2_unit_wake,
+    .yield          = csched2_unit_yield,
+
+    .adjust         = csched2_dom_cntl,
+    .adjust_affinity= csched2_aff_cntl,
+    .adjust_global  = csched2_sys_cntl,
+
+    .pick_resource  = csched2_res_pick,
+    .migrate        = csched2_unit_migrate,
+    .do_schedule    = csched2_schedule,
+    .context_saved  = csched2_context_saved,
+
+    .dump_settings  = csched2_dump,
+    .init           = csched2_init,
+    .deinit         = csched2_deinit,
+    .alloc_udata    = csched2_alloc_udata,
+    .free_udata     = csched2_free_udata,
+    .alloc_pdata    = csched2_alloc_pdata,
+    .init_pdata     = csched2_init_pdata,
+    .deinit_pdata   = csched2_deinit_pdata,
+    .free_pdata     = csched2_free_pdata,
+    .switch_sched   = csched2_switch_sched,
+    .alloc_domdata  = csched2_alloc_domdata,
+    .free_domdata   = csched2_free_domdata,
+};
+
+REGISTER_SCHEDULER(sched_credit2_def);
diff --git a/xen/common/sched/null.c b/xen/common/sched/null.c
new file mode 100644
index 0000000000..3f3418c9b1
--- /dev/null
+++ b/xen/common/sched/null.c
@@ -0,0 +1,1034 @@
+/*
+ * xen/common/sched_null.c
+ *
+ *  Copyright (c) 2017, Dario Faggioli, Citrix Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * The 'null' scheduler always choose to run, on each pCPU, either nothing
+ * (i.e., the pCPU stays idle) or always the same unit.
+ *
+ * It is aimed at supporting static scenarios, where there always are
+ * less units than pCPUs (and the units don't need to move among pCPUs
+ * for any reason) with the least possible overhead.
+ *
+ * Typical usecase are embedded applications, but also HPC, especially
+ * if the scheduler is used inside a cpupool.
+ */
+
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+
+/*
+ * null tracing events. Check include/public/trace.h for more details.
+ */
+#define TRC_SNULL_PICKED_CPU    TRC_SCHED_CLASS_EVT(SNULL, 1)
+#define TRC_SNULL_UNIT_ASSIGN   TRC_SCHED_CLASS_EVT(SNULL, 2)
+#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
+#define TRC_SNULL_MIGRATE       TRC_SCHED_CLASS_EVT(SNULL, 4)
+#define TRC_SNULL_SCHEDULE      TRC_SCHED_CLASS_EVT(SNULL, 5)
+#define TRC_SNULL_TASKLET       TRC_SCHED_CLASS_EVT(SNULL, 6)
+
+/*
+ * Locking:
+ * - Scheduler-lock (a.k.a. runqueue lock):
+ *  + is per-pCPU;
+ *  + serializes assignment and deassignment of units to a pCPU.
+ * - Private data lock (a.k.a. private scheduler lock):
+ *  + is scheduler-wide;
+ *  + serializes accesses to the list of domains in this scheduler.
+ * - Waitqueue lock:
+ *  + is scheduler-wide;
+ *  + serialize accesses to the list of units waiting to be assigned
+ *    to pCPUs.
+ *
+ * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
+ * waitqueue lock nests inside runqueue lock which nests inside private
+ * lock. More specifically:
+ *  + if we need both runqueue and private locks, we must acquire the
+ *    private lock for first;
+ *  + if we need both runqueue and waitqueue locks, we must acquire
+ *    the runqueue lock for first;
+ *  + if we need both private and waitqueue locks, we must acquire
+ *    the private lock for first;
+ *  + if we already own a runqueue lock, we must never acquire
+ *    the private lock;
+ *  + if we already own the waitqueue lock, we must never acquire
+ *    the runqueue lock or the private lock.
+ */
+
+/*
+ * System-wide private data
+ */
+struct null_private {
+    spinlock_t lock;        /* scheduler lock; nests inside cpupool_lock */
+    struct list_head ndom;  /* Domains of this scheduler                 */
+    struct list_head waitq; /* units not assigned to any pCPU            */
+    spinlock_t waitq_lock;  /* serializes waitq; nests inside runq locks */
+    cpumask_t cpus_free;    /* CPUs without a unit associated to them    */
+};
+
+/*
+ * Physical CPU
+ */
+struct null_pcpu {
+    struct sched_unit *unit;
+};
+DEFINE_PER_CPU(struct null_pcpu, npc);
+
+/*
+ * Schedule unit
+ */
+struct null_unit {
+    struct list_head waitq_elem;
+    struct sched_unit *unit;
+};
+
+/*
+ * Domain
+ */
+struct null_dom {
+    struct list_head ndom_elem;
+    struct domain *dom;
+};
+
+/*
+ * Accessor helpers functions
+ */
+static inline struct null_private *null_priv(const struct scheduler *ops)
+{
+    return ops->sched_data;
+}
+
+static inline struct null_unit *null_unit(const struct sched_unit *unit)
+{
+    return unit->priv;
+}
+
+static inline bool unit_check_affinity(struct sched_unit *unit,
+                                       unsigned int cpu,
+                                       unsigned int balance_step)
+{
+    affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
+    cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                cpupool_domain_master_cpumask(unit->domain));
+
+    return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
+}
+
+static int null_init(struct scheduler *ops)
+{
+    struct null_private *prv;
+
+    printk("Initializing null scheduler\n"
+           "WARNING: This is experimental software in development.\n"
+           "Use at your own risk.\n");
+
+    prv = xzalloc(struct null_private);
+    if ( prv == NULL )
+        return -ENOMEM;
+
+    spin_lock_init(&prv->lock);
+    spin_lock_init(&prv->waitq_lock);
+    INIT_LIST_HEAD(&prv->ndom);
+    INIT_LIST_HEAD(&prv->waitq);
+
+    ops->sched_data = prv;
+
+    return 0;
+}
+
+static void null_deinit(struct scheduler *ops)
+{
+    xfree(ops->sched_data);
+    ops->sched_data = NULL;
+}
+
+static void init_pdata(struct null_private *prv, unsigned int cpu)
+{
+    /* Mark the pCPU as free, and with no unit assigned */
+    cpumask_set_cpu(cpu, &prv->cpus_free);
+    per_cpu(npc, cpu).unit = NULL;
+}
+
+static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+    struct null_private *prv = null_priv(ops);
+
+    /* alloc_pdata is not implemented, so we want this to be NULL. */
+    ASSERT(!pdata);
+
+    init_pdata(prv, cpu);
+}
+
+static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct null_private *prv = null_priv(ops);
+
+    /* alloc_pdata not implemented, so this must have stayed NULL */
+    ASSERT(!pcpu);
+
+    cpumask_clear_cpu(cpu, &prv->cpus_free);
+    per_cpu(npc, cpu).unit = NULL;
+}
+
+static void *null_alloc_udata(const struct scheduler *ops,
+                              struct sched_unit *unit, void *dd)
+{
+    struct null_unit *nvc;
+
+    nvc = xzalloc(struct null_unit);
+    if ( nvc == NULL )
+        return NULL;
+
+    INIT_LIST_HEAD(&nvc->waitq_elem);
+    nvc->unit = unit;
+
+    SCHED_STAT_CRANK(unit_alloc);
+
+    return nvc;
+}
+
+static void null_free_udata(const struct scheduler *ops, void *priv)
+{
+    struct null_unit *nvc = priv;
+
+    xfree(nvc);
+}
+
+static void * null_alloc_domdata(const struct scheduler *ops,
+                                 struct domain *d)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_dom *ndom;
+    unsigned long flags;
+
+    ndom = xzalloc(struct null_dom);
+    if ( ndom == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    ndom->dom = d;
+
+    spin_lock_irqsave(&prv->lock, flags);
+    list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return ndom;
+}
+
+static void null_free_domdata(const struct scheduler *ops, void *data)
+{
+    struct null_dom *ndom = data;
+    struct null_private *prv = null_priv(ops);
+
+    if ( ndom )
+    {
+        unsigned long flags;
+
+        spin_lock_irqsave(&prv->lock, flags);
+        list_del_init(&ndom->ndom_elem);
+        spin_unlock_irqrestore(&prv->lock, flags);
+
+        xfree(ndom);
+    }
+}
+
+/*
+ * unit to pCPU assignment and placement. This _only_ happens:
+ *  - on insert,
+ *  - on migrate.
+ *
+ * Insert occurs when a unit joins this scheduler for the first time
+ * (e.g., when the domain it's part of is moved to the scheduler's
+ * cpupool).
+ *
+ * Migration may be necessary if a pCPU (with a unit assigned to it)
+ * is removed from the scheduler's cpupool.
+ *
+ * So this is not part of any hot path.
+ */
+static struct sched_resource *
+pick_res(struct null_private *prv, const struct sched_unit *unit)
+{
+    unsigned int bs;
+    unsigned int cpu = sched_unit_master(unit), new_cpu;
+    cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
+
+    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
+
+    for_each_affinity_balance_step( bs )
+    {
+        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
+            continue;
+
+        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
+
+        /*
+         * If our processor is free, or we are assigned to it, and it is also
+         * still valid and part of our affinity, just go for it.
+         * (Note that we may call unit_check_affinity(), but we deliberately
+         * don't, so we get to keep in the scratch cpumask what we have just
+         * put in it.)
+         */
+        if ( likely((per_cpu(npc, cpu).unit == NULL ||
+                     per_cpu(npc, cpu).unit == unit)
+                    && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
+        {
+            new_cpu = cpu;
+            goto out;
+        }
+
+        /* If not, just go for a free pCPU, within our affinity, if any */
+        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                    &prv->cpus_free);
+        new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
+
+        if ( likely(new_cpu != nr_cpu_ids) )
+            goto out;
+    }
+
+    /*
+     * If we didn't find any free pCPU, just pick any valid pcpu, even if
+     * it has another unit assigned. This will happen during shutdown and
+     * suspend/resume, but it may also happen during "normal operation", if
+     * all the pCPUs are busy.
+     *
+     * In fact, there must always be something sane in v->processor, or
+     * unit_schedule_lock() and friends won't work. This is not a problem,
+     * as we will actually assign the unit to the pCPU we return from here,
+     * only if the pCPU is free.
+     */
+    cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
+    new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
+
+ out:
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint16_t unit, dom;
+            uint32_t new_cpu;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.new_cpu = new_cpu;
+        __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
+    }
+
+    return get_sched_res(new_cpu);
+}
+
+static void unit_assign(struct null_private *prv, struct sched_unit *unit,
+                        unsigned int cpu)
+{
+    ASSERT(is_unit_online(unit));
+
+    per_cpu(npc, cpu).unit = unit;
+    sched_set_res(unit, get_sched_res(cpu));
+    cpumask_clear_cpu(cpu, &prv->cpus_free);
+
+    dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint16_t unit, dom;
+            uint32_t cpu;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.cpu = cpu;
+        __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d);
+    }
+}
+
+/* Returns true if a cpu was tickled */
+static bool unit_deassign(struct null_private *prv, struct sched_unit *unit)
+{
+    unsigned int bs;
+    unsigned int cpu = sched_unit_master(unit);
+    struct null_unit *wvc;
+
+    ASSERT(list_empty(&null_unit(unit)->waitq_elem));
+    ASSERT(per_cpu(npc, cpu).unit == unit);
+    ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
+
+    per_cpu(npc, cpu).unit = NULL;
+    cpumask_set_cpu(cpu, &prv->cpus_free);
+
+    dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
+            unit->unit_id);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint16_t unit, dom;
+            uint32_t cpu;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.cpu = cpu;
+        __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d);
+    }
+
+    spin_lock(&prv->waitq_lock);
+
+    /*
+     * If unit is assigned to a pCPU, let's see if there is someone waiting,
+     * suitable to be assigned to it (prioritizing units that have
+     * soft-affinity with cpu).
+     */
+    for_each_affinity_balance_step( bs )
+    {
+        list_for_each_entry( wvc, &prv->waitq, waitq_elem )
+        {
+            if ( bs == BALANCE_SOFT_AFFINITY &&
+                 !has_soft_affinity(wvc->unit) )
+                continue;
+
+            if ( unit_check_affinity(wvc->unit, cpu, bs) )
+            {
+                list_del_init(&wvc->waitq_elem);
+                unit_assign(prv, wvc->unit, cpu);
+                cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+                spin_unlock(&prv->waitq_lock);
+                return true;
+            }
+        }
+    }
+    spin_unlock(&prv->waitq_lock);
+
+    return false;
+}
+
+/* Change the scheduler of cpu to us (null). */
+static spinlock_t *null_switch_sched(struct scheduler *new_ops,
+                                     unsigned int cpu,
+                                     void *pdata, void *vdata)
+{
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct null_private *prv = null_priv(new_ops);
+    struct null_unit *nvc = vdata;
+
+    ASSERT(nvc && is_idle_unit(nvc->unit));
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    /*
+     * We are holding the runqueue lock already (it's been taken in
+     * schedule_cpu_switch()). It actually may or may not be the 'right'
+     * one for this cpu, but that is ok for preventing races.
+     */
+    ASSERT(!local_irq_is_enabled());
+
+    init_pdata(prv, cpu);
+
+    return &sr->_lock;
+}
+
+static void null_unit_insert(const struct scheduler *ops,
+                             struct sched_unit *unit)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *nvc = null_unit(unit);
+    unsigned int cpu;
+    spinlock_t *lock;
+
+    ASSERT(!is_idle_unit(unit));
+
+    lock = unit_schedule_lock_irq(unit);
+
+    if ( unlikely(!is_unit_online(unit)) )
+    {
+        unit_schedule_unlock_irq(lock, unit);
+        return;
+    }
+
+ retry:
+    sched_set_res(unit, pick_res(prv, unit));
+    cpu = sched_unit_master(unit);
+
+    spin_unlock(lock);
+
+    lock = unit_schedule_lock(unit);
+
+    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                cpupool_domain_master_cpumask(unit->domain));
+
+    /* If the pCPU is free, we assign unit to it */
+    if ( likely(per_cpu(npc, cpu).unit == NULL) )
+    {
+        /*
+         * Insert is followed by vcpu_wake(), so there's no need to poke
+         * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
+         */
+        unit_assign(prv, unit, cpu);
+    }
+    else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
+    {
+        /*
+         * If the pCPU is not free (e.g., because we raced with another
+         * insert or a migrate), but there are other free pCPUs, we can
+         * try to pick again.
+         */
+         goto retry;
+    }
+    else
+    {
+        /*
+         * If the pCPU is not free, and there aren't any (valid) others,
+         * we have no alternatives than to go into the waitqueue.
+         */
+        spin_lock(&prv->waitq_lock);
+        list_add_tail(&nvc->waitq_elem, &prv->waitq);
+        dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
+                unit->domain, unit->unit_id);
+        spin_unlock(&prv->waitq_lock);
+    }
+    spin_unlock_irq(lock);
+
+    SCHED_STAT_CRANK(unit_insert);
+}
+
+static void null_unit_remove(const struct scheduler *ops,
+                             struct sched_unit *unit)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *nvc = null_unit(unit);
+    spinlock_t *lock;
+
+    ASSERT(!is_idle_unit(unit));
+
+    lock = unit_schedule_lock_irq(unit);
+
+    /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
+    if ( unlikely(!is_unit_online(unit)) )
+    {
+        ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit);
+        ASSERT(list_empty(&nvc->waitq_elem));
+        goto out;
+    }
+
+    /* If unit is in waitqueue, just get it out of there and bail */
+    if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+    {
+        spin_lock(&prv->waitq_lock);
+        list_del_init(&nvc->waitq_elem);
+        spin_unlock(&prv->waitq_lock);
+
+        goto out;
+    }
+
+    unit_deassign(prv, unit);
+
+ out:
+    unit_schedule_unlock_irq(lock, unit);
+
+    SCHED_STAT_CRANK(unit_remove);
+}
+
+static void null_unit_wake(const struct scheduler *ops,
+                           struct sched_unit *unit)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *nvc = null_unit(unit);
+    unsigned int cpu = sched_unit_master(unit);
+
+    ASSERT(!is_idle_unit(unit));
+
+    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+    {
+        SCHED_STAT_CRANK(unit_wake_running);
+        return;
+    }
+
+    if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+    {
+        /* Not exactly "on runq", but close enough for reusing the counter */
+        SCHED_STAT_CRANK(unit_wake_onrunq);
+        return;
+    }
+
+    if ( likely(unit_runnable(unit)) )
+        SCHED_STAT_CRANK(unit_wake_runnable);
+    else
+        SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+    if ( likely(per_cpu(npc, cpu).unit == unit) )
+    {
+        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+        return;
+    }
+
+    /*
+     * If a unit is neither on a pCPU nor in the waitqueue, it means it was
+     * offline, and that it is now coming back being online. If we're lucky,
+     * and its previous resource is free (and affinities match), we can just
+     * assign the unit to it (we own the proper lock already) and be done.
+     */
+    if ( per_cpu(npc, cpu).unit == NULL &&
+         unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
+    {
+        if ( !has_soft_affinity(unit) ||
+             unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
+        {
+            unit_assign(prv, unit, cpu);
+            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+            return;
+        }
+    }
+
+    /*
+     * If the resource is not free (or affinities do not match) we need
+     * to assign unit to some other one, but we can't do it here, as:
+     * - we don't own  the proper lock,
+     * - we can't change v->processor under vcpu_wake()'s feet.
+     * So we add it to the waitqueue, and tickle all the free CPUs (if any)
+     * on which unit can run. The first one that schedules will pick it up.
+     */
+    spin_lock(&prv->waitq_lock);
+    list_add_tail(&nvc->waitq_elem, &prv->waitq);
+    spin_unlock(&prv->waitq_lock);
+
+    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
+                cpupool_domain_master_cpumask(unit->domain));
+    cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+                &prv->cpus_free);
+
+    if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
+        dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
+                unit->domain->domain_id, unit->unit_id);
+    else
+        cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
+}
+
+static void null_unit_sleep(const struct scheduler *ops,
+                            struct sched_unit *unit)
+{
+    struct null_private *prv = null_priv(ops);
+    unsigned int cpu = sched_unit_master(unit);
+    bool tickled = false;
+
+    ASSERT(!is_idle_unit(unit));
+
+    /*
+     * Check if the unit is in the process of being offlined. If yes,
+     * we need to remove it from either its pCPU or the waitqueue.
+     */
+    if ( unlikely(!is_unit_online(unit)) )
+    {
+        struct null_unit *nvc = null_unit(unit);
+
+        if ( unlikely(!list_empty(&nvc->waitq_elem)) )
+        {
+            spin_lock(&prv->waitq_lock);
+            list_del_init(&nvc->waitq_elem);
+            spin_unlock(&prv->waitq_lock);
+        }
+        else if ( per_cpu(npc, cpu).unit == unit )
+            tickled = unit_deassign(prv, unit);
+    }
+
+    /* If unit is not assigned to a pCPU, or is not running, no need to bother */
+    if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
+        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+
+    SCHED_STAT_CRANK(unit_sleep);
+}
+
+static struct sched_resource *
+null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    ASSERT(!is_idle_unit(unit));
+    return pick_res(null_priv(ops), unit);
+}
+
+static void null_unit_migrate(const struct scheduler *ops,
+                              struct sched_unit *unit, unsigned int new_cpu)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *nvc = null_unit(unit);
+
+    ASSERT(!is_idle_unit(unit));
+
+    if ( sched_unit_master(unit) == new_cpu )
+        return;
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint16_t unit, dom;
+            uint16_t cpu, new_cpu;
+        } d;
+        d.dom = unit->domain->domain_id;
+        d.unit = unit->unit_id;
+        d.cpu = sched_unit_master(unit);
+        d.new_cpu = new_cpu;
+        __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
+    }
+
+    /*
+     * If unit is assigned to a pCPU, then such pCPU becomes free, and we
+     * should look in the waitqueue if anyone else can be assigned to it.
+     */
+    if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) )
+    {
+        unit_deassign(prv, unit);
+        SCHED_STAT_CRANK(migrate_running);
+    }
+    else if ( !list_empty(&nvc->waitq_elem) )
+        SCHED_STAT_CRANK(migrate_on_runq);
+
+    SCHED_STAT_CRANK(migrated);
+
+    /*
+     * If a unit is (going) offline, we want it to be neither assigned
+     * to a pCPU, nor in the waitqueue.
+     *
+     * If it was on a cpu, we've removed it from there above. If it is
+     * in the waitqueue, we remove it from there now. And then we bail.
+     */
+    if ( unlikely(!is_unit_online(unit)) )
+    {
+        spin_lock(&prv->waitq_lock);
+        list_del_init(&nvc->waitq_elem);
+        spin_unlock(&prv->waitq_lock);
+        goto out;
+    }
+
+    /*
+     * Let's now consider new_cpu, which is where unit is being sent. It can be
+     * either free, or have a unit already assigned to it.
+     *
+     * In the former case we should assign unit to it, and try to get it to run,
+     * if possible, according to affinity.
+     *
+     * In latter, all we can do is to park unit in the waitqueue.
+     */
+    if ( per_cpu(npc, new_cpu).unit == NULL &&
+         unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
+    {
+        /* unit might have been in the waitqueue, so remove it */
+        spin_lock(&prv->waitq_lock);
+        list_del_init(&nvc->waitq_elem);
+        spin_unlock(&prv->waitq_lock);
+
+        unit_assign(prv, unit, new_cpu);
+    }
+    else
+    {
+        /* Put unit in the waitqueue, if it wasn't there already */
+        spin_lock(&prv->waitq_lock);
+        if ( list_empty(&nvc->waitq_elem) )
+        {
+            list_add_tail(&nvc->waitq_elem, &prv->waitq);
+            dprintk(XENLOG_G_WARNING,
+                    "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
+                    unit->unit_id);
+        }
+        spin_unlock(&prv->waitq_lock);
+    }
+
+    /*
+     * Whatever all the above, we always at least override v->processor.
+     * This is especially important for shutdown or suspend/resume paths,
+     * when it is important to let our caller (cpu_disable_scheduler())
+     * know that the migration did happen, to the best of our possibilities,
+     * at least. In case of suspend, any temporary inconsistency caused
+     * by this, will be fixed-up during resume.
+     */
+ out:
+    sched_set_res(unit, get_sched_res(new_cpu));
+}
+
+#ifndef NDEBUG
+static inline void null_unit_check(struct sched_unit *unit)
+{
+    struct null_unit * const nvc = null_unit(unit);
+    struct null_dom * const ndom = unit->domain->sched_priv;
+
+    BUG_ON(nvc->unit != unit);
+
+    if ( ndom )
+        BUG_ON(is_idle_unit(unit));
+    else
+        BUG_ON(!is_idle_unit(unit));
+
+    SCHED_STAT_CRANK(unit_check);
+}
+#define NULL_UNIT_CHECK(unit)  (null_unit_check(unit))
+#else
+#define NULL_UNIT_CHECK(unit)
+#endif
+
+
+/*
+ * The most simple scheduling function of all times! We either return:
+ *  - the unit assigned to the pCPU, if there's one and it can run;
+ *  - the idle unit, otherwise.
+ */
+static void null_schedule(const struct scheduler *ops, struct sched_unit *prev,
+                          s_time_t now, bool tasklet_work_scheduled)
+{
+    unsigned int bs;
+    const unsigned int cur_cpu = smp_processor_id();
+    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *wvc;
+
+    SCHED_STAT_CRANK(schedule);
+    NULL_UNIT_CHECK(current->sched_unit);
+
+    if ( unlikely(tb_init_done) )
+    {
+        struct {
+            uint16_t tasklet, cpu;
+            int16_t unit, dom;
+        } d;
+        d.cpu = cur_cpu;
+        d.tasklet = tasklet_work_scheduled;
+        if ( per_cpu(npc, sched_cpu).unit == NULL )
+        {
+            d.unit = d.dom = -1;
+        }
+        else
+        {
+            d.unit = per_cpu(npc, sched_cpu).unit->unit_id;
+            d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id;
+        }
+        __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
+    }
+
+    if ( tasklet_work_scheduled )
+    {
+        trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
+        prev->next_task = sched_idle_unit(sched_cpu);
+    }
+    else
+        prev->next_task = per_cpu(npc, sched_cpu).unit;
+    prev->next_time = -1;
+
+    /*
+     * We may be new in the cpupool, or just coming back online. In which
+     * case, there may be units in the waitqueue that we can assign to us
+     * and run.
+     */
+    if ( unlikely(prev->next_task == NULL) )
+    {
+        bool unit_found;
+
+        spin_lock(&prv->waitq_lock);
+
+        if ( list_empty(&prv->waitq) )
+            goto unlock;
+
+        /*
+         * We scan the waitqueue twice, for prioritizing units that have
+         * soft-affinity with cpu. This may look like something expensive to
+         * do here in null_schedule(), but it's actually fine, because we do
+         * it only in cases where a pcpu has no unit associated (e.g., as
+         * said above, the cpu has just joined a cpupool).
+         */
+        unit_found = false;
+        for_each_affinity_balance_step( bs )
+        {
+            list_for_each_entry( wvc, &prv->waitq, waitq_elem )
+            {
+                if ( bs == BALANCE_SOFT_AFFINITY &&
+                     !has_soft_affinity(wvc->unit) )
+                    continue;
+
+                if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
+                {
+                    spinlock_t *lock;
+
+                    unit_found = true;
+
+                    /*
+                     * If the unit in the waitqueue has just come up online,
+                     * we risk racing with vcpu_wake(). To avoid this, sync
+                     * on the spinlock that vcpu_wake() holds, but only with
+                     * trylock, to avoid deadlock).
+                     */
+                    lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
+
+                    /*
+                     * We know the vcpu's lock is not this resource's lock. In
+                     * fact, if it were, since this cpu is free, vcpu_wake()
+                     * would have assigned the unit to here directly.
+                     */
+                    ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
+
+                    if ( lock ) {
+                        unit_assign(prv, wvc->unit, sched_cpu);
+                        list_del_init(&wvc->waitq_elem);
+                        prev->next_task = wvc->unit;
+                        spin_unlock(lock);
+                        goto unlock;
+                    }
+                }
+            }
+        }
+        /*
+         * If we did find a unit with suitable affinity in the waitqueue, but
+         * we could not pick it up (due to lock contention), and hence we are
+         * still free, plan for another try. In fact, we don't want such unit
+         * to be stuck in the waitqueue, when there are free cpus where it
+         * could run.
+         */
+        if ( unlikely( unit_found && prev->next_task == NULL &&
+                       !list_empty(&prv->waitq)) )
+            cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
+ unlock:
+        spin_unlock(&prv->waitq_lock);
+
+        if ( prev->next_task == NULL &&
+             !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
+            cpumask_set_cpu(sched_cpu, &prv->cpus_free);
+    }
+
+    if ( unlikely(prev->next_task == NULL ||
+                  !unit_runnable_state(prev->next_task)) )
+        prev->next_task = sched_idle_unit(sched_cpu);
+
+    NULL_UNIT_CHECK(prev->next_task);
+
+    prev->next_task->migrated = false;
+}
+
+static inline void dump_unit(struct null_private *prv, struct null_unit *nvc)
+{
+    printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
+            nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
+                                sched_unit_master(nvc->unit) : -1);
+}
+
+static void null_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+    struct null_private *prv = null_priv(ops);
+    struct null_unit *nvc;
+    spinlock_t *lock;
+    unsigned long flags;
+
+    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+    printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
+           cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
+           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
+    if ( per_cpu(npc, cpu).unit != NULL )
+        printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain,
+               per_cpu(npc, cpu).unit->unit_id);
+    printk("\n");
+
+    /* current unit (nothing to say if that's the idle unit) */
+    nvc = null_unit(curr_on_cpu(cpu));
+    if ( nvc && !is_idle_unit(nvc->unit) )
+    {
+        printk("\trun: ");
+        dump_unit(prv, nvc);
+        printk("\n");
+    }
+
+    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+}
+
+static void null_dump(const struct scheduler *ops)
+{
+    struct null_private *prv = null_priv(ops);
+    struct list_head *iter;
+    unsigned long flags;
+    unsigned int loop;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
+
+    printk("Domain info:\n");
+    loop = 0;
+    list_for_each( iter, &prv->ndom )
+    {
+        struct null_dom *ndom;
+        struct sched_unit *unit;
+
+        ndom = list_entry(iter, struct null_dom, ndom_elem);
+
+        printk("\tDomain: %d\n", ndom->dom->domain_id);
+        for_each_sched_unit( ndom->dom, unit )
+        {
+            struct null_unit * const nvc = null_unit(unit);
+            spinlock_t *lock;
+
+            lock = unit_schedule_lock(unit);
+
+            printk("\t%3d: ", ++loop);
+            dump_unit(prv, nvc);
+            printk("\n");
+
+            unit_schedule_unlock(lock, unit);
+        }
+    }
+
+    printk("Waitqueue: ");
+    loop = 0;
+    spin_lock(&prv->waitq_lock);
+    list_for_each( iter, &prv->waitq )
+    {
+        struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
+
+        if ( loop++ != 0 )
+            printk(", ");
+        if ( loop % 24 == 0 )
+            printk("\n\t");
+        printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
+    }
+    printk("\n");
+    spin_unlock(&prv->waitq_lock);
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static const struct scheduler sched_null_def = {
+    .name           = "null Scheduler",
+    .opt_name       = "null",
+    .sched_id       = XEN_SCHEDULER_NULL,
+    .sched_data     = NULL,
+
+    .init           = null_init,
+    .deinit         = null_deinit,
+    .init_pdata     = null_init_pdata,
+    .switch_sched   = null_switch_sched,
+    .deinit_pdata   = null_deinit_pdata,
+
+    .alloc_udata    = null_alloc_udata,
+    .free_udata     = null_free_udata,
+    .alloc_domdata  = null_alloc_domdata,
+    .free_domdata   = null_free_domdata,
+
+    .insert_unit    = null_unit_insert,
+    .remove_unit    = null_unit_remove,
+
+    .wake           = null_unit_wake,
+    .sleep          = null_unit_sleep,
+    .pick_resource  = null_res_pick,
+    .migrate        = null_unit_migrate,
+    .do_schedule    = null_schedule,
+
+    .dump_cpu_state = null_dump_pcpu,
+    .dump_settings  = null_dump,
+};
+
+REGISTER_SCHEDULER(sched_null_def);
diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c
new file mode 100644
index 0000000000..c40a7e4990
--- /dev/null
+++ b/xen/common/sched/rt.c
@@ -0,0 +1,1571 @@
+/*****************************************************************************
+ * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
+ * EDF scheduling is a real-time scheduling algorithm used in embedded field.
+ *
+ * by Sisu Xi, 2013, Washington University in Saint Louis
+ * Meng Xu, 2014-2016, University of Pennsylvania
+ *
+ * Conversion toward event driven model by Tianyang Chen
+ * and Dagaen Golomb, 2016, University of Pennsylvania
+ *
+ * based on the code of credit Scheduler
+ */
+
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/cpu.h>
+#include <xen/keyhandler.h>
+#include <xen/trace.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+
+/*
+ * TODO:
+ *
+ * Migration compensation and resist like credit2 to better use cache;
+ * Lock Holder Problem, using yield?
+ * Self switch problem: UNITs of the same domain may preempt each other;
+ */
+
+/*
+ * Design:
+ *
+ * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
+ * theory in real-time field.
+ * At any scheduling point, the UNIT with earlier deadline has higher priority.
+ * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
+ * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
+ * has a lower-priority UNIT running on it.)
+ *
+ * Each UNIT has a dedicated period, budget and a extratime flag
+ * The deadline of an UNIT is at the end of each period;
+ * An UNIT has its budget replenished at the beginning of each period;
+ * While scheduled, an UNIT burns its budget.
+ * The UNIT needs to finish its budget before its deadline in each period;
+ * The UNIT discards its unused budget at the end of each period.
+ * When an UNIT runs out of budget in a period, if its extratime flag is set,
+ * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
+ * it has to wait until next period.
+ *
+ * Each UNIT is implemented as a deferable server.
+ * When an UNIT has a task running on it, its budget is continuously burned;
+ * When an UNIT has no task but with budget left, its budget is preserved.
+ *
+ * Queue scheme:
+ * A global runqueue and a global depletedqueue for each CPU pool.
+ * The runqueue holds all runnable UNITs with budget,
+ * sorted by priority_level and deadline;
+ * The depletedqueue holds all UNITs without budget, unsorted;
+ *
+ * Note: cpumask and cpupool is supported.
+ */
+
+/*
+ * Locking:
+ * A global system lock is used to protect the RunQ and DepletedQ.
+ * The global lock is referenced by sched_res->schedule_lock
+ * from all physical cpus.
+ *
+ * The lock is already grabbed when calling wake/sleep/schedule/ functions
+ * in schedule.c
+ *
+ * The functions involes RunQ and needs to grab locks are:
+ *    unit_insert, unit_remove, context_saved, runq_insert
+ */
+
+
+/*
+ * Default parameters:
+ * Period and budget in default is 10 and 4 ms, respectively
+ */
+#define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
+#define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
+
+/*
+ * Max period: max delta of time type, because period is added to the time
+ * an unit activates, so this must not overflow.
+ * Min period: 10 us, considering the scheduling overhead (when period is
+ * too low, scheduling is invoked too frequently, causing high overhead).
+ */
+#define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
+#define RTDS_MIN_PERIOD     (MICROSECS(10))
+
+/*
+ * Min budget: 10 us, considering the scheduling overhead (when budget is
+ * consumed too fast, scheduling is invoked too frequently, causing
+ * high overhead).
+ */
+#define RTDS_MIN_BUDGET     (MICROSECS(10))
+
+/*
+ * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
+ * the next deadline, performing addition could be faster if the difference
+ * between cur_deadline and now is small. If the difference is bigger than
+ * 1024 * period, use multiplication.
+ */
+#define UPDATE_LIMIT_SHIFT      10
+
+/*
+ * Flags
+ */
+/*
+ * RTDS_scheduled: Is this unit either running on, or context-switching off,
+ * a physical cpu?
+ * + Accessed only with global lock held.
+ * + Set when chosen as next in rt_schedule().
+ * + Cleared after context switch has been saved in rt_context_saved()
+ * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
+ *   set RTDS_delayed_runq_add
+ * + Checked to be false in runq_insert.
+ */
+#define __RTDS_scheduled            1
+#define RTDS_scheduled (1<<__RTDS_scheduled)
+/*
+ * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
+ * once it's done being context switching out?
+ * + Set when scheduling out in rt_schedule() if prev is runable
+ * + Set in rt_unit_wake if it finds RTDS_scheduled set
+ * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
+ *   and clears the bit.
+ */
+#define __RTDS_delayed_runq_add     2
+#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
+
+/*
+ * RTDS_depleted: Does this vcp run out of budget?
+ * This flag is
+ * + set in burn_budget() if an unit has zero budget left;
+ * + cleared and checked in the repenishment handler,
+ *   for the units that are being replenished.
+ */
+#define __RTDS_depleted     3
+#define RTDS_depleted (1<<__RTDS_depleted)
+
+/*
+ * RTDS_extratime: Can the unit run in the time that is
+ * not part of any real-time reservation, and would therefore
+ * be otherwise left idle?
+ */
+#define __RTDS_extratime    4
+#define RTDS_extratime (1<<__RTDS_extratime)
+
+/*
+ * rt tracing events ("only" 512 available!). Check
+ * include/public/trace.h for more details.
+ */
+#define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
+#define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
+#define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
+#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
+#define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
+#define TRC_RTDS_SCHEDULE         TRC_SCHED_CLASS_EVT(RTDS, 6)
+
+static void repl_timer_handler(void *data);
+
+/*
+ * System-wide private data, include global RunQueue/DepletedQ
+ * Global lock is referenced by sched_res->schedule_lock from all
+ * physical cpus. It can be grabbed via unit_schedule_lock_irq()
+ */
+struct rt_private {
+    spinlock_t lock;            /* the global coarse-grained lock */
+    struct list_head sdom;      /* list of availalbe domains, used for dump */
+
+    struct list_head runq;      /* ordered list of runnable units */
+    struct list_head depletedq; /* unordered list of depleted units */
+
+    struct timer repl_timer;    /* replenishment timer */
+    struct list_head replq;     /* ordered list of units that need replenishment */
+
+    cpumask_t tickled;          /* cpus been tickled */
+};
+
+/*
+ * Virtual CPU
+ */
+struct rt_unit {
+    struct list_head q_elem;     /* on the runq/depletedq list */
+    struct list_head replq_elem; /* on the replenishment events list */
+
+    /* UNIT parameters, in nanoseconds */
+    s_time_t period;
+    s_time_t budget;
+
+    /* UNIT current information in nanosecond */
+    s_time_t cur_budget;         /* current budget */
+    s_time_t last_start;         /* last start time */
+    s_time_t cur_deadline;       /* current deadline for EDF */
+
+    /* Up-pointers */
+    struct rt_dom *sdom;
+    struct sched_unit *unit;
+
+    unsigned priority_level;
+
+    unsigned flags;              /* mark __RTDS_scheduled, etc.. */
+};
+
+/*
+ * Domain
+ */
+struct rt_dom {
+    struct list_head sdom_elem; /* link list on rt_priv */
+    struct domain *dom;         /* pointer to upper domain */
+};
+
+/*
+ * Useful inline functions
+ */
+static inline struct rt_private *rt_priv(const struct scheduler *ops)
+{
+    return ops->sched_data;
+}
+
+static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
+{
+    return unit->priv;
+}
+
+static inline struct list_head *rt_runq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->runq;
+}
+
+static inline struct list_head *rt_depletedq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->depletedq;
+}
+
+static inline struct list_head *rt_replq(const struct scheduler *ops)
+{
+    return &rt_priv(ops)->replq;
+}
+
+static inline bool has_extratime(const struct rt_unit *svc)
+{
+    return svc->flags & RTDS_extratime;
+}
+
+/*
+ * Helper functions for manipulating the runqueue, the depleted queue,
+ * and the replenishment events queue.
+ */
+static int
+unit_on_q(const struct rt_unit *svc)
+{
+   return !list_empty(&svc->q_elem);
+}
+
+static struct rt_unit *
+q_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_unit, q_elem);
+}
+
+static struct rt_unit *
+replq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_unit, replq_elem);
+}
+
+static int
+unit_on_replq(const struct rt_unit *svc)
+{
+    return !list_empty(&svc->replq_elem);
+}
+
+/*
+ * If v1 priority >= v2 priority, return value > 0
+ * Otherwise, return value < 0
+ */
+static s_time_t
+compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
+{
+    int prio = v2->priority_level - v1->priority_level;
+
+    if ( prio == 0 )
+        return v2->cur_deadline - v1->cur_deadline;
+
+    return prio;
+}
+
+/*
+ * Debug related code, dump unit/cpu information
+ */
+static void
+rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
+{
+    cpumask_t *cpupool_mask, *mask;
+
+    ASSERT(svc != NULL);
+    /* idle unit */
+    if( svc->sdom == NULL )
+    {
+        printk("\n");
+        return;
+    }
+
+    /*
+     * We can't just use 'cpumask_scratch' because the dumping can
+     * happen from a pCPU outside of this scheduler's cpupool, and
+     * hence it's not right to use its pCPU's scratch mask.
+     * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
+     * own scratch space, since we hold the runqueue lock.
+     */
+    mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
+
+    cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
+    cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
+    printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
+           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
+           " \t\t priority_level=%d has_extratime=%d\n"
+           " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
+            svc->unit->domain->domain_id,
+            svc->unit->unit_id,
+            sched_unit_master(svc->unit),
+            svc->period,
+            svc->budget,
+            svc->cur_budget,
+            svc->cur_deadline,
+            svc->last_start,
+            svc->priority_level,
+            has_extratime(svc),
+            unit_on_q(svc),
+            unit_runnable(svc->unit),
+            svc->flags, CPUMASK_PR(mask));
+}
+
+static void
+rt_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_unit *svc;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+    printk("CPU[%02d]\n", cpu);
+    /* current UNIT (nothing to say if that's the idle unit). */
+    svc = rt_unit(curr_on_cpu(cpu));
+    if ( svc && !is_idle_unit(svc->unit) )
+    {
+        rt_dump_unit(ops, svc);
+    }
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void
+rt_dump(const struct scheduler *ops)
+{
+    struct list_head *runq, *depletedq, *replq, *iter;
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_unit *svc;
+    struct rt_dom *sdom;
+    unsigned long flags;
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    if ( list_empty(&prv->sdom) )
+        goto out;
+
+    runq = rt_runq(ops);
+    depletedq = rt_depletedq(ops);
+    replq = rt_replq(ops);
+
+    printk("Global RunQueue info:\n");
+    list_for_each ( iter, runq )
+    {
+        svc = q_elem(iter);
+        rt_dump_unit(ops, svc);
+    }
+
+    printk("Global DepletedQueue info:\n");
+    list_for_each ( iter, depletedq )
+    {
+        svc = q_elem(iter);
+        rt_dump_unit(ops, svc);
+    }
+
+    printk("Global Replenishment Events info:\n");
+    list_for_each ( iter, replq )
+    {
+        svc = replq_elem(iter);
+        rt_dump_unit(ops, svc);
+    }
+
+    printk("Domain info:\n");
+    list_for_each ( iter, &prv->sdom )
+    {
+        struct sched_unit *unit;
+
+        sdom = list_entry(iter, struct rt_dom, sdom_elem);
+        printk("\tdomain: %d\n", sdom->dom->domain_id);
+
+        for_each_sched_unit ( sdom->dom, unit )
+        {
+            svc = rt_unit(unit);
+            rt_dump_unit(ops, svc);
+        }
+    }
+
+ out:
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+/*
+ * update deadline and budget when now >= cur_deadline
+ * it needs to be updated to the deadline of the current period
+ */
+static void
+rt_update_deadline(s_time_t now, struct rt_unit *svc)
+{
+    ASSERT(now >= svc->cur_deadline);
+    ASSERT(svc->period != 0);
+
+    if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
+    {
+        do
+            svc->cur_deadline += svc->period;
+        while ( svc->cur_deadline <= now );
+    }
+    else
+    {
+        long count = ((now - svc->cur_deadline) / svc->period) + 1;
+        svc->cur_deadline += count * svc->period;
+    }
+
+    /*
+     * svc may be scheduled to run immediately after it misses deadline
+     * Then rt_update_deadline is called before rt_schedule, which
+     * should only deduct the time spent in current period from the budget
+     */
+    svc->last_start = now;
+    svc->cur_budget = svc->budget;
+    svc->priority_level = 0;
+
+    /* TRACE */
+    {
+        struct __packed {
+            unsigned unit:16, dom:16;
+            unsigned priority_level;
+            uint64_t cur_deadline, cur_budget;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.priority_level = svc->priority_level;
+        d.cur_deadline = (uint64_t) svc->cur_deadline;
+        d.cur_budget = (uint64_t) svc->cur_budget;
+        trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
+                  sizeof(d),
+                  (unsigned char *) &d);
+    }
+
+    return;
+}
+
+/*
+ * Helpers for removing and inserting an unit in a queue
+ * that is being kept ordered by the units' deadlines (as EDF
+ * mandates).
+ *
+ * For callers' convenience, the unit removing helper returns
+ * true if the unit removed was the one at the front of the
+ * queue; similarly, the inserting helper returns true if the
+ * inserted ended at the front of the queue (i.e., in both
+ * cases, if the unit with the earliest deadline is what we
+ * are dealing with).
+ */
+static inline bool
+deadline_queue_remove(struct list_head *queue, struct list_head *elem)
+{
+    int pos = 0;
+
+    if ( queue->next != elem )
+        pos = 1;
+
+    list_del_init(elem);
+    return !pos;
+}
+
+static inline bool
+deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
+                      struct rt_unit *svc, struct list_head *elem,
+                      struct list_head *queue)
+{
+    struct list_head *iter;
+    int pos = 0;
+
+    list_for_each ( iter, queue )
+    {
+        struct rt_unit * iter_svc = (*qelem)(iter);
+        if ( compare_unit_priority(svc, iter_svc) > 0 )
+            break;
+        pos++;
+    }
+    list_add_tail(elem, iter);
+    return !pos;
+}
+#define deadline_runq_insert(...) \
+  deadline_queue_insert(&q_elem, ##__VA_ARGS__)
+#define deadline_replq_insert(...) \
+  deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
+
+static inline void
+q_remove(struct rt_unit *svc)
+{
+    ASSERT( unit_on_q(svc) );
+    list_del_init(&svc->q_elem);
+}
+
+static inline void
+replq_remove(const struct scheduler *ops, struct rt_unit *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+
+    ASSERT( unit_on_replq(svc) );
+
+    if ( deadline_queue_remove(replq, &svc->replq_elem) )
+    {
+        /*
+         * The replenishment timer needs to be set to fire when a
+         * replenishment for the unit at the front of the replenishment
+         * queue is due. If it is such unit that we just removed, we may
+         * need to reprogram the timer.
+         */
+        if ( !list_empty(replq) )
+        {
+            struct rt_unit *svc_next = replq_elem(replq->next);
+            set_timer(&prv->repl_timer, svc_next->cur_deadline);
+        }
+        else
+            stop_timer(&prv->repl_timer);
+    }
+}
+
+/*
+ * Insert svc with budget in RunQ according to EDF:
+ * units with smaller deadlines go first.
+ * Insert svc without budget in DepletedQ unsorted;
+ */
+static void
+runq_insert(const struct scheduler *ops, struct rt_unit *svc)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *runq = rt_runq(ops);
+
+    ASSERT( spin_is_locked(&prv->lock) );
+    ASSERT( !unit_on_q(svc) );
+    ASSERT( unit_on_replq(svc) );
+
+    /* add svc to runq if svc still has budget or its extratime is set */
+    if ( svc->cur_budget > 0 ||
+         has_extratime(svc) )
+        deadline_runq_insert(svc, &svc->q_elem, runq);
+    else
+        list_add(&svc->q_elem, &prv->depletedq);
+}
+
+static void
+replq_insert(const struct scheduler *ops, struct rt_unit *svc)
+{
+    struct list_head *replq = rt_replq(ops);
+    struct rt_private *prv = rt_priv(ops);
+
+    ASSERT( !unit_on_replq(svc) );
+
+    /*
+     * The timer may be re-programmed if svc is inserted
+     * at the front of the event list.
+     */
+    if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
+        set_timer(&prv->repl_timer, svc->cur_deadline);
+}
+
+/*
+ * Removes and re-inserts an event to the replenishment queue.
+ * The aim is to update its position inside the queue, as its
+ * deadline (and hence its replenishment time) could have
+ * changed.
+ */
+static void
+replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
+{
+    struct list_head *replq = rt_replq(ops);
+    struct rt_unit *rearm_svc = svc;
+    bool_t rearm = 0;
+
+    ASSERT( unit_on_replq(svc) );
+
+    /*
+     * If svc was at the front of the replenishment queue, we certainly
+     * need to re-program the timer, and we want to use the deadline of
+     * the unit which is now at the front of the queue (which may still
+     * be svc or not).
+     *
+     * We may also need to re-program, if svc has been put at the front
+     * of the replenishment queue when being re-inserted.
+     */
+    if ( deadline_queue_remove(replq, &svc->replq_elem) )
+    {
+        deadline_replq_insert(svc, &svc->replq_elem, replq);
+        rearm_svc = replq_elem(replq->next);
+        rearm = 1;
+    }
+    else
+        rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
+
+    if ( rearm )
+        set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
+}
+
+/*
+ * Pick a valid resource for the unit vc
+ * Valid resource of an unit is intesection of unit's affinity
+ * and available resources
+ */
+static struct sched_resource *
+rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+    cpumask_t cpus;
+    cpumask_t *online;
+    int cpu;
+
+    online = cpupool_domain_master_cpumask(unit->domain);
+    cpumask_and(&cpus, online, unit->cpu_hard_affinity);
+
+    cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus)
+            ? sched_unit_master(unit)
+            : cpumask_cycle(sched_unit_master(unit), &cpus);
+    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
+
+    return get_sched_res(cpu);
+}
+
+/*
+ * Init/Free related code
+ */
+static int
+rt_init(struct scheduler *ops)
+{
+    int rc = -ENOMEM;
+    struct rt_private *prv = xzalloc(struct rt_private);
+
+    printk("Initializing RTDS scheduler\n"
+           "WARNING: This is experimental software in development.\n"
+           "Use at your own risk.\n");
+
+    if ( prv == NULL )
+        goto err;
+
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->sdom);
+    INIT_LIST_HEAD(&prv->runq);
+    INIT_LIST_HEAD(&prv->depletedq);
+    INIT_LIST_HEAD(&prv->replq);
+
+    ops->sched_data = prv;
+    rc = 0;
+
+ err:
+    if ( rc )
+        xfree(prv);
+
+    return rc;
+}
+
+static void
+rt_deinit(struct scheduler *ops)
+{
+    struct rt_private *prv = rt_priv(ops);
+
+    ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
+           prv->repl_timer.status == TIMER_STATUS_killed);
+
+    ops->sched_data = NULL;
+    xfree(prv);
+}
+
+/*
+ * Point per_cpu spinlock to the global system lock;
+ * All cpu have same global system lock
+ */
+static void
+rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
+{
+    struct rt_private *prv = rt_priv(ops);
+    spinlock_t *old_lock;
+    unsigned long flags;
+
+    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+
+    /*
+     * TIMER_STATUS_invalid means we are the first cpu that sees the timer
+     * allocated but not initialized, and so it's up to us to initialize it.
+     */
+    if ( prv->repl_timer.status == TIMER_STATUS_invalid )
+    {
+        init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu);
+        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
+    }
+
+    /* Move the scheduler lock to our global runqueue lock.  */
+    get_sched_res(cpu)->schedule_lock = &prv->lock;
+
+    /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
+    spin_unlock_irqrestore(old_lock, flags);
+}
+
+/* Change the scheduler of cpu to us (RTDS). */
+static spinlock_t *
+rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+                void *pdata, void *vdata)
+{
+    struct rt_private *prv = rt_priv(new_ops);
+    struct rt_unit *svc = vdata;
+
+    ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+    /*
+     * We are holding the runqueue lock already (it's been taken in
+     * schedule_cpu_switch()). It's actually the runqueue lock of
+     * another scheduler, but that is how things need to be, for
+     * preventing races.
+     */
+    ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
+
+    /*
+     * If we are the absolute first cpu being switched toward this
+     * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
+     * first one that is added back to the cpupool that had all its cpus
+     * removed (in which case we'll see TIMER_STATUS_killed), it's our
+     * job to (re)initialize the timer.
+     */
+    if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
+         prv->repl_timer.status == TIMER_STATUS_killed )
+    {
+        init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
+        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
+    }
+
+    sched_idle_unit(cpu)->priv = vdata;
+
+    return &prv->lock;
+}
+
+static void
+rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    unsigned long flags;
+    struct rt_private *prv = rt_priv(ops);
+
+    spin_lock_irqsave(&prv->lock, flags);
+
+    if ( prv->repl_timer.cpu == cpu )
+    {
+        cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
+        unsigned int new_cpu = cpumask_cycle(cpu, online);
+
+        /*
+         * Make sure the timer run on one of the cpus that are still available
+         * to this scheduler. If there aren't any left, it means it's the time
+         * to just kill it.
+         */
+        if ( new_cpu >= nr_cpu_ids )
+        {
+            kill_timer(&prv->repl_timer);
+            dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
+        }
+        else
+        {
+            migrate_timer(&prv->repl_timer, new_cpu);
+        }
+    }
+
+    spin_unlock_irqrestore(&prv->lock, flags);
+}
+
+static void *
+rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+    unsigned long flags;
+    struct rt_dom *sdom;
+    struct rt_private * prv = rt_priv(ops);
+
+    sdom = xzalloc(struct rt_dom);
+    if ( sdom == NULL )
+        return ERR_PTR(-ENOMEM);
+
+    INIT_LIST_HEAD(&sdom->sdom_elem);
+    sdom->dom = dom;
+
+    /* spinlock here to insert the dom */
+    spin_lock_irqsave(&prv->lock, flags);
+    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return sdom;
+}
+
+static void
+rt_free_domdata(const struct scheduler *ops, void *data)
+{
+    struct rt_dom *sdom = data;
+    struct rt_private *prv = rt_priv(ops);
+
+    if ( sdom )
+    {
+        unsigned long flags;
+
+        spin_lock_irqsave(&prv->lock, flags);
+        list_del_init(&sdom->sdom_elem);
+        spin_unlock_irqrestore(&prv->lock, flags);
+
+        xfree(sdom);
+    }
+}
+
+static void *
+rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
+{
+    struct rt_unit *svc;
+
+    /* Allocate per-UNIT info */
+    svc = xzalloc(struct rt_unit);
+    if ( svc == NULL )
+        return NULL;
+
+    INIT_LIST_HEAD(&svc->q_elem);
+    INIT_LIST_HEAD(&svc->replq_elem);
+    svc->flags = 0U;
+    svc->sdom = dd;
+    svc->unit = unit;
+    svc->last_start = 0;
+
+    __set_bit(__RTDS_extratime, &svc->flags);
+    svc->priority_level = 0;
+    svc->period = RTDS_DEFAULT_PERIOD;
+    if ( !is_idle_unit(unit) )
+        svc->budget = RTDS_DEFAULT_BUDGET;
+
+    SCHED_STAT_CRANK(unit_alloc);
+
+    return svc;
+}
+
+static void
+rt_free_udata(const struct scheduler *ops, void *priv)
+{
+    struct rt_unit *svc = priv;
+
+    xfree(svc);
+}
+
+/*
+ * It is called in sched_move_domain() and sched_init_vcpu
+ * in schedule.c.
+ * When move a domain to a new cpupool.
+ * It inserts units of moving domain to the scheduler's RunQ in
+ * dest. cpupool.
+ */
+static void
+rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct rt_unit *svc = rt_unit(unit);
+    s_time_t now;
+    spinlock_t *lock;
+
+    BUG_ON( is_idle_unit(unit) );
+
+    /* This is safe because unit isn't yet being scheduled */
+    sched_set_res(unit, rt_res_pick(ops, unit));
+
+    lock = unit_schedule_lock_irq(unit);
+
+    now = NOW();
+    if ( now >= svc->cur_deadline )
+        rt_update_deadline(now, svc);
+
+    if ( !unit_on_q(svc) && unit_runnable(unit) )
+    {
+        replq_insert(ops, svc);
+
+        if ( !unit->is_running )
+            runq_insert(ops, svc);
+    }
+    unit_schedule_unlock_irq(lock, unit);
+
+    SCHED_STAT_CRANK(unit_insert);
+}
+
+/*
+ * Remove rt_unit svc from the old scheduler in source cpupool.
+ */
+static void
+rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct rt_unit * const svc = rt_unit(unit);
+    struct rt_dom * const sdom = svc->sdom;
+    spinlock_t *lock;
+
+    SCHED_STAT_CRANK(unit_remove);
+
+    BUG_ON( sdom == NULL );
+
+    lock = unit_schedule_lock_irq(unit);
+    if ( unit_on_q(svc) )
+        q_remove(svc);
+
+    if ( unit_on_replq(svc) )
+        replq_remove(ops,svc);
+
+    unit_schedule_unlock_irq(lock, unit);
+}
+
+/*
+ * Burn budget in nanosecond granularity
+ */
+static void
+burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
+{
+    s_time_t delta;
+
+    /* don't burn budget for idle UNIT */
+    if ( is_idle_unit(svc->unit) )
+        return;
+
+    /* burn at nanoseconds level */
+    delta = now - svc->last_start;
+    /*
+     * delta < 0 only happens in nested virtualization;
+     * TODO: how should we handle delta < 0 in a better way?
+     */
+    if ( delta < 0 )
+    {
+        printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
+                __func__, delta);
+        svc->last_start = now;
+        return;
+    }
+
+    svc->cur_budget -= delta;
+    svc->last_start = now;
+
+    if ( svc->cur_budget <= 0 )
+    {
+        if ( has_extratime(svc) )
+        {
+            svc->priority_level++;
+            svc->cur_budget = svc->budget;
+        }
+        else
+        {
+            svc->cur_budget = 0;
+            __set_bit(__RTDS_depleted, &svc->flags);
+        }
+    }
+
+    /* TRACE */
+    {
+        struct __packed {
+            unsigned unit:16, dom:16;
+            uint64_t cur_budget;
+            int delta;
+            unsigned priority_level;
+            bool has_extratime;
+        } d;
+        d.dom = svc->unit->domain->domain_id;
+        d.unit = svc->unit->unit_id;
+        d.cur_budget = (uint64_t) svc->cur_budget;
+        d.delta = delta;
+        d.priority_level = svc->priority_level;
+        d.has_extratime = svc->flags & RTDS_extratime;
+        trace_var(TRC_RTDS_BUDGET_BURN, 1,
+                  sizeof(d),
+                  (unsigned char *) &d);
+    }
+}
+
+/*
+ * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
+ * lock is grabbed before calling this function
+ */
+static struct rt_unit *
+runq_pick(const struct scheduler *ops, const cpumask_t *mask)
+{
+    struct list_head *runq = rt_runq(ops);
+    struct list_head *iter;
+    struct rt_unit *svc = NULL;
+    struct rt_unit *iter_svc = NULL;
+    cpumask_t cpu_common;
+    cpumask_t *online;
+
+    list_for_each ( iter, runq )
+    {
+        iter_svc = q_elem(iter);
+
+        /* mask cpu_hard_affinity & cpupool & mask */
+        online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
+        cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity);
+        cpumask_and(&cpu_common, mask, &cpu_common);
+        if ( cpumask_empty(&cpu_common) )
+            continue;
+
+        ASSERT( iter_svc->cur_budget > 0 );
+
+        svc = iter_svc;
+        break;
+    }
+
+    /* TRACE */
+    {
+        if( svc != NULL )
+        {
+            struct __packed {
+                unsigned unit:16, dom:16;
+                uint64_t cur_deadline, cur_budget;
+            } d;
+            d.dom = svc->unit->domain->domain_id;
+            d.unit = svc->unit->unit_id;
+            d.cur_deadline = (uint64_t) svc->cur_deadline;
+            d.cur_budget = (uint64_t) svc->cur_budget;
+            trace_var(TRC_RTDS_RUNQ_PICK, 1,
+                      sizeof(d),
+                      (unsigned char *) &d);
+        }
+    }
+
+    return svc;
+}
+
+/*
+ * schedule function for rt scheduler.
+ * The lock is already grabbed in schedule.c, no need to lock here
+ */
+static void
+rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
+            s_time_t now, bool tasklet_work_scheduled)
+{
+    const unsigned int cur_cpu = smp_processor_id();
+    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_unit *const scurr = rt_unit(currunit);
+    struct rt_unit *snext = NULL;
+    bool migrated = false;
+
+    /* TRACE */
+    {
+        struct __packed {
+            unsigned cpu:16, tasklet:8, tickled:4, idle:4;
+        } d;
+        d.cpu = cur_cpu;
+        d.tasklet = tasklet_work_scheduled;
+        d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
+        d.idle = is_idle_unit(currunit);
+        trace_var(TRC_RTDS_SCHEDULE, 1,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+
+    /* clear ticked bit now that we've been scheduled */
+    cpumask_clear_cpu(sched_cpu, &prv->tickled);
+
+    /* burn_budget would return for IDLE UNIT */
+    burn_budget(ops, scurr, now);
+
+    if ( tasklet_work_scheduled )
+    {
+        trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0,  NULL);
+        snext = rt_unit(sched_idle_unit(sched_cpu));
+    }
+    else
+    {
+        snext = runq_pick(ops, cpumask_of(sched_cpu));
+
+        if ( snext == NULL )
+            snext = rt_unit(sched_idle_unit(sched_cpu));
+        else if ( !unit_runnable_state(snext->unit) )
+        {
+            q_remove(snext);
+            snext = rt_unit(sched_idle_unit(sched_cpu));
+        }
+
+        /* if scurr has higher priority and budget, still pick scurr */
+        if ( !is_idle_unit(currunit) &&
+             unit_runnable_state(currunit) &&
+             scurr->cur_budget > 0 &&
+             ( is_idle_unit(snext->unit) ||
+               compare_unit_priority(scurr, snext) > 0 ) )
+            snext = scurr;
+    }
+
+    if ( snext != scurr &&
+         !is_idle_unit(currunit) &&
+         unit_runnable(currunit) )
+        __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
+
+    snext->last_start = now;
+    currunit->next_time =  -1; /* if an idle unit is picked */
+    if ( !is_idle_unit(snext->unit) )
+    {
+        if ( snext != scurr )
+        {
+            q_remove(snext);
+            __set_bit(__RTDS_scheduled, &snext->flags);
+        }
+        if ( sched_unit_master(snext->unit) != sched_cpu )
+        {
+            sched_set_res(snext->unit, get_sched_res(sched_cpu));
+            migrated = true;
+        }
+        /* Invoke the scheduler next time. */
+        currunit->next_time = snext->cur_budget;
+    }
+    currunit->next_task = snext->unit;
+    snext->unit->migrated = migrated;
+}
+
+/*
+ * Remove UNIT from RunQ
+ * The lock is already grabbed in schedule.c, no need to lock here
+ */
+static void
+rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct rt_unit * const svc = rt_unit(unit);
+
+    BUG_ON( is_idle_unit(unit) );
+    SCHED_STAT_CRANK(unit_sleep);
+
+    if ( curr_on_cpu(sched_unit_master(unit)) == unit )
+        cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+    else if ( unit_on_q(svc) )
+    {
+        q_remove(svc);
+        replq_remove(ops, svc);
+    }
+    else if ( svc->flags & RTDS_delayed_runq_add )
+        __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
+}
+
+/*
+ * Pick a cpu where to run an unit,
+ * possibly kicking out the unit running there
+ * Called by wake() and context_saved()
+ * We have a running candidate here, the kick logic is:
+ * Among all the cpus that are within the cpu affinity
+ * 1) if there are any idle CPUs, kick one.
+      For cache benefit, we check new->cpu as first
+ * 2) now all pcpus are busy;
+ *    among all the running units, pick lowest priority one
+ *    if snext has higher priority, kick it.
+ *
+ * TODO:
+ * 1) what if these two units belongs to the same domain?
+ *    replace an unit belonging to the same domain introduces more overhead
+ *
+ * lock is grabbed before calling this function
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct rt_unit *new)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
+    struct rt_unit *iter_svc;
+    struct sched_unit *iter_unit;
+    int cpu = 0, cpu_to_tickle = 0;
+    cpumask_t not_tickled;
+    cpumask_t *online;
+
+    if ( new == NULL || is_idle_unit(new->unit) )
+        return;
+
+    online = cpupool_domain_master_cpumask(new->unit->domain);
+    cpumask_and(&not_tickled, online, new->unit->cpu_hard_affinity);
+    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
+
+    /*
+     * 1) If there are any idle CPUs, kick one.
+     *    For cache benefit,we first search new->cpu.
+     *    The same loop also find the one with lowest priority.
+     */
+    cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), &not_tickled);
+    while ( cpu!= nr_cpu_ids )
+    {
+        iter_unit = curr_on_cpu(cpu);
+        if ( is_idle_unit(iter_unit) )
+        {
+            SCHED_STAT_CRANK(tickled_idle_cpu);
+            cpu_to_tickle = cpu;
+            goto out;
+        }
+        iter_svc = rt_unit(iter_unit);
+        if ( latest_deadline_unit == NULL ||
+             compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
+            latest_deadline_unit = iter_svc;
+
+        cpumask_clear_cpu(cpu, &not_tickled);
+        cpu = cpumask_cycle(cpu, &not_tickled);
+    }
+
+    /* 2) candicate has higher priority, kick out lowest priority unit */
+    if ( latest_deadline_unit != NULL &&
+         compare_unit_priority(latest_deadline_unit, new) < 0 )
+    {
+        SCHED_STAT_CRANK(tickled_busy_cpu);
+        cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
+        goto out;
+    }
+
+    /* didn't tickle any cpu */
+    SCHED_STAT_CRANK(tickled_no_cpu);
+    return;
+ out:
+    /* TRACE */
+    {
+        struct {
+            unsigned cpu:16, pad:16;
+        } d;
+        d.cpu = cpu_to_tickle;
+        d.pad = 0;
+        trace_var(TRC_RTDS_TICKLE, 1,
+                  sizeof(d),
+                  (unsigned char *)&d);
+    }
+
+    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
+    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
+    return;
+}
+
+/*
+ * Should always wake up runnable unit, put it back to RunQ.
+ * Check priority to raise interrupt
+ * The lock is already grabbed in schedule.c, no need to lock here
+ * TODO: what if these two units belongs to the same domain?
+ */
+static void
+rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct rt_unit * const svc = rt_unit(unit);
+    s_time_t now;
+    bool_t missed;
+
+    BUG_ON( is_idle_unit(unit) );
+
+    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
+    {
+        SCHED_STAT_CRANK(unit_wake_running);
+        return;
+    }
+
+    /* on RunQ/DepletedQ, just update info is ok */
+    if ( unlikely(unit_on_q(svc)) )
+    {
+        SCHED_STAT_CRANK(unit_wake_onrunq);
+        return;
+    }
+
+    if ( likely(unit_runnable(unit)) )
+        SCHED_STAT_CRANK(unit_wake_runnable);
+    else
+        SCHED_STAT_CRANK(unit_wake_not_runnable);
+
+    /*
+     * If a deadline passed while svc was asleep/blocked, we need new
+     * scheduling parameters (a new deadline and full budget).
+     */
+    now = NOW();
+
+    missed = ( now >= svc->cur_deadline );
+    if ( missed )
+        rt_update_deadline(now, svc);
+
+    /*
+     * If context hasn't been saved for this unit yet, we can't put it on
+     * the run-queue/depleted-queue. Instead, we set the appropriate flag,
+     * the unit will be put back on queue after the context has been saved
+     * (in rt_context_save()).
+     */
+    if ( unlikely(svc->flags & RTDS_scheduled) )
+    {
+        __set_bit(__RTDS_delayed_runq_add, &svc->flags);
+        /*
+         * The unit is waking up already, and we didn't even had the time to
+         * remove its next replenishment event from the replenishment queue
+         * when it blocked! No big deal. If we did not miss the deadline in
+         * the meantime, let's just leave it there. If we did, let's remove it
+         * and queue a new one (to occur at our new deadline).
+         */
+        if ( missed )
+           replq_reinsert(ops, svc);
+        return;
+    }
+
+    /* Replenishment event got cancelled when we blocked. Add it back. */
+    replq_insert(ops, svc);
+    /* insert svc to runq/depletedq because svc is not in queue now */
+    runq_insert(ops, svc);
+
+    runq_tickle(ops, svc);
+}
+
+/*
+ * scurr has finished context switch, insert it back to the RunQ,
+ * and then pick the highest priority unit from runq to run
+ */
+static void
+rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
+{
+    struct rt_unit *svc = rt_unit(unit);
+    spinlock_t *lock = unit_schedule_lock_irq(unit);
+
+    __clear_bit(__RTDS_scheduled, &svc->flags);
+    /* not insert idle unit to runq */
+    if ( is_idle_unit(unit) )
+        goto out;
+
+    if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
+         likely(unit_runnable(unit)) )
+    {
+        runq_insert(ops, svc);
+        runq_tickle(ops, svc);
+    }
+    else
+        replq_remove(ops, svc);
+
+out:
+    unit_schedule_unlock_irq(lock, unit);
+}
+
+/*
+ * set/get each unit info of each domain
+ */
+static int
+rt_dom_cntl(
+    const struct scheduler *ops,
+    struct domain *d,
+    struct xen_domctl_scheduler_op *op)
+{
+    struct rt_private *prv = rt_priv(ops);
+    struct rt_unit *svc;
+    struct sched_unit *unit;
+    unsigned long flags;
+    int rc = 0;
+    struct xen_domctl_schedparam_vcpu local_sched;
+    s_time_t period, budget;
+    uint32_t index = 0;
+
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_SCHEDOP_getinfo:
+        /* Return the default parameters. */
+        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
+        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
+        break;
+    case XEN_DOMCTL_SCHEDOP_putinfo:
+        if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
+        {
+            rc = -EINVAL;
+            break;
+        }
+        spin_lock_irqsave(&prv->lock, flags);
+        for_each_sched_unit ( d, unit )
+        {
+            svc = rt_unit(unit);
+            svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
+            svc->budget = MICROSECS(op->u.rtds.budget);
+        }
+        spin_unlock_irqrestore(&prv->lock, flags);
+        break;
+    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+        while ( index < op->u.v.nr_vcpus )
+        {
+            if ( copy_from_guest_offset(&local_sched,
+                                        op->u.v.vcpus, index, 1) )
+            {
+                rc = -EFAULT;
+                break;
+            }
+            if ( local_sched.vcpuid >= d->max_vcpus ||
+                 d->vcpu[local_sched.vcpuid] == NULL )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
+            {
+                spin_lock_irqsave(&prv->lock, flags);
+                svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
+                local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
+                local_sched.u.rtds.period = svc->period / MICROSECS(1);
+                if ( has_extratime(svc) )
+                    local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
+                else
+                    local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
+                spin_unlock_irqrestore(&prv->lock, flags);
+
+                if ( copy_to_guest_offset(op->u.v.vcpus, index,
+                                          &local_sched, 1) )
+                {
+                    rc = -EFAULT;
+                    break;
+                }
+            }
+            else
+            {
+                period = MICROSECS(local_sched.u.rtds.period);
+                budget = MICROSECS(local_sched.u.rtds.budget);
+                if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
+                     budget > period || period < RTDS_MIN_PERIOD )
+                {
+                    rc = -EINVAL;
+                    break;
+                }
+
+                spin_lock_irqsave(&prv->lock, flags);
+                svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
+                svc->period = period;
+                svc->budget = budget;
+                if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
+                    __set_bit(__RTDS_extratime, &svc->flags);
+                else
+                    __clear_bit(__RTDS_extratime, &svc->flags);
+                spin_unlock_irqrestore(&prv->lock, flags);
+            }
+            /* Process a most 64 vCPUs without checking for preemptions. */
+            if ( (++index > 63) && hypercall_preempt_check() )
+                break;
+        }
+        if ( !rc )
+            /* notify upper caller how many units have been processed. */
+            op->u.v.nr_vcpus = index;
+        break;
+    }
+
+    return rc;
+}
+
+/*
+ * The replenishment timer handler picks units
+ * from the replq and does the actual replenishment.
+ */
+static void repl_timer_handler(void *data){
+    s_time_t now;
+    struct scheduler *ops = data;
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *replq = rt_replq(ops);
+    struct list_head *runq = rt_runq(ops);
+    struct list_head *iter, *tmp;
+    struct rt_unit *svc;
+    LIST_HEAD(tmp_replq);
+
+    spin_lock_irq(&prv->lock);
+
+    now = NOW();
+
+    /*
+     * Do the replenishment and move replenished units
+     * to the temporary list to tickle.
+     * If svc is on run queue, we need to put it at
+     * the correct place since its deadline changes.
+     */
+    list_for_each_safe ( iter, tmp, replq )
+    {
+        svc = replq_elem(iter);
+
+        if ( now < svc->cur_deadline )
+            break;
+
+        list_del(&svc->replq_elem);
+        rt_update_deadline(now, svc);
+        list_add(&svc->replq_elem, &tmp_replq);
+
+        if ( unit_on_q(svc) )
+        {
+            q_remove(svc);
+            runq_insert(ops, svc);
+        }
+    }
+
+    /*
+     * Iterate through the list of updated units.
+     * If an updated unit is running, tickle the head of the
+     * runqueue if it has a higher priority.
+     * If an updated unit was depleted and on the runqueue, tickle it.
+     * Finally, reinsert the units back to replenishement events list.
+     */
+    list_for_each_safe ( iter, tmp, &tmp_replq )
+    {
+        svc = replq_elem(iter);
+
+        if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
+             !list_empty(runq) )
+        {
+            struct rt_unit *next_on_runq = q_elem(runq->next);
+
+            if ( compare_unit_priority(svc, next_on_runq) < 0 )
+                runq_tickle(ops, next_on_runq);
+        }
+        else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
+                  unit_on_q(svc) )
+            runq_tickle(ops, svc);
+
+        list_del(&svc->replq_elem);
+        deadline_replq_insert(svc, &svc->replq_elem, replq);
+    }
+
+    /*
+     * If there are units left in the replenishment event list,
+     * set the next replenishment to happen at the deadline of
+     * the one in the front.
+     */
+    if ( !list_empty(replq) )
+        set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
+
+    spin_unlock_irq(&prv->lock);
+}
+
+static const struct scheduler sched_rtds_def = {
+    .name           = "SMP RTDS Scheduler",
+    .opt_name       = "rtds",
+    .sched_id       = XEN_SCHEDULER_RTDS,
+    .sched_data     = NULL,
+
+    .dump_cpu_state = rt_dump_pcpu,
+    .dump_settings  = rt_dump,
+    .init           = rt_init,
+    .deinit         = rt_deinit,
+    .init_pdata     = rt_init_pdata,
+    .switch_sched   = rt_switch_sched,
+    .deinit_pdata   = rt_deinit_pdata,
+    .alloc_domdata  = rt_alloc_domdata,
+    .free_domdata   = rt_free_domdata,
+    .alloc_udata    = rt_alloc_udata,
+    .free_udata     = rt_free_udata,
+    .insert_unit    = rt_unit_insert,
+    .remove_unit    = rt_unit_remove,
+
+    .adjust         = rt_dom_cntl,
+
+    .pick_resource  = rt_res_pick,
+    .do_schedule    = rt_schedule,
+    .sleep          = rt_unit_sleep,
+    .wake           = rt_unit_wake,
+    .context_saved  = rt_context_saved,
+};
+
+REGISTER_SCHEDULER(sched_rtds_def);
diff --git a/xen/common/sched_arinc653.c b/xen/common/sched_arinc653.c
deleted file mode 100644
index 565575c326..0000000000
--- a/xen/common/sched_arinc653.c
+++ /dev/null
@@ -1,739 +0,0 @@
-/******************************************************************************
- * sched_arinc653.c
- *
- * An ARINC653-compatible scheduling algorithm for use in Xen.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/timer.h>
-#include <xen/softirq.h>
-#include <xen/time.h>
-#include <xen/errno.h>
-#include <xen/list.h>
-#include <xen/guest_access.h>
-#include <public/sysctl.h>
-
-/**************************************************************************
- * Private Macros                                                         *
- **************************************************************************/
-
-/**
- * Default timeslice for domain 0.
- */
-#define DEFAULT_TIMESLICE MILLISECS(10)
-
-/**
- * Retrieve the idle UNIT for a given physical CPU
- */
-#define IDLETASK(cpu)  (sched_idle_unit(cpu))
-
-/**
- * Return a pointer to the ARINC 653-specific scheduler data information
- * associated with the given UNIT (unit)
- */
-#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
-
-/**
- * Return the global scheduler private data given the scheduler ops pointer
- */
-#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
-
-/**************************************************************************
- * Private Type Definitions                                               *
- **************************************************************************/
-
-/**
- * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
- * information for all non-idle UNITs
- */
-typedef struct arinc653_unit_s
-{
-    /* unit points to Xen's struct sched_unit so we can get to it from an
-     * arinc653_unit_t pointer. */
-    struct sched_unit * unit;
-    /* awake holds whether the UNIT has been woken with vcpu_wake() */
-    bool_t              awake;
-    /* list holds the linked list information for the list this UNIT
-     * is stored in */
-    struct list_head    list;
-} arinc653_unit_t;
-
-/**
- * The sched_entry_t structure holds a single entry of the
- * ARINC 653 schedule.
- */
-typedef struct sched_entry_s
-{
-    /* dom_handle holds the handle ("UUID") for the domain that this
-     * schedule entry refers to. */
-    xen_domain_handle_t dom_handle;
-    /* unit_id holds the UNIT number for the UNIT that this schedule
-     * entry refers to. */
-    int                 unit_id;
-    /* runtime holds the number of nanoseconds that the UNIT for this
-     * schedule entry should be allowed to run per major frame. */
-    s_time_t            runtime;
-    /* unit holds a pointer to the Xen sched_unit structure */
-    struct sched_unit * unit;
-} sched_entry_t;
-
-/**
- * This structure defines data that is global to an instance of the scheduler
- */
-typedef struct a653sched_priv_s
-{
-    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
-    spinlock_t lock;
-
-    /**
-     * This array holds the active ARINC 653 schedule.
-     *
-     * When the system tries to start a new UNIT, this schedule is scanned
-     * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
-     * and UNIT number match, then the UNIT is allowed to run. Its run time
-     * (per major frame) is given in the third entry of the schedule.
-     */
-    sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
-
-    /**
-     * This variable holds the number of entries that are valid in
-     * the arinc653_schedule table.
-     *
-     * This is not necessarily the same as the number of domains in the
-     * schedule. A domain could be listed multiple times within the schedule,
-     * or a domain with multiple UNITs could have a different
-     * schedule entry for each UNIT.
-     */
-    unsigned int num_schedule_entries;
-
-    /**
-     * the major frame time for the ARINC 653 schedule.
-     */
-    s_time_t major_frame;
-
-    /**
-     * the time that the next major frame starts
-     */
-    s_time_t next_major_frame;
-
-    /**
-     * pointers to all Xen UNIT structures for iterating through
-     */
-    struct list_head unit_list;
-} a653sched_priv_t;
-
-/**************************************************************************
- * Helper functions                                                       *
- **************************************************************************/
-
-/**
- * This function compares two domain handles.
- *
- * @param h1        Pointer to handle 1
- * @param h2        Pointer to handle 2
- *
- * @return          <ul>
- *                  <li> <0:  handle 1 is less than handle 2
- *                  <li>  0:  handle 1 is equal to handle 2
- *                  <li> >0:  handle 1 is greater than handle 2
- *                  </ul>
- */
-static int dom_handle_cmp(const xen_domain_handle_t h1,
-                          const xen_domain_handle_t h2)
-{
-    return memcmp(h1, h2, sizeof(xen_domain_handle_t));
-}
-
-/**
- * This function searches the unit list to find a UNIT that matches
- * the domain handle and UNIT ID specified.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param handle    Pointer to handler
- * @param unit_id   UNIT ID
- *
- * @return          <ul>
- *                  <li> Pointer to the matching UNIT if one is found
- *                  <li> NULL otherwise
- *                  </ul>
- */
-static struct sched_unit *find_unit(
-    const struct scheduler *ops,
-    xen_domain_handle_t handle,
-    int unit_id)
-{
-    arinc653_unit_t *aunit;
-
-    /* loop through the unit_list looking for the specified UNIT */
-    list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
-        if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
-             && (unit_id == aunit->unit->unit_id) )
-            return aunit->unit;
-
-    return NULL;
-}
-
-/**
- * This function updates the pointer to the Xen UNIT structure for each entry
- * in the ARINC 653 schedule.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @return          <None>
- */
-static void update_schedule_units(const struct scheduler *ops)
-{
-    unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
-
-    for ( i = 0; i < n_entries; i++ )
-        SCHED_PRIV(ops)->schedule[i].unit =
-            find_unit(ops,
-                      SCHED_PRIV(ops)->schedule[i].dom_handle,
-                      SCHED_PRIV(ops)->schedule[i].unit_id);
-}
-
-/**
- * This function is called by the adjust_global scheduler hook to put
- * in place a new ARINC653 schedule.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- *
- * @return          <ul>
- *                  <li> 0 = success
- *                  <li> !0 = error
- *                  </ul>
- */
-static int
-arinc653_sched_set(
-    const struct scheduler *ops,
-    struct xen_sysctl_arinc653_schedule *schedule)
-{
-    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
-    s_time_t total_runtime = 0;
-    unsigned int i;
-    unsigned long flags;
-    int rc = -EINVAL;
-
-    spin_lock_irqsave(&sched_priv->lock, flags);
-
-    /* Check for valid major frame and number of schedule entries. */
-    if ( (schedule->major_frame <= 0)
-         || (schedule->num_sched_entries < 1)
-         || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
-        goto fail;
-
-    for ( i = 0; i < schedule->num_sched_entries; i++ )
-    {
-        /* Check for a valid run time. */
-        if ( schedule->sched_entries[i].runtime <= 0 )
-            goto fail;
-
-        /* Add this entry's run time to total run time. */
-        total_runtime += schedule->sched_entries[i].runtime;
-    }
-
-    /*
-     * Error if the major frame is not large enough to run all entries as
-     * indicated by comparing the total run time to the major frame length.
-     */
-    if ( total_runtime > schedule->major_frame )
-        goto fail;
-
-    /* Copy the new schedule into place. */
-    sched_priv->num_schedule_entries = schedule->num_sched_entries;
-    sched_priv->major_frame = schedule->major_frame;
-    for ( i = 0; i < schedule->num_sched_entries; i++ )
-    {
-        memcpy(sched_priv->schedule[i].dom_handle,
-               schedule->sched_entries[i].dom_handle,
-               sizeof(sched_priv->schedule[i].dom_handle));
-        sched_priv->schedule[i].unit_id =
-            schedule->sched_entries[i].vcpu_id;
-        sched_priv->schedule[i].runtime =
-            schedule->sched_entries[i].runtime;
-    }
-    update_schedule_units(ops);
-
-    /*
-     * The newly-installed schedule takes effect immediately. We do not even
-     * wait for the current major frame to expire.
-     *
-     * Signal a new major frame to begin. The next major frame is set up by
-     * the do_schedule callback function when it is next invoked.
-     */
-    sched_priv->next_major_frame = NOW();
-
-    rc = 0;
-
- fail:
-    spin_unlock_irqrestore(&sched_priv->lock, flags);
-    return rc;
-}
-
-/**
- * This function is called by the adjust_global scheduler hook to read the
- * current ARINC 653 schedule
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @return          <ul>
- *                  <li> 0 = success
- *                  <li> !0 = error
- *                  </ul>
- */
-static int
-arinc653_sched_get(
-    const struct scheduler *ops,
-    struct xen_sysctl_arinc653_schedule *schedule)
-{
-    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
-    unsigned int i;
-    unsigned long flags;
-
-    spin_lock_irqsave(&sched_priv->lock, flags);
-
-    schedule->num_sched_entries = sched_priv->num_schedule_entries;
-    schedule->major_frame = sched_priv->major_frame;
-    for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
-    {
-        memcpy(schedule->sched_entries[i].dom_handle,
-               sched_priv->schedule[i].dom_handle,
-               sizeof(sched_priv->schedule[i].dom_handle));
-        schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
-        schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
-    }
-
-    spin_unlock_irqrestore(&sched_priv->lock, flags);
-
-    return 0;
-}
-
-/**************************************************************************
- * Scheduler callback functions                                           *
- **************************************************************************/
-
-/**
- * This function performs initialization for an instance of the scheduler.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- *
- * @return          <ul>
- *                  <li> 0 = success
- *                  <li> !0 = error
- *                  </ul>
- */
-static int
-a653sched_init(struct scheduler *ops)
-{
-    a653sched_priv_t *prv;
-
-    prv = xzalloc(a653sched_priv_t);
-    if ( prv == NULL )
-        return -ENOMEM;
-
-    ops->sched_data = prv;
-
-    prv->next_major_frame = 0;
-    spin_lock_init(&prv->lock);
-    INIT_LIST_HEAD(&prv->unit_list);
-
-    return 0;
-}
-
-/**
- * This function performs deinitialization for an instance of the scheduler
- *
- * @param ops       Pointer to this instance of the scheduler structure
- */
-static void
-a653sched_deinit(struct scheduler *ops)
-{
-    xfree(SCHED_PRIV(ops));
-    ops->sched_data = NULL;
-}
-
-/**
- * This function allocates scheduler-specific data for a UNIT
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param unit      Pointer to struct sched_unit
- *
- * @return          Pointer to the allocated data
- */
-static void *
-a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
-                      void *dd)
-{
-    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
-    arinc653_unit_t *svc;
-    unsigned int entry;
-    unsigned long flags;
-
-    /*
-     * Allocate memory for the ARINC 653-specific scheduler data information
-     * associated with the given UNIT (unit).
-     */
-    svc = xmalloc(arinc653_unit_t);
-    if ( svc == NULL )
-        return NULL;
-
-    spin_lock_irqsave(&sched_priv->lock, flags);
-
-    /*
-     * Add every one of dom0's units to the schedule, as long as there are
-     * slots available.
-     */
-    if ( unit->domain->domain_id == 0 )
-    {
-        entry = sched_priv->num_schedule_entries;
-
-        if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
-        {
-            sched_priv->schedule[entry].dom_handle[0] = '\0';
-            sched_priv->schedule[entry].unit_id = unit->unit_id;
-            sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
-            sched_priv->schedule[entry].unit = unit;
-
-            sched_priv->major_frame += DEFAULT_TIMESLICE;
-            ++sched_priv->num_schedule_entries;
-        }
-    }
-
-    /*
-     * Initialize our ARINC 653 scheduler-specific information for the UNIT.
-     * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
-     * will call the vcpu_wake scheduler callback function and our scheduler
-     * will mark the UNIT awake.
-     */
-    svc->unit = unit;
-    svc->awake = 0;
-    if ( !is_idle_unit(unit) )
-        list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
-    update_schedule_units(ops);
-
-    spin_unlock_irqrestore(&sched_priv->lock, flags);
-
-    return svc;
-}
-
-/**
- * This function frees scheduler-specific UNIT data
- *
- * @param ops       Pointer to this instance of the scheduler structure
- */
-static void
-a653sched_free_udata(const struct scheduler *ops, void *priv)
-{
-    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
-    arinc653_unit_t *av = priv;
-    unsigned long flags;
-
-    if (av == NULL)
-        return;
-
-    spin_lock_irqsave(&sched_priv->lock, flags);
-
-    if ( !is_idle_unit(av->unit) )
-        list_del(&av->list);
-
-    xfree(av);
-    update_schedule_units(ops);
-
-    spin_unlock_irqrestore(&sched_priv->lock, flags);
-}
-
-/**
- * Xen scheduler callback function to sleep a UNIT
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param unit      Pointer to struct sched_unit
- */
-static void
-a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
-    if ( AUNIT(unit) != NULL )
-        AUNIT(unit)->awake = 0;
-
-    /*
-     * If the UNIT being put to sleep is the same one that is currently
-     * running, raise a softirq to invoke the scheduler to switch domains.
-     */
-    if ( get_sched_res(sched_unit_master(unit))->curr == unit )
-        cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
-}
-
-/**
- * Xen scheduler callback function to wake up a UNIT
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param unit      Pointer to struct sched_unit
- */
-static void
-a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
-    if ( AUNIT(unit) != NULL )
-        AUNIT(unit)->awake = 1;
-
-    cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
-}
-
-/**
- * Xen scheduler callback function to select a UNIT to run.
- * This is the main scheduler routine.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param now       Current time
- */
-static void
-a653sched_do_schedule(
-    const struct scheduler *ops,
-    struct sched_unit *prev,
-    s_time_t now,
-    bool tasklet_work_scheduled)
-{
-    struct sched_unit *new_task = NULL;
-    static unsigned int sched_index = 0;
-    static s_time_t next_switch_time;
-    a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
-    const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
-    unsigned long flags;
-
-    spin_lock_irqsave(&sched_priv->lock, flags);
-
-    if ( sched_priv->num_schedule_entries < 1 )
-        sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
-    else if ( now >= sched_priv->next_major_frame )
-    {
-        /* time to enter a new major frame
-         * the first time this function is called, this will be true */
-        /* start with the first domain in the schedule */
-        sched_index = 0;
-        sched_priv->next_major_frame = now + sched_priv->major_frame;
-        next_switch_time = now + sched_priv->schedule[0].runtime;
-    }
-    else
-    {
-        while ( (now >= next_switch_time)
-                && (sched_index < sched_priv->num_schedule_entries) )
-        {
-            /* time to switch to the next domain in this major frame */
-            sched_index++;
-            next_switch_time += sched_priv->schedule[sched_index].runtime;
-        }
-    }
-
-    /*
-     * If we exhausted the domains in the schedule and still have time left
-     * in the major frame then switch next at the next major frame.
-     */
-    if ( sched_index >= sched_priv->num_schedule_entries )
-        next_switch_time = sched_priv->next_major_frame;
-
-    /*
-     * If there are more domains to run in the current major frame, set
-     * new_task equal to the address of next domain's sched_unit structure.
-     * Otherwise, set new_task equal to the address of the idle task's
-     * sched_unit structure.
-     */
-    new_task = (sched_index < sched_priv->num_schedule_entries)
-        ? sched_priv->schedule[sched_index].unit
-        : IDLETASK(cpu);
-
-    /* Check to see if the new task can be run (awake & runnable). */
-    if ( !((new_task != NULL)
-           && (AUNIT(new_task) != NULL)
-           && AUNIT(new_task)->awake
-           && unit_runnable_state(new_task)) )
-        new_task = IDLETASK(cpu);
-    BUG_ON(new_task == NULL);
-
-    /*
-     * Check to make sure we did not miss a major frame.
-     * This is a good test for robust partitioning.
-     */
-    BUG_ON(now >= sched_priv->next_major_frame);
-
-    spin_unlock_irqrestore(&sched_priv->lock, flags);
-
-    /* Tasklet work (which runs in idle UNIT context) overrides all else. */
-    if ( tasklet_work_scheduled )
-        new_task = IDLETASK(cpu);
-
-    /* Running this task would result in a migration */
-    if ( !is_idle_unit(new_task)
-         && (sched_unit_master(new_task) != cpu) )
-        new_task = IDLETASK(cpu);
-
-    /*
-     * Return the amount of time the next domain has to run and the address
-     * of the selected task's UNIT structure.
-     */
-    prev->next_time = next_switch_time - now;
-    prev->next_task = new_task;
-    new_task->migrated = false;
-
-    BUG_ON(prev->next_time <= 0);
-}
-
-/**
- * Xen scheduler callback function to select a resource for the UNIT to run on
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param unit      Pointer to struct sched_unit
- *
- * @return          Scheduler resource to run on
- */
-static struct sched_resource *
-a653sched_pick_resource(const struct scheduler *ops,
-                        const struct sched_unit *unit)
-{
-    cpumask_t *online;
-    unsigned int cpu;
-
-    /*
-     * If present, prefer unit's current processor, else
-     * just find the first valid unit.
-     */
-    online = cpupool_domain_master_cpumask(unit->domain);
-
-    cpu = cpumask_first(online);
-
-    if ( cpumask_test_cpu(sched_unit_master(unit), online)
-         || (cpu >= nr_cpu_ids) )
-        cpu = sched_unit_master(unit);
-
-    return get_sched_res(cpu);
-}
-
-/**
- * Xen scheduler callback to change the scheduler of a cpu
- *
- * @param new_ops   Pointer to this instance of the scheduler structure
- * @param cpu       The cpu that is changing scheduler
- * @param pdata     scheduler specific PCPU data (we don't have any)
- * @param vdata     scheduler specific UNIT data of the idle unit
- */
-static spinlock_t *
-a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
-                  void *pdata, void *vdata)
-{
-    struct sched_resource *sr = get_sched_res(cpu);
-    arinc653_unit_t *svc = vdata;
-
-    ASSERT(!pdata && svc && is_idle_unit(svc->unit));
-
-    sched_idle_unit(cpu)->priv = vdata;
-
-    return &sr->_lock;
-}
-
-/**
- * Xen scheduler callback function to perform a global (not domain-specific)
- * adjustment. It is used by the ARINC 653 scheduler to put in place a new
- * ARINC 653 schedule or to retrieve the schedule currently in place.
- *
- * @param ops       Pointer to this instance of the scheduler structure
- * @param sc        Pointer to the scheduler operation specified by Domain 0
- */
-static int
-a653sched_adjust_global(const struct scheduler *ops,
-                        struct xen_sysctl_scheduler_op *sc)
-{
-    struct xen_sysctl_arinc653_schedule local_sched;
-    int rc = -EINVAL;
-
-    switch ( sc->cmd )
-    {
-    case XEN_SYSCTL_SCHEDOP_putinfo:
-        if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
-        {
-            rc = -EFAULT;
-            break;
-        }
-
-        rc = arinc653_sched_set(ops, &local_sched);
-        break;
-    case XEN_SYSCTL_SCHEDOP_getinfo:
-        memset(&local_sched, -1, sizeof(local_sched));
-        rc = arinc653_sched_get(ops, &local_sched);
-        if ( rc )
-            break;
-
-        if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
-            rc = -EFAULT;
-        break;
-    }
-
-    return rc;
-}
-
-/**
- * This structure defines our scheduler for Xen.
- * The entries tell Xen where to find our scheduler-specific
- * callback functions.
- * The symbol must be visible to the rest of Xen at link time.
- */
-static const struct scheduler sched_arinc653_def = {
-    .name           = "ARINC 653 Scheduler",
-    .opt_name       = "arinc653",
-    .sched_id       = XEN_SCHEDULER_ARINC653,
-    .sched_data     = NULL,
-
-    .init           = a653sched_init,
-    .deinit         = a653sched_deinit,
-
-    .free_udata     = a653sched_free_udata,
-    .alloc_udata    = a653sched_alloc_udata,
-
-    .insert_unit    = NULL,
-    .remove_unit    = NULL,
-
-    .sleep          = a653sched_unit_sleep,
-    .wake           = a653sched_unit_wake,
-    .yield          = NULL,
-    .context_saved  = NULL,
-
-    .do_schedule    = a653sched_do_schedule,
-
-    .pick_resource  = a653sched_pick_resource,
-
-    .switch_sched   = a653_switch_sched,
-
-    .adjust         = NULL,
-    .adjust_global  = a653sched_adjust_global,
-
-    .dump_settings  = NULL,
-    .dump_cpu_state = NULL,
-};
-
-REGISTER_SCHEDULER(sched_arinc653_def);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
deleted file mode 100644
index aa41a3301b..0000000000
--- a/xen/common/sched_credit.c
+++ /dev/null
@@ -1,2284 +0,0 @@
-/****************************************************************************
- * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
- ****************************************************************************
- *
- *        File: common/csched_credit.c
- *      Author: Emmanuel Ackaouy
- *
- * Description: Credit-based SMP CPU scheduler
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/atomic.h>
-#include <asm/div64.h>
-#include <xen/errno.h>
-#include <xen/keyhandler.h>
-#include <xen/trace.h>
-#include <xen/err.h>
-
-
-/*
- * Locking:
- * - Scheduler-lock (a.k.a. runqueue lock):
- *  + is per-runqueue, and there is one runqueue per-cpu;
- *  + serializes all runqueue manipulation operations;
- * - Private data lock (a.k.a. private scheduler lock):
- *  + serializes accesses to the scheduler global state (weight,
- *    credit, balance_credit, etc);
- *  + serializes updates to the domains' scheduling parameters.
- *
- * Ordering is "private lock always comes first":
- *  + if we need both locks, we must acquire the private
- *    scheduler lock for first;
- *  + if we already own a runqueue lock, we must never acquire
- *    the private scheduler lock.
- */
-
-/*
- * Basic constants
- */
-#define CSCHED_DEFAULT_WEIGHT       256
-#define CSCHED_TICKS_PER_TSLICE     3
-/* Default timeslice: 30ms */
-#define CSCHED_DEFAULT_TSLICE_MS    30
-#define CSCHED_CREDITS_PER_MSEC     10
-/* Never set a timer shorter than this value. */
-#define CSCHED_MIN_TIMER            XEN_SYSCTL_SCHED_RATELIMIT_MIN
-
-
-/*
- * Priorities
- */
-#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
-#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
-#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
-#define CSCHED_PRI_IDLE         -64     /* idle */
-
-
-/*
- * Flags
- *
- * Note that svc->flags (where these flags live) is protected by an
- * inconsistent set of locks. Therefore atomic-safe bit operations must
- * be used for accessing it.
- */
-#define CSCHED_FLAG_UNIT_PARKED    0x0  /* UNIT over capped credits */
-#define CSCHED_FLAG_UNIT_YIELD     0x1  /* UNIT yielding */
-#define CSCHED_FLAG_UNIT_MIGRATING 0x2  /* UNIT may have moved to a new pcpu */
-#define CSCHED_FLAG_UNIT_PINNED    0x4  /* UNIT can run only on 1 pcpu */
-
-
-/*
- * Useful macros
- */
-#define CSCHED_PRIV(_ops)   \
-    ((struct csched_private *)((_ops)->sched_data))
-#define CSCHED_PCPU(_c)     \
-    ((struct csched_pcpu *)get_sched_res(_c)->sched_priv)
-#define CSCHED_UNIT(unit)   ((struct csched_unit *) (unit)->priv)
-#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
-#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
-
-
-/*
- * CSCHED_STATS
- *
- * Manage very basic per-unit counters and stats.
- *
- * Useful for debugging live systems. The stats are displayed
- * with runq dumps ('r' on the Xen console).
- */
-#ifdef SCHED_STATS
-
-#define CSCHED_STATS
-
-#define SCHED_UNIT_STATS_RESET(_V)                      \
-    do                                                  \
-    {                                                   \
-        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
-    } while ( 0 )
-
-#define SCHED_UNIT_STAT_CRANK(_V, _X)       (((_V)->stats._X)++)
-
-#define SCHED_UNIT_STAT_SET(_V, _X, _Y)     (((_V)->stats._X) = (_Y))
-
-#else /* !SCHED_STATS */
-
-#undef CSCHED_STATS
-
-#define SCHED_UNIT_STATS_RESET(_V)         do {} while ( 0 )
-#define SCHED_UNIT_STAT_CRANK(_V, _X)      do {} while ( 0 )
-#define SCHED_UNIT_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
-
-#endif /* SCHED_STATS */
-
-
-/*
- * Credit tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1)
-#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2)
-#define TRC_CSCHED_ACCOUNT_STOP  TRC_SCHED_CLASS_EVT(CSCHED, 3)
-#define TRC_CSCHED_STOLEN_UNIT   TRC_SCHED_CLASS_EVT(CSCHED, 4)
-#define TRC_CSCHED_PICKED_CPU    TRC_SCHED_CLASS_EVT(CSCHED, 5)
-#define TRC_CSCHED_TICKLE        TRC_SCHED_CLASS_EVT(CSCHED, 6)
-#define TRC_CSCHED_BOOST_START   TRC_SCHED_CLASS_EVT(CSCHED, 7)
-#define TRC_CSCHED_BOOST_END     TRC_SCHED_CLASS_EVT(CSCHED, 8)
-#define TRC_CSCHED_SCHEDULE      TRC_SCHED_CLASS_EVT(CSCHED, 9)
-#define TRC_CSCHED_RATELIMIT     TRC_SCHED_CLASS_EVT(CSCHED, 10)
-#define TRC_CSCHED_STEAL_CHECK   TRC_SCHED_CLASS_EVT(CSCHED, 11)
-
-/*
- * Boot parameters
- */
-static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
-integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms);
-
-/*
- * Physical CPU
- */
-struct csched_pcpu {
-    struct list_head runq;
-    uint32_t runq_sort_last;
-
-    unsigned int idle_bias;
-    unsigned int nr_runnable;
-
-    unsigned int tick;
-    struct timer ticker;
-};
-
-/*
- * Virtual UNIT
- */
-struct csched_unit {
-    struct list_head runq_elem;
-    struct list_head active_unit_elem;
-
-    /* Up-pointers */
-    struct csched_dom *sdom;
-    struct sched_unit *unit;
-
-    s_time_t start_time;   /* When we were scheduled (used for credit) */
-    unsigned flags;
-    int pri;
-
-    atomic_t credit;
-    unsigned int residual;
-
-    s_time_t last_sched_time;
-
-#ifdef CSCHED_STATS
-    struct {
-        int credit_last;
-        uint32_t credit_incr;
-        uint32_t state_active;
-        uint32_t state_idle;
-        uint32_t migrate_q;
-        uint32_t migrate_r;
-        uint32_t kicked_away;
-    } stats;
-#endif
-};
-
-/*
- * Domain
- */
-struct csched_dom {
-    struct list_head active_unit;
-    struct list_head active_sdom_elem;
-    struct domain *dom;
-    uint16_t active_unit_count;
-    uint16_t weight;
-    uint16_t cap;
-};
-
-/*
- * System-wide private data
- */
-struct csched_private {
-    /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
-    spinlock_t lock;
-
-    cpumask_var_t idlers;
-    cpumask_var_t cpus;
-    uint32_t *balance_bias;
-    uint32_t runq_sort;
-    uint32_t ncpus;
-
-    /* Period of master and tick in milliseconds */
-    unsigned int tick_period_us, ticks_per_tslice;
-    s_time_t ratelimit, tslice, unit_migr_delay;
-
-    struct list_head active_sdom;
-    uint32_t weight;
-    uint32_t credit;
-    int credit_balance;
-    unsigned int credits_per_tslice;
-
-    unsigned int master;
-    struct timer master_ticker;
-};
-
-static void csched_tick(void *_cpu);
-static void csched_acct(void *dummy);
-
-static inline int
-__unit_on_runq(struct csched_unit *svc)
-{
-    return !list_empty(&svc->runq_elem);
-}
-
-static inline struct csched_unit *
-__runq_elem(struct list_head *elem)
-{
-    return list_entry(elem, struct csched_unit, runq_elem);
-}
-
-/* Is the first element of cpu's runq (if any) cpu's idle unit? */
-static inline bool_t is_runq_idle(unsigned int cpu)
-{
-    /*
-     * We're peeking at cpu's runq, we must hold the proper lock.
-     */
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    return list_empty(RUNQ(cpu)) ||
-           is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit);
-}
-
-static inline void
-inc_nr_runnable(unsigned int cpu)
-{
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-    CSCHED_PCPU(cpu)->nr_runnable++;
-
-}
-
-static inline void
-dec_nr_runnable(unsigned int cpu)
-{
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-    ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1);
-    CSCHED_PCPU(cpu)->nr_runnable--;
-}
-
-static inline void
-__runq_insert(struct csched_unit *svc)
-{
-    unsigned int cpu = sched_unit_master(svc->unit);
-    const struct list_head * const runq = RUNQ(cpu);
-    struct list_head *iter;
-
-    BUG_ON( __unit_on_runq(svc) );
-
-    list_for_each( iter, runq )
-    {
-        const struct csched_unit * const iter_svc = __runq_elem(iter);
-        if ( svc->pri > iter_svc->pri )
-            break;
-    }
-
-    /* If the unit yielded, try to put it behind one lower-priority
-     * runnable unit if we can.  The next runq_sort will bring it forward
-     * within 30ms if the queue too long. */
-    if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags)
-         && __runq_elem(iter)->pri > CSCHED_PRI_IDLE )
-    {
-        iter=iter->next;
-
-        /* Some sanity checks */
-        BUG_ON(iter == runq);
-    }
-
-    list_add_tail(&svc->runq_elem, iter);
-}
-
-static inline void
-runq_insert(struct csched_unit *svc)
-{
-    __runq_insert(svc);
-    inc_nr_runnable(sched_unit_master(svc->unit));
-}
-
-static inline void
-__runq_remove(struct csched_unit *svc)
-{
-    BUG_ON( !__unit_on_runq(svc) );
-    list_del_init(&svc->runq_elem);
-}
-
-static inline void
-runq_remove(struct csched_unit *svc)
-{
-    dec_nr_runnable(sched_unit_master(svc->unit));
-    __runq_remove(svc);
-}
-
-static void burn_credits(struct csched_unit *svc, s_time_t now)
-{
-    s_time_t delta;
-    uint64_t val;
-    unsigned int credits;
-
-    /* Assert svc is current */
-    ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) );
-
-    if ( (delta = now - svc->start_time) <= 0 )
-        return;
-
-    val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual;
-    svc->residual = do_div(val, MILLISECS(1));
-    credits = val;
-    ASSERT(credits == val); /* make sure we haven't truncated val */
-    atomic_sub(credits, &svc->credit);
-    svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC;
-}
-
-static bool_t __read_mostly opt_tickle_one_idle = 1;
-boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle);
-
-DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
-
-static inline void __runq_tickle(struct csched_unit *new)
-{
-    unsigned int cpu = sched_unit_master(new->unit);
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct sched_unit *unit = new->unit;
-    struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu));
-    struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
-    cpumask_t mask, idle_mask, *online;
-    int balance_step, idlers_empty;
-
-    ASSERT(cur);
-    cpumask_clear(&mask);
-
-    online = cpupool_domain_master_cpumask(new->sdom->dom);
-    cpumask_and(&idle_mask, prv->idlers, online);
-    idlers_empty = cpumask_empty(&idle_mask);
-
-    /*
-     * Exclusive pinning is when a unit has hard-affinity with only one
-     * cpu, and there is no other unit that has hard-affinity with that
-     * same cpu. This is infrequent, but if it happens, is for achieving
-     * the most possible determinism, and least possible overhead for
-     * the units in question.
-     *
-     * Try to identify the vast majority of these situations, and deal
-     * with them quickly.
-     */
-    if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) &&
-                  cpumask_test_cpu(cpu, &idle_mask)) )
-    {
-        ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
-        SCHED_STAT_CRANK(tickled_idle_cpu_excl);
-        __cpumask_set_cpu(cpu, &mask);
-        goto tickle;
-    }
-
-    /*
-     * If the pcpu is idle, or there are no idlers and the new
-     * unit is a higher priority than the old unit, run it here.
-     *
-     * If there are idle cpus, first try to find one suitable to run
-     * new, so we can avoid preempting cur.  If we cannot find a
-     * suitable idler on which to run new, run it here, but try to
-     * find a suitable idler on which to run cur instead.
-     */
-    if ( cur->pri == CSCHED_PRI_IDLE
-         || (idlers_empty && new->pri > cur->pri) )
-    {
-        if ( cur->pri != CSCHED_PRI_IDLE )
-            SCHED_STAT_CRANK(tickled_busy_cpu);
-        else
-            SCHED_STAT_CRANK(tickled_idle_cpu);
-        __cpumask_set_cpu(cpu, &mask);
-    }
-    else if ( !idlers_empty )
-    {
-        /*
-         * Soft and hard affinity balancing loop. For units without
-         * a useful soft affinity, consider hard affinity only.
-         */
-        for_each_affinity_balance_step( balance_step )
-        {
-            int new_idlers_empty;
-
-            if ( balance_step == BALANCE_SOFT_AFFINITY
-                 && !has_soft_affinity(unit) )
-                continue;
-
-            /* Are there idlers suitable for new (for this balance step)? */
-            affinity_balance_cpumask(unit, balance_step,
-                                     cpumask_scratch_cpu(cpu));
-            cpumask_and(cpumask_scratch_cpu(cpu),
-                        cpumask_scratch_cpu(cpu), &idle_mask);
-            new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu));
-
-            /*
-             * Let's not be too harsh! If there aren't idlers suitable
-             * for new in its soft affinity mask, make sure we check its
-             * hard affinity as well, before taking final decisions.
-             */
-            if ( new_idlers_empty
-                 && balance_step == BALANCE_SOFT_AFFINITY )
-                continue;
-
-            /*
-             * If there are no suitable idlers for new, and it's higher
-             * priority than cur, check whether we can migrate cur away.
-             * We have to do it indirectly, via _VPF_migrating (instead
-             * of just tickling any idler suitable for cur) because cur
-             * is running.
-             *
-             * If there are suitable idlers for new, no matter priorities,
-             * leave cur alone (as it is running and is, likely, cache-hot)
-             * and wake some of them (which is waking up and so is, likely,
-             * cache cold anyway).
-             */
-            if ( new_idlers_empty && new->pri > cur->pri )
-            {
-                if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) )
-                {
-                    SCHED_UNIT_STAT_CRANK(cur, kicked_away);
-                    SCHED_UNIT_STAT_CRANK(cur, migrate_r);
-                    SCHED_STAT_CRANK(migrate_kicked_away);
-                    sched_set_pause_flags_atomic(cur->unit, _VPF_migrating);
-                }
-                /* Tickle cpu anyway, to let new preempt cur. */
-                SCHED_STAT_CRANK(tickled_busy_cpu);
-                __cpumask_set_cpu(cpu, &mask);
-            }
-            else if ( !new_idlers_empty )
-            {
-                /* Which of the idlers suitable for new shall we wake up? */
-                SCHED_STAT_CRANK(tickled_idle_cpu);
-                if ( opt_tickle_one_idle )
-                {
-                    this_cpu(last_tickle_cpu) =
-                        cpumask_cycle(this_cpu(last_tickle_cpu),
-                                      cpumask_scratch_cpu(cpu));
-                    __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
-                }
-                else
-                    cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu));
-            }
-
-            /* Did we find anyone? */
-            if ( !cpumask_empty(&mask) )
-                break;
-        }
-    }
-
- tickle:
-    if ( !cpumask_empty(&mask) )
-    {
-        if ( unlikely(tb_init_done) )
-        {
-            /* Avoid TRACE_*: saves checking !tb_init_done each step */
-            for_each_cpu(cpu, &mask)
-                __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
-        }
-
-        /*
-         * Mark the designated CPUs as busy and send them all the scheduler
-         * interrupt. We need the for_each_cpu for dealing with the
-         * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
-         * can't use cpumask_andnot(), because prv->idlers needs atomic access.
-         *
-         * In the default (and most common) case, when opt_rickle_one_idle is
-         * true, the loop does only one step, and only one bit is cleared.
-         */
-        for_each_cpu(cpu, &mask)
-            cpumask_clear_cpu(cpu, prv->idlers);
-        cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
-    }
-    else
-        SCHED_STAT_CRANK(tickled_no_cpu);
-}
-
-static void
-csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    struct csched_private *prv = CSCHED_PRIV(ops);
-
-    /*
-     * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're
-     * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed
-     * very early. xfree() does not really mind, but we want to be sure that,
-     * when we get here, either init_pdata has never been called, or
-     * deinit_pdata has been called already.
-     */
-    ASSERT(!cpumask_test_cpu(cpu, prv->cpus));
-
-    xfree(pcpu);
-}
-
-static void
-csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    struct csched_pcpu *spc = pcpu;
-    unsigned int node = cpu_to_node(cpu);
-    unsigned long flags;
-
-    /*
-     * Scheduler specific data for this pCPU must still be there and and be
-     * valid. In fact, if we are here:
-     *  1. alloc_pdata must have been called for this cpu, and free_pdata
-     *     must not have been called on it before us,
-     *  2. init_pdata must have been called on this cpu, and deinit_pdata
-     *     (us!) must not have been called on it already.
-     */
-    ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus));
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    prv->credit -= prv->credits_per_tslice;
-    prv->ncpus--;
-    cpumask_clear_cpu(cpu, prv->idlers);
-    cpumask_clear_cpu(cpu, prv->cpus);
-    if ( (prv->master == cpu) && (prv->ncpus > 0) )
-    {
-        prv->master = cpumask_first(prv->cpus);
-        migrate_timer(&prv->master_ticker, prv->master);
-    }
-    if ( prv->balance_bias[node] == cpu )
-    {
-        cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node));
-        if ( !cpumask_empty(cpumask_scratch) )
-            prv->balance_bias[node] =  cpumask_first(cpumask_scratch);
-    }
-    kill_timer(&spc->ticker);
-    if ( prv->ncpus == 0 )
-        kill_timer(&prv->master_ticker);
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-csched_alloc_pdata(const struct scheduler *ops, int cpu)
-{
-    struct csched_pcpu *spc;
-
-    /* Allocate per-PCPU info */
-    spc = xzalloc(struct csched_pcpu);
-    if ( spc == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    return spc;
-}
-
-static void
-init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu)
-{
-    ASSERT(spin_is_locked(&prv->lock));
-    /* cpu data needs to be allocated, but STILL uninitialized. */
-    ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL);
-
-    /* Initialize/update system-wide config */
-    prv->credit += prv->credits_per_tslice;
-    prv->ncpus++;
-    cpumask_set_cpu(cpu, prv->cpus);
-    if ( prv->ncpus == 1 )
-    {
-        prv->master = cpu;
-        init_timer(&prv->master_ticker, csched_acct, prv, cpu);
-        set_timer(&prv->master_ticker, NOW() + prv->tslice);
-    }
-
-    cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu)));
-    if ( cpumask_weight(cpumask_scratch) == 1 )
-        prv->balance_bias[cpu_to_node(cpu)] = cpu;
-
-    init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu);
-    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
-
-    INIT_LIST_HEAD(&spc->runq);
-    spc->runq_sort_last = prv->runq_sort;
-    spc->idle_bias = nr_cpu_ids - 1;
-
-    /* Start off idling... */
-    BUG_ON(!is_idle_unit(curr_on_cpu(cpu)));
-    cpumask_set_cpu(cpu, prv->idlers);
-    spc->nr_runnable = 0;
-}
-
-static void
-csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
-    unsigned long flags;
-    struct csched_private *prv = CSCHED_PRIV(ops);
-
-    spin_lock_irqsave(&prv->lock, flags);
-    init_pdata(prv, pdata, cpu);
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-/* Change the scheduler of cpu to us (Credit). */
-static spinlock_t *
-csched_switch_sched(struct scheduler *new_ops, unsigned int cpu,
-                    void *pdata, void *vdata)
-{
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct csched_private *prv = CSCHED_PRIV(new_ops);
-    struct csched_unit *svc = vdata;
-
-    ASSERT(svc && is_idle_unit(svc->unit));
-
-    sched_idle_unit(cpu)->priv = vdata;
-
-    /*
-     * We are holding the runqueue lock already (it's been taken in
-     * schedule_cpu_switch()). It actually may or may not be the 'right'
-     * one for this cpu, but that is ok for preventing races.
-     */
-    ASSERT(!local_irq_is_enabled());
-    spin_lock(&prv->lock);
-    init_pdata(prv, pdata, cpu);
-    spin_unlock(&prv->lock);
-
-    return &sr->_lock;
-}
-
-#ifndef NDEBUG
-static inline void
-__csched_unit_check(struct sched_unit *unit)
-{
-    struct csched_unit * const svc = CSCHED_UNIT(unit);
-    struct csched_dom * const sdom = svc->sdom;
-
-    BUG_ON( svc->unit != unit );
-    BUG_ON( sdom != CSCHED_DOM(unit->domain) );
-    if ( sdom )
-    {
-        BUG_ON( is_idle_unit(unit) );
-        BUG_ON( sdom->dom != unit->domain );
-    }
-    else
-    {
-        BUG_ON( !is_idle_unit(unit) );
-    }
-
-    SCHED_STAT_CRANK(unit_check);
-}
-#define CSCHED_UNIT_CHECK(unit)  (__csched_unit_check(unit))
-#else
-#define CSCHED_UNIT_CHECK(unit)
-#endif
-
-/*
- * Delay, in microseconds, between migrations of a UNIT between PCPUs.
- * This prevents rapid fluttering of a UNIT between CPUs, and reduces the
- * implicit overheads such as cache-warming. 1ms (1000) has been measured
- * as a good value.
- */
-static unsigned int vcpu_migration_delay_us;
-integer_param("vcpu_migration_delay", vcpu_migration_delay_us);
-
-static inline bool
-__csched_vcpu_is_cache_hot(const struct csched_private *prv,
-                           const struct csched_unit *svc)
-{
-    bool hot = prv->unit_migr_delay &&
-               (NOW() - svc->last_sched_time) < prv->unit_migr_delay;
-
-    if ( hot )
-        SCHED_STAT_CRANK(unit_hot);
-
-    return hot;
-}
-
-static inline int
-__csched_unit_is_migrateable(const struct csched_private *prv,
-                             struct sched_unit *unit,
-                             int dest_cpu, cpumask_t *mask)
-{
-    const struct csched_unit *svc = CSCHED_UNIT(unit);
-    /*
-     * Don't pick up work that's hot on peer PCPU, or that can't (or
-     * would prefer not to) run on cpu.
-     *
-     * The caller is supposed to have already checked that unit is also
-     * not running.
-     */
-    ASSERT(!unit->is_running);
-
-    return !__csched_vcpu_is_cache_hot(prv, svc) &&
-           cpumask_test_cpu(dest_cpu, mask);
-}
-
-static int
-_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit,
-                 bool_t commit)
-{
-    int cpu = sched_unit_master(unit);
-    /* We must always use cpu's scratch space */
-    cpumask_t *cpus = cpumask_scratch_cpu(cpu);
-    cpumask_t idlers;
-    cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
-    struct csched_pcpu *spc = NULL;
-    int balance_step;
-
-    for_each_affinity_balance_step( balance_step )
-    {
-        affinity_balance_cpumask(unit, balance_step, cpus);
-        cpumask_and(cpus, online, cpus);
-        /*
-         * We want to pick up a pcpu among the ones that are online and
-         * can accommodate vc. As far as hard affinity is concerned, there
-         * always will be at least one of these pcpus in the scratch cpumask,
-         * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below
-         * are ok.
-         *
-         * On the other hand, when considering soft affinity, it is possible
-         * that the mask is empty (for instance, if the domain has been put
-         * in a cpupool that does not contain any of the pcpus in its soft
-         * affinity), which would result in the ASSERT()-s inside cpumask_*()
-         * operations triggering (in debug builds).
-         *
-         * Therefore, if that is the case, we just skip the soft affinity
-         * balancing step all together.
-         */
-        if ( balance_step == BALANCE_SOFT_AFFINITY &&
-             (!has_soft_affinity(unit) || cpumask_empty(cpus)) )
-            continue;
-
-        /* If present, prefer vc's current processor */
-        cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
-                ? sched_unit_master(unit)
-                : cpumask_cycle(sched_unit_master(unit), cpus);
-        ASSERT(cpumask_test_cpu(cpu, cpus));
-
-        /*
-         * Try to find an idle processor within the above constraints.
-         *
-         * In multi-core and multi-threaded CPUs, not all idle execution
-         * vehicles are equal!
-         *
-         * We give preference to the idle execution vehicle with the most
-         * idling neighbours in its grouping. This distributes work across
-         * distinct cores first and guarantees we don't do something stupid
-         * like run two UNITs on co-hyperthreads while there are idle cores
-         * or sockets.
-         *
-         * Notice that, when computing the "idleness" of cpu, we may want to
-         * discount unit. That is, iff unit is the currently running and the
-         * only runnable unit on cpu, we add cpu to the idlers.
-         */
-        cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers);
-        if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) )
-            __cpumask_set_cpu(cpu, &idlers);
-        cpumask_and(cpus, &idlers, cpus);
-
-        /*
-         * It is important that cpu points to an idle processor, if a suitable
-         * one exists (and we can use cpus to check and, possibly, choose a new
-         * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and
-         * cpu points to a busy thread with an idle sibling, both the threads
-         * will be considered the same, from the "idleness" calculation point
-         * of view", preventing unit from being moved to the thread that is
-         * actually idle.
-         *
-         * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so
-         * we check for it first.
-         */
-        if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) )
-            cpu = cpumask_cycle(cpu, cpus);
-        __cpumask_clear_cpu(cpu, cpus);
-
-        while ( !cpumask_empty(cpus) )
-        {
-            cpumask_t cpu_idlers;
-            cpumask_t nxt_idlers;
-            int nxt, weight_cpu, weight_nxt;
-            int migrate_factor;
-
-            nxt = cpumask_cycle(cpu, cpus);
-
-            if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) )
-            {
-                /* We're on the same socket, so check the busy-ness of threads.
-                 * Migrate if # of idlers is less at all */
-                ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
-                migrate_factor = 1;
-                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask,
-                            cpu));
-                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask,
-                            nxt));
-            }
-            else
-            {
-                /* We're on different sockets, so check the busy-ness of cores.
-                 * Migrate only if the other core is twice as idle */
-                ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) );
-                migrate_factor = 2;
-                cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu));
-                cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt));
-            }
-
-            weight_cpu = cpumask_weight(&cpu_idlers);
-            weight_nxt = cpumask_weight(&nxt_idlers);
-            /* smt_power_savings: consolidate work rather than spreading it */
-            if ( sched_smt_power_savings ?
-                 weight_cpu > weight_nxt :
-                 weight_cpu * migrate_factor < weight_nxt )
-            {
-                cpumask_and(&nxt_idlers, &nxt_idlers, cpus);
-                spc = CSCHED_PCPU(nxt);
-                cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers);
-                cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu));
-            }
-            else
-            {
-                cpumask_andnot(cpus, cpus, &nxt_idlers);
-            }
-        }
-
-        /* Stop if cpu is idle */
-        if ( cpumask_test_cpu(cpu, &idlers) )
-            break;
-    }
-
-    if ( commit && spc )
-       spc->idle_bias = cpu;
-
-    TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id,
-             cpu);
-
-    return cpu;
-}
-
-static struct sched_resource *
-csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
-    struct csched_unit *svc = CSCHED_UNIT(unit);
-
-    /*
-     * We have been called by vcpu_migrate() (in schedule.c), as part
-     * of the process of seeing if vc can be migrated to another pcpu.
-     * We make a note about this in svc->flags so that later, in
-     * csched_unit_wake() (still called from vcpu_migrate()) we won't
-     * get boosted, which we don't deserve as we are "only" migrating.
-     */
-    set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
-    return get_sched_res(_csched_cpu_pick(ops, unit, 1));
-}
-
-static inline void
-__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc)
-{
-    struct csched_dom * const sdom = svc->sdom;
-    unsigned long flags;
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    if ( list_empty(&svc->active_unit_elem) )
-    {
-        SCHED_UNIT_STAT_CRANK(svc, state_active);
-        SCHED_STAT_CRANK(acct_unit_active);
-
-        sdom->active_unit_count++;
-        list_add(&svc->active_unit_elem, &sdom->active_unit);
-        /* Make weight per-unit */
-        prv->weight += sdom->weight;
-        if ( list_empty(&sdom->active_sdom_elem) )
-        {
-            list_add(&sdom->active_sdom_elem, &prv->active_sdom);
-        }
-    }
-
-    TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id,
-             svc->unit->unit_id, sdom->active_unit_count);
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static inline void
-__csched_unit_acct_stop_locked(struct csched_private *prv,
-    struct csched_unit *svc)
-{
-    struct csched_dom * const sdom = svc->sdom;
-
-    BUG_ON( list_empty(&svc->active_unit_elem) );
-
-    SCHED_UNIT_STAT_CRANK(svc, state_idle);
-    SCHED_STAT_CRANK(acct_unit_idle);
-
-    BUG_ON( prv->weight < sdom->weight );
-    sdom->active_unit_count--;
-    list_del_init(&svc->active_unit_elem);
-    prv->weight -= sdom->weight;
-    if ( list_empty(&sdom->active_unit) )
-    {
-        list_del_init(&sdom->active_sdom_elem);
-    }
-
-    TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id,
-             svc->unit->unit_id, sdom->active_unit_count);
-}
-
-static void
-csched_unit_acct(struct csched_private *prv, unsigned int cpu)
-{
-    struct sched_unit *currunit = current->sched_unit;
-    struct csched_unit * const svc = CSCHED_UNIT(currunit);
-    struct sched_resource *sr = get_sched_res(cpu);
-    const struct scheduler *ops = sr->scheduler;
-
-    ASSERT( sched_unit_master(currunit) == cpu );
-    ASSERT( svc->sdom != NULL );
-    ASSERT( !is_idle_unit(svc->unit) );
-
-    /*
-     * If this UNIT's priority was boosted when it last awoke, reset it.
-     * If the UNIT is found here, then it's consuming a non-negligeable
-     * amount of CPU resources and should no longer be boosted.
-     */
-    if ( svc->pri == CSCHED_PRI_TS_BOOST )
-    {
-        svc->pri = CSCHED_PRI_TS_UNDER;
-        TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id,
-                 svc->unit->unit_id);
-    }
-
-    /*
-     * Update credits
-     */
-    burn_credits(svc, NOW());
-
-    /*
-     * Put this UNIT and domain back on the active list if it was
-     * idling.
-     */
-    if ( list_empty(&svc->active_unit_elem) )
-    {
-        __csched_unit_acct_start(prv, svc);
-    }
-    else
-    {
-        unsigned int new_cpu;
-        unsigned long flags;
-        spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags);
-
-        /*
-         * If it's been active a while, check if we'd be better off
-         * migrating it to run elsewhere (see multi-core and multi-thread
-         * support in csched_res_pick()).
-         */
-        new_cpu = _csched_cpu_pick(ops, currunit, 0);
-
-        unit_schedule_unlock_irqrestore(lock, flags, currunit);
-
-        if ( new_cpu != cpu )
-        {
-            SCHED_UNIT_STAT_CRANK(svc, migrate_r);
-            SCHED_STAT_CRANK(migrate_running);
-            sched_set_pause_flags_atomic(currunit, _VPF_migrating);
-            /*
-             * As we are about to tickle cpu, we should clear its bit in
-             * idlers. But, if we are here, it means there is someone running
-             * on it, and hence the bit must be zero already.
-             */
-            ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers));
-            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-        }
-    }
-}
-
-static void *
-csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
-                   void *dd)
-{
-    struct csched_unit *svc;
-
-    /* Allocate per-UNIT info */
-    svc = xzalloc(struct csched_unit);
-    if ( svc == NULL )
-        return NULL;
-
-    INIT_LIST_HEAD(&svc->runq_elem);
-    INIT_LIST_HEAD(&svc->active_unit_elem);
-    svc->sdom = dd;
-    svc->unit = unit;
-    svc->pri = is_idle_unit(unit) ?
-        CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
-    SCHED_UNIT_STATS_RESET(svc);
-    SCHED_STAT_CRANK(unit_alloc);
-    return svc;
-}
-
-static void
-csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched_unit *svc = unit->priv;
-    spinlock_t *lock;
-
-    BUG_ON( is_idle_unit(unit) );
-
-    /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */
-    lock = unit_schedule_lock_irq(unit);
-
-    sched_set_res(unit, csched_res_pick(ops, unit));
-
-    spin_unlock_irq(lock);
-
-    lock = unit_schedule_lock_irq(unit);
-
-    if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running )
-        runq_insert(svc);
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    SCHED_STAT_CRANK(unit_insert);
-}
-
-static void
-csched_free_udata(const struct scheduler *ops, void *priv)
-{
-    struct csched_unit *svc = priv;
-
-    BUG_ON( !list_empty(&svc->runq_elem) );
-
-    xfree(svc);
-}
-
-static void
-csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    struct csched_unit * const svc = CSCHED_UNIT(unit);
-    struct csched_dom * const sdom = svc->sdom;
-
-    SCHED_STAT_CRANK(unit_remove);
-
-    ASSERT(!__unit_on_runq(svc));
-
-    if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
-    {
-        SCHED_STAT_CRANK(unit_unpark);
-        sched_unit_unpause(svc->unit);
-    }
-
-    spin_lock_irq(&prv->lock);
-
-    if ( !list_empty(&svc->active_unit_elem) )
-        __csched_unit_acct_stop_locked(prv, svc);
-
-    spin_unlock_irq(&prv->lock);
-
-    BUG_ON( sdom == NULL );
-}
-
-static void
-csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched_unit * const svc = CSCHED_UNIT(unit);
-    unsigned int cpu = sched_unit_master(unit);
-    struct sched_resource *sr = get_sched_res(cpu);
-
-    SCHED_STAT_CRANK(unit_sleep);
-
-    BUG_ON( is_idle_unit(unit) );
-
-    if ( curr_on_cpu(cpu) == unit )
-    {
-        /*
-         * We are about to tickle cpu, so we should clear its bit in idlers.
-         * But, we are here because unit is going to sleep while running on cpu,
-         * so the bit must be zero already.
-         */
-        ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers));
-        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-    }
-    else if ( __unit_on_runq(svc) )
-        runq_remove(svc);
-}
-
-static void
-csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched_unit * const svc = CSCHED_UNIT(unit);
-    bool_t migrating;
-
-    BUG_ON( is_idle_unit(unit) );
-
-    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
-    {
-        SCHED_STAT_CRANK(unit_wake_running);
-        return;
-    }
-    if ( unlikely(__unit_on_runq(svc)) )
-    {
-        SCHED_STAT_CRANK(unit_wake_onrunq);
-        return;
-    }
-
-    if ( likely(unit_runnable(unit)) )
-        SCHED_STAT_CRANK(unit_wake_runnable);
-    else
-        SCHED_STAT_CRANK(unit_wake_not_runnable);
-
-    /*
-     * We temporarily boost the priority of awaking UNITs!
-     *
-     * If this UNIT consumes a non negligible amount of CPU, it
-     * will eventually find itself in the credit accounting code
-     * path where its priority will be reset to normal.
-     *
-     * If on the other hand the UNIT consumes little CPU and is
-     * blocking and awoken a lot (doing I/O for example), its
-     * priority will remain boosted, optimizing it's wake-to-run
-     * latencies.
-     *
-     * This allows wake-to-run latency sensitive UNITs to preempt
-     * more CPU resource intensive UNITs without impacting overall
-     * system fairness.
-     *
-     * There are two cases, when we don't want to boost:
-     *  - UNITs that are waking up after a migration, rather than
-     *    after having block;
-     *  - UNITs of capped domains unpausing after earning credits
-     *    they had overspent.
-     */
-    migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags);
-
-    if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER &&
-         !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
-    {
-        TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id,
-                 unit->unit_id);
-        SCHED_STAT_CRANK(unit_boost);
-        svc->pri = CSCHED_PRI_TS_BOOST;
-    }
-
-    /* Put the UNIT on the runq and tickle CPUs */
-    runq_insert(svc);
-    __runq_tickle(svc);
-}
-
-static void
-csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched_unit * const svc = CSCHED_UNIT(unit);
-
-    /* Let the scheduler know that this vcpu is trying to yield */
-    set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags);
-}
-
-static int
-csched_dom_cntl(
-    const struct scheduler *ops,
-    struct domain *d,
-    struct xen_domctl_scheduler_op *op)
-{
-    struct csched_dom * const sdom = CSCHED_DOM(d);
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    unsigned long flags;
-    int rc = 0;
-
-    /* Protect both get and put branches with the pluggable scheduler
-     * lock. Runq lock not needed anywhere in here. */
-    spin_lock_irqsave(&prv->lock, flags);
-
-    switch ( op->cmd )
-    {
-    case XEN_DOMCTL_SCHEDOP_getinfo:
-        op->u.credit.weight = sdom->weight;
-        op->u.credit.cap = sdom->cap;
-        break;
-    case XEN_DOMCTL_SCHEDOP_putinfo:
-        if ( op->u.credit.weight != 0 )
-        {
-            if ( !list_empty(&sdom->active_sdom_elem) )
-            {
-                prv->weight -= sdom->weight * sdom->active_unit_count;
-                prv->weight += op->u.credit.weight * sdom->active_unit_count;
-            }
-            sdom->weight = op->u.credit.weight;
-        }
-
-        if ( op->u.credit.cap != (uint16_t)~0U )
-            sdom->cap = op->u.credit.cap;
-        break;
-    default:
-        rc = -EINVAL;
-        break;
-    }
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-
-    return rc;
-}
-
-static void
-csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
-                const cpumask_t *hard, const cpumask_t *soft)
-{
-    struct csched_unit *svc = CSCHED_UNIT(unit);
-
-    if ( !hard )
-        return;
-
-    /* Are we becoming exclusively pinned? */
-    if ( cpumask_weight(hard) == 1 )
-        set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
-    else
-        clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags);
-}
-
-static inline void
-__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms)
-{
-    prv->tslice = MILLISECS(timeslice_ms);
-    prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE;
-    if ( timeslice_ms < prv->ticks_per_tslice )
-        prv->ticks_per_tslice = 1;
-    prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice;
-    prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms;
-    prv->credit = prv->credits_per_tslice * prv->ncpus;
-}
-
-static int
-csched_sys_cntl(const struct scheduler *ops,
-                        struct xen_sysctl_scheduler_op *sc)
-{
-    int rc = -EINVAL;
-    struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit;
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    unsigned long flags;
-
-    switch ( sc->cmd )
-    {
-    case XEN_SYSCTL_SCHEDOP_putinfo:
-        if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX
-             || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN
-             || (params->ratelimit_us
-                 && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
-                     || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN))
-             || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms)
-             || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
-                goto out;
-
-        spin_lock_irqsave(&prv->lock, flags);
-        __csched_set_tslice(prv, params->tslice_ms);
-        if ( !prv->ratelimit && params->ratelimit_us )
-            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
-        else if ( prv->ratelimit && !params->ratelimit_us )
-            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
-        prv->ratelimit = MICROSECS(params->ratelimit_us);
-        prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us);
-        spin_unlock_irqrestore(&prv->lock, flags);
-
-        /* FALLTHRU */
-    case XEN_SYSCTL_SCHEDOP_getinfo:
-        params->tslice_ms = prv->tslice / MILLISECS(1);
-        params->ratelimit_us = prv->ratelimit / MICROSECS(1);
-        params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1);
-        rc = 0;
-        break;
-    }
-    out:
-    return rc;
-}
-
-static void *
-csched_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
-    struct csched_dom *sdom;
-
-    sdom = xzalloc(struct csched_dom);
-    if ( sdom == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    /* Initialize credit and weight */
-    INIT_LIST_HEAD(&sdom->active_unit);
-    INIT_LIST_HEAD(&sdom->active_sdom_elem);
-    sdom->dom = dom;
-    sdom->weight = CSCHED_DEFAULT_WEIGHT;
-
-    return sdom;
-}
-
-static void
-csched_free_domdata(const struct scheduler *ops, void *data)
-{
-    xfree(data);
-}
-
-/*
- * This is a O(n) optimized sort of the runq.
- *
- * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk
- * through the runq and move up any UNDERs that are preceded by OVERS. We
- * remember the last UNDER to make the move up operation O(1).
- */
-static void
-csched_runq_sort(struct csched_private *prv, unsigned int cpu)
-{
-    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
-    struct list_head *runq, *elem, *next, *last_under;
-    struct csched_unit *svc_elem;
-    spinlock_t *lock;
-    unsigned long flags;
-    int sort_epoch;
-
-    sort_epoch = prv->runq_sort;
-    if ( sort_epoch == spc->runq_sort_last )
-        return;
-
-    spc->runq_sort_last = sort_epoch;
-
-    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
-    runq = &spc->runq;
-    elem = runq->next;
-    last_under = runq;
-
-    while ( elem != runq )
-    {
-        next = elem->next;
-        svc_elem = __runq_elem(elem);
-
-        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
-        {
-            /* does elem need to move up the runq? */
-            if ( elem->prev != last_under )
-            {
-                list_del(elem);
-                list_add(elem, last_under);
-            }
-            last_under = elem;
-        }
-
-        elem = next;
-    }
-
-    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-}
-
-static void
-csched_acct(void* dummy)
-{
-    struct csched_private *prv = dummy;
-    unsigned long flags;
-    struct list_head *iter_unit, *next_unit;
-    struct list_head *iter_sdom, *next_sdom;
-    struct csched_unit *svc;
-    struct csched_dom *sdom;
-    uint32_t credit_total;
-    uint32_t weight_total;
-    uint32_t weight_left;
-    uint32_t credit_fair;
-    uint32_t credit_peak;
-    uint32_t credit_cap;
-    int credit_balance;
-    int credit_xtra;
-    int credit;
-
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    weight_total = prv->weight;
-    credit_total = prv->credit;
-
-    /* Converge balance towards 0 when it drops negative */
-    if ( prv->credit_balance < 0 )
-    {
-        credit_total -= prv->credit_balance;
-        SCHED_STAT_CRANK(acct_balance);
-    }
-
-    if ( unlikely(weight_total == 0) )
-    {
-        prv->credit_balance = 0;
-        spin_unlock_irqrestore(&prv->lock, flags);
-        SCHED_STAT_CRANK(acct_no_work);
-        goto out;
-    }
-
-    SCHED_STAT_CRANK(acct_run);
-
-    weight_left = weight_total;
-    credit_balance = 0;
-    credit_xtra = 0;
-    credit_cap = 0U;
-
-    list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom )
-    {
-        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
-
-        BUG_ON( is_idle_domain(sdom->dom) );
-        BUG_ON( sdom->active_unit_count == 0 );
-        BUG_ON( sdom->weight == 0 );
-        BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left );
-
-        weight_left -= ( sdom->weight * sdom->active_unit_count );
-
-        /*
-         * A domain's fair share is computed using its weight in competition
-         * with that of all other active domains.
-         *
-         * At most, a domain can use credits to run all its active UNITs
-         * for one full accounting period. We allow a domain to earn more
-         * only when the system-wide credit balance is negative.
-         */
-        credit_peak = sdom->active_unit_count * prv->credits_per_tslice;
-        if ( prv->credit_balance < 0 )
-        {
-            credit_peak += ( ( -prv->credit_balance
-                               * sdom->weight
-                               * sdom->active_unit_count) +
-                             (weight_total - 1)
-                           ) / weight_total;
-        }
-
-        if ( sdom->cap != 0U )
-        {
-            credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100;
-            if ( credit_cap < credit_peak )
-                credit_peak = credit_cap;
-
-            /* FIXME -- set cap per-unit as well...? */
-            credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 )
-                         ) / sdom->active_unit_count;
-        }
-
-        credit_fair = ( ( credit_total
-                          * sdom->weight
-                          * sdom->active_unit_count )
-                        + (weight_total - 1)
-                      ) / weight_total;
-
-        if ( credit_fair < credit_peak )
-        {
-            credit_xtra = 1;
-        }
-        else
-        {
-            if ( weight_left != 0U )
-            {
-                /* Give other domains a chance at unused credits */
-                credit_total += ( ( ( credit_fair - credit_peak
-                                    ) * weight_total
-                                  ) + ( weight_left - 1 )
-                                ) / weight_left;
-            }
-
-            if ( credit_xtra )
-            {
-                /*
-                 * Lazily keep domains with extra credits at the head of
-                 * the queue to give others a chance at them in future
-                 * accounting periods.
-                 */
-                SCHED_STAT_CRANK(acct_reorder);
-                list_del(&sdom->active_sdom_elem);
-                list_add(&sdom->active_sdom_elem, &prv->active_sdom);
-            }
-
-            credit_fair = credit_peak;
-        }
-
-        /* Compute fair share per UNIT */
-        credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 )
-                      ) / sdom->active_unit_count;
-
-
-        list_for_each_safe( iter_unit, next_unit, &sdom->active_unit )
-        {
-            svc = list_entry(iter_unit, struct csched_unit, active_unit_elem);
-            BUG_ON( sdom != svc->sdom );
-
-            /* Increment credit */
-            atomic_add(credit_fair, &svc->credit);
-            credit = atomic_read(&svc->credit);
-
-            /*
-             * Recompute priority or, if UNIT is idling, remove it from
-             * the active list.
-             */
-            if ( credit < 0 )
-            {
-                svc->pri = CSCHED_PRI_TS_OVER;
-
-                /* Park running UNITs of capped-out domains */
-                if ( sdom->cap != 0U &&
-                     credit < -credit_cap &&
-                     !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
-                {
-                    SCHED_STAT_CRANK(unit_park);
-                    sched_unit_pause_nosync(svc->unit);
-                }
-
-                /* Lower bound on credits */
-                if ( credit < -prv->credits_per_tslice )
-                {
-                    SCHED_STAT_CRANK(acct_min_credit);
-                    credit = -prv->credits_per_tslice;
-                    atomic_set(&svc->credit, credit);
-                }
-            }
-            else
-            {
-                svc->pri = CSCHED_PRI_TS_UNDER;
-
-                /* Unpark any capped domains whose credits go positive */
-                if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) )
-                {
-                    /*
-                     * It's important to unset the flag AFTER the unpause()
-                     * call to make sure the UNIT's priority is not boosted
-                     * if it is woken up here.
-                     */
-                    SCHED_STAT_CRANK(unit_unpark);
-                    sched_unit_unpause(svc->unit);
-                    clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags);
-                }
-
-                /* Upper bound on credits means UNIT stops earning */
-                if ( credit > prv->credits_per_tslice )
-                {
-                    __csched_unit_acct_stop_locked(prv, svc);
-                    /* Divide credits in half, so that when it starts
-                     * accounting again, it starts a little bit "ahead" */
-                    credit /= 2;
-                    atomic_set(&svc->credit, credit);
-                }
-            }
-
-            SCHED_UNIT_STAT_SET(svc, credit_last, credit);
-            SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair);
-            credit_balance += credit;
-        }
-    }
-
-    prv->credit_balance = credit_balance;
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-
-    /* Inform each CPU that its runq needs to be sorted */
-    prv->runq_sort++;
-
-out:
-    set_timer( &prv->master_ticker, NOW() + prv->tslice);
-}
-
-static void
-csched_tick(void *_cpu)
-{
-    unsigned int cpu = (unsigned long)_cpu;
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct csched_pcpu *spc = CSCHED_PCPU(cpu);
-    struct csched_private *prv = CSCHED_PRIV(sr->scheduler);
-
-    spc->tick++;
-
-    /*
-     * Accounting for running UNIT
-     */
-    if ( !is_idle_unit(current->sched_unit) )
-        csched_unit_acct(prv, cpu);
-
-    /*
-     * Check if runq needs to be sorted
-     *
-     * Every physical CPU resorts the runq after the accounting master has
-     * modified priorities. This is a special O(n) sort and runs at most
-     * once per accounting period (currently 30 milliseconds).
-     */
-    csched_runq_sort(prv, cpu);
-
-    set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) );
-}
-
-static struct csched_unit *
-csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step)
-{
-    struct sched_resource *sr = get_sched_res(cpu);
-    const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler);
-    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
-    struct csched_unit *speer;
-    struct list_head *iter;
-    struct sched_unit *unit;
-
-    ASSERT(peer_pcpu != NULL);
-
-    /*
-     * Don't steal from an idle CPU's runq because it's about to
-     * pick up work from it itself.
-     */
-    if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) )
-        goto out;
-
-    list_for_each( iter, &peer_pcpu->runq )
-    {
-        speer = __runq_elem(iter);
-
-        /*
-         * If next available UNIT here is not of strictly higher
-         * priority than ours, this PCPU is useless to us.
-         */
-        if ( speer->pri <= pri )
-            break;
-
-        /* Is this UNIT runnable on our PCPU? */
-        unit = speer->unit;
-        BUG_ON( is_idle_unit(unit) );
-
-        /*
-         * If the unit is still in peer_cpu's scheduling tail, or if it
-         * has no useful soft affinity, skip it.
-         *
-         * In fact, what we want is to check if we have any "soft-affine
-         * work" to steal, before starting to look at "hard-affine work".
-         *
-         * Notice that, if not even one unit on this runq has a useful
-         * soft affinity, we could have avoid considering this runq for
-         * a soft balancing step in the first place. This, for instance,
-         * can be implemented by taking note of on what runq there are
-         * units with useful soft affinities in some sort of bitmap
-         * or counter.
-         */
-        if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY &&
-                                  !has_soft_affinity(unit)) )
-            continue;
-
-        affinity_balance_cpumask(unit, balance_step, cpumask_scratch);
-        if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) )
-        {
-            /* We got a candidate. Grab it! */
-            TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu,
-                     unit->domain->domain_id, unit->unit_id);
-            SCHED_UNIT_STAT_CRANK(speer, migrate_q);
-            SCHED_STAT_CRANK(migrate_queued);
-            runq_remove(speer);
-            sched_set_res(unit, get_sched_res(cpu));
-            /*
-             * speer will start executing directly on cpu, without having to
-             * go through runq_insert(). So we must update the runnable count
-             * for cpu here.
-             */
-            inc_nr_runnable(cpu);
-            return speer;
-        }
-    }
- out:
-    SCHED_STAT_CRANK(steal_peer_idle);
-    return NULL;
-}
-
-static struct csched_unit *
-csched_load_balance(struct csched_private *prv, int cpu,
-    struct csched_unit *snext, bool *stolen)
-{
-    struct cpupool *c = get_sched_res(cpu)->cpupool;
-    struct csched_unit *speer;
-    cpumask_t workers;
-    cpumask_t *online = c->res_valid;
-    int peer_cpu, first_cpu, peer_node, bstep;
-    int node = cpu_to_node(cpu);
-
-    BUG_ON(get_sched_res(cpu) != snext->unit->res);
-
-    /*
-     * If this CPU is going offline, or is not (yet) part of any cpupool
-     * (as it happens, e.g., during cpu bringup), we shouldn't steal work.
-     */
-    if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) )
-        goto out;
-
-    if ( snext->pri == CSCHED_PRI_IDLE )
-        SCHED_STAT_CRANK(load_balance_idle);
-    else if ( snext->pri == CSCHED_PRI_TS_OVER )
-        SCHED_STAT_CRANK(load_balance_over);
-    else
-        SCHED_STAT_CRANK(load_balance_other);
-
-    /*
-     * Let's look around for work to steal, taking both hard affinity
-     * and soft affinity into account. More specifically, we check all
-     * the non-idle CPUs' runq, looking for:
-     *  1. any "soft-affine work" to steal first,
-     *  2. if not finding anything, any "hard-affine work" to steal.
-     */
-    for_each_affinity_balance_step( bstep )
-    {
-        /*
-         * We peek at the non-idling CPUs in a node-wise fashion. In fact,
-         * it is more likely that we find some affine work on our same
-         * node, not to mention that migrating units within the same node
-         * could well expected to be cheaper than across-nodes (memory
-         * stays local, there might be some node-wide cache[s], etc.).
-         */
-        peer_node = node;
-        do
-        {
-            /* Select the pCPUs in this node that have work we can steal. */
-            cpumask_andnot(&workers, online, prv->idlers);
-            cpumask_and(&workers, &workers, &node_to_cpumask(peer_node));
-            __cpumask_clear_cpu(cpu, &workers);
-
-            first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers);
-            if ( first_cpu >= nr_cpu_ids )
-                goto next_node;
-            peer_cpu = first_cpu;
-            do
-            {
-                spinlock_t *lock;
-
-                /*
-                 * If there is only one runnable unit on peer_cpu, it means
-                 * there's no one to be stolen in its runqueue, so skip it.
-                 *
-                 * Checking this without holding the lock is racy... But that's
-                 * the whole point of this optimization!
-                 *
-                 * In more details:
-                 * - if we race with dec_nr_runnable(), we may try to take the
-                 *   lock and call csched_runq_steal() for no reason. This is
-                 *   not a functional issue, and should be infrequent enough.
-                 *   And we can avoid that by re-checking nr_runnable after
-                 *   having grabbed the lock, if we want;
-                 * - if we race with inc_nr_runnable(), we skip a pCPU that may
-                 *   have runnable units in its runqueue, but that's not a
-                 *   problem because:
-                 *   + if racing with csched_unit_insert() or csched_unit_wake(),
-                 *     __runq_tickle() will be called afterwords, so the unit
-                 *     won't get stuck in the runqueue for too long;
-                 *   + if racing with csched_runq_steal(), it may be that an
-                 *     unit that we could have picked up, stays in a runqueue
-                 *     until someone else tries to steal it again. But this is
-                 *     no worse than what can happen already (without this
-                 *     optimization), it the pCPU would schedule right after we
-                 *     have taken the lock, and hence block on it.
-                 */
-                if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 )
-                {
-                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0);
-                    goto next_cpu;
-                }
-
-                /*
-                 * Get ahold of the scheduler lock for this peer CPU.
-                 *
-                 * Note: We don't spin on this lock but simply try it. Spinning
-                 * could cause a deadlock if the peer CPU is also load
-                 * balancing and trying to lock this CPU.
-                 */
-                lock = pcpu_schedule_trylock(peer_cpu);
-                SCHED_STAT_CRANK(steal_trylock);
-                if ( !lock )
-                {
-                    SCHED_STAT_CRANK(steal_trylock_failed);
-                    TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0);
-                    goto next_cpu;
-                }
-
-                TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1);
-
-                /* Any work over there to steal? */
-                speer = cpumask_test_cpu(peer_cpu, online) ?
-                    csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
-                pcpu_schedule_unlock(lock, peer_cpu);
-
-                /* As soon as one unit is found, balancing ends */
-                if ( speer != NULL )
-                {
-                    *stolen = true;
-                    /*
-                     * Next time we'll look for work to steal on this node, we
-                     * will start from the next pCPU, with respect to this one,
-                     * so we don't risk stealing always from the same ones.
-                     */
-                    prv->balance_bias[peer_node] = peer_cpu;
-                    return speer;
-                }
-
- next_cpu:
-                peer_cpu = cpumask_cycle(peer_cpu, &workers);
-
-            } while( peer_cpu != first_cpu );
-
- next_node:
-            peer_node = cycle_node(peer_node, node_online_map);
-        } while( peer_node != node );
-    }
-
- out:
-    /* Failed to find more important work elsewhere... */
-    __runq_remove(snext);
-    return snext;
-}
-
-/*
- * This function is in the critical path. It is designed to be simple and
- * fast for the common case.
- */
-static void csched_schedule(
-    const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
-    bool tasklet_work_scheduled)
-{
-    const unsigned int cur_cpu = smp_processor_id();
-    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
-    struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu);
-    struct list_head * const runq = RUNQ(sched_cpu);
-    struct csched_unit * const scurr = CSCHED_UNIT(unit);
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    struct csched_unit *snext;
-    s_time_t runtime, tslice;
-    bool migrated = false;
-
-    SCHED_STAT_CRANK(schedule);
-    CSCHED_UNIT_CHECK(unit);
-
-    /*
-     * Here in Credit1 code, we usually just call TRACE_nD() helpers, and
-     * don't care about packing. But scheduling happens very often, so it
-     * actually is important that the record is as small as possible.
-     */
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned cpu:16, tasklet:8, idle:8;
-        } d;
-        d.cpu = cur_cpu;
-        d.tasklet = tasklet_work_scheduled;
-        d.idle = is_idle_unit(unit);
-        __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    runtime = now - unit->state_entry_time;
-    if ( runtime < 0 ) /* Does this ever happen? */
-        runtime = 0;
-
-    if ( !is_idle_unit(unit) )
-    {
-        /* Update credits of a non-idle UNIT. */
-        burn_credits(scurr, now);
-        scurr->start_time -= now;
-        scurr->last_sched_time = now;
-    }
-    else
-    {
-        /* Re-instate a boosted idle UNIT as normal-idle. */
-        scurr->pri = CSCHED_PRI_IDLE;
-    }
-
-    /* Choices, choices:
-     * - If we have a tasklet, we need to run the idle unit no matter what.
-     * - If sched rate limiting is in effect, and the current unit has
-     *   run for less than that amount of time, continue the current one,
-     *   but with a shorter timeslice and return it immediately
-     * - Otherwise, chose the one with the highest priority (which may
-     *   be the one currently running)
-     * - If the currently running one is TS_OVER, see if there
-     *   is a higher priority one waiting on the runqueue of another
-     *   cpu and steal it.
-     */
-
-    /*
-     * If we have schedule rate limiting enabled, check to see
-     * how long we've run for.
-     *
-     * If scurr is yielding, however, we don't let rate limiting kick in.
-     * In fact, it may be the case that scurr is about to spin, and there's
-     * no point forcing it to do so until rate limiting expires.
-     */
-    if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags)
-         && !tasklet_work_scheduled
-         && prv->ratelimit
-         && unit_runnable_state(unit)
-         && !is_idle_unit(unit)
-         && runtime < prv->ratelimit )
-    {
-        snext = scurr;
-        snext->start_time += now;
-        perfc_incr(delay_ms);
-        /*
-         * Next timeslice must last just until we'll have executed for
-         * ratelimit. However, to avoid setting a really short timer, which
-         * will most likely be inaccurate and counterproductive, we never go
-         * below CSCHED_MIN_TIMER.
-         */
-        tslice = prv->ratelimit - runtime;
-        if ( unlikely(runtime < CSCHED_MIN_TIMER) )
-            tslice = CSCHED_MIN_TIMER;
-        if ( unlikely(tb_init_done) )
-        {
-            struct {
-                unsigned unit:16, dom:16;
-                unsigned runtime;
-            } d;
-            d.dom = unit->domain->domain_id;
-            d.unit = unit->unit_id;
-            d.runtime = runtime;
-            __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d),
-                        (unsigned char *)&d);
-        }
-
-        goto out;
-    }
-    tslice = prv->tslice;
-
-    /*
-     * Select next runnable local UNIT (ie top of local runq)
-     */
-    if ( unit_runnable(unit) )
-        __runq_insert(scurr);
-    else
-    {
-        BUG_ON( is_idle_unit(unit) || list_empty(runq) );
-        /* Current has blocked. Update the runnable counter for this cpu. */
-        dec_nr_runnable(sched_cpu);
-    }
-
-    /*
-     * Clear YIELD flag before scheduling out
-     */
-    clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
-
-    do {
-        snext = __runq_elem(runq->next);
-
-        /* Tasklet work (which runs in idle UNIT context) overrides all else. */
-        if ( tasklet_work_scheduled )
-        {
-            TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
-            snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
-            snext->pri = CSCHED_PRI_TS_BOOST;
-        }
-
-        /*
-         * SMP Load balance:
-         *
-         * If the next highest priority local runnable UNIT has already eaten
-         * through its credits, look on other PCPUs to see if we have more
-         * urgent work... If not, csched_load_balance() will return snext, but
-         * already removed from the runq.
-         */
-        if ( snext->pri > CSCHED_PRI_TS_OVER )
-            __runq_remove(snext);
-        else
-            snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
-
-    } while ( !unit_runnable_state(snext->unit) );
-
-    /*
-     * Update idlers mask if necessary. When we're idling, other CPUs
-     * will tickle us when they get extra work.
-     */
-    if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE )
-    {
-        if ( !cpumask_test_cpu(sched_cpu, prv->idlers) )
-            cpumask_set_cpu(sched_cpu, prv->idlers);
-    }
-    else if ( cpumask_test_cpu(sched_cpu, prv->idlers) )
-    {
-        cpumask_clear_cpu(sched_cpu, prv->idlers);
-    }
-
-    if ( !is_idle_unit(snext->unit) )
-        snext->start_time += now;
-
-out:
-    /*
-     * Return task to run next...
-     */
-    unit->next_time = (is_idle_unit(snext->unit) ?
-                -1 : tslice);
-    unit->next_task = snext->unit;
-    snext->unit->migrated = migrated;
-
-    /* Stop credit tick when going to idle, restart it when coming from idle. */
-    if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) )
-        stop_timer(&spc->ticker);
-    if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) )
-        set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us)
-                                - now % MICROSECS(prv->tick_period_us) );
-
-    CSCHED_UNIT_CHECK(unit->next_task);
-}
-
-static void
-csched_dump_unit(struct csched_unit *svc)
-{
-    struct csched_dom * const sdom = svc->sdom;
-
-    printk("[%i.%i] pri=%i flags=%x cpu=%i",
-            svc->unit->domain->domain_id,
-            svc->unit->unit_id,
-            svc->pri,
-            svc->flags,
-            sched_unit_master(svc->unit));
-
-    if ( sdom )
-    {
-        printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit),
-                sdom->weight, sdom->cap);
-#ifdef CSCHED_STATS
-        printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}",
-                svc->stats.credit_last,
-                svc->stats.credit_incr,
-                svc->stats.state_active,
-                svc->stats.state_idle,
-                svc->stats.migrate_q,
-                svc->stats.migrate_r,
-                svc->stats.kicked_away);
-#endif
-    }
-
-    printk("\n");
-}
-
-static void
-csched_dump_pcpu(const struct scheduler *ops, int cpu)
-{
-    struct list_head *runq, *iter;
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    struct csched_pcpu *spc;
-    struct csched_unit *svc;
-    spinlock_t *lock;
-    unsigned long flags;
-    int loop;
-
-    /*
-     * We need both locks:
-     * - csched_dump_unit() wants to access domains' scheduling
-     *   parameters, which are protected by the private scheduler lock;
-     * - we scan through the runqueue, so we need the proper runqueue
-     *   lock (the one of the runqueue of this cpu).
-     */
-    spin_lock_irqsave(&prv->lock, flags);
-    lock = pcpu_schedule_lock(cpu);
-
-    spc = CSCHED_PCPU(cpu);
-    runq = &spc->runq;
-
-    printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n",
-           cpu, spc->nr_runnable, spc->runq_sort_last,
-           CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
-           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
-
-    /* current UNIT (nothing to say if that's the idle unit). */
-    svc = CSCHED_UNIT(curr_on_cpu(cpu));
-    if ( svc && !is_idle_unit(svc->unit) )
-    {
-        printk("\trun: ");
-        csched_dump_unit(svc);
-    }
-
-    loop = 0;
-    list_for_each( iter, runq )
-    {
-        svc = __runq_elem(iter);
-        if ( svc )
-        {
-            printk("\t%3d: ", ++loop);
-            csched_dump_unit(svc);
-        }
-    }
-
-    pcpu_schedule_unlock(lock, cpu);
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void
-csched_dump(const struct scheduler *ops)
-{
-    struct list_head *iter_sdom, *iter_svc;
-    struct csched_private *prv = CSCHED_PRIV(ops);
-    int loop;
-    unsigned long flags;
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    printk("info:\n"
-           "\tncpus              = %u\n"
-           "\tmaster             = %u\n"
-           "\tcredit             = %u\n"
-           "\tcredit balance     = %d\n"
-           "\tweight             = %u\n"
-           "\trunq_sort          = %u\n"
-           "\tdefault-weight     = %d\n"
-           "\ttslice             = %"PRI_stime"ms\n"
-           "\tratelimit          = %"PRI_stime"us\n"
-           "\tcredits per msec   = %d\n"
-           "\tticks per tslice   = %d\n"
-           "\tmigration delay    = %"PRI_stime"us\n",
-           prv->ncpus,
-           prv->master,
-           prv->credit,
-           prv->credit_balance,
-           prv->weight,
-           prv->runq_sort,
-           CSCHED_DEFAULT_WEIGHT,
-           prv->tslice / MILLISECS(1),
-           prv->ratelimit / MICROSECS(1),
-           CSCHED_CREDITS_PER_MSEC,
-           prv->ticks_per_tslice,
-           prv->unit_migr_delay/ MICROSECS(1));
-
-    printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers));
-
-    printk("active units:\n");
-    loop = 0;
-    list_for_each( iter_sdom, &prv->active_sdom )
-    {
-        struct csched_dom *sdom;
-        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
-
-        list_for_each( iter_svc, &sdom->active_unit )
-        {
-            struct csched_unit *svc;
-            spinlock_t *lock;
-
-            svc = list_entry(iter_svc, struct csched_unit, active_unit_elem);
-            lock = unit_schedule_lock(svc->unit);
-
-            printk("\t%3d: ", ++loop);
-            csched_dump_unit(svc);
-
-            unit_schedule_unlock(lock, svc->unit);
-        }
-    }
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static int __init
-csched_global_init(void)
-{
-    if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX ||
-         sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN )
-    {
-        printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n"
-               " Resetting to default %u\n",
-               XEN_SYSCTL_CSCHED_TSLICE_MIN,
-               XEN_SYSCTL_CSCHED_TSLICE_MAX,
-               CSCHED_DEFAULT_TSLICE_MS);
-        sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
-    }
-
-    if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
-        printk("WARNING: sched_ratelimit_us >"
-               "sched_credit_tslice_ms is undefined\n"
-               "Setting ratelimit to tslice\n");
-
-    if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US )
-    {
-        vcpu_migration_delay_us = 0;
-        printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n"
-               "Resetting to default: %u\n",
-               XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us);
-    }
-
-    return 0;
-}
-
-static int
-csched_init(struct scheduler *ops)
-{
-    struct csched_private *prv;
-
-    prv = xzalloc(struct csched_private);
-    if ( prv == NULL )
-        return -ENOMEM;
-
-    prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES);
-    if ( prv->balance_bias == NULL )
-    {
-        xfree(prv);
-        return -ENOMEM;
-    }
-
-    if ( !zalloc_cpumask_var(&prv->cpus) ||
-         !zalloc_cpumask_var(&prv->idlers) )
-    {
-        free_cpumask_var(prv->cpus);
-        xfree(prv->balance_bias);
-        xfree(prv);
-        return -ENOMEM;
-    }
-
-    ops->sched_data = prv;
-    spin_lock_init(&prv->lock);
-    INIT_LIST_HEAD(&prv->active_sdom);
-    prv->master = UINT_MAX;
-
-    __csched_set_tslice(prv, sched_credit_tslice_ms);
-
-    if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) )
-        prv->ratelimit = prv->tslice;
-    else
-        prv->ratelimit = MICROSECS(sched_ratelimit_us);
-
-    prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us);
-
-    return 0;
-}
-
-static void
-csched_deinit(struct scheduler *ops)
-{
-    struct csched_private *prv;
-
-    prv = CSCHED_PRIV(ops);
-    if ( prv != NULL )
-    {
-        ops->sched_data = NULL;
-        free_cpumask_var(prv->cpus);
-        free_cpumask_var(prv->idlers);
-        xfree(prv->balance_bias);
-        xfree(prv);
-    }
-}
-
-static const struct scheduler sched_credit_def = {
-    .name           = "SMP Credit Scheduler",
-    .opt_name       = "credit",
-    .sched_id       = XEN_SCHEDULER_CREDIT,
-    .sched_data     = NULL,
-
-    .global_init    = csched_global_init,
-
-    .insert_unit    = csched_unit_insert,
-    .remove_unit    = csched_unit_remove,
-
-    .sleep          = csched_unit_sleep,
-    .wake           = csched_unit_wake,
-    .yield          = csched_unit_yield,
-
-    .adjust         = csched_dom_cntl,
-    .adjust_affinity= csched_aff_cntl,
-    .adjust_global  = csched_sys_cntl,
-
-    .pick_resource  = csched_res_pick,
-    .do_schedule    = csched_schedule,
-
-    .dump_cpu_state = csched_dump_pcpu,
-    .dump_settings  = csched_dump,
-    .init           = csched_init,
-    .deinit         = csched_deinit,
-    .alloc_udata    = csched_alloc_udata,
-    .free_udata     = csched_free_udata,
-    .alloc_pdata    = csched_alloc_pdata,
-    .init_pdata     = csched_init_pdata,
-    .deinit_pdata   = csched_deinit_pdata,
-    .free_pdata     = csched_free_pdata,
-    .switch_sched   = csched_switch_sched,
-    .alloc_domdata  = csched_alloc_domdata,
-    .free_domdata   = csched_free_domdata,
-};
-
-REGISTER_SCHEDULER(sched_credit_def);
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
deleted file mode 100644
index f7c477053c..0000000000
--- a/xen/common/sched_credit2.c
+++ /dev/null
@@ -1,4122 +0,0 @@
-
-/****************************************************************************
- * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
- ****************************************************************************
- *
- *        File: common/sched_credit2.c
- *      Author: George Dunlap
- *
- * Description: Credit-based SMP CPU scheduler
- * Based on an earlier verson by Emmanuel Ackaouy.
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/div64.h>
-#include <xen/errno.h>
-#include <xen/trace.h>
-#include <xen/cpu.h>
-#include <xen/keyhandler.h>
-
-/* Meant only for helping developers during debugging. */
-/* #define d2printk printk */
-#define d2printk(x...)
-
-
-/*
- * Credit2 tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_CSCHED2_TICK             TRC_SCHED_CLASS_EVT(CSCHED2, 1)
-#define TRC_CSCHED2_RUNQ_POS         TRC_SCHED_CLASS_EVT(CSCHED2, 2)
-#define TRC_CSCHED2_CREDIT_BURN      TRC_SCHED_CLASS_EVT(CSCHED2, 3)
-#define TRC_CSCHED2_CREDIT_ADD       TRC_SCHED_CLASS_EVT(CSCHED2, 4)
-#define TRC_CSCHED2_TICKLE_CHECK     TRC_SCHED_CLASS_EVT(CSCHED2, 5)
-#define TRC_CSCHED2_TICKLE           TRC_SCHED_CLASS_EVT(CSCHED2, 6)
-#define TRC_CSCHED2_CREDIT_RESET     TRC_SCHED_CLASS_EVT(CSCHED2, 7)
-#define TRC_CSCHED2_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(CSCHED2, 8)
-#define TRC_CSCHED2_UPDATE_LOAD      TRC_SCHED_CLASS_EVT(CSCHED2, 9)
-#define TRC_CSCHED2_RUNQ_ASSIGN      TRC_SCHED_CLASS_EVT(CSCHED2, 10)
-#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11)
-#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12)
-#define TRC_CSCHED2_TICKLE_NEW       TRC_SCHED_CLASS_EVT(CSCHED2, 13)
-#define TRC_CSCHED2_RUNQ_MAX_WEIGHT  TRC_SCHED_CLASS_EVT(CSCHED2, 14)
-#define TRC_CSCHED2_MIGRATE          TRC_SCHED_CLASS_EVT(CSCHED2, 15)
-#define TRC_CSCHED2_LOAD_CHECK       TRC_SCHED_CLASS_EVT(CSCHED2, 16)
-#define TRC_CSCHED2_LOAD_BALANCE     TRC_SCHED_CLASS_EVT(CSCHED2, 17)
-#define TRC_CSCHED2_PICKED_CPU       TRC_SCHED_CLASS_EVT(CSCHED2, 19)
-#define TRC_CSCHED2_RUNQ_CANDIDATE   TRC_SCHED_CLASS_EVT(CSCHED2, 20)
-#define TRC_CSCHED2_SCHEDULE         TRC_SCHED_CLASS_EVT(CSCHED2, 21)
-#define TRC_CSCHED2_RATELIMIT        TRC_SCHED_CLASS_EVT(CSCHED2, 22)
-#define TRC_CSCHED2_RUNQ_CAND_CHECK  TRC_SCHED_CLASS_EVT(CSCHED2, 23)
-
-/*
- * TODO:
- * + Hyperthreading
- *  - "Discount" time run on a thread with busy siblings
- * + Algorithm:
- *  - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
- *    a flash animation in the background) can we schedule it with low enough latency
- *    so that audio doesn't skip?
- * + Optimizing
- *  - Profiling, making new algorithms, making math more efficient (no long division)
- */
-
-/*
- * Design:
- *
- * VMs "burn" credits based on their weight; higher weight means
- * credits burn more slowly.  The highest weight unit burns credits at
- * a rate of 1 credit per nanosecond.  Others burn proportionally
- * more.
- *
- * units are inserted into the runqueue by credit order.
- *
- * Credits are "reset" when the next unit in the runqueue is less than
- * or equal to zero.  At that point, everyone's credits are "clipped"
- * to a small value, and a fixed credit is added to everyone.
- */
-
-/*
- * Utilization cap:
- *
- * Setting an pCPU utilization cap for a domain means the following:
- *
- * - a domain can have a cap, expressed in terms of % of physical CPU time.
- *   A domain that must not use more than 1/4 of _one_ physical CPU, will
- *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
- *   physical CPU time, will be given a cap of 150%;
- *
- * - caps are per-domain (not per-unit). If a domain has only 1 unit, and
- *   a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4
- *   units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be
- *   split among the v units. How much each of the units will actually get,
- *   during any given interval of time, is unspecified (as it depends on
- *   various aspects: workload, system load, etc.). For instance, it is
- *   possible that, during a given time interval, 2 units use 100% each,
- *   and the other two use nothing; while during another time interval,
- *   two units use 80%, one uses 10% and the other 30%; or that each use
- *   50% (and so on and so forth).
- *
- * For implementing this, we use the following approach:
- *
- * - each domain is given a 'budget', an each domain has a timer, which
- *   replenishes the domain's budget periodically. The budget is the amount
- *   of time the units of the domain can use every 'period';
- *
- * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
- *   (but each domain has its own timer; so the all are periodic by the same
- *   period, but replenishment of the budgets of the various domains, at
- *   periods boundaries, are not synchronous);
- *
- * - when units run, they consume budget. When they don't run, they don't
- *   consume budget. If there is no budget left for the domain, no unit of
- *   that domain can run. If an unit tries to run and finds that there is no
- *   budget, it blocks.
- *   At whatever time an unit wants to run, it must check the domain's budget,
- *   and if there is some, it can use it.
- *
- * - budget is replenished to the top of the capacity for the domain once
- *   per period. Even if there was some leftover budget from previous period,
- *   though, the budget after a replenishment will always be at most equal
- *   to the total capacify of the domain ('tot_budget');
- *
- * - when a budget replenishment occurs, if there are units that had been
- *   blocked because of lack of budget, they'll be unblocked, and they will
- *   (potentially) be able to run again.
- *
- * Finally, some even more implementation related detail:
- *
- * - budget is stored in a domain-wide pool. Units of the domain that want
- *   to run go to such pool, and grub some. When they do so, the amount
- *   they grabbed is _immediately_ removed from the pool. This happens in
- *   unit_grab_budget();
- *
- * - when units stop running, if they've not consumed all the budget they
- *   took, the leftover is put back in the pool. This happens in
- *   unit_return_budget();
- *
- * - the above means that an unit can find out that there is no budget and
- *   block, not only if the cap has actually been reached (for this period),
- *   but also if some other units, in order to run, have grabbed a certain
- *   quota of budget, no matter whether they've already used it all or not.
- *   An unit blocking because (any form of) lack of budget is said to be
- *   "parked", and such blocking happens in park_unit();
- *
- * - when an unit stops running, and puts back some budget in the domain pool,
- *   we need to check whether there is someone which has been parked and that
- *   can be unparked. This happens in unpark_parked_units(), called from
- *   csched2_context_saved();
- *
- * - of course, unparking happens also as a consequence of the domain's budget
- *   being replenished by the periodic timer. This also occurs by means of
- *   calling csched2_context_saved() (but from replenish_domain_budget());
- *
- * - parked units of a domain are kept in a (per-domain) list, called
- *   'parked_units'). Manipulation of the list and of the domain-wide budget
- *   pool, must occur only when holding the 'budget_lock'.
- */
-
-/*
- * Locking:
- *
- * - runqueue lock
- *  + it is per-runqueue, so:
- *   * cpus in a runqueue take the runqueue lock, when using
- *     pcpu_schedule_lock() / unit_schedule_lock() (and friends),
- *   * a cpu may (try to) take a "remote" runqueue lock, e.g., for
- *     load balancing;
- *  + serializes runqueue operations (removing and inserting units);
- *  + protects runqueue-wide data in csched2_runqueue_data;
- *  + protects unit parameters in csched2_unit for the unit in the
- *    runqueue.
- *
- * - Private scheduler lock
- *  + protects scheduler-wide data in csched2_private, such as:
- *   * the list of domains active in this scheduler,
- *   * what cpus and what runqueues are active and in what
- *     runqueue each cpu is;
- *  + serializes the operation of changing the weights of domains;
- *
- * - Budget lock
- *  + it is per-domain;
- *  + protects, in domains that have an utilization cap;
- *   * manipulation of the total budget of the domain (as it is shared
- *     among all units of the domain),
- *   * manipulation of the list of units that are blocked waiting for
- *     some budget to be available.
- *
- * - Type:
- *  + runqueue locks are 'regular' spinlocks;
- *  + the private scheduler lock can be an rwlock. In fact, data
- *    it protects is modified only during initialization, cpupool
- *    manipulation and when changing weights, and read in all
- *    other cases (e.g., during load balancing);
- *  + budget locks are 'regular' spinlocks.
- *
- * Ordering:
- *  + tylock must be used when wanting to take a runqueue lock,
- *    if we already hold another one;
- *  + if taking both a runqueue lock and the private scheduler
- *    lock is, the latter must always be taken for first;
- *  + if taking both a runqueue lock and a budget lock, the former
- *    must always be taken for first.
- */
-
-/*
- * Basic constants
- */
-/* Default weight: How much a new domain starts with. */
-#define CSCHED2_DEFAULT_WEIGHT       256
-/*
- * Min timer: Minimum length a timer will be set, to
- * achieve efficiency.
- */
-#define CSCHED2_MIN_TIMER            MICROSECS(500)
-/*
- * Amount of credit VMs begin with, and are reset to.
- * ATM, set so that highest-weight VMs can only run for 10ms
- * before a reset event.
- */
-#define CSCHED2_CREDIT_INIT          MILLISECS(10)
-/*
- * Amount of credit the idle units have. It never changes, as idle
- * units does not consume credits, and it must be lower than whatever
- * amount of credit 'regular' unit would end up with.
- */
-#define CSCHED2_IDLE_CREDIT          (-(1U<<30))
-/*
- * Carryover: How much "extra" credit may be carried over after
- * a reset.
- */
-#define CSCHED2_CARRYOVER_MAX        CSCHED2_MIN_TIMER
-/*
- * Stickiness: Cross-L2 migration resistance.  Should be less than
- * MIN_TIMER.
- */
-#define CSCHED2_MIGRATE_RESIST       ((opt_migrate_resist)*MICROSECS(1))
-/* How much to "compensate" an unit for L2 migration. */
-#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50)
-/* How tolerant we should be when peeking at runtime of units on other cpus */
-#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50)
-/* Reset: Value below which credit will be reset. */
-#define CSCHED2_CREDIT_RESET         0
-/* Max timer: Maximum time a guest can be run for. */
-#define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
-/* Period of the cap replenishment timer. */
-#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
-
-/*
- * Flags
- */
-/*
- * CSFLAG_scheduled: Is this unit either running on, or context-switching off,
- * a physical cpu?
- * + Accessed only with runqueue lock held
- * + Set when chosen as next in csched2_schedule().
- * + Cleared after context switch has been saved in csched2_context_saved()
- * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
- *   set CSFLAG_delayed_runq_add
- * + Checked to be false in runq_insert.
- */
-#define __CSFLAG_scheduled 1
-#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled)
-/*
- * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
- * being context switched out?
- * + Set when scheduling out in csched2_schedule() if prev is runnable
- * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set
- * + Read in csched2_context_saved().  If set, it adds prev to the runqueue and
- *   clears the bit.
- */
-#define __CSFLAG_delayed_runq_add 2
-#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add)
-/*
- * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a
- * credit2-initiated runq migrate request; migrate it to the runqueue indicated
- * in the svc struct.
- */
-#define __CSFLAG_runq_migrate_request 3
-#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request)
-/*
- * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The
- * scheduler is invoked to see if we can give the cpu to someone else, and
- * get back to the yielding unit in a while.
- */
-#define __CSFLAG_unit_yield 4
-#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield)
-/*
- * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard
- * affinity set to one and only 1 cpu (and, hence, can only run there).
- */
-#define __CSFLAG_pinned 5
-#define CSFLAG_pinned (1U<<__CSFLAG_pinned)
-
-static unsigned int __read_mostly opt_migrate_resist = 500;
-integer_param("sched_credit2_migrate_resist", opt_migrate_resist);
-
-/*
- * Load tracking and load balancing
- *
- * Load history of runqueues and units is accounted for by using an
- * exponential weighted moving average algorithm. However, instead of using
- * fractions,we shift everything to left by the number of bits we want to
- * use for representing the fractional part (Q-format).
- *
- * We may also want to reduce the precision of time accounting, to
- * accommodate 'longer  windows'. So, if that is the case, we just need to
- * shift all time samples to the right.
- *
- * The details of the formulas used for load tracking are explained close to
- * update_runq_load(). Let's just say here that, with full nanosecond time
- * granularity, a 30 bits wide 'decaying window' is ~1 second long.
- *
- * We want to consider the following equations:
- *
- *  avg[0] = load*P
- *  avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W,  0 <= delta <= W
- *
- * where W is the length of the window, P the multiplier for transitiong into
- * Q-format fixed point arithmetic and load is the instantaneous load of a
- * runqueue, which basically is the number of runnable units there are on the
- * runqueue (for the meaning of the other terms, look at the doc comment to
- *  update_runq_load()).
- *
- *  So, again, with full nanosecond granularity, and 1 second window, we have:
- *
- *  W = 2^30
- *  P = 2^18
- *
- * The maximum possible value for the average load, which we want to store in
- * s_time_t type variables (i.e., we have 63 bits available) is load*P. This
- * means that, with P 18 bits wide, load can occupy 45 bits. This in turn
- * means we can have 2^45 units in each runqueue, before overflow occurs!
- *
- * However, it can happen that, at step j+1, if:
- *
- *  avg[j] = load*P
- *  delta = W
- *
- * then:
- *
- *  avg[j+i] = avg[j] + W*load*P/W - W*load*P/W
- *
- * So we must be able to deal with W*load*P. This means load can't be higher
- * than:
- *
- *  2^(63 - 30 - 18) = 2^15 = 32768
- *
- * So 32768 is the maximum number of units the we can have in a runqueue,
- * at any given time, and still not have problems with the load tracking
- * calculations... and this is more than fine.
- *
- * As a matter of fact, since we are using microseconds granularity, we have
- * W=2^20. So, still with 18 fractional bits and a 1 second long window, there
- * may be 2^25 = 33554432 units in a runq before we have to start thinking
- * about overflow.
- */
-
-/* If >0, decreases the granularity of time samples used for load tracking. */
-#define LOADAVG_GRANULARITY_SHIFT   (10)
-/* Time window during which we still give value to previous load history. */
-#define LOADAVG_WINDOW_SHIFT        (30)
-/* 18 bits by default (and not less than 4) for decimals. */
-#define LOADAVG_PRECISION_SHIFT     (18)
-#define LOADAVG_PRECISION_SHIFT_MIN (4)
-
-/*
- * Both the length of the window and the number of fractional bits can be
- * decided with boot parameters.
- *
- * The length of the window is always expressed in nanoseconds. The actual
- * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT.
- */
-static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
-integer_param("credit2_load_window_shift", opt_load_window_shift);
-static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT;
-integer_param("credit2_load_precision_shift", opt_load_precision_shift);
-
-static int __read_mostly opt_underload_balance_tolerance = 0;
-integer_param("credit2_balance_under", opt_underload_balance_tolerance);
-static int __read_mostly opt_overload_balance_tolerance = -3;
-integer_param("credit2_balance_over", opt_overload_balance_tolerance);
-/*
- * Domains subject to a cap receive a replenishment of their runtime budget
- * once every opt_cap_period interval. Default is 10 ms. The amount of budget
- * they receive depends on their cap. For instance, a domain with a 50% cap
- * will receive 50% of 10 ms, so 5 ms.
- */
-static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
-integer_param("credit2_cap_period_ms", opt_cap_period);
-
-/*
- * Runqueue organization.
- *
- * The various cpus are to be assigned each one to a runqueue, and we
- * want that to happen basing on topology. At the moment, it is possible
- * to choose to arrange runqueues to be:
- *
- * - per-cpu: meaning that there will be one runqueue per logical cpu. This
- *            will happen when if the opt_runqueue parameter is set to 'cpu'.
- *
- * - per-core: meaning that there will be one runqueue per each physical
- *             core of the host. This will happen if the opt_runqueue
- *             parameter is set to 'core';
- *
- * - per-socket: meaning that there will be one runqueue per each physical
- *               socket (AKA package, which often, but not always, also
- *               matches a NUMA node) of the host; This will happen if
- *               the opt_runqueue parameter is set to 'socket';
- *
- * - per-node: meaning that there will be one runqueue per each physical
- *             NUMA node of the host. This will happen if the opt_runqueue
- *             parameter is set to 'node';
- *
- * - global: meaning that there will be only one runqueue to which all the
- *           (logical) processors of the host belong. This will happen if
- *           the opt_runqueue parameter is set to 'all'.
- *
- * Depending on the value of opt_runqueue, therefore, cpus that are part of
- * either the same physical core, the same physical socket, the same NUMA
- * node, or just all of them, will be put together to form runqueues.
- */
-#define OPT_RUNQUEUE_CPU    0
-#define OPT_RUNQUEUE_CORE   1
-#define OPT_RUNQUEUE_SOCKET 2
-#define OPT_RUNQUEUE_NODE   3
-#define OPT_RUNQUEUE_ALL    4
-static const char *const opt_runqueue_str[] = {
-    [OPT_RUNQUEUE_CPU] = "cpu",
-    [OPT_RUNQUEUE_CORE] = "core",
-    [OPT_RUNQUEUE_SOCKET] = "socket",
-    [OPT_RUNQUEUE_NODE] = "node",
-    [OPT_RUNQUEUE_ALL] = "all"
-};
-static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET;
-
-static int __init parse_credit2_runqueue(const char *s)
-{
-    unsigned int i;
-
-    for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ )
-    {
-        if ( !strcmp(s, opt_runqueue_str[i]) )
-        {
-            opt_runqueue = i;
-            return 0;
-        }
-    }
-
-    return -EINVAL;
-}
-custom_param("credit2_runqueue", parse_credit2_runqueue);
-
-/*
- * Per-runqueue data
- */
-struct csched2_runqueue_data {
-    spinlock_t lock;           /* Lock for this runqueue                     */
-
-    struct list_head runq;     /* Ordered list of runnable vms               */
-    unsigned int nr_cpus;      /* How many CPUs are sharing this runqueue    */
-    int id;                    /* ID of this runqueue (-1 if invalid)        */
-
-    int load;                  /* Instantaneous load (num of non-idle units) */
-    s_time_t load_last_update; /* Last time average was updated              */
-    s_time_t avgload;          /* Decaying queue load                        */
-    s_time_t b_avgload;        /* Decaying queue load modified by balancing  */
-
-    cpumask_t active,          /* CPUs enabled for this runqueue             */
-        smt_idle,              /* Fully idle-and-untickled cores (see below) */
-        tickled,               /* Have been asked to go through schedule     */
-        idle;                  /* Currently idle pcpus                       */
-
-    struct list_head svc;      /* List of all units assigned to the runqueue */
-    unsigned int max_weight;   /* Max weight of the units in this runqueue   */
-    unsigned int pick_bias;    /* Last picked pcpu. Start from it next time  */
-};
-
-/*
- * System-wide private data
- */
-struct csched2_private {
-    rwlock_t lock;                     /* Private scheduler lock             */
-
-    unsigned int load_precision_shift; /* Precision of load calculations     */
-    unsigned int load_window_shift;    /* Lenght of load decaying window     */
-    unsigned int ratelimit_us;         /* Rate limiting for this scheduler   */
-
-    cpumask_t active_queues;           /* Runqueues with (maybe) active cpus */
-    struct csched2_runqueue_data *rqd; /* Data of the various runqueues      */
-
-    cpumask_t initialized;             /* CPUs part of this scheduler        */
-    struct list_head sdom;             /* List of domains (for debug key)    */
-};
-
-/*
- * Physical CPU
- */
-struct csched2_pcpu {
-    cpumask_t sibling_mask;            /* Siblings in the same runqueue      */
-    int runq_id;
-};
-
-/*
- * Schedule Unit
- */
-struct csched2_unit {
-    struct csched2_dom *sdom;          /* Up-pointer to domain                */
-    struct sched_unit *unit;           /* Up-pointer, to schedule unit        */
-    struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue          */
-
-    int credit;                        /* Current amount of credit            */
-    unsigned int weight;               /* Weight of this unit                 */
-    unsigned int residual;             /* Reminder of div(max_weight/weight)  */
-    unsigned flags;                    /* Status flags (16 bits would be ok,  */
-    s_time_t budget;                   /* Current budget (if domains has cap) */
-                                       /* but clear_bit() does not like that) */
-    s_time_t budget_quota;             /* Budget to which unit is entitled    */
-
-    s_time_t start_time;               /* Time we were scheduled (for credit) */
-
-    /* Individual contribution to load                                        */
-    s_time_t load_last_update;         /* Last time average was updated       */
-    s_time_t avgload;                  /* Decaying queue load                 */
-
-    struct list_head runq_elem;        /* On the runqueue (rqd->runq)         */
-    struct list_head parked_elem;      /* On the parked_units list            */
-    struct list_head rqd_elem;         /* On csched2_runqueue_data's svc list */
-    struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */
-    int tickled_cpu;                   /* Cpu that will pick us (-1 if none)  */
-};
-
-/*
- * Domain
- */
-struct csched2_dom {
-    struct domain *dom;         /* Up-pointer to domain                       */
-
-    spinlock_t budget_lock;     /* Serialized budget calculations             */
-    s_time_t tot_budget;        /* Total amount of budget                     */
-    s_time_t budget;            /* Currently available budget                 */
-
-    struct timer repl_timer;    /* Timer for periodic replenishment of budget */
-    s_time_t next_repl;         /* Time at which next replenishment occurs    */
-    struct list_head parked_units; /* List of CPUs waiting for budget         */
-
-    struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list       */
-    uint16_t weight;            /* User specified weight                      */
-    uint16_t cap;               /* User specified cap                         */
-    uint16_t nr_units;          /* Number of units of this domain             */
-};
-
-/*
- * Accessor helpers functions.
- */
-static inline struct csched2_private *csched2_priv(const struct scheduler *ops)
-{
-    return ops->sched_data;
-}
-
-static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu)
-{
-    return get_sched_res(cpu)->sched_priv;
-}
-
-static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit)
-{
-    return unit->priv;
-}
-
-static inline struct csched2_dom *csched2_dom(const struct domain *d)
-{
-    return d->sched_priv;
-}
-
-/* CPU to runq_id macro */
-static inline int c2r(unsigned int cpu)
-{
-    return csched2_pcpu(cpu)->runq_id;
-}
-
-/* CPU to runqueue struct macro */
-static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
-                                                  unsigned int cpu)
-{
-    return &csched2_priv(ops)->rqd[c2r(cpu)];
-}
-
-/* Does the domain of this unit have a cap? */
-static inline bool has_cap(const struct csched2_unit *svc)
-{
-    return svc->budget != STIME_MAX;
-}
-
-/*
- * Hyperthreading (SMT) support.
- *
- * We use a special per-runq mask (smt_idle) and update it according to the
- * following logic:
- *  - when _all_ the SMT sibling in a core are idle, all their corresponding
- *    bits are set in the smt_idle mask;
- *  - when even _just_one_ of the SMT siblings in a core is not idle, all the
- *    bits correspondings to it and to all its siblings are clear in the
- *    smt_idle mask.
- *
- * Once we have such a mask, it is easy to implement a policy that, either:
- *  - uses fully idle cores first: it is enough to try to schedule the units
- *    on pcpus from smt_idle mask first. This is what happens if
- *    sched_smt_power_savings was not set at boot (default), and it maximizes
- *    true parallelism, and hence performance;
- *  - uses already busy cores first: it is enough to try to schedule the units
- *    on pcpus that are idle, but are not in smt_idle. This is what happens if
- *    sched_smt_power_savings is set at boot, and it allows as more cores as
- *    possible to stay in low power states, minimizing power consumption.
- *
- * This logic is entirely implemented in runq_tickle(), and that is enough.
- * In fact, in this scheduler, placement of an unit on one of the pcpus of a
- * runq, _always_ happens by means of tickling:
- *  - when an unit wakes up, it calls csched2_unit_wake(), which calls
- *    runq_tickle();
- *  - when a migration is initiated in schedule.c, we call csched2_res_pick(),
- *    csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake().
- *    csched2_res_pick() looks for the least loaded runq and return just any
- *    of its processors. Then, csched2_unit_migrate() just moves the unit to
- *    the chosen runq, and it is again runq_tickle(), called by
- *    csched2_unit_wake() that actually decides what pcpu to use within the
- *    chosen runq;
- *  - when a migration is initiated in sched_credit2.c, by calling  migrate()
- *    directly, that again temporarily use a random pcpu from the new runq,
- *    and then calls runq_tickle(), by itself.
- */
-
-/*
- * If all the siblings of cpu (including cpu itself) are both idle and
- * untickled, set all their bits in mask.
- *
- * NB that rqd->smt_idle is different than rqd->idle.  rqd->idle
- * records pcpus that at are merely idle (i.e., at the moment do not
- * have an unit running on them).  But you have to manually filter out
- * which pcpus have been tickled in order to find cores that are not
- * going to be busy soon.  Filtering out tickled cpus pairwise is a
- * lot of extra pain; so for rqd->smt_idle, we explicitly make so that
- * the bits of a pcpu are set only if all the threads on its core are
- * both idle *and* untickled.
- *
- * This means changing the mask when either rqd->idle or rqd->tickled
- * changes.
- */
-static inline
-void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers,
-                       cpumask_t *mask)
-{
-    const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
-
-    if ( cpumask_subset(cpu_siblings, idlers) )
-        cpumask_or(mask, mask, cpu_siblings);
-}
-
-/*
- * Clear the bits of all the siblings of cpu from mask (if necessary).
- */
-static inline
-void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask)
-{
-    const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask;
-
-    if ( cpumask_subset(cpu_siblings, mask) )
-        cpumask_andnot(mask, mask, cpu_siblings);
-}
-
-/*
- * In csched2_res_pick(), it may not be possible to actually look at remote
- * runqueues (the trylock-s on their spinlocks can fail!). If that happens,
- * we pick, in order of decreasing preference:
- *  1) svc's current pcpu, if it is part of svc's soft affinity;
- *  2) a pcpu in svc's current runqueue that is also in svc's soft affinity;
- *  3) svc's current pcpu, if it is part of svc's hard affinity;
- *  4) a pcpu in svc's current runqueue that is also in svc's hard affinity;
- *  5) just one valid pcpu from svc's hard affinity
- *
- * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also
- * note that at least 5 is guaranteed to _always_ return at least one pcpu.
- */
-static int get_fallback_cpu(struct csched2_unit *svc)
-{
-    struct sched_unit *unit = svc->unit;
-    unsigned int bs;
-
-    SCHED_STAT_CRANK(need_fallback_cpu);
-
-    for_each_affinity_balance_step( bs )
-    {
-        int cpu = sched_unit_master(unit);
-
-        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
-            continue;
-
-        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    cpupool_domain_master_cpumask(unit->domain));
-
-        /*
-         * This is cases 1 or 3 (depending on bs): if processor is (still)
-         * in our affinity, go for it, for cache betterness.
-         */
-        if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
-            return cpu;
-
-        /*
-         * This is cases 2 or 4 (depending on bs): v->processor isn't there
-         * any longer, check if we at least can stay in our current runq.
-         */
-        if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu),
-                                       &svc->rqd->active)) )
-        {
-            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                        &svc->rqd->active);
-            return cpumask_first(cpumask_scratch_cpu(cpu));
-        }
-
-        /*
-         * We may well pick any valid pcpu from our soft-affinity, outside
-         * of our current runqueue, but we decide not to. In fact, changing
-         * runqueue is slow, affects load distribution, and is a source of
-         * overhead for the units running on the other runqueue (we need the
-         * lock). So, better do that as a consequence of a well informed
-         * decision (or if we really don't have any other chance, as we will,
-         * at step 5, if we get to there).
-         *
-         * Also, being here, looking for a fallback, is an unfortunate and
-         * infrequent event, while the decision of putting us in the runqueue
-         * wehere we are was (likely) made taking all the relevant factors
-         * into account. So let's not disrupt that, just for the sake of
-         * soft-affinity, and let's wait here to be able to made (hopefully,
-         * soon), another similar well informed decision.
-         */
-        if ( bs == BALANCE_SOFT_AFFINITY )
-            continue;
-
-        /*
-         * This is cases 5: last stand, just one valid pcpu from our hard
-         * affinity. It's guaranteed that there is at least one valid cpu,
-         * and therefore we are sure that we return it, and never really
-         * exit the loop.
-         */
-        ASSERT(bs == BALANCE_HARD_AFFINITY &&
-               !cpumask_empty(cpumask_scratch_cpu(cpu)));
-        cpu = cpumask_first(cpumask_scratch_cpu(cpu));
-        if ( likely(cpu < nr_cpu_ids) )
-            return cpu;
-    }
-    ASSERT_UNREACHABLE();
-    /*
-     * We can't be here.  But if that somehow happen (in non-debug builds),
-     * at least return something which both online and in our hard-affinity.
-     */
-    return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit)));
-}
-
-/*
- * Time-to-credit, credit-to-time.
- *
- * We keep track of the "residual" time to make sure that frequent short
- * schedules still get accounted for in the end.
- *
- * FIXME: Do pre-calculated division?
- */
-static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time,
-                          struct csched2_unit *svc)
-{
-    uint64_t val = time * rqd->max_weight + svc->residual;
-
-    svc->residual = do_div(val, svc->weight);
-    svc->credit -= val;
-}
-
-static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc)
-{
-    return credit * svc->weight / rqd->max_weight;
-}
-
-/*
- * Runqueue related code.
- */
-
-static inline int unit_on_runq(struct csched2_unit *svc)
-{
-    return !list_empty(&svc->runq_elem);
-}
-
-static inline struct csched2_unit * runq_elem(struct list_head *elem)
-{
-    return list_entry(elem, struct csched2_unit, runq_elem);
-}
-
-static void activate_runqueue(struct csched2_private *prv, int rqi)
-{
-    struct csched2_runqueue_data *rqd;
-
-    rqd = prv->rqd + rqi;
-
-    BUG_ON(!cpumask_empty(&rqd->active));
-
-    rqd->max_weight = 1;
-    rqd->id = rqi;
-    INIT_LIST_HEAD(&rqd->svc);
-    INIT_LIST_HEAD(&rqd->runq);
-    spin_lock_init(&rqd->lock);
-
-    __cpumask_set_cpu(rqi, &prv->active_queues);
-}
-
-static void deactivate_runqueue(struct csched2_private *prv, int rqi)
-{
-    struct csched2_runqueue_data *rqd;
-
-    rqd = prv->rqd + rqi;
-
-    BUG_ON(!cpumask_empty(&rqd->active));
-
-    rqd->id = -1;
-
-    __cpumask_clear_cpu(rqi, &prv->active_queues);
-}
-
-static inline bool same_node(unsigned int cpua, unsigned int cpub)
-{
-    return cpu_to_node(cpua) == cpu_to_node(cpub);
-}
-
-static inline bool same_socket(unsigned int cpua, unsigned int cpub)
-{
-    return cpu_to_socket(cpua) == cpu_to_socket(cpub);
-}
-
-static inline bool same_core(unsigned int cpua, unsigned int cpub)
-{
-    return same_socket(cpua, cpub) &&
-           cpu_to_core(cpua) == cpu_to_core(cpub);
-}
-
-static unsigned int
-cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu)
-{
-    struct csched2_runqueue_data *rqd;
-    unsigned int rqi;
-
-    for ( rqi = 0; rqi < nr_cpu_ids; rqi++ )
-    {
-        unsigned int peer_cpu;
-
-        /*
-         * As soon as we come across an uninitialized runqueue, use it.
-         * In fact, either:
-         *  - we are initializing the first cpu, and we assign it to
-         *    runqueue 0. This is handy, especially if we are dealing
-         *    with the boot cpu (if credit2 is the default scheduler),
-         *    as we would not be able to use cpu_to_socket() and similar
-         *    helpers anyway (they're result of which is not reliable yet);
-         *  - we have gone through all the active runqueues, and have not
-         *    found anyone whose cpus' topology matches the one we are
-         *    dealing with, so activating a new runqueue is what we want.
-         */
-        if ( prv->rqd[rqi].id == -1 )
-            break;
-
-        rqd = prv->rqd + rqi;
-        BUG_ON(cpumask_empty(&rqd->active));
-
-        peer_cpu = cpumask_first(&rqd->active);
-        BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID ||
-               cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID);
-
-        if (opt_runqueue == OPT_RUNQUEUE_CPU)
-            continue;
-        if ( opt_runqueue == OPT_RUNQUEUE_ALL ||
-             (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) ||
-             (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) ||
-             (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) )
-            break;
-    }
-
-    /* We really expect to be able to assign each cpu to a runqueue. */
-    BUG_ON(rqi >= nr_cpu_ids);
-
-    return rqi;
-}
-
-/* Find the domain with the highest weight. */
-static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight,
-                              int old_weight)
-{
-    /* Try to avoid brute-force search:
-     * - If new_weight is larger, max_weigth <- new_weight
-     * - If old_weight != max_weight, someone else is still max_weight
-     *   (No action required)
-     * - If old_weight == max_weight, brute-force search for max weight
-     */
-    if ( new_weight > rqd->max_weight )
-    {
-        rqd->max_weight = new_weight;
-        SCHED_STAT_CRANK(upd_max_weight_quick);
-    }
-    else if ( old_weight == rqd->max_weight )
-    {
-        struct list_head *iter;
-        int max_weight = 1;
-
-        list_for_each( iter, &rqd->svc )
-        {
-            struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem);
-
-            if ( svc->weight > max_weight )
-                max_weight = svc->weight;
-        }
-
-        rqd->max_weight = max_weight;
-        SCHED_STAT_CRANK(upd_max_weight_full);
-    }
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned rqi:16, max_weight:16;
-        } d;
-        d.rqi = rqd->id;
-        d.max_weight = rqd->max_weight;
-        __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-}
-
-/* Add and remove from runqueue assignment (not active run queue) */
-static void
-_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd)
-{
-
-    svc->rqd = rqd;
-    list_add_tail(&svc->rqd_elem, &svc->rqd->svc);
-
-    update_max_weight(svc->rqd, svc->weight, 0);
-
-    /* Expected new load based on adding this unit */
-    rqd->b_avgload += svc->avgload;
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            unsigned rqi:16;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.rqi=rqd->id;
-        __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-}
-
-static void
-runq_assign(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit *svc = unit->priv;
-
-    ASSERT(svc->rqd == NULL);
-
-    _runq_assign(svc, c2rqd(ops, sched_unit_master(unit)));
-}
-
-static void
-_runq_deassign(struct csched2_unit *svc)
-{
-    struct csched2_runqueue_data *rqd = svc->rqd;
-
-    ASSERT(!unit_on_runq(svc));
-    ASSERT(!(svc->flags & CSFLAG_scheduled));
-
-    list_del_init(&svc->rqd_elem);
-    update_max_weight(rqd, 0, svc->weight);
-
-    /* Expected new load based on removing this unit */
-    rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
-
-    svc->rqd = NULL;
-}
-
-static void
-runq_deassign(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit *svc = unit->priv;
-
-    ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
-    _runq_deassign(svc);
-}
-
-/*
- * Track the runq load by gathering instantaneous load samples, and using
- * exponentially weighted moving average (EWMA) for the 'decaying'.
- *
- * We consider a window of length W=2^(prv->load_window_shift) nsecs
- * (which takes LOADAVG_GRANULARITY_SHIFT into account).
- *
- * If load is the instantaneous load, the formula for EWMA looks as follows,
- * for the i-eth sample:
- *
- *  avg[i] = a*load + (1 - a)*avg[i-1]
- *
- * where avg[i] is the new value of the average load, avg[i-1] is the value
- * of the average load calculated so far, and a is a coefficient less or
- * equal to 1.
- *
- * So, for us, it becomes:
- *
- *  avgload = a*load + (1 - a)*avgload
- *
- * For determining a, we consider _when_ we are doing the load update, wrt
- * the length of the window. We define delta as follows:
- *
- *  delta = t - load_last_update
- *
- * where t is current time (i.e., time at which we are both sampling and
- * updating the load average) and load_last_update is the last time we did
- * that.
- *
- * There are two possible situations:
- *
- * a) delta <= W
- *    this means that, during the last window of length W, the runeuque load
- *    was avgload for (W - detla) time, and load for delta time:
- *
- *                |----------- W ---------|
- *                |                       |
- *                |     load_last_update  t
- *     -------------------------|---------|---
- *                |             |         |
- *                \__W - delta__/\_delta__/
- *                |             |         |
- *                |___avgload___|__load___|
- *
- *    So, what about using delta/W as our smoothing coefficient a. If we do,
- *    here's what happens:
- *
- *     a = delta / W
- *     1 - a = 1 - (delta / W) = (W - delta) / W
- *
- *    Which matches the above description of what happened in the last
- *    window of length W.
- *
- *    Note that this also means that the weight that we assign to both the
- *    latest load sample, and to previous history, varies at each update.
- *    The longer the latest load sample has been in efect, within the last
- *    window, the higher it weights (and the lesser the previous history
- *    weights).
- *
- *    This is some sort of extension of plain EWMA to fit even better to our
- *    use case.
- *
- * b) delta > W
- *    this means more than a full window has passed since the last update:
- *
- *                |----------- W ---------|
- *                |                       |
- *       load_last_update                 t
- *     ----|------------------------------|---
- *         |                              |
- *         \_________________delta________/
- *
- *    Basically, it means the last load sample has been in effect for more
- *    than W time, and hence we should just use it, and forget everything
- *    before that.
- *
- *    This can be seen as a 'reset condition', occurring when, for whatever
- *    reason, load has not been updated for longer than we expected. (It is
- *    also how avgload is assigned its first value.)
- *
- * The formula for avgload then becomes:
- *
- *  avgload = (delta/W)*load + (W - delta)*avgload/W
- *  avgload = delta*load/W + W*avgload/W - delta*avgload/W
- *  avgload = avgload + delta*load/W - delta*avgload/W
- *
- * So, final form is:
- *
- *  avgload_0 = load
- *  avgload = avgload + delta*load/W - delta*avgload/W,  0<=delta<=W
- *
- * As a confirmation, let's look at the extremes, when delta is 0 (i.e.,
- * what happens if we  update the load twice, at the same time instant?):
- *
- *  avgload = avgload + 0*load/W - 0*avgload/W
- *  avgload = avgload
- *
- * and when delta is W (i.e., what happens if we update at the last
- * possible instant before the window 'expires'?):
- *
- *  avgload = avgload + W*load/W - W*avgload/W
- *  avgload = avgload + load - avgload
- *  avgload = load
- *
- * Which, in both cases, is what we expect.
- */
-static void
-update_runq_load(const struct scheduler *ops,
-                 struct csched2_runqueue_data *rqd, int change, s_time_t now)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    s_time_t delta, load = rqd->load;
-    unsigned int P, W;
-
-    W = prv->load_window_shift;
-    P = prv->load_precision_shift;
-    now >>= LOADAVG_GRANULARITY_SHIFT;
-
-    /*
-     * To avoid using fractions, we shift to left by load_precision_shift,
-     * and use the least last load_precision_shift bits as fractional part.
-     * Looking back at the formula we want to use, we now have:
-     *
-     *  P = 2^(load_precision_shift)
-     *  P*avgload = P*(avgload + delta*load/W - delta*avgload/W)
-     *  P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W
-     *
-     * And if we are ok storing and using P*avgload, we can rewrite this as:
-     *
-     *  P*avgload = avgload'
-     *  avgload' = avgload' + delta*P*load/W - delta*avgload'/W
-     *
-     * Coupled with, of course:
-     *
-     *  avgload_0' = P*load
-     */
-
-    if ( rqd->load_last_update + (1ULL << W)  < now )
-    {
-        rqd->avgload = load << P;
-        rqd->b_avgload = load << P;
-    }
-    else
-    {
-        delta = now - rqd->load_last_update;
-        if ( unlikely(delta < 0) )
-        {
-            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
-                     __func__, now, rqd->load_last_update);
-            delta = 0;
-        }
-
-        /*
-         * Note that, if we were to enforce (or check) some relationship
-         * between P and W, we may save one shift. E.g., if we are sure
-         * that P < W, we could write:
-         *
-         *  (delta * (load << P)) >> W
-         *
-         * as:
-         *
-         *  (delta * load) >> (W - P)
-         */
-        rqd->avgload = rqd->avgload +
-                       ((delta * (load << P)) >> W) -
-                       ((delta * rqd->avgload) >> W);
-        rqd->b_avgload = rqd->b_avgload +
-                         ((delta * (load << P)) >> W) -
-                         ((delta * rqd->b_avgload) >> W);
-    }
-    rqd->load += change;
-    rqd->load_last_update = now;
-
-    /* Overflow, capable of making the load look negative, must not occur. */
-    ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint64_t rq_avgload, b_avgload;
-            unsigned rq_load:16, rq_id:8, shift:8;
-        } d;
-        d.rq_id = rqd->id;
-        d.rq_load = rqd->load;
-        d.rq_avgload = rqd->avgload;
-        d.b_avgload = rqd->b_avgload;
-        d.shift = P;
-        __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-}
-
-static void
-update_svc_load(const struct scheduler *ops,
-                struct csched2_unit *svc, int change, s_time_t now)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    s_time_t delta, unit_load;
-    unsigned int P, W;
-
-    if ( change == -1 )
-        unit_load = 1;
-    else if ( change == 1 )
-        unit_load = 0;
-    else
-        unit_load = unit_runnable(svc->unit);
-
-    W = prv->load_window_shift;
-    P = prv->load_precision_shift;
-    now >>= LOADAVG_GRANULARITY_SHIFT;
-
-    if ( svc->load_last_update + (1ULL << W) < now )
-    {
-        svc->avgload = unit_load << P;
-    }
-    else
-    {
-        delta = now - svc->load_last_update;
-        if ( unlikely(delta < 0) )
-        {
-            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n",
-                     __func__, now, svc->load_last_update);
-            delta = 0;
-        }
-
-        svc->avgload = svc->avgload +
-                       ((delta * (unit_load << P)) >> W) -
-                       ((delta * svc->avgload) >> W);
-    }
-    svc->load_last_update = now;
-
-    /* Overflow, capable of making the load look negative, must not occur. */
-    ASSERT(svc->avgload >= 0);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint64_t v_avgload;
-            unsigned unit:16, dom:16;
-            unsigned shift;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.v_avgload = svc->avgload;
-        d.shift = P;
-        __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-}
-
-static void
-update_load(const struct scheduler *ops,
-            struct csched2_runqueue_data *rqd,
-            struct csched2_unit *svc, int change, s_time_t now)
-{
-    trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0,  NULL);
-
-    update_runq_load(ops, rqd, change, now);
-    if ( svc )
-        update_svc_load(ops, svc, change, now);
-}
-
-static void
-runq_insert(const struct scheduler *ops, struct csched2_unit *svc)
-{
-    struct list_head *iter;
-    unsigned int cpu = sched_unit_master(svc->unit);
-    struct list_head * runq = &c2rqd(ops, cpu)->runq;
-    int pos = 0;
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    ASSERT(!unit_on_runq(svc));
-    ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit)));
-
-    ASSERT(&svc->rqd->runq == runq);
-    ASSERT(!is_idle_unit(svc->unit));
-    ASSERT(!svc->unit->is_running);
-    ASSERT(!(svc->flags & CSFLAG_scheduled));
-
-    list_for_each( iter, runq )
-    {
-        struct csched2_unit * iter_svc = runq_elem(iter);
-
-        if ( svc->credit > iter_svc->credit )
-            break;
-
-        pos++;
-    }
-    list_add_tail(&svc->runq_elem, iter);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            unsigned pos;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.pos = pos;
-        __trace_var(TRC_CSCHED2_RUNQ_POS, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-}
-
-static inline void runq_remove(struct csched2_unit *svc)
-{
-    ASSERT(unit_on_runq(svc));
-    list_del_init(&svc->runq_elem);
-}
-
-void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t);
-
-static inline void
-tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd)
-{
-    __cpumask_set_cpu(cpu, &rqd->tickled);
-    smt_idle_mask_clear(cpu, &rqd->smt_idle);
-    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-}
-
-/*
- * What we want to know is whether svc, which we assume to be running on some
- * pcpu, can be interrupted and preempted (which, so far, basically means
- * whether or not it already run for more than the ratelimit, to which we
- * apply some tolerance).
- */
-static inline bool is_preemptable(const struct csched2_unit *svc,
-                                    s_time_t now, s_time_t ratelimit)
-{
-    if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE )
-        return true;
-
-    ASSERT(svc->unit->is_running);
-    return now - svc->unit->state_entry_time >
-           ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE;
-}
-
-/*
- * Score to preempt the target cpu.  Return a negative number if the
- * credit isn't high enough; if it is, favor a preemption on cpu in
- * this order:
- * - cpu is in new's soft-affinity, not in cur's soft-affinity
- *   (2 x CSCHED2_CREDIT_INIT score bonus);
- * - cpu is in new's soft-affinity and cur's soft-affinity, or
- *   cpu is not in new's soft-affinity, nor in cur's soft-affinity
- *   (1x CSCHED2_CREDIT_INIT score bonus);
- * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity
- *   (no bonus).
- *
- * Within the same class, the highest difference of credit.
- */
-static s_time_t tickle_score(const struct scheduler *ops, s_time_t now,
-                             struct csched2_unit *new, unsigned int cpu)
-{
-    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-    struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu));
-    struct csched2_private *prv = csched2_priv(ops);
-    s_time_t score;
-
-    /*
-     * We are dealing with cpus that are marked non-idle (i.e., that are not
-     * in rqd->idle). However, some of them may be running their idle unit,
-     * if taking care of tasklets. In that case, we want to leave it alone.
-     */
-    if ( unlikely(is_idle_unit(cur->unit) ||
-         !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) )
-        return -1;
-
-    burn_credits(rqd, cur, now);
-
-    score = new->credit - cur->credit;
-    if ( sched_unit_master(new->unit) != cpu )
-        score -= CSCHED2_MIGRATE_RESIST;
-
-    /*
-     * If score is positive, it means new has enough credits (i.e.,
-     * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST).
-     *
-     * Let's compute the bonuses for soft-affinities.
-     */
-    if ( score > 0 )
-    {
-        if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) )
-            score += CSCHED2_CREDIT_INIT;
-
-        if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) )
-            score += CSCHED2_CREDIT_INIT;
-    }
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            int credit, score;
-        } d;
-        d.dom = cur->unit->domain->domain_id;
-        d.unit = cur->unit->unit_id;
-        d.credit = cur->credit;
-        d.score = score;
-        __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    return score;
-}
-
-/*
- * Check what processor it is best to 'wake', for picking up an unit that has
- * just been put (back) in the runqueue. Logic is as follows:
- *  1. if there are idle processors in the runq, wake one of them;
- *  2. if there aren't idle processor, check the one were the unit was
- *     running before to see if we can preempt what's running there now
- *     (and hence doing just one migration);
- *  3. last stand: check all processors and see if the unit is in right
- *     of preempting any of the other units running on them (this requires
- *     two migrations, and that's indeed why it is left as the last stand).
- *
- * Note that when we say 'idle processors' what we really mean is (pretty
- * much always) both _idle_ and _not_already_tickled_. In fact, if a
- * processor has been tickled, it will run csched2_schedule() shortly, and
- * pick up some work, so it would be wrong to consider it idle.
- */
-static void
-runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now)
-{
-    int i, ipid = -1;
-    s_time_t max = 0;
-    struct sched_unit *unit = new->unit;
-    unsigned int bs, cpu = sched_unit_master(unit);
-    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-    cpumask_t *online = cpupool_domain_master_cpumask(unit->domain);
-    cpumask_t mask;
-
-    ASSERT(new->rqd == rqd);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            unsigned processor;
-            int credit;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.processor = cpu;
-        d.credit = new->credit;
-        __trace_var(TRC_CSCHED2_TICKLE_NEW, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    /*
-     * Exclusive pinning is when an unit has hard-affinity with only one
-     * cpu, and there is no other unit that has hard-affinity with that
-     * same cpu. This is infrequent, but if it happens, is for achieving
-     * the most possible determinism, and least possible overhead for
-     * the units in question.
-     *
-     * Try to identify the vast majority of these situations, and deal
-     * with them quickly.
-     */
-    if ( unlikely((new->flags & CSFLAG_pinned) &&
-                  cpumask_test_cpu(cpu, &rqd->idle) &&
-                  !cpumask_test_cpu(cpu, &rqd->tickled)) )
-    {
-        ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu);
-        SCHED_STAT_CRANK(tickled_idle_cpu_excl);
-        ipid = cpu;
-        goto tickle;
-    }
-
-    for_each_affinity_balance_step( bs )
-    {
-        /* Just skip first step, if we don't have a soft affinity */
-        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
-            continue;
-
-        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
-
-        /*
-         * First of all, consider idle cpus, checking if we can just
-         * re-use the pcpu where we were running before.
-         *
-         * If there are cores where all the siblings are idle, consider
-         * them first, honoring whatever the spreading-vs-consolidation
-         * SMT policy wants us to do.
-         */
-        if ( unlikely(sched_smt_power_savings) )
-        {
-            cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle);
-            cpumask_and(&mask, &mask, online);
-        }
-        else
-            cpumask_and(&mask, &rqd->smt_idle, online);
-        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
-        i = cpumask_test_or_cycle(cpu, &mask);
-        if ( i < nr_cpu_ids )
-        {
-            SCHED_STAT_CRANK(tickled_idle_cpu);
-            ipid = i;
-            goto tickle;
-        }
-
-        /*
-         * If there are no fully idle cores, check all idlers, after
-         * having filtered out pcpus that have been tickled but haven't
-         * gone through the scheduler yet.
-         */
-        cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online);
-        cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
-        i = cpumask_test_or_cycle(cpu, &mask);
-        if ( i < nr_cpu_ids )
-        {
-            SCHED_STAT_CRANK(tickled_idle_cpu);
-            ipid = i;
-            goto tickle;
-        }
-    }
-
-    /*
-     * Note that, if we are here, it means we have done the hard-affinity
-     * balancing step of the loop, and hence what we have in cpumask_scratch
-     * is what we put there for last, i.e., new's unit_hard_affinity & online
-     * which is exactly what we need for the next part of the function.
-     */
-
-    /*
-     * Otherwise, look for the non-idle (and non-tickled) processors with
-     * the lowest credit, among the ones new is allowed to run on. Again,
-     * the cpu were it was running on would be the best candidate.
-     *
-     * For deciding which cpu to tickle, we use tickle_score(), which will
-     * factor in both new's soft-affinity, and the soft-affinity of the
-     * unit running on each cpu that we consider.
-     */
-    cpumask_andnot(&mask, &rqd->active, &rqd->idle);
-    cpumask_andnot(&mask, &mask, &rqd->tickled);
-    cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu));
-    if ( __cpumask_test_and_clear_cpu(cpu, &mask) )
-    {
-        s_time_t score = tickle_score(ops, now, new, cpu);
-
-        if ( score > max )
-        {
-            max = score;
-            ipid = cpu;
-
-            /* If this is in new's soft affinity, just take it */
-            if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) )
-            {
-                SCHED_STAT_CRANK(tickled_busy_cpu);
-                goto tickle;
-            }
-        }
-    }
-
-    for_each_cpu(i, &mask)
-    {
-        s_time_t score;
-
-        /* Already looked at this one above */
-        ASSERT(i != cpu);
-
-        score = tickle_score(ops, now, new, i);
-
-        if ( score > max )
-        {
-            max = score;
-            ipid = i;
-        }
-    }
-
-    if ( ipid == -1 )
-    {
-        SCHED_STAT_CRANK(tickled_no_cpu);
-        return;
-    }
-
-    ASSERT(!is_idle_unit(curr_on_cpu(ipid)));
-    SCHED_STAT_CRANK(tickled_busy_cpu);
- tickle:
-    BUG_ON(ipid == -1);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned cpu:16, pad:16;
-        } d;
-        d.cpu = ipid; d.pad = 0;
-        __trace_var(TRC_CSCHED2_TICKLE, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    tickle_cpu(ipid, rqd);
-
-    if ( unlikely(new->tickled_cpu != -1) )
-        SCHED_STAT_CRANK(tickled_cpu_overwritten);
-    new->tickled_cpu = ipid;
-}
-
-/*
- * Credit-related code
- */
-static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
-                         struct csched2_unit *snext)
-{
-    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-    struct list_head *iter;
-    int m;
-
-    /*
-     * Under normal circumstances, snext->credit should never be less
-     * than -CSCHED2_MIN_TIMER.  However, under some circumstances, an
-     * unit with low credits may be allowed to run long enough that
-     * its credits are actually less than -CSCHED2_CREDIT_INIT.
-     * (Instances have been observed, for example, where an unit with
-     * 200us of credit was allowed to run for 11ms, giving it -10.8ms
-     * of credit.  Thus it was still negative even after the reset.)
-     *
-     * If this is the case for snext, we simply want to keep moving
-     * everyone up until it is in the black again.  This fair because
-     * none of the other units want to run at the moment.
-     *
-     * Rather than looping, however, we just calculate a multiplier,
-     * avoiding an integer division and multiplication in the common
-     * case.
-     */
-    m = 1;
-    if ( snext->credit < -CSCHED2_CREDIT_INIT )
-        m += (-snext->credit) / CSCHED2_CREDIT_INIT;
-
-    list_for_each( iter, &rqd->svc )
-    {
-        unsigned int svc_cpu;
-        struct csched2_unit * svc;
-        int start_credit;
-
-        svc = list_entry(iter, struct csched2_unit, rqd_elem);
-        svc_cpu = sched_unit_master(svc->unit);
-
-        ASSERT(!is_idle_unit(svc->unit));
-        ASSERT(svc->rqd == rqd);
-
-        /*
-         * If svc is running, it is our responsibility to make sure, here,
-         * that the credit it has spent so far get accounted.
-         */
-        if ( svc->unit == curr_on_cpu(svc_cpu) )
-        {
-            burn_credits(rqd, svc, now);
-            /*
-             * And, similarly, in case it has run out of budget, as a
-             * consequence of this round of accounting, we also must inform
-             * its pCPU that it's time to park it, and pick up someone else.
-             */
-            if ( unlikely(svc->budget <= 0) )
-                tickle_cpu(svc_cpu, rqd);
-        }
-
-        start_credit = svc->credit;
-
-        /*
-         * Add INIT * m, avoiding integer multiplication in the common case.
-         */
-        if ( likely(m==1) )
-            svc->credit += CSCHED2_CREDIT_INIT;
-        else
-            svc->credit += m * CSCHED2_CREDIT_INIT;
-
-        /* "Clip" credits to max carryover */
-        if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX )
-            svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX;
-
-        svc->start_time = now;
-
-        if ( unlikely(tb_init_done) )
-        {
-            struct {
-                unsigned unit:16, dom:16;
-                int credit_start, credit_end;
-                unsigned multiplier;
-            } d;
-            d.dom = svc->unit->domain->domain_id;
-            d.unit = svc->unit->unit_id;
-            d.credit_start = start_credit;
-            d.credit_end = svc->credit;
-            d.multiplier = m;
-            __trace_var(TRC_CSCHED2_CREDIT_RESET, 1,
-                        sizeof(d),
-                        (unsigned char *)&d);
-        }
-    }
-
-    SCHED_STAT_CRANK(credit_reset);
-
-    /* No need to resort runqueue, as everyone's order should be the same. */
-}
-
-void burn_credits(struct csched2_runqueue_data *rqd,
-                  struct csched2_unit *svc, s_time_t now)
-{
-    s_time_t delta;
-
-    ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit))));
-
-    if ( unlikely(is_idle_unit(svc->unit)) )
-    {
-        ASSERT(svc->credit == CSCHED2_IDLE_CREDIT);
-        return;
-    }
-
-    delta = now - svc->start_time;
-
-    if ( unlikely(delta <= 0) )
-    {
-        if ( unlikely(delta < 0) )
-            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
-                     " start_time %"PRI_stime"\n", __func__, now,
-                     svc->start_time);
-        goto out;
-    }
-
-    SCHED_STAT_CRANK(burn_credits_t2c);
-    t2c_update(rqd, delta, svc);
-
-    if ( has_cap(svc) )
-        svc->budget -= delta;
-
-    svc->start_time = now;
-
- out:
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            int credit, budget;
-            int delta;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.credit = svc->credit;
-        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
-        d.delta = delta;
-        __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-}
-
-/*
- * Budget-related code.
- */
-
-static void park_unit(struct csched2_unit *svc)
-{
-    struct sched_unit *unit = svc->unit;
-
-    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
-
-    /*
-     * It was impossible to find budget for this unit, so it has to be
-     * "parked". This implies it is not runnable, so we mark it as such in
-     * its pause_flags. If the unit is currently scheduled (which means we
-     * are here after being called from within csched_schedule()), flagging
-     * is enough, as we'll choose someone else, and then context_saved()
-     * will take care of updating the load properly.
-     *
-     * If, OTOH, the unit is sitting in the runqueue (which means we are here
-     * after being called from within runq_candidate()), we must go all the
-     * way down to taking it out of there, and updating the load accordingly.
-     *
-     * In both cases, we also add it to the list of parked units of the domain.
-     */
-    sched_set_pause_flags(unit, _VPF_parked);
-    if ( unit_on_runq(svc) )
-    {
-        runq_remove(svc);
-        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
-    }
-    list_add(&svc->parked_elem, &svc->sdom->parked_units);
-}
-
-static bool unit_grab_budget(struct csched2_unit *svc)
-{
-    struct csched2_dom *sdom = svc->sdom;
-    unsigned int cpu = sched_unit_master(svc->unit);
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    if ( svc->budget > 0 )
-        return true;
-
-    /* budget_lock nests inside runqueue lock. */
-    spin_lock(&sdom->budget_lock);
-
-    /*
-     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
-     * above!). That basically means the unit has overrun a bit --because of
-     * various reasons-- and we want to take that into account. With the +=,
-     * we are actually subtracting the amount of budget the unit has
-     * overconsumed, from the total domain budget.
-     */
-    sdom->budget += svc->budget;
-
-    if ( sdom->budget > 0 )
-    {
-        s_time_t budget;
-
-        /* Get our quota, if there's at least as much budget */
-        if ( likely(sdom->budget >= svc->budget_quota) )
-            budget = svc->budget_quota;
-        else
-            budget = sdom->budget;
-
-        svc->budget = budget;
-        sdom->budget -= budget;
-    }
-    else
-    {
-        svc->budget = 0;
-        park_unit(svc);
-    }
-
-    spin_unlock(&sdom->budget_lock);
-
-    return svc->budget > 0;
-}
-
-static void
-unit_return_budget(struct csched2_unit *svc, struct list_head *parked)
-{
-    struct csched2_dom *sdom = svc->sdom;
-    unsigned int cpu = sched_unit_master(svc->unit);
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-    ASSERT(list_empty(parked));
-
-    /* budget_lock nests inside runqueue lock. */
-    spin_lock(&sdom->budget_lock);
-
-    /*
-     * The unit is stopping running (e.g., because it's blocking, or it has
-     * been preempted). If it hasn't consumed all the budget it got when,
-     * starting to run, put that remaining amount back in the domain's budget
-     * pool.
-     */
-    sdom->budget += svc->budget;
-    svc->budget = 0;
-
-    /*
-     * Making budget available again to the domain means that parked units
-     * may be unparked and run. They are, if any, in the domain's parked_units
-     * list, so we want to go through that and unpark them (so they can try
-     * to get some budget).
-     *
-     * Touching the list requires the budget_lock, which we hold. Let's
-     * therefore put everyone in that list in another, temporary list, which
-     * then the caller will traverse, unparking the units it finds there.
-     *
-     * In fact, we can't do the actual unparking here, because that requires
-     * taking the runqueue lock of the units being unparked, and we can't
-     * take any runqueue locks while we hold a budget_lock.
-     */
-    if ( sdom->budget > 0 )
-        list_splice_init(&sdom->parked_units, parked);
-
-    spin_unlock(&sdom->budget_lock);
-}
-
-static void
-unpark_parked_units(const struct scheduler *ops, struct list_head *units)
-{
-    struct csched2_unit *svc, *tmp;
-    spinlock_t *lock;
-
-    list_for_each_entry_safe ( svc, tmp, units, parked_elem )
-    {
-        unsigned long flags;
-        s_time_t now;
-
-        lock = unit_schedule_lock_irqsave(svc->unit, &flags);
-
-        sched_clear_pause_flags(svc->unit, _VPF_parked);
-        if ( unlikely(svc->flags & CSFLAG_scheduled) )
-        {
-            /*
-             * We end here if a budget replenishment arrived between
-             * csched2_schedule() (and, in particular, after a call to
-             * unit_grab_budget() that returned false), and
-             * context_saved(). By setting __CSFLAG_delayed_runq_add,
-             * we tell context_saved() to put the unit back in the
-             * runqueue, from where it will compete with the others
-             * for the newly replenished budget.
-             */
-            ASSERT( svc->rqd != NULL );
-            ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd );
-            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
-        }
-        else if ( unit_runnable(svc->unit) )
-        {
-            /*
-             * The unit should go back to the runqueue, and compete for
-             * the newly replenished budget, but only if it is actually
-             * runnable (and was therefore offline only because of the
-             * lack of budget).
-             */
-            now = NOW();
-            update_load(ops, svc->rqd, svc, 1, now);
-            runq_insert(ops, svc);
-            runq_tickle(ops, svc, now);
-        }
-        list_del_init(&svc->parked_elem);
-
-        unit_schedule_unlock_irqrestore(lock, flags, svc->unit);
-    }
-}
-
-static inline void do_replenish(struct csched2_dom *sdom)
-{
-    sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
-    sdom->budget += sdom->tot_budget;
-}
-
-static void replenish_domain_budget(void* data)
-{
-    struct csched2_dom *sdom = data;
-    unsigned long flags;
-    s_time_t now;
-    LIST_HEAD(parked);
-
-    spin_lock_irqsave(&sdom->budget_lock, flags);
-
-    now = NOW();
-
-    /*
-     * Let's do the replenishment. Note, though, that a domain may overrun,
-     * which means the budget would have gone below 0 (reasons may be system
-     * overbooking, accounting issues, etc.). It also may happen that we are
-     * handling the replenishment (much) later than we should (reasons may
-     * again be overbooking, or issues with timers).
-     *
-     * Even in cases of overrun or delay, however, we expect that in 99% of
-     * cases, doing just one replenishment will be good enough for being able
-     * to unpark the units that are waiting for some budget.
-     */
-    do_replenish(sdom);
-
-    /*
-     * And now, the special cases:
-     * 1) if we are late enough to have skipped (at least) one full period,
-     * what we must do is doing more replenishments. Note that, however,
-     * every time we add tot_budget to the budget, we also move next_repl
-     * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always
-     * respected.
-     */
-    if ( unlikely(sdom->next_repl <= now) )
-    {
-        do
-            do_replenish(sdom);
-        while ( sdom->next_repl <= now );
-    }
-    /*
-     * 2) if we overrun by more than tot_budget, then budget+tot_budget is
-     * still < 0, which means that we can't unpark the units. Let's bail,
-     * and wait for future replenishments.
-     */
-    if ( unlikely(sdom->budget <= 0) )
-    {
-        spin_unlock_irqrestore(&sdom->budget_lock, flags);
-        goto out;
-    }
-
-    /* Since we do more replenishments, make sure we didn't overshot. */
-    sdom->budget = min(sdom->budget, sdom->tot_budget);
-
-    /*
-     * As above, let's prepare the temporary list, out of the domain's
-     * parked_units list, now that we hold the budget_lock. Then, drop such
-     * lock, and pass the list to the unparking function.
-     */
-    list_splice_init(&sdom->parked_units, &parked);
-
-    spin_unlock_irqrestore(&sdom->budget_lock, flags);
-
-    unpark_parked_units(sdom->dom->cpupool->sched, &parked);
-
- out:
-    set_timer(&sdom->repl_timer, sdom->next_repl);
-}
-
-#ifndef NDEBUG
-static inline void
-csched2_unit_check(struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-    struct csched2_dom * const sdom = svc->sdom;
-
-    BUG_ON( svc->unit != unit );
-    BUG_ON( sdom != csched2_dom(unit->domain) );
-    if ( sdom )
-    {
-        BUG_ON( is_idle_unit(unit) );
-        BUG_ON( sdom->dom != unit->domain );
-    }
-    else
-    {
-        BUG_ON( !is_idle_unit(unit) );
-    }
-    SCHED_STAT_CRANK(unit_check);
-}
-#define CSCHED2_UNIT_CHECK(unit)  (csched2_unit_check(unit))
-#else
-#define CSCHED2_UNIT_CHECK(unit)
-#endif
-
-static void *
-csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
-                    void *dd)
-{
-    struct csched2_unit *svc;
-
-    /* Allocate per-UNIT info */
-    svc = xzalloc(struct csched2_unit);
-    if ( svc == NULL )
-        return NULL;
-
-    INIT_LIST_HEAD(&svc->rqd_elem);
-    INIT_LIST_HEAD(&svc->runq_elem);
-
-    svc->sdom = dd;
-    svc->unit = unit;
-    svc->flags = 0U;
-
-    if ( ! is_idle_unit(unit) )
-    {
-        ASSERT(svc->sdom != NULL);
-        svc->credit = CSCHED2_CREDIT_INIT;
-        svc->weight = svc->sdom->weight;
-        /* Starting load of 50% */
-        svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1);
-        svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT;
-    }
-    else
-    {
-        ASSERT(svc->sdom == NULL);
-        svc->credit = CSCHED2_IDLE_CREDIT;
-        svc->weight = 0;
-    }
-    svc->tickled_cpu = -1;
-
-    svc->budget = STIME_MAX;
-    svc->budget_quota = 0;
-    INIT_LIST_HEAD(&svc->parked_elem);
-
-    SCHED_STAT_CRANK(unit_alloc);
-
-    return svc;
-}
-
-static void
-csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-
-    ASSERT(!is_idle_unit(unit));
-    SCHED_STAT_CRANK(unit_sleep);
-
-    if ( curr_on_cpu(sched_unit_master(unit)) == unit )
-    {
-        tickle_cpu(sched_unit_master(unit), svc->rqd);
-    }
-    else if ( unit_on_runq(svc) )
-    {
-        ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-        update_load(ops, svc->rqd, svc, -1, NOW());
-        runq_remove(svc);
-    }
-    else
-        __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
-}
-
-static void
-csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-    unsigned int cpu = sched_unit_master(unit);
-    s_time_t now;
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    ASSERT(!is_idle_unit(unit));
-
-    if ( unlikely(curr_on_cpu(cpu) == unit) )
-    {
-        SCHED_STAT_CRANK(unit_wake_running);
-        goto out;
-    }
-
-    if ( unlikely(unit_on_runq(svc)) )
-    {
-        SCHED_STAT_CRANK(unit_wake_onrunq);
-        goto out;
-    }
-
-    if ( likely(unit_runnable(unit)) )
-        SCHED_STAT_CRANK(unit_wake_runnable);
-    else
-        SCHED_STAT_CRANK(unit_wake_not_runnable);
-
-    /* If the context hasn't been saved for this unit yet, we can't put it on
-     * another runqueue.  Instead, we set a flag so that it will be put on the runqueue
-     * after the context has been saved. */
-    if ( unlikely(svc->flags & CSFLAG_scheduled) )
-    {
-        __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
-        goto out;
-    }
-
-    /* Add into the new runqueue if necessary */
-    if ( svc->rqd == NULL )
-        runq_assign(ops, unit);
-    else
-        ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd );
-
-    now = NOW();
-
-    update_load(ops, svc->rqd, svc, 1, now);
-
-    /* Put the UNIT on the runq */
-    runq_insert(ops, svc);
-    runq_tickle(ops, svc, now);
-
-out:
-    return;
-}
-
-static void
-csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-
-    __set_bit(__CSFLAG_unit_yield, &svc->flags);
-}
-
-static void
-csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-    spinlock_t *lock = unit_schedule_lock_irq(unit);
-    s_time_t now = NOW();
-    LIST_HEAD(were_parked);
-
-    BUG_ON( !is_idle_unit(unit) &&
-            svc->rqd != c2rqd(ops, sched_unit_master(unit)));
-    ASSERT(is_idle_unit(unit) ||
-           svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
-    /* This unit is now eligible to be put on the runqueue again */
-    __clear_bit(__CSFLAG_scheduled, &svc->flags);
-
-    if ( unlikely(has_cap(svc) && svc->budget > 0) )
-        unit_return_budget(svc, &were_parked);
-
-    /* If someone wants it on the runqueue, put it there. */
-    /*
-     * NB: We can get rid of CSFLAG_scheduled by checking for
-     * vc->is_running and unit_on_runq(svc) here.  However,
-     * since we're accessing the flags cacheline anyway,
-     * it seems a bit pointless; especially as we have plenty of
-     * bits free.
-     */
-    if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags)
-         && likely(unit_runnable(unit)) )
-    {
-        ASSERT(!unit_on_runq(svc));
-
-        runq_insert(ops, svc);
-        runq_tickle(ops, svc, now);
-    }
-    else if ( !is_idle_unit(unit) )
-        update_load(ops, svc->rqd, svc, -1, now);
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    unpark_parked_units(ops, &were_parked);
-}
-
-#define MAX_LOAD (STIME_MAX)
-static struct sched_resource *
-csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    int i, min_rqi = -1, min_s_rqi = -1;
-    unsigned int new_cpu, cpu = sched_unit_master(unit);
-    struct csched2_unit *svc = csched2_unit(unit);
-    s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD;
-    bool has_soft;
-
-    ASSERT(!cpumask_empty(&prv->active_queues));
-
-    SCHED_STAT_CRANK(pick_resource);
-
-    /* Locking:
-     * - Runqueue lock of vc->processor is already locked
-     * - Need to grab prv lock to make sure active runqueues don't
-     *   change
-     * - Need to grab locks for other runqueues while checking
-     *   avgload
-     * Locking constraint is:
-     * - Lock prv before runqueue locks
-     * - Trylock between runqueue locks (no ordering)
-     *
-     * Since one of the runqueue locks is already held, we can't
-     * just grab the prv lock.  Instead, we'll have to trylock, and
-     * do something else reasonable if we fail.
-     */
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    if ( !read_trylock(&prv->lock) )
-    {
-        /* We may be here because someone requested us to migrate. */
-        __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
-        new_cpu = get_fallback_cpu(svc);
-        /*
-         * Tracing of runq and its load won't be accurate, since we could
-         * not get the lock, but at least we will output the chosen pcpu.
-         */
-        goto out;
-    }
-
-    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                cpupool_domain_master_cpumask(unit->domain));
-
-    /*
-     * First check to see if we're here because someone else suggested a place
-     * for us to move.
-     */
-    if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) )
-    {
-        if ( unlikely(svc->migrate_rqd->id < 0) )
-        {
-            printk(XENLOG_WARNING "%s: target runqueue disappeared!\n",
-                   __func__);
-        }
-        else if ( cpumask_intersects(cpumask_scratch_cpu(cpu),
-                                     &svc->migrate_rqd->active) )
-        {
-            /*
-             * If we've been asked to move to migrate_rqd, we should just do
-             * that, which we actually do by returning one cpu from that runq.
-             * There is no need to take care of soft affinity, as that will
-             * happen in runq_tickle().
-             */
-            cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                        &svc->migrate_rqd->active);
-            new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias,
-                                    cpumask_scratch_cpu(cpu));
-
-            svc->migrate_rqd->pick_bias = new_cpu;
-            goto out_up;
-        }
-        /* Fall-through to normal cpu pick */
-    }
-
-    /*
-     * What we want is:
-     *  - if we have soft affinity, the runqueue with the lowest average
-     *    load, among the ones that contain cpus in our soft affinity; this
-     *    represents the best runq on which we would want to run.
-     *  - the runqueue with the lowest average load among the ones that
-     *    contains cpus in our hard affinity; this represent the best runq
-     *    on which we can run.
-     *
-     * Find both runqueues in one pass.
-     */
-    has_soft = has_soft_affinity(unit);
-    for_each_cpu(i, &prv->active_queues)
-    {
-        struct csched2_runqueue_data *rqd;
-        s_time_t rqd_avgload = MAX_LOAD;
-
-        rqd = prv->rqd + i;
-
-        /*
-         * If none of the cpus of this runqueue is in svc's hard-affinity,
-         * skip the runqueue.
-         *
-         * Note that, in case svc's hard-affinity has changed, this is the
-         * first time when we see such change, so it is indeed possible
-         * that we end up skipping svc's current runqueue.
-         */
-        if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) )
-            continue;
-
-        /*
-         * If checking a different runqueue, grab the lock, read the avg,
-         * and then release the lock.
-         *
-         * If on our own runqueue, don't grab or release the lock;
-         * but subtract our own load from the runqueue load to simulate
-         * impartiality.
-         */
-        if ( rqd == svc->rqd )
-        {
-            rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0);
-        }
-        else if ( spin_trylock(&rqd->lock) )
-        {
-            rqd_avgload = rqd->b_avgload;
-            spin_unlock(&rqd->lock);
-        }
-
-        /*
-         * if svc has a soft-affinity, and some cpus of rqd are part of it,
-         * see if we need to update the "soft-affinity minimum".
-         */
-        if ( has_soft &&
-             rqd_avgload < min_s_avgload )
-        {
-            cpumask_t mask;
-
-            cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active);
-            if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) )
-            {
-                min_s_avgload = rqd_avgload;
-                min_s_rqi = i;
-            }
-        }
-        /* In any case, keep the "hard-affinity minimum" updated too. */
-        if ( rqd_avgload < min_avgload )
-        {
-            min_avgload = rqd_avgload;
-            min_rqi = i;
-        }
-    }
-
-    if ( has_soft && min_s_rqi != -1 )
-    {
-        /*
-         * We have soft affinity, and we have a candidate runq, so go for it.
-         *
-         * Note that, to obtain the soft-affinity mask, we "just" put what we
-         * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is
-         * ok because:
-         * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have
-         *   a non-empty intersection (because has_soft is true);
-         * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask()
-         *   already in cpumask_scratch, we do save a lot doing like this.
-         *
-         * It's kind of like open coding affinity_balance_cpumask() but, in
-         * this specific case, calling that would mean a lot of (unnecessary)
-         * cpumask operations.
-         */
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    unit->cpu_soft_affinity);
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    &prv->rqd[min_s_rqi].active);
-    }
-    else if ( min_rqi != -1 )
-    {
-        /*
-         * Either we don't have soft-affinity, or we do, but we did not find
-         * any suitable runq. But we did find one when considering hard
-         * affinity, so go for it.
-         *
-         * cpumask_scratch already has unit->cpu_hard_affinity &
-         * cpupool_domain_master_cpumask() in it, so it's enough that we filter
-         * with the cpus of the runq.
-         */
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    &prv->rqd[min_rqi].active);
-    }
-    else
-    {
-        /*
-         * We didn't find anyone at all (most likely because of spinlock
-         * contention).
-         */
-        new_cpu = get_fallback_cpu(svc);
-        min_rqi = c2r(new_cpu);
-        min_avgload = prv->rqd[min_rqi].b_avgload;
-        goto out_up;
-    }
-
-    new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias,
-                            cpumask_scratch_cpu(cpu));
-    prv->rqd[min_rqi].pick_bias = new_cpu;
-    BUG_ON(new_cpu >= nr_cpu_ids);
-
- out_up:
-    read_unlock(&prv->lock);
- out:
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint64_t b_avgload;
-            unsigned unit:16, dom:16;
-            unsigned rq_id:16, new_cpu:16;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.rq_id = min_rqi;
-        d.b_avgload = min_avgload;
-        d.new_cpu = new_cpu;
-        __trace_var(TRC_CSCHED2_PICKED_CPU, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    return get_sched_res(new_cpu);
-}
-
-/* Working state of the load-balancing algorithm */
-typedef struct {
-    /* NB: Modified by consider() */
-    s_time_t load_delta;
-    struct csched2_unit * best_push_svc, *best_pull_svc;
-    /* NB: Read by consider() */
-    struct csched2_runqueue_data *lrqd;
-    struct csched2_runqueue_data *orqd;
-} balance_state_t;
-
-static void consider(balance_state_t *st,
-                     struct csched2_unit *push_svc,
-                     struct csched2_unit *pull_svc)
-{
-    s_time_t l_load, o_load, delta;
-
-    l_load = st->lrqd->b_avgload;
-    o_load = st->orqd->b_avgload;
-    if ( push_svc )
-    {
-        /* What happens to the load on both if we push? */
-        l_load -= push_svc->avgload;
-        o_load += push_svc->avgload;
-    }
-    if ( pull_svc )
-    {
-        /* What happens to the load on both if we pull? */
-        l_load += pull_svc->avgload;
-        o_load -= pull_svc->avgload;
-    }
-
-    delta = l_load - o_load;
-    if ( delta < 0 )
-        delta = -delta;
-
-    if ( delta < st->load_delta )
-    {
-        st->load_delta = delta;
-        st->best_push_svc=push_svc;
-        st->best_pull_svc=pull_svc;
-    }
-}
-
-
-static void migrate(const struct scheduler *ops,
-                    struct csched2_unit *svc,
-                    struct csched2_runqueue_data *trqd,
-                    s_time_t now)
-{
-    struct sched_unit *unit = svc->unit;
-    int cpu = sched_unit_master(unit);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            unsigned rqi:16, trqi:16;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.rqi = svc->rqd->id;
-        d.trqi = trqd->id;
-        __trace_var(TRC_CSCHED2_MIGRATE, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    if ( svc->flags & CSFLAG_scheduled )
-    {
-        /* It's running; mark it to migrate. */
-        svc->migrate_rqd = trqd;
-        sched_set_pause_flags(unit, _VPF_migrating);
-        __set_bit(__CSFLAG_runq_migrate_request, &svc->flags);
-        SCHED_STAT_CRANK(migrate_requested);
-        tickle_cpu(cpu, svc->rqd);
-    }
-    else
-    {
-        int on_runq = 0;
-        /* It's not running; just move it */
-        if ( unit_on_runq(svc) )
-        {
-            runq_remove(svc);
-            update_load(ops, svc->rqd, NULL, -1, now);
-            on_runq = 1;
-        }
-        _runq_deassign(svc);
-
-        cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                    cpupool_domain_master_cpumask(unit->domain));
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    &trqd->active);
-        sched_set_res(unit,
-                      get_sched_res(cpumask_cycle(trqd->pick_bias,
-                                                  cpumask_scratch_cpu(cpu))));
-        trqd->pick_bias = sched_unit_master(unit);
-        ASSERT(sched_unit_master(unit) < nr_cpu_ids);
-
-        _runq_assign(svc, trqd);
-        if ( on_runq )
-        {
-            update_load(ops, svc->rqd, NULL, 1, now);
-            runq_insert(ops, svc);
-            runq_tickle(ops, svc, now);
-            SCHED_STAT_CRANK(migrate_on_runq);
-        }
-        else
-            SCHED_STAT_CRANK(migrate_no_runq);
-    }
-}
-
-/*
- * It makes sense considering migrating svc to rqd, if:
- *  - svc is not already flagged to migrate,
- *  - if svc is allowed to run on at least one of the pcpus of rqd.
- */
-static bool unit_is_migrateable(struct csched2_unit *svc,
-                                  struct csched2_runqueue_data *rqd)
-{
-    struct sched_unit *unit = svc->unit;
-    int cpu = sched_unit_master(unit);
-
-    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                cpupool_domain_master_cpumask(unit->domain));
-
-    return !(svc->flags & CSFLAG_runq_migrate_request) &&
-           cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active);
-}
-
-static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    int i, max_delta_rqi;
-    struct list_head *push_iter, *pull_iter;
-    bool inner_load_updated = 0;
-
-    balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL };
-
-    /*
-     * Basic algorithm: Push, pull, or swap.
-     * - Find the runqueue with the furthest load distance
-     * - Find a pair that makes the difference the least (where one
-     * on either side may be empty).
-     */
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-    st.lrqd = c2rqd(ops, cpu);
-
-    update_runq_load(ops, st.lrqd, 0, now);
-
-retry:
-    max_delta_rqi = -1;
-    if ( !read_trylock(&prv->lock) )
-        return;
-
-    st.load_delta = 0;
-
-    for_each_cpu(i, &prv->active_queues)
-    {
-        s_time_t delta;
-
-        st.orqd = prv->rqd + i;
-
-        if ( st.orqd == st.lrqd
-             || !spin_trylock(&st.orqd->lock) )
-            continue;
-
-        update_runq_load(ops, st.orqd, 0, now);
-
-        delta = st.lrqd->b_avgload - st.orqd->b_avgload;
-        if ( delta < 0 )
-            delta = -delta;
-
-        if ( delta > st.load_delta )
-        {
-            st.load_delta = delta;
-            max_delta_rqi = i;
-        }
-
-        spin_unlock(&st.orqd->lock);
-    }
-
-    /* Minimize holding the private scheduler lock. */
-    read_unlock(&prv->lock);
-    if ( max_delta_rqi == -1 )
-        goto out;
-
-    {
-        s_time_t load_max;
-        int cpus_max;
-
-
-        load_max = st.lrqd->b_avgload;
-        if ( st.orqd->b_avgload > load_max )
-            load_max = st.orqd->b_avgload;
-
-        cpus_max = st.lrqd->nr_cpus;
-        i = st.orqd->nr_cpus;
-        if ( i > cpus_max )
-            cpus_max = i;
-
-        if ( unlikely(tb_init_done) )
-        {
-            struct {
-                unsigned lrq_id:16, orq_id:16;
-                unsigned load_delta;
-            } d;
-            d.lrq_id = st.lrqd->id;
-            d.orq_id = st.orqd->id;
-            d.load_delta = st.load_delta;
-            __trace_var(TRC_CSCHED2_LOAD_CHECK, 1,
-                        sizeof(d),
-                        (unsigned char *)&d);
-        }
-
-        /*
-         * If we're under 100% capacaty, only shift if load difference
-         * is > 1.  otherwise, shift if under 12.5%
-         */
-        if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) )
-        {
-            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
-                                           opt_underload_balance_tolerance)) )
-                 goto out;
-        }
-        else
-            if ( st.load_delta < (1ULL << (prv->load_precision_shift +
-                                           opt_overload_balance_tolerance)) )
-                goto out;
-    }
-
-    /* Try to grab the other runqueue lock; if it's been taken in the
-     * meantime, try the process over again.  This can't deadlock
-     * because if it doesn't get any other rqd locks, it will simply
-     * give up and return. */
-    st.orqd = prv->rqd + max_delta_rqi;
-    if ( !spin_trylock(&st.orqd->lock) )
-        goto retry;
-
-    /* Make sure the runqueue hasn't been deactivated since we released prv->lock */
-    if ( unlikely(st.orqd->id < 0) )
-        goto out_up;
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint64_t lb_avgload, ob_avgload;
-            unsigned lrq_id:16, orq_id:16;
-        } d;
-        d.lrq_id = st.lrqd->id;
-        d.lb_avgload = st.lrqd->b_avgload;
-        d.orq_id = st.orqd->id;
-        d.ob_avgload = st.orqd->b_avgload;
-        __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    SCHED_STAT_CRANK(acct_load_balance);
-
-    /* Look for "swap" which gives the best load average
-     * FIXME: O(n^2)! */
-
-    /* Reuse load delta (as we're trying to minimize it) */
-    list_for_each( push_iter, &st.lrqd->svc )
-    {
-        struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem);
-
-        update_svc_load(ops, push_svc, 0, now);
-
-        if ( !unit_is_migrateable(push_svc, st.orqd) )
-            continue;
-
-        list_for_each( pull_iter, &st.orqd->svc )
-        {
-            struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
-
-            if ( !inner_load_updated )
-                update_svc_load(ops, pull_svc, 0, now);
-
-            if ( !unit_is_migrateable(pull_svc, st.lrqd) )
-                continue;
-
-            consider(&st, push_svc, pull_svc);
-        }
-
-        inner_load_updated = 1;
-
-        /* Consider push only */
-        consider(&st, push_svc, NULL);
-    }
-
-    list_for_each( pull_iter, &st.orqd->svc )
-    {
-        struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem);
-
-        if ( !unit_is_migrateable(pull_svc, st.lrqd) )
-            continue;
-
-        /* Consider pull only */
-        consider(&st, NULL, pull_svc);
-    }
-
-    /* OK, now we have some candidates; do the moving */
-    if ( st.best_push_svc )
-        migrate(ops, st.best_push_svc, st.orqd, now);
-    if ( st.best_pull_svc )
-        migrate(ops, st.best_pull_svc, st.lrqd, now);
-
- out_up:
-    spin_unlock(&st.orqd->lock);
- out:
-    return;
-}
-
-static void
-csched2_unit_migrate(
-    const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
-{
-    struct domain *d = unit->domain;
-    struct csched2_unit * const svc = csched2_unit(unit);
-    struct csched2_runqueue_data *trqd;
-    s_time_t now = NOW();
-
-    /*
-     * Being passed a target pCPU which is outside of our cpupool is only
-     * valid if we are shutting down (or doing ACPI suspend), and we are
-     * moving everyone to BSP, no matter whether or not BSP is inside our
-     * cpupool.
-     *
-     * And since there indeed is the chance that it is not part of it, all
-     * we must do is remove _and_ unassign the unit from any runqueue, as
-     * well as updating v->processor with the target, so that the suspend
-     * process can continue.
-     *
-     * It will then be during resume that a new, meaningful, value for
-     * v->processor will be chosen, and during actual domain unpause that
-     * the unit will be assigned to and added to the proper runqueue.
-     */
-    if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) )
-    {
-        ASSERT(system_state == SYS_STATE_suspend);
-        if ( unit_on_runq(svc) )
-        {
-            runq_remove(svc);
-            update_load(ops, svc->rqd, NULL, -1, now);
-        }
-        _runq_deassign(svc);
-        sched_set_res(unit, get_sched_res(new_cpu));
-        return;
-    }
-
-    /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */
-    ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized));
-    ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity));
-
-    trqd = c2rqd(ops, new_cpu);
-
-    /*
-     * Do the actual movement toward new_cpu, and update vc->processor.
-     * If we are changing runqueue, migrate() takes care of everything.
-     * If we are not changing runqueue, we need to update vc->processor
-     * here. In fact, if, for instance, we are here because the unit's
-     * hard affinity changed, we don't want to risk leaving vc->processor
-     * pointing to a pcpu where we can't run any longer.
-     */
-    if ( trqd != svc->rqd )
-        migrate(ops, svc, trqd, now);
-    else
-        sched_set_res(unit, get_sched_res(new_cpu));
-}
-
-static int
-csched2_dom_cntl(
-    const struct scheduler *ops,
-    struct domain *d,
-    struct xen_domctl_scheduler_op *op)
-{
-    struct csched2_dom * const sdom = csched2_dom(d);
-    struct csched2_private *prv = csched2_priv(ops);
-    unsigned long flags;
-    struct sched_unit *unit;
-    int rc = 0;
-
-    /*
-     * Locking:
-     *  - we must take the private lock for accessing the weights of the
-     *    units of d, and/or the cap;
-     *  - in the putinfo case, we also need the runqueue lock(s), for
-     *    updating the max waight of the runqueue(s).
-     *    If changing the cap, we also need the budget_lock, for updating
-     *    the value of the domain budget pool (and the runqueue lock,
-     *    for adjusting the parameters and rescheduling any unit that is
-     *    running at the time of the change).
-     */
-    switch ( op->cmd )
-    {
-    case XEN_DOMCTL_SCHEDOP_getinfo:
-        read_lock_irqsave(&prv->lock, flags);
-        op->u.credit2.weight = sdom->weight;
-        op->u.credit2.cap = sdom->cap;
-        read_unlock_irqrestore(&prv->lock, flags);
-        break;
-    case XEN_DOMCTL_SCHEDOP_putinfo:
-        write_lock_irqsave(&prv->lock, flags);
-        /* Weight */
-        if ( op->u.credit2.weight != 0 )
-        {
-            int old_weight;
-
-            old_weight = sdom->weight;
-
-            sdom->weight = op->u.credit2.weight;
-
-            /* Update weights for units, and max_weight for runqueues on which they reside */
-            for_each_sched_unit ( d, unit )
-            {
-                struct csched2_unit *svc = csched2_unit(unit);
-                spinlock_t *lock = unit_schedule_lock(unit);
-
-                ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit)));
-
-                svc->weight = sdom->weight;
-                update_max_weight(svc->rqd, svc->weight, old_weight);
-
-                unit_schedule_unlock(lock, unit);
-            }
-        }
-        /* Cap */
-        if ( op->u.credit2.cap != 0 )
-        {
-            struct csched2_unit *svc;
-            spinlock_t *lock;
-
-            /* Cap is only valid if it's below 100 * nr_of_units */
-            if ( op->u.credit2.cap > 100 * sdom->nr_units )
-            {
-                rc = -EINVAL;
-                write_unlock_irqrestore(&prv->lock, flags);
-                break;
-            }
-
-            spin_lock(&sdom->budget_lock);
-            sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap);
-            sdom->tot_budget /= 100;
-            spin_unlock(&sdom->budget_lock);
-
-            /*
-             * When trying to get some budget and run, each unit will grab
-             * from the pool 1/N (with N = nr of units of the domain) of
-             * the total budget. Roughly speaking, this means each unit will
-             * have at least one chance to run during every period.
-             */
-            for_each_sched_unit ( d, unit )
-            {
-                svc = csched2_unit(unit);
-                lock = unit_schedule_lock(unit);
-                /*
-                 * Too small quotas would in theory cause a lot of overhead,
-                 * which then won't happen because, in csched2_runtime(),
-                 * CSCHED2_MIN_TIMER is what would be used anyway.
-                 */
-                svc->budget_quota = max(sdom->tot_budget / sdom->nr_units,
-                                        CSCHED2_MIN_TIMER);
-                unit_schedule_unlock(lock, unit);
-            }
-
-            if ( sdom->cap == 0 )
-            {
-                /*
-                 * We give to the domain the budget to which it is entitled,
-                 * and queue its first replenishment event.
-                 *
-                 * Since cap is currently disabled for this domain, we
-                 * know no unit is messing with the domain's budget, and
-                 * the replenishment timer is still off.
-                 * For these reasons, it is safe to do the following without
-                 * taking the budget_lock.
-                 */
-                sdom->budget = sdom->tot_budget;
-                sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD;
-                set_timer(&sdom->repl_timer, sdom->next_repl);
-
-                /*
-                 * Now, let's enable budget accounting for all the units.
-                 * For making sure that they will start to honour the domain's
-                 * cap, we set their budget to 0.
-                 * This way, as soon as they will try to run, they will have
-                 * to get some budget.
-                 *
-                 * For the units that are already running, we trigger the
-                 * scheduler on their pCPU. When, as a consequence of this,
-                 * csched2_schedule() will run, it will figure out there is
-                 * no budget, and the unit will try to get some (and be parked,
-                 * if there's none, and we'll switch to someone else).
-                 */
-                for_each_sched_unit ( d, unit )
-                {
-                    svc = csched2_unit(unit);
-                    lock = unit_schedule_lock(unit);
-                    if ( unit->is_running )
-                    {
-                        unsigned int cpu = sched_unit_master(unit);
-                        struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-
-                        ASSERT(curr_on_cpu(cpu) == unit);
-
-                        /*
-                         * We are triggering a reschedule on the unit's
-                         * pCPU. That will run burn_credits() and, since
-                         * the unit is capped now, it would charge all the
-                         * execution time of this last round as budget as
-                         * well. That will make the unit budget go negative,
-                         * potentially by a large amount, and it's unfair.
-                         *
-                         * To avoid that, call burn_credit() here, to do the
-                         * accounting of this current running instance now,
-                         * with budgetting still disabled. This does not
-                         * prevent some small amount of budget being charged
-                         * to the unit (i.e., the amount of time it runs from
-                         * now, to when scheduling happens). The budget will
-                         * also go below 0, but a lot less than how it would
-                         * if we don't do this.
-                         */
-                        burn_credits(rqd, svc, NOW());
-                        __cpumask_set_cpu(cpu, &rqd->tickled);
-                        ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle));
-                        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-                    }
-                    svc->budget = 0;
-                    unit_schedule_unlock(lock, unit);
-                }
-            }
-
-            sdom->cap = op->u.credit2.cap;
-        }
-        else if ( sdom->cap != 0 )
-        {
-            LIST_HEAD(parked);
-
-            stop_timer(&sdom->repl_timer);
-
-            /* Disable budget accounting for all the units. */
-            for_each_sched_unit ( d, unit )
-            {
-                struct csched2_unit *svc = csched2_unit(unit);
-                spinlock_t *lock = unit_schedule_lock(unit);
-
-                svc->budget = STIME_MAX;
-                svc->budget_quota = 0;
-
-                unit_schedule_unlock(lock, unit);
-            }
-            sdom->cap = 0;
-            /*
-             * We are disabling the cap for this domain, which may have
-             * units waiting for a replenishment, so we unpark them all.
-             * Note that, since we have already disabled budget accounting
-             * for all the units of the domain, no currently running unit
-             * will be added to the parked units list any longer.
-             */
-            spin_lock(&sdom->budget_lock);
-            list_splice_init(&sdom->parked_units, &parked);
-            spin_unlock(&sdom->budget_lock);
-
-            unpark_parked_units(ops, &parked);
-        }
-        write_unlock_irqrestore(&prv->lock, flags);
-        break;
-    default:
-        rc = -EINVAL;
-        break;
-    }
-
-
-    return rc;
-}
-
-static void
-csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit,
-                 const cpumask_t *hard, const cpumask_t *soft)
-{
-    struct csched2_unit *svc = csched2_unit(unit);
-
-    if ( !hard )
-        return;
-
-    /* Are we becoming exclusively pinned? */
-    if ( cpumask_weight(hard) == 1 )
-        __set_bit(__CSFLAG_pinned, &svc->flags);
-    else
-        __clear_bit(__CSFLAG_pinned, &svc->flags);
-}
-
-static int csched2_sys_cntl(const struct scheduler *ops,
-                            struct xen_sysctl_scheduler_op *sc)
-{
-    struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2;
-    struct csched2_private *prv = csched2_priv(ops);
-    unsigned long flags;
-
-    switch (sc->cmd )
-    {
-    case XEN_SYSCTL_SCHEDOP_putinfo:
-        if ( params->ratelimit_us &&
-             (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX ||
-              params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN ))
-            return -EINVAL;
-
-        write_lock_irqsave(&prv->lock, flags);
-        if ( !prv->ratelimit_us && params->ratelimit_us )
-            printk(XENLOG_INFO "Enabling context switch rate limiting\n");
-        else if ( prv->ratelimit_us && !params->ratelimit_us )
-            printk(XENLOG_INFO "Disabling context switch rate limiting\n");
-        prv->ratelimit_us = params->ratelimit_us;
-        write_unlock_irqrestore(&prv->lock, flags);
-
-    /* FALLTHRU */
-    case XEN_SYSCTL_SCHEDOP_getinfo:
-        params->ratelimit_us = prv->ratelimit_us;
-        break;
-    }
-
-    return 0;
-}
-
-static void *
-csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    struct csched2_dom *sdom;
-    unsigned long flags;
-
-    sdom = xzalloc(struct csched2_dom);
-    if ( sdom == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    /* Initialize credit, cap and weight */
-    INIT_LIST_HEAD(&sdom->sdom_elem);
-    sdom->dom = dom;
-    sdom->weight = CSCHED2_DEFAULT_WEIGHT;
-    sdom->cap = 0U;
-    sdom->nr_units = 0;
-
-    init_timer(&sdom->repl_timer, replenish_domain_budget, sdom,
-               cpumask_any(cpupool_domain_master_cpumask(dom)));
-    spin_lock_init(&sdom->budget_lock);
-    INIT_LIST_HEAD(&sdom->parked_units);
-
-    write_lock_irqsave(&prv->lock, flags);
-
-    list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
-
-    write_unlock_irqrestore(&prv->lock, flags);
-
-    return sdom;
-}
-
-static void
-csched2_free_domdata(const struct scheduler *ops, void *data)
-{
-    struct csched2_dom *sdom = data;
-    struct csched2_private *prv = csched2_priv(ops);
-
-    if ( sdom )
-    {
-        unsigned long flags;
-
-        kill_timer(&sdom->repl_timer);
-
-        write_lock_irqsave(&prv->lock, flags);
-        list_del_init(&sdom->sdom_elem);
-        write_unlock_irqrestore(&prv->lock, flags);
-
-        xfree(sdom);
-    }
-}
-
-static void
-csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit *svc = unit->priv;
-    struct csched2_dom * const sdom = svc->sdom;
-    spinlock_t *lock;
-
-    ASSERT(!is_idle_unit(unit));
-    ASSERT(list_empty(&svc->runq_elem));
-
-    /* csched2_res_pick() expects the pcpu lock to be held */
-    lock = unit_schedule_lock_irq(unit);
-
-    sched_set_res(unit, csched2_res_pick(ops, unit));
-
-    spin_unlock_irq(lock);
-
-    lock = unit_schedule_lock_irq(unit);
-
-    /* Add unit to runqueue of initial processor */
-    runq_assign(ops, unit);
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    sdom->nr_units++;
-
-    SCHED_STAT_CRANK(unit_insert);
-
-    CSCHED2_UNIT_CHECK(unit);
-}
-
-static void
-csched2_free_udata(const struct scheduler *ops, void *priv)
-{
-    struct csched2_unit *svc = priv;
-
-    xfree(svc);
-}
-
-static void
-csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct csched2_unit * const svc = csched2_unit(unit);
-    spinlock_t *lock;
-
-    ASSERT(!is_idle_unit(unit));
-    ASSERT(list_empty(&svc->runq_elem));
-
-    SCHED_STAT_CRANK(unit_remove);
-
-    /* Remove from runqueue */
-    lock = unit_schedule_lock_irq(unit);
-
-    runq_deassign(ops, unit);
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    svc->sdom->nr_units--;
-}
-
-/* How long should we let this unit run for? */
-static s_time_t
-csched2_runtime(const struct scheduler *ops, int cpu,
-                struct csched2_unit *snext, s_time_t now)
-{
-    s_time_t time, min_time;
-    int rt_credit; /* Proposed runtime measured in credits */
-    struct csched2_runqueue_data *rqd = c2rqd(ops, cpu);
-    struct list_head *runq = &rqd->runq;
-    struct csched2_private *prv = csched2_priv(ops);
-
-    /*
-     * If we're idle, just stay so. Others (or external events)
-     * will poke us when necessary.
-     */
-    if ( is_idle_unit(snext->unit) )
-        return -1;
-
-    /* General algorithm:
-     * 1) Run until snext's credit will be 0.
-     * 2) But if someone is waiting, run until snext's credit is equal
-     *    to his.
-     * 3) But, if we are capped, never run more than our budget.
-     * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or
-     *    the ratelimit time.
-     */
-
-    /* Calculate mintime */
-    min_time = CSCHED2_MIN_TIMER;
-    if ( prv->ratelimit_us )
-    {
-        s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us);
-        if ( snext->unit->is_running )
-            ratelimit_min = snext->unit->state_entry_time +
-                            MICROSECS(prv->ratelimit_us) - now;
-        if ( ratelimit_min > min_time )
-            min_time = ratelimit_min;
-    }
-
-    /* 1) Run until snext's credit will be 0. */
-    rt_credit = snext->credit;
-
-    /*
-     * 2) If there's someone waiting whose credit is positive,
-     *    run until your credit ~= his.
-     */
-    if ( ! list_empty(runq) )
-    {
-        struct csched2_unit *swait = runq_elem(runq->next);
-
-        if ( ! is_idle_unit(swait->unit)
-             && swait->credit > 0 )
-        {
-            rt_credit = snext->credit - swait->credit;
-        }
-    }
-
-    /*
-     * The next guy on the runqueue may actually have a higher credit,
-     * if we've tried to avoid migrating him from a different cpu.
-     * Setting time=0 will ensure the minimum timeslice is chosen.
-     *
-     * FIXME: See if we can eliminate this conversion if we know time
-     * will be outside (MIN,MAX).  Probably requires pre-calculating
-     * credit values of MIN,MAX per unit, since each unit burns credit
-     * at a different rate.
-     */
-    if ( rt_credit > 0 )
-        time = c2t(rqd, rt_credit, snext);
-    else
-        time = 0;
-
-    /*
-     * 3) But, if capped, never run more than our budget.
-     */
-    if ( has_cap(snext) )
-        time = snext->budget < time ? snext->budget : time;
-
-    /*
-     * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or
-     *    the rate_limit time.
-     */
-    if ( time < min_time )
-    {
-        time = min_time;
-        SCHED_STAT_CRANK(runtime_min_timer);
-    }
-    else if (time > CSCHED2_MAX_TIMER)
-    {
-        time = CSCHED2_MAX_TIMER;
-        SCHED_STAT_CRANK(runtime_max_timer);
-    }
-
-    return time;
-}
-
-/*
- * Find a candidate.
- */
-static struct csched2_unit *
-runq_candidate(struct csched2_runqueue_data *rqd,
-               struct csched2_unit *scurr,
-               int cpu, s_time_t now,
-               unsigned int *skipped)
-{
-    struct list_head *iter, *temp;
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct csched2_unit *snext = NULL;
-    struct csched2_private *prv = csched2_priv(sr->scheduler);
-    bool yield = false, soft_aff_preempt = false;
-
-    *skipped = 0;
-
-    if ( unlikely(is_idle_unit(scurr->unit)) )
-    {
-        snext = scurr;
-        goto check_runq;
-    }
-
-    yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags);
-
-    /*
-     * Return the current unit if it has executed for less than ratelimit.
-     * Adjuststment for the selected unit's credit and decision
-     * for how long it will run will be taken in csched2_runtime.
-     *
-     * Note that, if scurr is yielding, we don't let rate limiting kick in.
-     * In fact, it may be the case that scurr is about to spin, and there's
-     * no point forcing it to do so until rate limiting expires.
-     */
-    if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) &&
-         (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) )
-    {
-        if ( unlikely(tb_init_done) )
-        {
-            struct {
-                unsigned unit:16, dom:16;
-                unsigned runtime;
-            } d;
-            d.dom = scurr->unit->domain->domain_id;
-            d.unit = scurr->unit->unit_id;
-            d.runtime = now - scurr->unit->state_entry_time;
-            __trace_var(TRC_CSCHED2_RATELIMIT, 1,
-                        sizeof(d),
-                        (unsigned char *)&d);
-        }
-        return scurr;
-    }
-
-    /* If scurr has a soft-affinity, let's check whether cpu is part of it */
-    if ( has_soft_affinity(scurr->unit) )
-    {
-        affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY,
-                                 cpumask_scratch);
-        if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) )
-        {
-            cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain);
-
-            /* Ok, is any of the pcpus in scurr soft-affinity idle? */
-            cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle);
-            cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled);
-            soft_aff_preempt = cpumask_intersects(cpumask_scratch, online);
-        }
-    }
-
-    /*
-     * If scurr is runnable, and this cpu is in its soft-affinity, default to
-     * it. We also default to it, even if cpu is not in its soft-affinity, if
-     * there aren't any idle and not tickled cpu in its soft-affinity. In
-     * fact, we don't want to risk leaving scurr in the runq and this cpu idle
-     * only because scurr is running outside of its soft-affinity.
-     *
-     * On the other hand, if cpu is not in scurr's soft-affinity, and there
-     * looks to be better options, go for them. That happens by defaulting to
-     * idle here, which means scurr will be preempted, put back in runq, and
-     * one of those idle and not tickled cpus from its soft-affinity will be
-     * tickled to pick it up.
-     *
-     * Finally, if scurr does not have a valid soft-affinity, we also let it
-     * continue to run here (in fact, soft_aff_preempt will still be false,
-     * in this case).
-     *
-     * Of course, we also default to idle also if scurr is not runnable.
-     */
-    if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt )
-        snext = scurr;
-    else
-        snext = csched2_unit(sched_idle_unit(cpu));
-
- check_runq:
-    list_for_each_safe( iter, temp, &rqd->runq )
-    {
-        struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem);
-
-        if ( unlikely(tb_init_done) )
-        {
-            struct {
-                unsigned unit:16, dom:16;
-            } d;
-            d.dom = svc->unit->domain->domain_id;
-            d.unit = svc->unit->unit_id;
-            __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1,
-                        sizeof(d),
-                        (unsigned char *)&d);
-        }
-
-        /* Only consider units that are allowed to run on this processor. */
-        if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) )
-        {
-            (*skipped)++;
-            continue;
-        }
-
-        /*
-         * If an unit is meant to be picked up by another processor, and such
-         * processor has not scheduled yet, leave it in the runqueue for him.
-         */
-        if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu &&
-             cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) )
-        {
-            (*skipped)++;
-            SCHED_STAT_CRANK(deferred_to_tickled_cpu);
-            continue;
-        }
-
-        /*
-         * If this is on a different processor, don't pull it unless
-         * its credit is at least CSCHED2_MIGRATE_RESIST higher.
-         */
-        if ( sched_unit_master(svc->unit) != cpu
-             && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit )
-        {
-            (*skipped)++;
-            SCHED_STAT_CRANK(migrate_resisted);
-            continue;
-        }
-
-        /*
-         * If the one in the runqueue has more credit than current (or idle,
-         * if current is not runnable), or if current is yielding, and also
-         * if the one in runqueue either is not capped, or is capped but has
-         * some budget, then choose it.
-         */
-        if ( (yield || svc->credit > snext->credit) &&
-             (!has_cap(svc) || unit_grab_budget(svc)) &&
-             unit_runnable_state(svc->unit) )
-            snext = svc;
-
-        /* In any case, if we got this far, break. */
-        break;
-    }
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned unit:16, dom:16;
-            unsigned tickled_cpu, skipped;
-            int credit;
-        } d;
-        d.dom = snext->unit->domain->domain_id;
-        d.unit = snext->unit->unit_id;
-        d.credit = snext->credit;
-        d.tickled_cpu = snext->tickled_cpu;
-        d.skipped = *skipped;
-        __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
-        SCHED_STAT_CRANK(tickled_cpu_overridden);
-
-    /*
-     * If snext is from a capped domain, it must have budget (or it
-     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
-     * which still is >= 0.
-     */
-    ASSERT(snext->budget >= 0);
-
-    return snext;
-}
-
-/*
- * This function is in the critical path. It is designed to be simple and
- * fast for the common case.
- */
-static void csched2_schedule(
-    const struct scheduler *ops, struct sched_unit *currunit, s_time_t now,
-    bool tasklet_work_scheduled)
-{
-    const unsigned int cur_cpu = smp_processor_id();
-    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
-    struct csched2_runqueue_data *rqd;
-    struct csched2_unit * const scurr = csched2_unit(currunit);
-    struct csched2_unit *snext = NULL;
-    unsigned int skipped_units = 0;
-    bool tickled;
-    bool migrated = false;
-
-    SCHED_STAT_CRANK(schedule);
-    CSCHED2_UNIT_CHECK(currunit);
-
-    BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized));
-
-    rqd = c2rqd(ops, sched_cpu);
-    BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active));
-
-    ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock));
-
-    BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd);
-
-    /* Clear "tickled" bit now that we've been scheduled */
-    tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled);
-    if ( tickled )
-    {
-        __cpumask_clear_cpu(sched_cpu, &rqd->tickled);
-        cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
-        smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
-    }
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            unsigned cpu:16, rq_id:16;
-            unsigned tasklet:8, idle:8, smt_idle:8, tickled:8;
-        } d;
-        d.cpu = cur_cpu;
-        d.rq_id = c2r(sched_cpu);
-        d.tasklet = tasklet_work_scheduled;
-        d.idle = is_idle_unit(currunit);
-        d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle);
-        d.tickled = tickled;
-        __trace_var(TRC_CSCHED2_SCHEDULE, 1,
-                    sizeof(d),
-                    (unsigned char *)&d);
-    }
-
-    /* Update credits (and budget, if necessary). */
-    burn_credits(rqd, scurr, now);
-
-    /*
-     *  Below 0, means that we are capped and we have overrun our  budget.
-     *  Let's try to get some more but, if we fail (e.g., because of the
-     *  other running units), we will be parked.
-     */
-    if ( unlikely(scurr->budget <= 0) )
-        unit_grab_budget(scurr);
-
-    /*
-     * Select next runnable local UNIT (ie top of local runq).
-     *
-     * If the current unit is runnable, and has higher credit than
-     * the next guy on the queue (or there is noone else), we want to
-     * run him again.
-     *
-     * If there's tasklet work to do, we want to chose the idle unit
-     * for this processor, and mark the current for delayed runqueue
-     * add.
-     *
-     * If the current unit is runnable, and there's another runnable
-     * candidate, we want to mark current for delayed runqueue add,
-     * and remove the next guy from the queue.
-     *
-     * If the current unit is not runnable, we want to chose the idle
-     * unit for this processor.
-     */
-    if ( tasklet_work_scheduled )
-    {
-        __clear_bit(__CSFLAG_unit_yield, &scurr->flags);
-        trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL);
-        snext = csched2_unit(sched_idle_unit(sched_cpu));
-    }
-    else
-        snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units);
-
-    /* If switching from a non-idle runnable unit, put it
-     * back on the runqueue. */
-    if ( snext != scurr
-         && !is_idle_unit(currunit)
-         && unit_runnable(currunit) )
-        __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
-
-    /* Accounting for non-idle tasks */
-    if ( !is_idle_unit(snext->unit) )
-    {
-        /* If switching, remove this from the runqueue and mark it scheduled */
-        if ( snext != scurr )
-        {
-            ASSERT(snext->rqd == rqd);
-            ASSERT(!snext->unit->is_running);
-
-            runq_remove(snext);
-            __set_bit(__CSFLAG_scheduled, &snext->flags);
-        }
-
-        /* Clear the idle mask if necessary */
-        if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
-        {
-            __cpumask_clear_cpu(sched_cpu, &rqd->idle);
-            smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
-        }
-
-        /*
-         * The reset condition is "has a scheduler epoch come to an end?".
-         * The way this is enforced is checking whether the unit at the top
-         * of the runqueue has negative credits. This means the epochs have
-         * variable length, as in one epoch expores when:
-         *  1) the unit at the top of the runqueue has executed for
-         *     around 10 ms (with default parameters);
-         *  2) no other unit with higher credits wants to run.
-         *
-         * Here, where we want to check for reset, we need to make sure the
-         * proper unit is being used. In fact, runqueue_candidate() may have
-         * not returned the first unit in the runqueue, for various reasons
-         * (e.g., affinity). Only trigger a reset when it does.
-         */
-        if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET )
-        {
-            reset_credit(ops, sched_cpu, now, snext);
-            balance_load(ops, sched_cpu, now);
-        }
-
-        snext->start_time = now;
-        snext->tickled_cpu = -1;
-
-        /* Safe because lock for old processor is held */
-        if ( sched_unit_master(snext->unit) != sched_cpu )
-        {
-            snext->credit += CSCHED2_MIGRATE_COMPENSATION;
-            sched_set_res(snext->unit, get_sched_res(sched_cpu));
-            SCHED_STAT_CRANK(migrated);
-            migrated = true;
-        }
-    }
-    else
-    {
-        /*
-         * Update the idle mask if necessary. Note that, if we're scheduling
-         * idle in order to carry on some tasklet work, we want to play busy!
-         */
-        if ( tasklet_work_scheduled )
-        {
-            if ( cpumask_test_cpu(sched_cpu, &rqd->idle) )
-            {
-                __cpumask_clear_cpu(sched_cpu, &rqd->idle);
-                smt_idle_mask_clear(sched_cpu, &rqd->smt_idle);
-            }
-        }
-        else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) )
-        {
-            __cpumask_set_cpu(sched_cpu, &rqd->idle);
-            cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled);
-            smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle);
-        }
-        /* Make sure avgload gets updated periodically even
-         * if there's no activity */
-        update_load(ops, rqd, NULL, 0, now);
-    }
-
-    /*
-     * Return task to run next...
-     */
-    currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now);
-    currunit->next_task = snext->unit;
-    snext->unit->migrated = migrated;
-
-    CSCHED2_UNIT_CHECK(currunit->next_task);
-}
-
-static void
-csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc)
-{
-    printk("[%i.%i] flags=%x cpu=%i",
-            svc->unit->domain->domain_id,
-            svc->unit->unit_id,
-            svc->flags,
-            sched_unit_master(svc->unit));
-
-    printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
-
-    if ( has_cap(svc) )
-        printk(" budget=%"PRI_stime"(%"PRI_stime")",
-               svc->budget, svc->budget_quota);
-
-    printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
-           (svc->avgload * 100) >> prv->load_precision_shift);
-
-    printk("\n");
-}
-
-static inline void
-dump_pcpu(const struct scheduler *ops, int cpu)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    struct csched2_unit *svc;
-
-    printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n",
-           cpu, c2r(cpu),
-           CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
-           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
-
-    /* current UNIT (nothing to say if that's the idle unit) */
-    svc = csched2_unit(curr_on_cpu(cpu));
-    if ( svc && !is_idle_unit(svc->unit) )
-    {
-        printk("\trun: ");
-        csched2_dump_unit(prv, svc);
-    }
-}
-
-static void
-csched2_dump(const struct scheduler *ops)
-{
-    struct list_head *iter_sdom;
-    struct csched2_private *prv = csched2_priv(ops);
-    unsigned long flags;
-    unsigned int i, j, loop;
-
-    /*
-     * We need the private scheduler lock as we access global
-     * scheduler data and (below) the list of active domains.
-     */
-    read_lock_irqsave(&prv->lock, flags);
-
-    printk("Active queues: %d\n"
-           "\tdefault-weight     = %d\n",
-           cpumask_weight(&prv->active_queues),
-           CSCHED2_DEFAULT_WEIGHT);
-    for_each_cpu(i, &prv->active_queues)
-    {
-        s_time_t fraction;
-
-        fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift;
-
-        printk("Runqueue %d:\n"
-               "\tncpus              = %u\n"
-               "\tcpus               = %*pbl\n"
-               "\tmax_weight         = %u\n"
-               "\tpick_bias          = %u\n"
-               "\tinstload           = %d\n"
-               "\taveload            = %"PRI_stime" (~%"PRI_stime"%%)\n",
-               i,
-               prv->rqd[i].nr_cpus,
-               CPUMASK_PR(&prv->rqd[i].active),
-               prv->rqd[i].max_weight,
-               prv->rqd[i].pick_bias,
-               prv->rqd[i].load,
-               prv->rqd[i].avgload,
-               fraction);
-
-        printk("\tidlers: %*pb\n"
-               "\ttickled: %*pb\n"
-               "\tfully idle cores: %*pb\n",
-               CPUMASK_PR(&prv->rqd[i].idle),
-               CPUMASK_PR(&prv->rqd[i].tickled),
-               CPUMASK_PR(&prv->rqd[i].smt_idle));
-    }
-
-    printk("Domain info:\n");
-    loop = 0;
-    list_for_each( iter_sdom, &prv->sdom )
-    {
-        struct csched2_dom *sdom;
-        struct sched_unit *unit;
-
-        sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
-
-        printk("\tDomain: %d w %d c %u v %d\n",
-               sdom->dom->domain_id,
-               sdom->weight,
-               sdom->cap,
-               sdom->nr_units);
-
-        for_each_sched_unit ( sdom->dom, unit )
-        {
-            struct csched2_unit * const svc = csched2_unit(unit);
-            spinlock_t *lock;
-
-            lock = unit_schedule_lock(unit);
-
-            printk("\t%3d: ", ++loop);
-            csched2_dump_unit(prv, svc);
-
-            unit_schedule_unlock(lock, unit);
-        }
-    }
-
-    for_each_cpu(i, &prv->active_queues)
-    {
-        struct csched2_runqueue_data *rqd = prv->rqd + i;
-        struct list_head *iter, *runq = &rqd->runq;
-        int loop = 0;
-
-        /* We need the lock to scan the runqueue. */
-        spin_lock(&rqd->lock);
-
-        printk("Runqueue %d:\n", i);
-
-        for_each_cpu(j, &rqd->active)
-            dump_pcpu(ops, j);
-
-        printk("RUNQ:\n");
-        list_for_each( iter, runq )
-        {
-            struct csched2_unit *svc = runq_elem(iter);
-
-            if ( svc )
-            {
-                printk("\t%3d: ", loop++);
-                csched2_dump_unit(prv, svc);
-            }
-        }
-        spin_unlock(&rqd->lock);
-    }
-
-    read_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-csched2_alloc_pdata(const struct scheduler *ops, int cpu)
-{
-    struct csched2_pcpu *spc;
-
-    spc = xzalloc(struct csched2_pcpu);
-    if ( spc == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    /* Not in any runqueue yet */
-    spc->runq_id = -1;
-
-    return spc;
-}
-
-/* Returns the ID of the runqueue the cpu is assigned to. */
-static unsigned
-init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc,
-           unsigned int cpu)
-{
-    struct csched2_runqueue_data *rqd;
-    unsigned int rcpu;
-
-    ASSERT(rw_is_write_locked(&prv->lock));
-    ASSERT(!cpumask_test_cpu(cpu, &prv->initialized));
-    /* CPU data needs to be allocated, but still uninitialized. */
-    ASSERT(spc && spc->runq_id == -1);
-
-    /* Figure out which runqueue to put it in */
-    spc->runq_id = cpu_to_runqueue(prv, cpu);
-
-    rqd = prv->rqd + spc->runq_id;
-
-    printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id);
-    if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) )
-    {
-        printk(XENLOG_INFO " First cpu on runqueue, activating\n");
-        activate_runqueue(prv, spc->runq_id);
-    }
-
-    __cpumask_set_cpu(cpu, &spc->sibling_mask);
-
-    if ( rqd->nr_cpus > 0 )
-        for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) )
-            if ( cpumask_test_cpu(rcpu, &rqd->active) )
-            {
-                __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
-                __cpumask_set_cpu(rcpu, &spc->sibling_mask);
-            }
-
-    __cpumask_set_cpu(cpu, &rqd->idle);
-    __cpumask_set_cpu(cpu, &rqd->active);
-    __cpumask_set_cpu(cpu, &prv->initialized);
-    __cpumask_set_cpu(cpu, &rqd->smt_idle);
-
-    rqd->nr_cpus++;
-    ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
-
-    if ( rqd->nr_cpus == 1 )
-        rqd->pick_bias = cpu;
-
-    return spc->runq_id;
-}
-
-static void
-csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
-    struct csched2_private *prv = csched2_priv(ops);
-    spinlock_t *old_lock;
-    unsigned long flags;
-    unsigned rqi;
-
-    write_lock_irqsave(&prv->lock, flags);
-    old_lock = pcpu_schedule_lock(cpu);
-
-    rqi = init_pdata(prv, pdata, cpu);
-    /* Move the scheduler lock to the new runq lock. */
-    get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock;
-
-    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
-    spin_unlock(old_lock);
-    write_unlock_irqrestore(&prv->lock, flags);
-}
-
-/* Change the scheduler of cpu to us (Credit2). */
-static spinlock_t *
-csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu,
-                     void *pdata, void *vdata)
-{
-    struct csched2_private *prv = csched2_priv(new_ops);
-    struct csched2_unit *svc = vdata;
-    unsigned rqi;
-
-    ASSERT(pdata && svc && is_idle_unit(svc->unit));
-
-    /*
-     * We own one runqueue lock already (from schedule_cpu_switch()). This
-     * looks like it violates this scheduler's locking rules, but it does
-     * not, as what we own is the lock of another scheduler, that hence has
-     * no particular (ordering) relationship with our private global lock.
-     * And owning exactly that one (the lock of the old scheduler of this
-     * cpu) is what is necessary to prevent races.
-     */
-    ASSERT(!local_irq_is_enabled());
-    write_lock(&prv->lock);
-
-    sched_idle_unit(cpu)->priv = vdata;
-
-    rqi = init_pdata(prv, pdata, cpu);
-
-    /*
-     * Now that we know what runqueue we'll go in, double check what's said
-     * above: the lock we already hold is not the one of this runqueue of
-     * this scheduler, and so it's safe to have taken it /before/ our
-     * private global lock.
-     */
-    ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock);
-
-    write_unlock(&prv->lock);
-
-    return &prv->rqd[rqi].lock;
-}
-
-static void
-csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    unsigned long flags;
-    struct csched2_private *prv = csched2_priv(ops);
-    struct csched2_runqueue_data *rqd;
-    struct csched2_pcpu *spc = pcpu;
-    unsigned int rcpu;
-
-    write_lock_irqsave(&prv->lock, flags);
-
-    /*
-     * alloc_pdata is not implemented, so pcpu must be NULL. On the other
-     * hand, init_pdata must have been called for this pCPU.
-     */
-    /*
-     * Scheduler specific data for this pCPU must still be there and and be
-     * valid. In fact, if we are here:
-     *  1. alloc_pdata must have been called for this cpu, and free_pdata
-     *     must not have been called on it before us,
-     *  2. init_pdata must have been called on this cpu, and deinit_pdata
-     *     (us!) must not have been called on it already.
-     */
-    ASSERT(spc && spc->runq_id != -1);
-    ASSERT(cpumask_test_cpu(cpu, &prv->initialized));
-
-    /* Find the old runqueue and remove this cpu from it */
-    rqd = prv->rqd + spc->runq_id;
-
-    /* No need to save IRQs here, they're already disabled */
-    spin_lock(&rqd->lock);
-
-    printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id);
-
-    __cpumask_clear_cpu(cpu, &rqd->idle);
-    __cpumask_clear_cpu(cpu, &rqd->smt_idle);
-    __cpumask_clear_cpu(cpu, &rqd->active);
-
-    for_each_cpu ( rcpu, &rqd->active )
-        __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask);
-
-    rqd->nr_cpus--;
-    ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus);
-
-    if ( rqd->nr_cpus == 0 )
-    {
-        printk(XENLOG_INFO " No cpus left on runqueue, disabling\n");
-        deactivate_runqueue(prv, spc->runq_id);
-    }
-    else if ( rqd->pick_bias == cpu )
-        rqd->pick_bias = cpumask_first(&rqd->active);
-
-    spc->runq_id = -1;
-
-    spin_unlock(&rqd->lock);
-
-    __cpumask_clear_cpu(cpu, &prv->initialized);
-
-    write_unlock_irqrestore(&prv->lock, flags);
-
-    return;
-}
-
-static void
-csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    struct csched2_pcpu *spc = pcpu;
-
-    /*
-     * pcpu either points to a valid struct csched2_pcpu, or is NULL (if
-     * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED).
-     * xfree() does not really mind, but we want to be sure that either
-     * init_pdata has never been called, or deinit_pdata has been called
-     * already.
-     */
-    ASSERT(!pcpu || spc->runq_id == -1);
-    ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized));
-
-    xfree(pcpu);
-}
-
-static int __init
-csched2_global_init(void)
-{
-    if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
-    {
-        printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n",
-               __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN);
-        opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN;
-    }
-
-    if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT )
-    {
-        printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n",
-               __func__, opt_load_window_shift);
-        opt_load_window_shift = LOADAVG_WINDOW_SHIFT;
-    }
-
-    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
-    {
-        printk("WARNING: %s: opt_cap_period %u too small, resetting\n",
-               __func__, opt_cap_period);
-        opt_cap_period = 10; /* ms */
-    }
-
-    return 0;
-}
-
-static int
-csched2_init(struct scheduler *ops)
-{
-    int i;
-    struct csched2_private *prv;
-
-    printk("Initializing Credit2 scheduler\n");
-
-    printk(XENLOG_INFO " load_precision_shift: %d\n"
-           XENLOG_INFO " load_window_shift: %d\n"
-           XENLOG_INFO " underload_balance_tolerance: %d\n"
-           XENLOG_INFO " overload_balance_tolerance: %d\n"
-           XENLOG_INFO " runqueues arrangement: %s\n"
-           XENLOG_INFO " cap enforcement granularity: %dms\n",
-           opt_load_precision_shift,
-           opt_load_window_shift,
-           opt_underload_balance_tolerance,
-           opt_overload_balance_tolerance,
-           opt_runqueue_str[opt_runqueue],
-           opt_cap_period);
-
-    printk(XENLOG_INFO "load tracking window length %llu ns\n",
-           1ULL << opt_load_window_shift);
-
-    /*
-     * Basically no CPU information is available at this point; just
-     * set up basic structures, and a callback when the CPU info is
-     * available.
-     */
-
-    prv = xzalloc(struct csched2_private);
-    if ( prv == NULL )
-        return -ENOMEM;
-    ops->sched_data = prv;
-
-    rwlock_init(&prv->lock);
-    INIT_LIST_HEAD(&prv->sdom);
-
-    /* Allocate all runqueues and mark them as un-initialized */
-    prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids);
-    if ( !prv->rqd )
-    {
-        xfree(prv);
-        return -ENOMEM;
-    }
-    for ( i = 0; i < nr_cpu_ids; i++ )
-        prv->rqd[i].id = -1;
-
-    /* initialize ratelimit */
-    prv->ratelimit_us = sched_ratelimit_us;
-
-    prv->load_precision_shift = opt_load_precision_shift;
-    prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT;
-    ASSERT(opt_load_window_shift > 0);
-
-    return 0;
-}
-
-static void
-csched2_deinit(struct scheduler *ops)
-{
-    struct csched2_private *prv;
-
-    prv = csched2_priv(ops);
-    ops->sched_data = NULL;
-    if ( prv )
-        xfree(prv->rqd);
-    xfree(prv);
-}
-
-static const struct scheduler sched_credit2_def = {
-    .name           = "SMP Credit Scheduler rev2",
-    .opt_name       = "credit2",
-    .sched_id       = XEN_SCHEDULER_CREDIT2,
-    .sched_data     = NULL,
-
-    .global_init    = csched2_global_init,
-
-    .insert_unit    = csched2_unit_insert,
-    .remove_unit    = csched2_unit_remove,
-
-    .sleep          = csched2_unit_sleep,
-    .wake           = csched2_unit_wake,
-    .yield          = csched2_unit_yield,
-
-    .adjust         = csched2_dom_cntl,
-    .adjust_affinity= csched2_aff_cntl,
-    .adjust_global  = csched2_sys_cntl,
-
-    .pick_resource  = csched2_res_pick,
-    .migrate        = csched2_unit_migrate,
-    .do_schedule    = csched2_schedule,
-    .context_saved  = csched2_context_saved,
-
-    .dump_settings  = csched2_dump,
-    .init           = csched2_init,
-    .deinit         = csched2_deinit,
-    .alloc_udata    = csched2_alloc_udata,
-    .free_udata     = csched2_free_udata,
-    .alloc_pdata    = csched2_alloc_pdata,
-    .init_pdata     = csched2_init_pdata,
-    .deinit_pdata   = csched2_deinit_pdata,
-    .free_pdata     = csched2_free_pdata,
-    .switch_sched   = csched2_switch_sched,
-    .alloc_domdata  = csched2_alloc_domdata,
-    .free_domdata   = csched2_free_domdata,
-};
-
-REGISTER_SCHEDULER(sched_credit2_def);
diff --git a/xen/common/sched_null.c b/xen/common/sched_null.c
deleted file mode 100644
index 3f3418c9b1..0000000000
--- a/xen/common/sched_null.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-/*
- * xen/common/sched_null.c
- *
- *  Copyright (c) 2017, Dario Faggioli, Citrix Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; If not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * The 'null' scheduler always choose to run, on each pCPU, either nothing
- * (i.e., the pCPU stays idle) or always the same unit.
- *
- * It is aimed at supporting static scenarios, where there always are
- * less units than pCPUs (and the units don't need to move among pCPUs
- * for any reason) with the least possible overhead.
- *
- * Typical usecase are embedded applications, but also HPC, especially
- * if the scheduler is used inside a cpupool.
- */
-
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <xen/trace.h>
-
-/*
- * null tracing events. Check include/public/trace.h for more details.
- */
-#define TRC_SNULL_PICKED_CPU    TRC_SCHED_CLASS_EVT(SNULL, 1)
-#define TRC_SNULL_UNIT_ASSIGN   TRC_SCHED_CLASS_EVT(SNULL, 2)
-#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
-#define TRC_SNULL_MIGRATE       TRC_SCHED_CLASS_EVT(SNULL, 4)
-#define TRC_SNULL_SCHEDULE      TRC_SCHED_CLASS_EVT(SNULL, 5)
-#define TRC_SNULL_TASKLET       TRC_SCHED_CLASS_EVT(SNULL, 6)
-
-/*
- * Locking:
- * - Scheduler-lock (a.k.a. runqueue lock):
- *  + is per-pCPU;
- *  + serializes assignment and deassignment of units to a pCPU.
- * - Private data lock (a.k.a. private scheduler lock):
- *  + is scheduler-wide;
- *  + serializes accesses to the list of domains in this scheduler.
- * - Waitqueue lock:
- *  + is scheduler-wide;
- *  + serialize accesses to the list of units waiting to be assigned
- *    to pCPUs.
- *
- * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
- * waitqueue lock nests inside runqueue lock which nests inside private
- * lock. More specifically:
- *  + if we need both runqueue and private locks, we must acquire the
- *    private lock for first;
- *  + if we need both runqueue and waitqueue locks, we must acquire
- *    the runqueue lock for first;
- *  + if we need both private and waitqueue locks, we must acquire
- *    the private lock for first;
- *  + if we already own a runqueue lock, we must never acquire
- *    the private lock;
- *  + if we already own the waitqueue lock, we must never acquire
- *    the runqueue lock or the private lock.
- */
-
-/*
- * System-wide private data
- */
-struct null_private {
-    spinlock_t lock;        /* scheduler lock; nests inside cpupool_lock */
-    struct list_head ndom;  /* Domains of this scheduler                 */
-    struct list_head waitq; /* units not assigned to any pCPU            */
-    spinlock_t waitq_lock;  /* serializes waitq; nests inside runq locks */
-    cpumask_t cpus_free;    /* CPUs without a unit associated to them    */
-};
-
-/*
- * Physical CPU
- */
-struct null_pcpu {
-    struct sched_unit *unit;
-};
-DEFINE_PER_CPU(struct null_pcpu, npc);
-
-/*
- * Schedule unit
- */
-struct null_unit {
-    struct list_head waitq_elem;
-    struct sched_unit *unit;
-};
-
-/*
- * Domain
- */
-struct null_dom {
-    struct list_head ndom_elem;
-    struct domain *dom;
-};
-
-/*
- * Accessor helpers functions
- */
-static inline struct null_private *null_priv(const struct scheduler *ops)
-{
-    return ops->sched_data;
-}
-
-static inline struct null_unit *null_unit(const struct sched_unit *unit)
-{
-    return unit->priv;
-}
-
-static inline bool unit_check_affinity(struct sched_unit *unit,
-                                       unsigned int cpu,
-                                       unsigned int balance_step)
-{
-    affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
-    cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                cpupool_domain_master_cpumask(unit->domain));
-
-    return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
-}
-
-static int null_init(struct scheduler *ops)
-{
-    struct null_private *prv;
-
-    printk("Initializing null scheduler\n"
-           "WARNING: This is experimental software in development.\n"
-           "Use at your own risk.\n");
-
-    prv = xzalloc(struct null_private);
-    if ( prv == NULL )
-        return -ENOMEM;
-
-    spin_lock_init(&prv->lock);
-    spin_lock_init(&prv->waitq_lock);
-    INIT_LIST_HEAD(&prv->ndom);
-    INIT_LIST_HEAD(&prv->waitq);
-
-    ops->sched_data = prv;
-
-    return 0;
-}
-
-static void null_deinit(struct scheduler *ops)
-{
-    xfree(ops->sched_data);
-    ops->sched_data = NULL;
-}
-
-static void init_pdata(struct null_private *prv, unsigned int cpu)
-{
-    /* Mark the pCPU as free, and with no unit assigned */
-    cpumask_set_cpu(cpu, &prv->cpus_free);
-    per_cpu(npc, cpu).unit = NULL;
-}
-
-static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
-    struct null_private *prv = null_priv(ops);
-
-    /* alloc_pdata is not implemented, so we want this to be NULL. */
-    ASSERT(!pdata);
-
-    init_pdata(prv, cpu);
-}
-
-static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    struct null_private *prv = null_priv(ops);
-
-    /* alloc_pdata not implemented, so this must have stayed NULL */
-    ASSERT(!pcpu);
-
-    cpumask_clear_cpu(cpu, &prv->cpus_free);
-    per_cpu(npc, cpu).unit = NULL;
-}
-
-static void *null_alloc_udata(const struct scheduler *ops,
-                              struct sched_unit *unit, void *dd)
-{
-    struct null_unit *nvc;
-
-    nvc = xzalloc(struct null_unit);
-    if ( nvc == NULL )
-        return NULL;
-
-    INIT_LIST_HEAD(&nvc->waitq_elem);
-    nvc->unit = unit;
-
-    SCHED_STAT_CRANK(unit_alloc);
-
-    return nvc;
-}
-
-static void null_free_udata(const struct scheduler *ops, void *priv)
-{
-    struct null_unit *nvc = priv;
-
-    xfree(nvc);
-}
-
-static void * null_alloc_domdata(const struct scheduler *ops,
-                                 struct domain *d)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_dom *ndom;
-    unsigned long flags;
-
-    ndom = xzalloc(struct null_dom);
-    if ( ndom == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    ndom->dom = d;
-
-    spin_lock_irqsave(&prv->lock, flags);
-    list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
-    spin_unlock_irqrestore(&prv->lock, flags);
-
-    return ndom;
-}
-
-static void null_free_domdata(const struct scheduler *ops, void *data)
-{
-    struct null_dom *ndom = data;
-    struct null_private *prv = null_priv(ops);
-
-    if ( ndom )
-    {
-        unsigned long flags;
-
-        spin_lock_irqsave(&prv->lock, flags);
-        list_del_init(&ndom->ndom_elem);
-        spin_unlock_irqrestore(&prv->lock, flags);
-
-        xfree(ndom);
-    }
-}
-
-/*
- * unit to pCPU assignment and placement. This _only_ happens:
- *  - on insert,
- *  - on migrate.
- *
- * Insert occurs when a unit joins this scheduler for the first time
- * (e.g., when the domain it's part of is moved to the scheduler's
- * cpupool).
- *
- * Migration may be necessary if a pCPU (with a unit assigned to it)
- * is removed from the scheduler's cpupool.
- *
- * So this is not part of any hot path.
- */
-static struct sched_resource *
-pick_res(struct null_private *prv, const struct sched_unit *unit)
-{
-    unsigned int bs;
-    unsigned int cpu = sched_unit_master(unit), new_cpu;
-    cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
-
-    ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
-
-    for_each_affinity_balance_step( bs )
-    {
-        if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
-            continue;
-
-        affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
-
-        /*
-         * If our processor is free, or we are assigned to it, and it is also
-         * still valid and part of our affinity, just go for it.
-         * (Note that we may call unit_check_affinity(), but we deliberately
-         * don't, so we get to keep in the scratch cpumask what we have just
-         * put in it.)
-         */
-        if ( likely((per_cpu(npc, cpu).unit == NULL ||
-                     per_cpu(npc, cpu).unit == unit)
-                    && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
-        {
-            new_cpu = cpu;
-            goto out;
-        }
-
-        /* If not, just go for a free pCPU, within our affinity, if any */
-        cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                    &prv->cpus_free);
-        new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
-
-        if ( likely(new_cpu != nr_cpu_ids) )
-            goto out;
-    }
-
-    /*
-     * If we didn't find any free pCPU, just pick any valid pcpu, even if
-     * it has another unit assigned. This will happen during shutdown and
-     * suspend/resume, but it may also happen during "normal operation", if
-     * all the pCPUs are busy.
-     *
-     * In fact, there must always be something sane in v->processor, or
-     * unit_schedule_lock() and friends won't work. This is not a problem,
-     * as we will actually assign the unit to the pCPU we return from here,
-     * only if the pCPU is free.
-     */
-    cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
-    new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
-
- out:
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint16_t unit, dom;
-            uint32_t new_cpu;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.new_cpu = new_cpu;
-        __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
-    }
-
-    return get_sched_res(new_cpu);
-}
-
-static void unit_assign(struct null_private *prv, struct sched_unit *unit,
-                        unsigned int cpu)
-{
-    ASSERT(is_unit_online(unit));
-
-    per_cpu(npc, cpu).unit = unit;
-    sched_set_res(unit, get_sched_res(cpu));
-    cpumask_clear_cpu(cpu, &prv->cpus_free);
-
-    dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint16_t unit, dom;
-            uint32_t cpu;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.cpu = cpu;
-        __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d);
-    }
-}
-
-/* Returns true if a cpu was tickled */
-static bool unit_deassign(struct null_private *prv, struct sched_unit *unit)
-{
-    unsigned int bs;
-    unsigned int cpu = sched_unit_master(unit);
-    struct null_unit *wvc;
-
-    ASSERT(list_empty(&null_unit(unit)->waitq_elem));
-    ASSERT(per_cpu(npc, cpu).unit == unit);
-    ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
-
-    per_cpu(npc, cpu).unit = NULL;
-    cpumask_set_cpu(cpu, &prv->cpus_free);
-
-    dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
-            unit->unit_id);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint16_t unit, dom;
-            uint32_t cpu;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.cpu = cpu;
-        __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d);
-    }
-
-    spin_lock(&prv->waitq_lock);
-
-    /*
-     * If unit is assigned to a pCPU, let's see if there is someone waiting,
-     * suitable to be assigned to it (prioritizing units that have
-     * soft-affinity with cpu).
-     */
-    for_each_affinity_balance_step( bs )
-    {
-        list_for_each_entry( wvc, &prv->waitq, waitq_elem )
-        {
-            if ( bs == BALANCE_SOFT_AFFINITY &&
-                 !has_soft_affinity(wvc->unit) )
-                continue;
-
-            if ( unit_check_affinity(wvc->unit, cpu, bs) )
-            {
-                list_del_init(&wvc->waitq_elem);
-                unit_assign(prv, wvc->unit, cpu);
-                cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-                spin_unlock(&prv->waitq_lock);
-                return true;
-            }
-        }
-    }
-    spin_unlock(&prv->waitq_lock);
-
-    return false;
-}
-
-/* Change the scheduler of cpu to us (null). */
-static spinlock_t *null_switch_sched(struct scheduler *new_ops,
-                                     unsigned int cpu,
-                                     void *pdata, void *vdata)
-{
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct null_private *prv = null_priv(new_ops);
-    struct null_unit *nvc = vdata;
-
-    ASSERT(nvc && is_idle_unit(nvc->unit));
-
-    sched_idle_unit(cpu)->priv = vdata;
-
-    /*
-     * We are holding the runqueue lock already (it's been taken in
-     * schedule_cpu_switch()). It actually may or may not be the 'right'
-     * one for this cpu, but that is ok for preventing races.
-     */
-    ASSERT(!local_irq_is_enabled());
-
-    init_pdata(prv, cpu);
-
-    return &sr->_lock;
-}
-
-static void null_unit_insert(const struct scheduler *ops,
-                             struct sched_unit *unit)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *nvc = null_unit(unit);
-    unsigned int cpu;
-    spinlock_t *lock;
-
-    ASSERT(!is_idle_unit(unit));
-
-    lock = unit_schedule_lock_irq(unit);
-
-    if ( unlikely(!is_unit_online(unit)) )
-    {
-        unit_schedule_unlock_irq(lock, unit);
-        return;
-    }
-
- retry:
-    sched_set_res(unit, pick_res(prv, unit));
-    cpu = sched_unit_master(unit);
-
-    spin_unlock(lock);
-
-    lock = unit_schedule_lock(unit);
-
-    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                cpupool_domain_master_cpumask(unit->domain));
-
-    /* If the pCPU is free, we assign unit to it */
-    if ( likely(per_cpu(npc, cpu).unit == NULL) )
-    {
-        /*
-         * Insert is followed by vcpu_wake(), so there's no need to poke
-         * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
-         */
-        unit_assign(prv, unit, cpu);
-    }
-    else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
-    {
-        /*
-         * If the pCPU is not free (e.g., because we raced with another
-         * insert or a migrate), but there are other free pCPUs, we can
-         * try to pick again.
-         */
-         goto retry;
-    }
-    else
-    {
-        /*
-         * If the pCPU is not free, and there aren't any (valid) others,
-         * we have no alternatives than to go into the waitqueue.
-         */
-        spin_lock(&prv->waitq_lock);
-        list_add_tail(&nvc->waitq_elem, &prv->waitq);
-        dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
-                unit->domain, unit->unit_id);
-        spin_unlock(&prv->waitq_lock);
-    }
-    spin_unlock_irq(lock);
-
-    SCHED_STAT_CRANK(unit_insert);
-}
-
-static void null_unit_remove(const struct scheduler *ops,
-                             struct sched_unit *unit)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *nvc = null_unit(unit);
-    spinlock_t *lock;
-
-    ASSERT(!is_idle_unit(unit));
-
-    lock = unit_schedule_lock_irq(unit);
-
-    /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
-    if ( unlikely(!is_unit_online(unit)) )
-    {
-        ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit);
-        ASSERT(list_empty(&nvc->waitq_elem));
-        goto out;
-    }
-
-    /* If unit is in waitqueue, just get it out of there and bail */
-    if ( unlikely(!list_empty(&nvc->waitq_elem)) )
-    {
-        spin_lock(&prv->waitq_lock);
-        list_del_init(&nvc->waitq_elem);
-        spin_unlock(&prv->waitq_lock);
-
-        goto out;
-    }
-
-    unit_deassign(prv, unit);
-
- out:
-    unit_schedule_unlock_irq(lock, unit);
-
-    SCHED_STAT_CRANK(unit_remove);
-}
-
-static void null_unit_wake(const struct scheduler *ops,
-                           struct sched_unit *unit)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *nvc = null_unit(unit);
-    unsigned int cpu = sched_unit_master(unit);
-
-    ASSERT(!is_idle_unit(unit));
-
-    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
-    {
-        SCHED_STAT_CRANK(unit_wake_running);
-        return;
-    }
-
-    if ( unlikely(!list_empty(&nvc->waitq_elem)) )
-    {
-        /* Not exactly "on runq", but close enough for reusing the counter */
-        SCHED_STAT_CRANK(unit_wake_onrunq);
-        return;
-    }
-
-    if ( likely(unit_runnable(unit)) )
-        SCHED_STAT_CRANK(unit_wake_runnable);
-    else
-        SCHED_STAT_CRANK(unit_wake_not_runnable);
-
-    if ( likely(per_cpu(npc, cpu).unit == unit) )
-    {
-        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-        return;
-    }
-
-    /*
-     * If a unit is neither on a pCPU nor in the waitqueue, it means it was
-     * offline, and that it is now coming back being online. If we're lucky,
-     * and its previous resource is free (and affinities match), we can just
-     * assign the unit to it (we own the proper lock already) and be done.
-     */
-    if ( per_cpu(npc, cpu).unit == NULL &&
-         unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
-    {
-        if ( !has_soft_affinity(unit) ||
-             unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
-        {
-            unit_assign(prv, unit, cpu);
-            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-            return;
-        }
-    }
-
-    /*
-     * If the resource is not free (or affinities do not match) we need
-     * to assign unit to some other one, but we can't do it here, as:
-     * - we don't own  the proper lock,
-     * - we can't change v->processor under vcpu_wake()'s feet.
-     * So we add it to the waitqueue, and tickle all the free CPUs (if any)
-     * on which unit can run. The first one that schedules will pick it up.
-     */
-    spin_lock(&prv->waitq_lock);
-    list_add_tail(&nvc->waitq_elem, &prv->waitq);
-    spin_unlock(&prv->waitq_lock);
-
-    cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                cpupool_domain_master_cpumask(unit->domain));
-    cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
-                &prv->cpus_free);
-
-    if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
-        dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
-                unit->domain->domain_id, unit->unit_id);
-    else
-        cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
-}
-
-static void null_unit_sleep(const struct scheduler *ops,
-                            struct sched_unit *unit)
-{
-    struct null_private *prv = null_priv(ops);
-    unsigned int cpu = sched_unit_master(unit);
-    bool tickled = false;
-
-    ASSERT(!is_idle_unit(unit));
-
-    /*
-     * Check if the unit is in the process of being offlined. If yes,
-     * we need to remove it from either its pCPU or the waitqueue.
-     */
-    if ( unlikely(!is_unit_online(unit)) )
-    {
-        struct null_unit *nvc = null_unit(unit);
-
-        if ( unlikely(!list_empty(&nvc->waitq_elem)) )
-        {
-            spin_lock(&prv->waitq_lock);
-            list_del_init(&nvc->waitq_elem);
-            spin_unlock(&prv->waitq_lock);
-        }
-        else if ( per_cpu(npc, cpu).unit == unit )
-            tickled = unit_deassign(prv, unit);
-    }
-
-    /* If unit is not assigned to a pCPU, or is not running, no need to bother */
-    if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
-        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-
-    SCHED_STAT_CRANK(unit_sleep);
-}
-
-static struct sched_resource *
-null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
-    ASSERT(!is_idle_unit(unit));
-    return pick_res(null_priv(ops), unit);
-}
-
-static void null_unit_migrate(const struct scheduler *ops,
-                              struct sched_unit *unit, unsigned int new_cpu)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *nvc = null_unit(unit);
-
-    ASSERT(!is_idle_unit(unit));
-
-    if ( sched_unit_master(unit) == new_cpu )
-        return;
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint16_t unit, dom;
-            uint16_t cpu, new_cpu;
-        } d;
-        d.dom = unit->domain->domain_id;
-        d.unit = unit->unit_id;
-        d.cpu = sched_unit_master(unit);
-        d.new_cpu = new_cpu;
-        __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
-    }
-
-    /*
-     * If unit is assigned to a pCPU, then such pCPU becomes free, and we
-     * should look in the waitqueue if anyone else can be assigned to it.
-     */
-    if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) )
-    {
-        unit_deassign(prv, unit);
-        SCHED_STAT_CRANK(migrate_running);
-    }
-    else if ( !list_empty(&nvc->waitq_elem) )
-        SCHED_STAT_CRANK(migrate_on_runq);
-
-    SCHED_STAT_CRANK(migrated);
-
-    /*
-     * If a unit is (going) offline, we want it to be neither assigned
-     * to a pCPU, nor in the waitqueue.
-     *
-     * If it was on a cpu, we've removed it from there above. If it is
-     * in the waitqueue, we remove it from there now. And then we bail.
-     */
-    if ( unlikely(!is_unit_online(unit)) )
-    {
-        spin_lock(&prv->waitq_lock);
-        list_del_init(&nvc->waitq_elem);
-        spin_unlock(&prv->waitq_lock);
-        goto out;
-    }
-
-    /*
-     * Let's now consider new_cpu, which is where unit is being sent. It can be
-     * either free, or have a unit already assigned to it.
-     *
-     * In the former case we should assign unit to it, and try to get it to run,
-     * if possible, according to affinity.
-     *
-     * In latter, all we can do is to park unit in the waitqueue.
-     */
-    if ( per_cpu(npc, new_cpu).unit == NULL &&
-         unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
-    {
-        /* unit might have been in the waitqueue, so remove it */
-        spin_lock(&prv->waitq_lock);
-        list_del_init(&nvc->waitq_elem);
-        spin_unlock(&prv->waitq_lock);
-
-        unit_assign(prv, unit, new_cpu);
-    }
-    else
-    {
-        /* Put unit in the waitqueue, if it wasn't there already */
-        spin_lock(&prv->waitq_lock);
-        if ( list_empty(&nvc->waitq_elem) )
-        {
-            list_add_tail(&nvc->waitq_elem, &prv->waitq);
-            dprintk(XENLOG_G_WARNING,
-                    "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
-                    unit->unit_id);
-        }
-        spin_unlock(&prv->waitq_lock);
-    }
-
-    /*
-     * Whatever all the above, we always at least override v->processor.
-     * This is especially important for shutdown or suspend/resume paths,
-     * when it is important to let our caller (cpu_disable_scheduler())
-     * know that the migration did happen, to the best of our possibilities,
-     * at least. In case of suspend, any temporary inconsistency caused
-     * by this, will be fixed-up during resume.
-     */
- out:
-    sched_set_res(unit, get_sched_res(new_cpu));
-}
-
-#ifndef NDEBUG
-static inline void null_unit_check(struct sched_unit *unit)
-{
-    struct null_unit * const nvc = null_unit(unit);
-    struct null_dom * const ndom = unit->domain->sched_priv;
-
-    BUG_ON(nvc->unit != unit);
-
-    if ( ndom )
-        BUG_ON(is_idle_unit(unit));
-    else
-        BUG_ON(!is_idle_unit(unit));
-
-    SCHED_STAT_CRANK(unit_check);
-}
-#define NULL_UNIT_CHECK(unit)  (null_unit_check(unit))
-#else
-#define NULL_UNIT_CHECK(unit)
-#endif
-
-
-/*
- * The most simple scheduling function of all times! We either return:
- *  - the unit assigned to the pCPU, if there's one and it can run;
- *  - the idle unit, otherwise.
- */
-static void null_schedule(const struct scheduler *ops, struct sched_unit *prev,
-                          s_time_t now, bool tasklet_work_scheduled)
-{
-    unsigned int bs;
-    const unsigned int cur_cpu = smp_processor_id();
-    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *wvc;
-
-    SCHED_STAT_CRANK(schedule);
-    NULL_UNIT_CHECK(current->sched_unit);
-
-    if ( unlikely(tb_init_done) )
-    {
-        struct {
-            uint16_t tasklet, cpu;
-            int16_t unit, dom;
-        } d;
-        d.cpu = cur_cpu;
-        d.tasklet = tasklet_work_scheduled;
-        if ( per_cpu(npc, sched_cpu).unit == NULL )
-        {
-            d.unit = d.dom = -1;
-        }
-        else
-        {
-            d.unit = per_cpu(npc, sched_cpu).unit->unit_id;
-            d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id;
-        }
-        __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
-    }
-
-    if ( tasklet_work_scheduled )
-    {
-        trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
-        prev->next_task = sched_idle_unit(sched_cpu);
-    }
-    else
-        prev->next_task = per_cpu(npc, sched_cpu).unit;
-    prev->next_time = -1;
-
-    /*
-     * We may be new in the cpupool, or just coming back online. In which
-     * case, there may be units in the waitqueue that we can assign to us
-     * and run.
-     */
-    if ( unlikely(prev->next_task == NULL) )
-    {
-        bool unit_found;
-
-        spin_lock(&prv->waitq_lock);
-
-        if ( list_empty(&prv->waitq) )
-            goto unlock;
-
-        /*
-         * We scan the waitqueue twice, for prioritizing units that have
-         * soft-affinity with cpu. This may look like something expensive to
-         * do here in null_schedule(), but it's actually fine, because we do
-         * it only in cases where a pcpu has no unit associated (e.g., as
-         * said above, the cpu has just joined a cpupool).
-         */
-        unit_found = false;
-        for_each_affinity_balance_step( bs )
-        {
-            list_for_each_entry( wvc, &prv->waitq, waitq_elem )
-            {
-                if ( bs == BALANCE_SOFT_AFFINITY &&
-                     !has_soft_affinity(wvc->unit) )
-                    continue;
-
-                if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
-                {
-                    spinlock_t *lock;
-
-                    unit_found = true;
-
-                    /*
-                     * If the unit in the waitqueue has just come up online,
-                     * we risk racing with vcpu_wake(). To avoid this, sync
-                     * on the spinlock that vcpu_wake() holds, but only with
-                     * trylock, to avoid deadlock).
-                     */
-                    lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
-
-                    /*
-                     * We know the vcpu's lock is not this resource's lock. In
-                     * fact, if it were, since this cpu is free, vcpu_wake()
-                     * would have assigned the unit to here directly.
-                     */
-                    ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
-
-                    if ( lock ) {
-                        unit_assign(prv, wvc->unit, sched_cpu);
-                        list_del_init(&wvc->waitq_elem);
-                        prev->next_task = wvc->unit;
-                        spin_unlock(lock);
-                        goto unlock;
-                    }
-                }
-            }
-        }
-        /*
-         * If we did find a unit with suitable affinity in the waitqueue, but
-         * we could not pick it up (due to lock contention), and hence we are
-         * still free, plan for another try. In fact, we don't want such unit
-         * to be stuck in the waitqueue, when there are free cpus where it
-         * could run.
-         */
-        if ( unlikely( unit_found && prev->next_task == NULL &&
-                       !list_empty(&prv->waitq)) )
-            cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
- unlock:
-        spin_unlock(&prv->waitq_lock);
-
-        if ( prev->next_task == NULL &&
-             !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
-            cpumask_set_cpu(sched_cpu, &prv->cpus_free);
-    }
-
-    if ( unlikely(prev->next_task == NULL ||
-                  !unit_runnable_state(prev->next_task)) )
-        prev->next_task = sched_idle_unit(sched_cpu);
-
-    NULL_UNIT_CHECK(prev->next_task);
-
-    prev->next_task->migrated = false;
-}
-
-static inline void dump_unit(struct null_private *prv, struct null_unit *nvc)
-{
-    printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
-            nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
-                                sched_unit_master(nvc->unit) : -1);
-}
-
-static void null_dump_pcpu(const struct scheduler *ops, int cpu)
-{
-    struct null_private *prv = null_priv(ops);
-    struct null_unit *nvc;
-    spinlock_t *lock;
-    unsigned long flags;
-
-    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
-    printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
-           cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
-           CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
-    if ( per_cpu(npc, cpu).unit != NULL )
-        printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain,
-               per_cpu(npc, cpu).unit->unit_id);
-    printk("\n");
-
-    /* current unit (nothing to say if that's the idle unit) */
-    nvc = null_unit(curr_on_cpu(cpu));
-    if ( nvc && !is_idle_unit(nvc->unit) )
-    {
-        printk("\trun: ");
-        dump_unit(prv, nvc);
-        printk("\n");
-    }
-
-    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-}
-
-static void null_dump(const struct scheduler *ops)
-{
-    struct null_private *prv = null_priv(ops);
-    struct list_head *iter;
-    unsigned long flags;
-    unsigned int loop;
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
-
-    printk("Domain info:\n");
-    loop = 0;
-    list_for_each( iter, &prv->ndom )
-    {
-        struct null_dom *ndom;
-        struct sched_unit *unit;
-
-        ndom = list_entry(iter, struct null_dom, ndom_elem);
-
-        printk("\tDomain: %d\n", ndom->dom->domain_id);
-        for_each_sched_unit( ndom->dom, unit )
-        {
-            struct null_unit * const nvc = null_unit(unit);
-            spinlock_t *lock;
-
-            lock = unit_schedule_lock(unit);
-
-            printk("\t%3d: ", ++loop);
-            dump_unit(prv, nvc);
-            printk("\n");
-
-            unit_schedule_unlock(lock, unit);
-        }
-    }
-
-    printk("Waitqueue: ");
-    loop = 0;
-    spin_lock(&prv->waitq_lock);
-    list_for_each( iter, &prv->waitq )
-    {
-        struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
-
-        if ( loop++ != 0 )
-            printk(", ");
-        if ( loop % 24 == 0 )
-            printk("\n\t");
-        printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
-    }
-    printk("\n");
-    spin_unlock(&prv->waitq_lock);
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static const struct scheduler sched_null_def = {
-    .name           = "null Scheduler",
-    .opt_name       = "null",
-    .sched_id       = XEN_SCHEDULER_NULL,
-    .sched_data     = NULL,
-
-    .init           = null_init,
-    .deinit         = null_deinit,
-    .init_pdata     = null_init_pdata,
-    .switch_sched   = null_switch_sched,
-    .deinit_pdata   = null_deinit_pdata,
-
-    .alloc_udata    = null_alloc_udata,
-    .free_udata     = null_free_udata,
-    .alloc_domdata  = null_alloc_domdata,
-    .free_domdata   = null_free_domdata,
-
-    .insert_unit    = null_unit_insert,
-    .remove_unit    = null_unit_remove,
-
-    .wake           = null_unit_wake,
-    .sleep          = null_unit_sleep,
-    .pick_resource  = null_res_pick,
-    .migrate        = null_unit_migrate,
-    .do_schedule    = null_schedule,
-
-    .dump_cpu_state = null_dump_pcpu,
-    .dump_settings  = null_dump,
-};
-
-REGISTER_SCHEDULER(sched_null_def);
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
deleted file mode 100644
index c40a7e4990..0000000000
--- a/xen/common/sched_rt.c
+++ /dev/null
@@ -1,1571 +0,0 @@
-/*****************************************************************************
- * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
- * EDF scheduling is a real-time scheduling algorithm used in embedded field.
- *
- * by Sisu Xi, 2013, Washington University in Saint Louis
- * Meng Xu, 2014-2016, University of Pennsylvania
- *
- * Conversion toward event driven model by Tianyang Chen
- * and Dagaen Golomb, 2016, University of Pennsylvania
- *
- * based on the code of credit Scheduler
- */
-
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/timer.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <asm/atomic.h>
-#include <xen/errno.h>
-#include <xen/trace.h>
-#include <xen/cpu.h>
-#include <xen/keyhandler.h>
-#include <xen/trace.h>
-#include <xen/err.h>
-#include <xen/guest_access.h>
-
-/*
- * TODO:
- *
- * Migration compensation and resist like credit2 to better use cache;
- * Lock Holder Problem, using yield?
- * Self switch problem: UNITs of the same domain may preempt each other;
- */
-
-/*
- * Design:
- *
- * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
- * theory in real-time field.
- * At any scheduling point, the UNIT with earlier deadline has higher priority.
- * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
- * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
- * has a lower-priority UNIT running on it.)
- *
- * Each UNIT has a dedicated period, budget and a extratime flag
- * The deadline of an UNIT is at the end of each period;
- * An UNIT has its budget replenished at the beginning of each period;
- * While scheduled, an UNIT burns its budget.
- * The UNIT needs to finish its budget before its deadline in each period;
- * The UNIT discards its unused budget at the end of each period.
- * When an UNIT runs out of budget in a period, if its extratime flag is set,
- * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
- * it has to wait until next period.
- *
- * Each UNIT is implemented as a deferable server.
- * When an UNIT has a task running on it, its budget is continuously burned;
- * When an UNIT has no task but with budget left, its budget is preserved.
- *
- * Queue scheme:
- * A global runqueue and a global depletedqueue for each CPU pool.
- * The runqueue holds all runnable UNITs with budget,
- * sorted by priority_level and deadline;
- * The depletedqueue holds all UNITs without budget, unsorted;
- *
- * Note: cpumask and cpupool is supported.
- */
-
-/*
- * Locking:
- * A global system lock is used to protect the RunQ and DepletedQ.
- * The global lock is referenced by sched_res->schedule_lock
- * from all physical cpus.
- *
- * The lock is already grabbed when calling wake/sleep/schedule/ functions
- * in schedule.c
- *
- * The functions involes RunQ and needs to grab locks are:
- *    unit_insert, unit_remove, context_saved, runq_insert
- */
-
-
-/*
- * Default parameters:
- * Period and budget in default is 10 and 4 ms, respectively
- */
-#define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
-#define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
-
-/*
- * Max period: max delta of time type, because period is added to the time
- * an unit activates, so this must not overflow.
- * Min period: 10 us, considering the scheduling overhead (when period is
- * too low, scheduling is invoked too frequently, causing high overhead).
- */
-#define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
-#define RTDS_MIN_PERIOD     (MICROSECS(10))
-
-/*
- * Min budget: 10 us, considering the scheduling overhead (when budget is
- * consumed too fast, scheduling is invoked too frequently, causing
- * high overhead).
- */
-#define RTDS_MIN_BUDGET     (MICROSECS(10))
-
-/*
- * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
- * the next deadline, performing addition could be faster if the difference
- * between cur_deadline and now is small. If the difference is bigger than
- * 1024 * period, use multiplication.
- */
-#define UPDATE_LIMIT_SHIFT      10
-
-/*
- * Flags
- */
-/*
- * RTDS_scheduled: Is this unit either running on, or context-switching off,
- * a physical cpu?
- * + Accessed only with global lock held.
- * + Set when chosen as next in rt_schedule().
- * + Cleared after context switch has been saved in rt_context_saved()
- * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
- *   set RTDS_delayed_runq_add
- * + Checked to be false in runq_insert.
- */
-#define __RTDS_scheduled            1
-#define RTDS_scheduled (1<<__RTDS_scheduled)
-/*
- * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
- * once it's done being context switching out?
- * + Set when scheduling out in rt_schedule() if prev is runable
- * + Set in rt_unit_wake if it finds RTDS_scheduled set
- * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
- *   and clears the bit.
- */
-#define __RTDS_delayed_runq_add     2
-#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
-
-/*
- * RTDS_depleted: Does this vcp run out of budget?
- * This flag is
- * + set in burn_budget() if an unit has zero budget left;
- * + cleared and checked in the repenishment handler,
- *   for the units that are being replenished.
- */
-#define __RTDS_depleted     3
-#define RTDS_depleted (1<<__RTDS_depleted)
-
-/*
- * RTDS_extratime: Can the unit run in the time that is
- * not part of any real-time reservation, and would therefore
- * be otherwise left idle?
- */
-#define __RTDS_extratime    4
-#define RTDS_extratime (1<<__RTDS_extratime)
-
-/*
- * rt tracing events ("only" 512 available!). Check
- * include/public/trace.h for more details.
- */
-#define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
-#define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
-#define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
-#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
-#define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
-#define TRC_RTDS_SCHEDULE         TRC_SCHED_CLASS_EVT(RTDS, 6)
-
-static void repl_timer_handler(void *data);
-
-/*
- * System-wide private data, include global RunQueue/DepletedQ
- * Global lock is referenced by sched_res->schedule_lock from all
- * physical cpus. It can be grabbed via unit_schedule_lock_irq()
- */
-struct rt_private {
-    spinlock_t lock;            /* the global coarse-grained lock */
-    struct list_head sdom;      /* list of availalbe domains, used for dump */
-
-    struct list_head runq;      /* ordered list of runnable units */
-    struct list_head depletedq; /* unordered list of depleted units */
-
-    struct timer repl_timer;    /* replenishment timer */
-    struct list_head replq;     /* ordered list of units that need replenishment */
-
-    cpumask_t tickled;          /* cpus been tickled */
-};
-
-/*
- * Virtual CPU
- */
-struct rt_unit {
-    struct list_head q_elem;     /* on the runq/depletedq list */
-    struct list_head replq_elem; /* on the replenishment events list */
-
-    /* UNIT parameters, in nanoseconds */
-    s_time_t period;
-    s_time_t budget;
-
-    /* UNIT current information in nanosecond */
-    s_time_t cur_budget;         /* current budget */
-    s_time_t last_start;         /* last start time */
-    s_time_t cur_deadline;       /* current deadline for EDF */
-
-    /* Up-pointers */
-    struct rt_dom *sdom;
-    struct sched_unit *unit;
-
-    unsigned priority_level;
-
-    unsigned flags;              /* mark __RTDS_scheduled, etc.. */
-};
-
-/*
- * Domain
- */
-struct rt_dom {
-    struct list_head sdom_elem; /* link list on rt_priv */
-    struct domain *dom;         /* pointer to upper domain */
-};
-
-/*
- * Useful inline functions
- */
-static inline struct rt_private *rt_priv(const struct scheduler *ops)
-{
-    return ops->sched_data;
-}
-
-static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
-{
-    return unit->priv;
-}
-
-static inline struct list_head *rt_runq(const struct scheduler *ops)
-{
-    return &rt_priv(ops)->runq;
-}
-
-static inline struct list_head *rt_depletedq(const struct scheduler *ops)
-{
-    return &rt_priv(ops)->depletedq;
-}
-
-static inline struct list_head *rt_replq(const struct scheduler *ops)
-{
-    return &rt_priv(ops)->replq;
-}
-
-static inline bool has_extratime(const struct rt_unit *svc)
-{
-    return svc->flags & RTDS_extratime;
-}
-
-/*
- * Helper functions for manipulating the runqueue, the depleted queue,
- * and the replenishment events queue.
- */
-static int
-unit_on_q(const struct rt_unit *svc)
-{
-   return !list_empty(&svc->q_elem);
-}
-
-static struct rt_unit *
-q_elem(struct list_head *elem)
-{
-    return list_entry(elem, struct rt_unit, q_elem);
-}
-
-static struct rt_unit *
-replq_elem(struct list_head *elem)
-{
-    return list_entry(elem, struct rt_unit, replq_elem);
-}
-
-static int
-unit_on_replq(const struct rt_unit *svc)
-{
-    return !list_empty(&svc->replq_elem);
-}
-
-/*
- * If v1 priority >= v2 priority, return value > 0
- * Otherwise, return value < 0
- */
-static s_time_t
-compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
-{
-    int prio = v2->priority_level - v1->priority_level;
-
-    if ( prio == 0 )
-        return v2->cur_deadline - v1->cur_deadline;
-
-    return prio;
-}
-
-/*
- * Debug related code, dump unit/cpu information
- */
-static void
-rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
-{
-    cpumask_t *cpupool_mask, *mask;
-
-    ASSERT(svc != NULL);
-    /* idle unit */
-    if( svc->sdom == NULL )
-    {
-        printk("\n");
-        return;
-    }
-
-    /*
-     * We can't just use 'cpumask_scratch' because the dumping can
-     * happen from a pCPU outside of this scheduler's cpupool, and
-     * hence it's not right to use its pCPU's scratch mask.
-     * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
-     * own scratch space, since we hold the runqueue lock.
-     */
-    mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
-
-    cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
-    cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
-    printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
-           " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
-           " \t\t priority_level=%d has_extratime=%d\n"
-           " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
-            svc->unit->domain->domain_id,
-            svc->unit->unit_id,
-            sched_unit_master(svc->unit),
-            svc->period,
-            svc->budget,
-            svc->cur_budget,
-            svc->cur_deadline,
-            svc->last_start,
-            svc->priority_level,
-            has_extratime(svc),
-            unit_on_q(svc),
-            unit_runnable(svc->unit),
-            svc->flags, CPUMASK_PR(mask));
-}
-
-static void
-rt_dump_pcpu(const struct scheduler *ops, int cpu)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_unit *svc;
-    unsigned long flags;
-
-    spin_lock_irqsave(&prv->lock, flags);
-    printk("CPU[%02d]\n", cpu);
-    /* current UNIT (nothing to say if that's the idle unit). */
-    svc = rt_unit(curr_on_cpu(cpu));
-    if ( svc && !is_idle_unit(svc->unit) )
-    {
-        rt_dump_unit(ops, svc);
-    }
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void
-rt_dump(const struct scheduler *ops)
-{
-    struct list_head *runq, *depletedq, *replq, *iter;
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_unit *svc;
-    struct rt_dom *sdom;
-    unsigned long flags;
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    if ( list_empty(&prv->sdom) )
-        goto out;
-
-    runq = rt_runq(ops);
-    depletedq = rt_depletedq(ops);
-    replq = rt_replq(ops);
-
-    printk("Global RunQueue info:\n");
-    list_for_each ( iter, runq )
-    {
-        svc = q_elem(iter);
-        rt_dump_unit(ops, svc);
-    }
-
-    printk("Global DepletedQueue info:\n");
-    list_for_each ( iter, depletedq )
-    {
-        svc = q_elem(iter);
-        rt_dump_unit(ops, svc);
-    }
-
-    printk("Global Replenishment Events info:\n");
-    list_for_each ( iter, replq )
-    {
-        svc = replq_elem(iter);
-        rt_dump_unit(ops, svc);
-    }
-
-    printk("Domain info:\n");
-    list_for_each ( iter, &prv->sdom )
-    {
-        struct sched_unit *unit;
-
-        sdom = list_entry(iter, struct rt_dom, sdom_elem);
-        printk("\tdomain: %d\n", sdom->dom->domain_id);
-
-        for_each_sched_unit ( sdom->dom, unit )
-        {
-            svc = rt_unit(unit);
-            rt_dump_unit(ops, svc);
-        }
-    }
-
- out:
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-/*
- * update deadline and budget when now >= cur_deadline
- * it needs to be updated to the deadline of the current period
- */
-static void
-rt_update_deadline(s_time_t now, struct rt_unit *svc)
-{
-    ASSERT(now >= svc->cur_deadline);
-    ASSERT(svc->period != 0);
-
-    if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
-    {
-        do
-            svc->cur_deadline += svc->period;
-        while ( svc->cur_deadline <= now );
-    }
-    else
-    {
-        long count = ((now - svc->cur_deadline) / svc->period) + 1;
-        svc->cur_deadline += count * svc->period;
-    }
-
-    /*
-     * svc may be scheduled to run immediately after it misses deadline
-     * Then rt_update_deadline is called before rt_schedule, which
-     * should only deduct the time spent in current period from the budget
-     */
-    svc->last_start = now;
-    svc->cur_budget = svc->budget;
-    svc->priority_level = 0;
-
-    /* TRACE */
-    {
-        struct __packed {
-            unsigned unit:16, dom:16;
-            unsigned priority_level;
-            uint64_t cur_deadline, cur_budget;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.priority_level = svc->priority_level;
-        d.cur_deadline = (uint64_t) svc->cur_deadline;
-        d.cur_budget = (uint64_t) svc->cur_budget;
-        trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
-                  sizeof(d),
-                  (unsigned char *) &d);
-    }
-
-    return;
-}
-
-/*
- * Helpers for removing and inserting an unit in a queue
- * that is being kept ordered by the units' deadlines (as EDF
- * mandates).
- *
- * For callers' convenience, the unit removing helper returns
- * true if the unit removed was the one at the front of the
- * queue; similarly, the inserting helper returns true if the
- * inserted ended at the front of the queue (i.e., in both
- * cases, if the unit with the earliest deadline is what we
- * are dealing with).
- */
-static inline bool
-deadline_queue_remove(struct list_head *queue, struct list_head *elem)
-{
-    int pos = 0;
-
-    if ( queue->next != elem )
-        pos = 1;
-
-    list_del_init(elem);
-    return !pos;
-}
-
-static inline bool
-deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
-                      struct rt_unit *svc, struct list_head *elem,
-                      struct list_head *queue)
-{
-    struct list_head *iter;
-    int pos = 0;
-
-    list_for_each ( iter, queue )
-    {
-        struct rt_unit * iter_svc = (*qelem)(iter);
-        if ( compare_unit_priority(svc, iter_svc) > 0 )
-            break;
-        pos++;
-    }
-    list_add_tail(elem, iter);
-    return !pos;
-}
-#define deadline_runq_insert(...) \
-  deadline_queue_insert(&q_elem, ##__VA_ARGS__)
-#define deadline_replq_insert(...) \
-  deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
-
-static inline void
-q_remove(struct rt_unit *svc)
-{
-    ASSERT( unit_on_q(svc) );
-    list_del_init(&svc->q_elem);
-}
-
-static inline void
-replq_remove(const struct scheduler *ops, struct rt_unit *svc)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct list_head *replq = rt_replq(ops);
-
-    ASSERT( unit_on_replq(svc) );
-
-    if ( deadline_queue_remove(replq, &svc->replq_elem) )
-    {
-        /*
-         * The replenishment timer needs to be set to fire when a
-         * replenishment for the unit at the front of the replenishment
-         * queue is due. If it is such unit that we just removed, we may
-         * need to reprogram the timer.
-         */
-        if ( !list_empty(replq) )
-        {
-            struct rt_unit *svc_next = replq_elem(replq->next);
-            set_timer(&prv->repl_timer, svc_next->cur_deadline);
-        }
-        else
-            stop_timer(&prv->repl_timer);
-    }
-}
-
-/*
- * Insert svc with budget in RunQ according to EDF:
- * units with smaller deadlines go first.
- * Insert svc without budget in DepletedQ unsorted;
- */
-static void
-runq_insert(const struct scheduler *ops, struct rt_unit *svc)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct list_head *runq = rt_runq(ops);
-
-    ASSERT( spin_is_locked(&prv->lock) );
-    ASSERT( !unit_on_q(svc) );
-    ASSERT( unit_on_replq(svc) );
-
-    /* add svc to runq if svc still has budget or its extratime is set */
-    if ( svc->cur_budget > 0 ||
-         has_extratime(svc) )
-        deadline_runq_insert(svc, &svc->q_elem, runq);
-    else
-        list_add(&svc->q_elem, &prv->depletedq);
-}
-
-static void
-replq_insert(const struct scheduler *ops, struct rt_unit *svc)
-{
-    struct list_head *replq = rt_replq(ops);
-    struct rt_private *prv = rt_priv(ops);
-
-    ASSERT( !unit_on_replq(svc) );
-
-    /*
-     * The timer may be re-programmed if svc is inserted
-     * at the front of the event list.
-     */
-    if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
-        set_timer(&prv->repl_timer, svc->cur_deadline);
-}
-
-/*
- * Removes and re-inserts an event to the replenishment queue.
- * The aim is to update its position inside the queue, as its
- * deadline (and hence its replenishment time) could have
- * changed.
- */
-static void
-replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
-{
-    struct list_head *replq = rt_replq(ops);
-    struct rt_unit *rearm_svc = svc;
-    bool_t rearm = 0;
-
-    ASSERT( unit_on_replq(svc) );
-
-    /*
-     * If svc was at the front of the replenishment queue, we certainly
-     * need to re-program the timer, and we want to use the deadline of
-     * the unit which is now at the front of the queue (which may still
-     * be svc or not).
-     *
-     * We may also need to re-program, if svc has been put at the front
-     * of the replenishment queue when being re-inserted.
-     */
-    if ( deadline_queue_remove(replq, &svc->replq_elem) )
-    {
-        deadline_replq_insert(svc, &svc->replq_elem, replq);
-        rearm_svc = replq_elem(replq->next);
-        rearm = 1;
-    }
-    else
-        rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
-
-    if ( rearm )
-        set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
-}
-
-/*
- * Pick a valid resource for the unit vc
- * Valid resource of an unit is intesection of unit's affinity
- * and available resources
- */
-static struct sched_resource *
-rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
-    cpumask_t cpus;
-    cpumask_t *online;
-    int cpu;
-
-    online = cpupool_domain_master_cpumask(unit->domain);
-    cpumask_and(&cpus, online, unit->cpu_hard_affinity);
-
-    cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus)
-            ? sched_unit_master(unit)
-            : cpumask_cycle(sched_unit_master(unit), &cpus);
-    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
-
-    return get_sched_res(cpu);
-}
-
-/*
- * Init/Free related code
- */
-static int
-rt_init(struct scheduler *ops)
-{
-    int rc = -ENOMEM;
-    struct rt_private *prv = xzalloc(struct rt_private);
-
-    printk("Initializing RTDS scheduler\n"
-           "WARNING: This is experimental software in development.\n"
-           "Use at your own risk.\n");
-
-    if ( prv == NULL )
-        goto err;
-
-    spin_lock_init(&prv->lock);
-    INIT_LIST_HEAD(&prv->sdom);
-    INIT_LIST_HEAD(&prv->runq);
-    INIT_LIST_HEAD(&prv->depletedq);
-    INIT_LIST_HEAD(&prv->replq);
-
-    ops->sched_data = prv;
-    rc = 0;
-
- err:
-    if ( rc )
-        xfree(prv);
-
-    return rc;
-}
-
-static void
-rt_deinit(struct scheduler *ops)
-{
-    struct rt_private *prv = rt_priv(ops);
-
-    ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
-           prv->repl_timer.status == TIMER_STATUS_killed);
-
-    ops->sched_data = NULL;
-    xfree(prv);
-}
-
-/*
- * Point per_cpu spinlock to the global system lock;
- * All cpu have same global system lock
- */
-static void
-rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
-{
-    struct rt_private *prv = rt_priv(ops);
-    spinlock_t *old_lock;
-    unsigned long flags;
-
-    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
-    /*
-     * TIMER_STATUS_invalid means we are the first cpu that sees the timer
-     * allocated but not initialized, and so it's up to us to initialize it.
-     */
-    if ( prv->repl_timer.status == TIMER_STATUS_invalid )
-    {
-        init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu);
-        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
-    }
-
-    /* Move the scheduler lock to our global runqueue lock.  */
-    get_sched_res(cpu)->schedule_lock = &prv->lock;
-
-    /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
-    spin_unlock_irqrestore(old_lock, flags);
-}
-
-/* Change the scheduler of cpu to us (RTDS). */
-static spinlock_t *
-rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
-                void *pdata, void *vdata)
-{
-    struct rt_private *prv = rt_priv(new_ops);
-    struct rt_unit *svc = vdata;
-
-    ASSERT(!pdata && svc && is_idle_unit(svc->unit));
-
-    /*
-     * We are holding the runqueue lock already (it's been taken in
-     * schedule_cpu_switch()). It's actually the runqueue lock of
-     * another scheduler, but that is how things need to be, for
-     * preventing races.
-     */
-    ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
-
-    /*
-     * If we are the absolute first cpu being switched toward this
-     * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
-     * first one that is added back to the cpupool that had all its cpus
-     * removed (in which case we'll see TIMER_STATUS_killed), it's our
-     * job to (re)initialize the timer.
-     */
-    if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
-         prv->repl_timer.status == TIMER_STATUS_killed )
-    {
-        init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
-        dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
-    }
-
-    sched_idle_unit(cpu)->priv = vdata;
-
-    return &prv->lock;
-}
-
-static void
-rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
-{
-    unsigned long flags;
-    struct rt_private *prv = rt_priv(ops);
-
-    spin_lock_irqsave(&prv->lock, flags);
-
-    if ( prv->repl_timer.cpu == cpu )
-    {
-        cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid;
-        unsigned int new_cpu = cpumask_cycle(cpu, online);
-
-        /*
-         * Make sure the timer run on one of the cpus that are still available
-         * to this scheduler. If there aren't any left, it means it's the time
-         * to just kill it.
-         */
-        if ( new_cpu >= nr_cpu_ids )
-        {
-            kill_timer(&prv->repl_timer);
-            dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
-        }
-        else
-        {
-            migrate_timer(&prv->repl_timer, new_cpu);
-        }
-    }
-
-    spin_unlock_irqrestore(&prv->lock, flags);
-}
-
-static void *
-rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
-{
-    unsigned long flags;
-    struct rt_dom *sdom;
-    struct rt_private * prv = rt_priv(ops);
-
-    sdom = xzalloc(struct rt_dom);
-    if ( sdom == NULL )
-        return ERR_PTR(-ENOMEM);
-
-    INIT_LIST_HEAD(&sdom->sdom_elem);
-    sdom->dom = dom;
-
-    /* spinlock here to insert the dom */
-    spin_lock_irqsave(&prv->lock, flags);
-    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
-    spin_unlock_irqrestore(&prv->lock, flags);
-
-    return sdom;
-}
-
-static void
-rt_free_domdata(const struct scheduler *ops, void *data)
-{
-    struct rt_dom *sdom = data;
-    struct rt_private *prv = rt_priv(ops);
-
-    if ( sdom )
-    {
-        unsigned long flags;
-
-        spin_lock_irqsave(&prv->lock, flags);
-        list_del_init(&sdom->sdom_elem);
-        spin_unlock_irqrestore(&prv->lock, flags);
-
-        xfree(sdom);
-    }
-}
-
-static void *
-rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
-{
-    struct rt_unit *svc;
-
-    /* Allocate per-UNIT info */
-    svc = xzalloc(struct rt_unit);
-    if ( svc == NULL )
-        return NULL;
-
-    INIT_LIST_HEAD(&svc->q_elem);
-    INIT_LIST_HEAD(&svc->replq_elem);
-    svc->flags = 0U;
-    svc->sdom = dd;
-    svc->unit = unit;
-    svc->last_start = 0;
-
-    __set_bit(__RTDS_extratime, &svc->flags);
-    svc->priority_level = 0;
-    svc->period = RTDS_DEFAULT_PERIOD;
-    if ( !is_idle_unit(unit) )
-        svc->budget = RTDS_DEFAULT_BUDGET;
-
-    SCHED_STAT_CRANK(unit_alloc);
-
-    return svc;
-}
-
-static void
-rt_free_udata(const struct scheduler *ops, void *priv)
-{
-    struct rt_unit *svc = priv;
-
-    xfree(svc);
-}
-
-/*
- * It is called in sched_move_domain() and sched_init_vcpu
- * in schedule.c.
- * When move a domain to a new cpupool.
- * It inserts units of moving domain to the scheduler's RunQ in
- * dest. cpupool.
- */
-static void
-rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct rt_unit *svc = rt_unit(unit);
-    s_time_t now;
-    spinlock_t *lock;
-
-    BUG_ON( is_idle_unit(unit) );
-
-    /* This is safe because unit isn't yet being scheduled */
-    sched_set_res(unit, rt_res_pick(ops, unit));
-
-    lock = unit_schedule_lock_irq(unit);
-
-    now = NOW();
-    if ( now >= svc->cur_deadline )
-        rt_update_deadline(now, svc);
-
-    if ( !unit_on_q(svc) && unit_runnable(unit) )
-    {
-        replq_insert(ops, svc);
-
-        if ( !unit->is_running )
-            runq_insert(ops, svc);
-    }
-    unit_schedule_unlock_irq(lock, unit);
-
-    SCHED_STAT_CRANK(unit_insert);
-}
-
-/*
- * Remove rt_unit svc from the old scheduler in source cpupool.
- */
-static void
-rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct rt_unit * const svc = rt_unit(unit);
-    struct rt_dom * const sdom = svc->sdom;
-    spinlock_t *lock;
-
-    SCHED_STAT_CRANK(unit_remove);
-
-    BUG_ON( sdom == NULL );
-
-    lock = unit_schedule_lock_irq(unit);
-    if ( unit_on_q(svc) )
-        q_remove(svc);
-
-    if ( unit_on_replq(svc) )
-        replq_remove(ops,svc);
-
-    unit_schedule_unlock_irq(lock, unit);
-}
-
-/*
- * Burn budget in nanosecond granularity
- */
-static void
-burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
-{
-    s_time_t delta;
-
-    /* don't burn budget for idle UNIT */
-    if ( is_idle_unit(svc->unit) )
-        return;
-
-    /* burn at nanoseconds level */
-    delta = now - svc->last_start;
-    /*
-     * delta < 0 only happens in nested virtualization;
-     * TODO: how should we handle delta < 0 in a better way?
-     */
-    if ( delta < 0 )
-    {
-        printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
-                __func__, delta);
-        svc->last_start = now;
-        return;
-    }
-
-    svc->cur_budget -= delta;
-    svc->last_start = now;
-
-    if ( svc->cur_budget <= 0 )
-    {
-        if ( has_extratime(svc) )
-        {
-            svc->priority_level++;
-            svc->cur_budget = svc->budget;
-        }
-        else
-        {
-            svc->cur_budget = 0;
-            __set_bit(__RTDS_depleted, &svc->flags);
-        }
-    }
-
-    /* TRACE */
-    {
-        struct __packed {
-            unsigned unit:16, dom:16;
-            uint64_t cur_budget;
-            int delta;
-            unsigned priority_level;
-            bool has_extratime;
-        } d;
-        d.dom = svc->unit->domain->domain_id;
-        d.unit = svc->unit->unit_id;
-        d.cur_budget = (uint64_t) svc->cur_budget;
-        d.delta = delta;
-        d.priority_level = svc->priority_level;
-        d.has_extratime = svc->flags & RTDS_extratime;
-        trace_var(TRC_RTDS_BUDGET_BURN, 1,
-                  sizeof(d),
-                  (unsigned char *) &d);
-    }
-}
-
-/*
- * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
- * lock is grabbed before calling this function
- */
-static struct rt_unit *
-runq_pick(const struct scheduler *ops, const cpumask_t *mask)
-{
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *iter;
-    struct rt_unit *svc = NULL;
-    struct rt_unit *iter_svc = NULL;
-    cpumask_t cpu_common;
-    cpumask_t *online;
-
-    list_for_each ( iter, runq )
-    {
-        iter_svc = q_elem(iter);
-
-        /* mask cpu_hard_affinity & cpupool & mask */
-        online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
-        cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity);
-        cpumask_and(&cpu_common, mask, &cpu_common);
-        if ( cpumask_empty(&cpu_common) )
-            continue;
-
-        ASSERT( iter_svc->cur_budget > 0 );
-
-        svc = iter_svc;
-        break;
-    }
-
-    /* TRACE */
-    {
-        if( svc != NULL )
-        {
-            struct __packed {
-                unsigned unit:16, dom:16;
-                uint64_t cur_deadline, cur_budget;
-            } d;
-            d.dom = svc->unit->domain->domain_id;
-            d.unit = svc->unit->unit_id;
-            d.cur_deadline = (uint64_t) svc->cur_deadline;
-            d.cur_budget = (uint64_t) svc->cur_budget;
-            trace_var(TRC_RTDS_RUNQ_PICK, 1,
-                      sizeof(d),
-                      (unsigned char *) &d);
-        }
-    }
-
-    return svc;
-}
-
-/*
- * schedule function for rt scheduler.
- * The lock is already grabbed in schedule.c, no need to lock here
- */
-static void
-rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
-            s_time_t now, bool tasklet_work_scheduled)
-{
-    const unsigned int cur_cpu = smp_processor_id();
-    const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_unit *const scurr = rt_unit(currunit);
-    struct rt_unit *snext = NULL;
-    bool migrated = false;
-
-    /* TRACE */
-    {
-        struct __packed {
-            unsigned cpu:16, tasklet:8, tickled:4, idle:4;
-        } d;
-        d.cpu = cur_cpu;
-        d.tasklet = tasklet_work_scheduled;
-        d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled);
-        d.idle = is_idle_unit(currunit);
-        trace_var(TRC_RTDS_SCHEDULE, 1,
-                  sizeof(d),
-                  (unsigned char *)&d);
-    }
-
-    /* clear ticked bit now that we've been scheduled */
-    cpumask_clear_cpu(sched_cpu, &prv->tickled);
-
-    /* burn_budget would return for IDLE UNIT */
-    burn_budget(ops, scurr, now);
-
-    if ( tasklet_work_scheduled )
-    {
-        trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0,  NULL);
-        snext = rt_unit(sched_idle_unit(sched_cpu));
-    }
-    else
-    {
-        snext = runq_pick(ops, cpumask_of(sched_cpu));
-
-        if ( snext == NULL )
-            snext = rt_unit(sched_idle_unit(sched_cpu));
-        else if ( !unit_runnable_state(snext->unit) )
-        {
-            q_remove(snext);
-            snext = rt_unit(sched_idle_unit(sched_cpu));
-        }
-
-        /* if scurr has higher priority and budget, still pick scurr */
-        if ( !is_idle_unit(currunit) &&
-             unit_runnable_state(currunit) &&
-             scurr->cur_budget > 0 &&
-             ( is_idle_unit(snext->unit) ||
-               compare_unit_priority(scurr, snext) > 0 ) )
-            snext = scurr;
-    }
-
-    if ( snext != scurr &&
-         !is_idle_unit(currunit) &&
-         unit_runnable(currunit) )
-        __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
-
-    snext->last_start = now;
-    currunit->next_time =  -1; /* if an idle unit is picked */
-    if ( !is_idle_unit(snext->unit) )
-    {
-        if ( snext != scurr )
-        {
-            q_remove(snext);
-            __set_bit(__RTDS_scheduled, &snext->flags);
-        }
-        if ( sched_unit_master(snext->unit) != sched_cpu )
-        {
-            sched_set_res(snext->unit, get_sched_res(sched_cpu));
-            migrated = true;
-        }
-        /* Invoke the scheduler next time. */
-        currunit->next_time = snext->cur_budget;
-    }
-    currunit->next_task = snext->unit;
-    snext->unit->migrated = migrated;
-}
-
-/*
- * Remove UNIT from RunQ
- * The lock is already grabbed in schedule.c, no need to lock here
- */
-static void
-rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct rt_unit * const svc = rt_unit(unit);
-
-    BUG_ON( is_idle_unit(unit) );
-    SCHED_STAT_CRANK(unit_sleep);
-
-    if ( curr_on_cpu(sched_unit_master(unit)) == unit )
-        cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
-    else if ( unit_on_q(svc) )
-    {
-        q_remove(svc);
-        replq_remove(ops, svc);
-    }
-    else if ( svc->flags & RTDS_delayed_runq_add )
-        __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
-}
-
-/*
- * Pick a cpu where to run an unit,
- * possibly kicking out the unit running there
- * Called by wake() and context_saved()
- * We have a running candidate here, the kick logic is:
- * Among all the cpus that are within the cpu affinity
- * 1) if there are any idle CPUs, kick one.
-      For cache benefit, we check new->cpu as first
- * 2) now all pcpus are busy;
- *    among all the running units, pick lowest priority one
- *    if snext has higher priority, kick it.
- *
- * TODO:
- * 1) what if these two units belongs to the same domain?
- *    replace an unit belonging to the same domain introduces more overhead
- *
- * lock is grabbed before calling this function
- */
-static void
-runq_tickle(const struct scheduler *ops, struct rt_unit *new)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
-    struct rt_unit *iter_svc;
-    struct sched_unit *iter_unit;
-    int cpu = 0, cpu_to_tickle = 0;
-    cpumask_t not_tickled;
-    cpumask_t *online;
-
-    if ( new == NULL || is_idle_unit(new->unit) )
-        return;
-
-    online = cpupool_domain_master_cpumask(new->unit->domain);
-    cpumask_and(&not_tickled, online, new->unit->cpu_hard_affinity);
-    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
-
-    /*
-     * 1) If there are any idle CPUs, kick one.
-     *    For cache benefit,we first search new->cpu.
-     *    The same loop also find the one with lowest priority.
-     */
-    cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), &not_tickled);
-    while ( cpu!= nr_cpu_ids )
-    {
-        iter_unit = curr_on_cpu(cpu);
-        if ( is_idle_unit(iter_unit) )
-        {
-            SCHED_STAT_CRANK(tickled_idle_cpu);
-            cpu_to_tickle = cpu;
-            goto out;
-        }
-        iter_svc = rt_unit(iter_unit);
-        if ( latest_deadline_unit == NULL ||
-             compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
-            latest_deadline_unit = iter_svc;
-
-        cpumask_clear_cpu(cpu, &not_tickled);
-        cpu = cpumask_cycle(cpu, &not_tickled);
-    }
-
-    /* 2) candicate has higher priority, kick out lowest priority unit */
-    if ( latest_deadline_unit != NULL &&
-         compare_unit_priority(latest_deadline_unit, new) < 0 )
-    {
-        SCHED_STAT_CRANK(tickled_busy_cpu);
-        cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
-        goto out;
-    }
-
-    /* didn't tickle any cpu */
-    SCHED_STAT_CRANK(tickled_no_cpu);
-    return;
- out:
-    /* TRACE */
-    {
-        struct {
-            unsigned cpu:16, pad:16;
-        } d;
-        d.cpu = cpu_to_tickle;
-        d.pad = 0;
-        trace_var(TRC_RTDS_TICKLE, 1,
-                  sizeof(d),
-                  (unsigned char *)&d);
-    }
-
-    cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
-    cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
-    return;
-}
-
-/*
- * Should always wake up runnable unit, put it back to RunQ.
- * Check priority to raise interrupt
- * The lock is already grabbed in schedule.c, no need to lock here
- * TODO: what if these two units belongs to the same domain?
- */
-static void
-rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct rt_unit * const svc = rt_unit(unit);
-    s_time_t now;
-    bool_t missed;
-
-    BUG_ON( is_idle_unit(unit) );
-
-    if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
-    {
-        SCHED_STAT_CRANK(unit_wake_running);
-        return;
-    }
-
-    /* on RunQ/DepletedQ, just update info is ok */
-    if ( unlikely(unit_on_q(svc)) )
-    {
-        SCHED_STAT_CRANK(unit_wake_onrunq);
-        return;
-    }
-
-    if ( likely(unit_runnable(unit)) )
-        SCHED_STAT_CRANK(unit_wake_runnable);
-    else
-        SCHED_STAT_CRANK(unit_wake_not_runnable);
-
-    /*
-     * If a deadline passed while svc was asleep/blocked, we need new
-     * scheduling parameters (a new deadline and full budget).
-     */
-    now = NOW();
-
-    missed = ( now >= svc->cur_deadline );
-    if ( missed )
-        rt_update_deadline(now, svc);
-
-    /*
-     * If context hasn't been saved for this unit yet, we can't put it on
-     * the run-queue/depleted-queue. Instead, we set the appropriate flag,
-     * the unit will be put back on queue after the context has been saved
-     * (in rt_context_save()).
-     */
-    if ( unlikely(svc->flags & RTDS_scheduled) )
-    {
-        __set_bit(__RTDS_delayed_runq_add, &svc->flags);
-        /*
-         * The unit is waking up already, and we didn't even had the time to
-         * remove its next replenishment event from the replenishment queue
-         * when it blocked! No big deal. If we did not miss the deadline in
-         * the meantime, let's just leave it there. If we did, let's remove it
-         * and queue a new one (to occur at our new deadline).
-         */
-        if ( missed )
-           replq_reinsert(ops, svc);
-        return;
-    }
-
-    /* Replenishment event got cancelled when we blocked. Add it back. */
-    replq_insert(ops, svc);
-    /* insert svc to runq/depletedq because svc is not in queue now */
-    runq_insert(ops, svc);
-
-    runq_tickle(ops, svc);
-}
-
-/*
- * scurr has finished context switch, insert it back to the RunQ,
- * and then pick the highest priority unit from runq to run
- */
-static void
-rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
-{
-    struct rt_unit *svc = rt_unit(unit);
-    spinlock_t *lock = unit_schedule_lock_irq(unit);
-
-    __clear_bit(__RTDS_scheduled, &svc->flags);
-    /* not insert idle unit to runq */
-    if ( is_idle_unit(unit) )
-        goto out;
-
-    if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
-         likely(unit_runnable(unit)) )
-    {
-        runq_insert(ops, svc);
-        runq_tickle(ops, svc);
-    }
-    else
-        replq_remove(ops, svc);
-
-out:
-    unit_schedule_unlock_irq(lock, unit);
-}
-
-/*
- * set/get each unit info of each domain
- */
-static int
-rt_dom_cntl(
-    const struct scheduler *ops,
-    struct domain *d,
-    struct xen_domctl_scheduler_op *op)
-{
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_unit *svc;
-    struct sched_unit *unit;
-    unsigned long flags;
-    int rc = 0;
-    struct xen_domctl_schedparam_vcpu local_sched;
-    s_time_t period, budget;
-    uint32_t index = 0;
-
-    switch ( op->cmd )
-    {
-    case XEN_DOMCTL_SCHEDOP_getinfo:
-        /* Return the default parameters. */
-        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
-        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
-        break;
-    case XEN_DOMCTL_SCHEDOP_putinfo:
-        if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
-        {
-            rc = -EINVAL;
-            break;
-        }
-        spin_lock_irqsave(&prv->lock, flags);
-        for_each_sched_unit ( d, unit )
-        {
-            svc = rt_unit(unit);
-            svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
-            svc->budget = MICROSECS(op->u.rtds.budget);
-        }
-        spin_unlock_irqrestore(&prv->lock, flags);
-        break;
-    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
-    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
-        while ( index < op->u.v.nr_vcpus )
-        {
-            if ( copy_from_guest_offset(&local_sched,
-                                        op->u.v.vcpus, index, 1) )
-            {
-                rc = -EFAULT;
-                break;
-            }
-            if ( local_sched.vcpuid >= d->max_vcpus ||
-                 d->vcpu[local_sched.vcpuid] == NULL )
-            {
-                rc = -EINVAL;
-                break;
-            }
-
-            if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
-            {
-                spin_lock_irqsave(&prv->lock, flags);
-                svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
-                local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
-                local_sched.u.rtds.period = svc->period / MICROSECS(1);
-                if ( has_extratime(svc) )
-                    local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
-                else
-                    local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
-                spin_unlock_irqrestore(&prv->lock, flags);
-
-                if ( copy_to_guest_offset(op->u.v.vcpus, index,
-                                          &local_sched, 1) )
-                {
-                    rc = -EFAULT;
-                    break;
-                }
-            }
-            else
-            {
-                period = MICROSECS(local_sched.u.rtds.period);
-                budget = MICROSECS(local_sched.u.rtds.budget);
-                if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
-                     budget > period || period < RTDS_MIN_PERIOD )
-                {
-                    rc = -EINVAL;
-                    break;
-                }
-
-                spin_lock_irqsave(&prv->lock, flags);
-                svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
-                svc->period = period;
-                svc->budget = budget;
-                if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
-                    __set_bit(__RTDS_extratime, &svc->flags);
-                else
-                    __clear_bit(__RTDS_extratime, &svc->flags);
-                spin_unlock_irqrestore(&prv->lock, flags);
-            }
-            /* Process a most 64 vCPUs without checking for preemptions. */
-            if ( (++index > 63) && hypercall_preempt_check() )
-                break;
-        }
-        if ( !rc )
-            /* notify upper caller how many units have been processed. */
-            op->u.v.nr_vcpus = index;
-        break;
-    }
-
-    return rc;
-}
-
-/*
- * The replenishment timer handler picks units
- * from the replq and does the actual replenishment.
- */
-static void repl_timer_handler(void *data){
-    s_time_t now;
-    struct scheduler *ops = data;
-    struct rt_private *prv = rt_priv(ops);
-    struct list_head *replq = rt_replq(ops);
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *iter, *tmp;
-    struct rt_unit *svc;
-    LIST_HEAD(tmp_replq);
-
-    spin_lock_irq(&prv->lock);
-
-    now = NOW();
-
-    /*
-     * Do the replenishment and move replenished units
-     * to the temporary list to tickle.
-     * If svc is on run queue, we need to put it at
-     * the correct place since its deadline changes.
-     */
-    list_for_each_safe ( iter, tmp, replq )
-    {
-        svc = replq_elem(iter);
-
-        if ( now < svc->cur_deadline )
-            break;
-
-        list_del(&svc->replq_elem);
-        rt_update_deadline(now, svc);
-        list_add(&svc->replq_elem, &tmp_replq);
-
-        if ( unit_on_q(svc) )
-        {
-            q_remove(svc);
-            runq_insert(ops, svc);
-        }
-    }
-
-    /*
-     * Iterate through the list of updated units.
-     * If an updated unit is running, tickle the head of the
-     * runqueue if it has a higher priority.
-     * If an updated unit was depleted and on the runqueue, tickle it.
-     * Finally, reinsert the units back to replenishement events list.
-     */
-    list_for_each_safe ( iter, tmp, &tmp_replq )
-    {
-        svc = replq_elem(iter);
-
-        if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
-             !list_empty(runq) )
-        {
-            struct rt_unit *next_on_runq = q_elem(runq->next);
-
-            if ( compare_unit_priority(svc, next_on_runq) < 0 )
-                runq_tickle(ops, next_on_runq);
-        }
-        else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
-                  unit_on_q(svc) )
-            runq_tickle(ops, svc);
-
-        list_del(&svc->replq_elem);
-        deadline_replq_insert(svc, &svc->replq_elem, replq);
-    }
-
-    /*
-     * If there are units left in the replenishment event list,
-     * set the next replenishment to happen at the deadline of
-     * the one in the front.
-     */
-    if ( !list_empty(replq) )
-        set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
-
-    spin_unlock_irq(&prv->lock);
-}
-
-static const struct scheduler sched_rtds_def = {
-    .name           = "SMP RTDS Scheduler",
-    .opt_name       = "rtds",
-    .sched_id       = XEN_SCHEDULER_RTDS,
-    .sched_data     = NULL,
-
-    .dump_cpu_state = rt_dump_pcpu,
-    .dump_settings  = rt_dump,
-    .init           = rt_init,
-    .deinit         = rt_deinit,
-    .init_pdata     = rt_init_pdata,
-    .switch_sched   = rt_switch_sched,
-    .deinit_pdata   = rt_deinit_pdata,
-    .alloc_domdata  = rt_alloc_domdata,
-    .free_domdata   = rt_free_domdata,
-    .alloc_udata    = rt_alloc_udata,
-    .free_udata     = rt_free_udata,
-    .insert_unit    = rt_unit_insert,
-    .remove_unit    = rt_unit_remove,
-
-    .adjust         = rt_dom_cntl,
-
-    .pick_resource  = rt_res_pick,
-    .do_schedule    = rt_schedule,
-    .sleep          = rt_unit_sleep,
-    .wake           = rt_unit_wake,
-    .context_saved  = rt_context_saved,
-};
-
-REGISTER_SCHEDULER(sched_rtds_def);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
deleted file mode 100644
index 54a07ff9e8..0000000000
--- a/xen/common/schedule.c
+++ /dev/null
@@ -1,3144 +0,0 @@
-/****************************************************************************
- * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
- * (C) 2002-2003 University of Cambridge
- * (C) 2004      - Mark Williamson - Intel Research Cambridge
- ****************************************************************************
- *
- *        File: common/schedule.c
- *      Author: Rolf Neugebauer & Keir Fraser
- *              Updated for generic API by Mark Williamson
- *
- * Description: Generic CPU scheduling code
- *              implements support functionality for the Xen scheduler API.
- *
- */
-
-#ifndef COMPAT
-#include <xen/init.h>
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <xen/domain.h>
-#include <xen/delay.h>
-#include <xen/event.h>
-#include <xen/time.h>
-#include <xen/timer.h>
-#include <xen/perfc.h>
-#include <xen/sched-if.h>
-#include <xen/softirq.h>
-#include <xen/trace.h>
-#include <xen/mm.h>
-#include <xen/err.h>
-#include <xen/guest_access.h>
-#include <xen/hypercall.h>
-#include <xen/multicall.h>
-#include <xen/cpu.h>
-#include <xen/preempt.h>
-#include <xen/event.h>
-#include <public/sched.h>
-#include <xsm/xsm.h>
-#include <xen/err.h>
-
-#ifdef CONFIG_XEN_GUEST
-#include <asm/guest.h>
-#else
-#define pv_shim false
-#endif
-
-/* opt_sched: scheduler - default to configured value */
-static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
-string_param("sched", opt_sched);
-
-/* if sched_smt_power_savings is set,
- * scheduler will give preferrence to partially idle package compared to
- * the full idle package, when picking pCPU to schedule vCPU.
- */
-bool_t sched_smt_power_savings = 0;
-boolean_param("sched_smt_power_savings", sched_smt_power_savings);
-
-/* Default scheduling rate limit: 1ms
- * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
- * */
-int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
-integer_param("sched_ratelimit_us", sched_ratelimit_us);
-
-/* Number of vcpus per struct sched_unit. */
-bool __read_mostly sched_disable_smt_switching;
-cpumask_t sched_res_mask;
-
-/* Common lock for free cpus. */
-static DEFINE_SPINLOCK(sched_free_cpu_lock);
-
-/* Various timer handlers. */
-static void s_timer_fn(void *unused);
-static void vcpu_periodic_timer_fn(void *data);
-static void vcpu_singleshot_timer_fn(void *data);
-static void poll_timer_fn(void *data);
-
-/* This is global for now so that private implementations can reach it */
-DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
-static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
-DEFINE_RCU_READ_LOCK(sched_res_rculock);
-
-/* Scratch space for cpumasks. */
-DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
-
-/* How many urgent vcpus. */
-DEFINE_PER_CPU(atomic_t, sched_urgent_count);
-
-extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
-#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
-#define schedulers __start_schedulers_array
-
-static struct scheduler __read_mostly ops;
-
-static bool scheduler_active;
-
-static void sched_set_affinity(
-    struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
-
-static struct sched_resource *
-sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
-{
-    return unit->res;
-}
-
-static void *
-sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
-                       void *dd)
-{
-    /* Any non-NULL pointer is fine here. */
-    return ZERO_BLOCK_PTR;
-}
-
-static void
-sched_idle_free_udata(const struct scheduler *ops, void *priv)
-{
-}
-
-static void sched_idle_schedule(
-    const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
-    bool tasklet_work_scheduled)
-{
-    const unsigned int cpu = smp_processor_id();
-
-    unit->next_time = -1;
-    unit->next_task = sched_idle_unit(cpu);
-}
-
-static struct scheduler sched_idle_ops = {
-    .name           = "Idle Scheduler",
-    .opt_name       = "idle",
-    .sched_data     = NULL,
-
-    .pick_resource  = sched_idle_res_pick,
-    .do_schedule    = sched_idle_schedule,
-
-    .alloc_udata    = sched_idle_alloc_udata,
-    .free_udata     = sched_idle_free_udata,
-};
-
-static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
-                                         unsigned int cpu)
-{
-    unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
-    const struct domain *d = unit->domain;
-
-    return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
-}
-
-static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
-                                               unsigned int cpu)
-{
-    struct vcpu *v = unit2vcpu_cpu(unit, cpu);
-
-    return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
-}
-
-static inline struct scheduler *dom_scheduler(const struct domain *d)
-{
-    if ( likely(d->cpupool != NULL) )
-        return d->cpupool->sched;
-
-    /*
-     * If d->cpupool is NULL, this is the idle domain. This is special
-     * because the idle domain does not really belong to any cpupool, and,
-     * hence, does not really have a scheduler.
-     *
-     * This is (should be!) only called like this for allocating the idle
-     * vCPUs for the first time, during boot, in which case what we want
-     * is the default scheduler that has been, choosen at boot.
-     */
-    ASSERT(is_idle_domain(d));
-    return &ops;
-}
-
-static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
-{
-    struct domain *d = unit->domain;
-
-    if ( likely(d->cpupool != NULL) )
-        return d->cpupool->sched;
-
-    /*
-     * If d->cpupool is NULL, this is a unit of the idle domain. And this
-     * case is special because the idle domain does not really belong to
-     * a cpupool and, hence, doesn't really have a scheduler). In fact, its
-     * units (may) run on pCPUs which are in different pools, with different
-     * schedulers.
-     *
-     * What we want, in this case, is the scheduler of the pCPU where this
-     * particular idle unit is running. And, since unit->res never changes
-     * for idle units, it is safe to use it, with no locks, to figure that out.
-     */
-
-    ASSERT(is_idle_domain(d));
-    return unit->res->scheduler;
-}
-
-static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
-{
-    return unit_scheduler(v->sched_unit);
-}
-#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
-
-static inline void trace_runstate_change(struct vcpu *v, int new_state)
-{
-    struct { uint32_t vcpu:16, domain:16; } d;
-    uint32_t event;
-
-    if ( likely(!tb_init_done) )
-        return;
-
-    d.vcpu = v->vcpu_id;
-    d.domain = v->domain->domain_id;
-
-    event = TRC_SCHED_RUNSTATE_CHANGE;
-    event |= ( v->runstate.state & 0x3 ) << 8;
-    event |= ( new_state & 0x3 ) << 4;
-
-    __trace_var(event, 1/*tsc*/, sizeof(d), &d);
-}
-
-static inline void trace_continue_running(struct vcpu *v)
-{
-    struct { uint32_t vcpu:16, domain:16; } d;
-
-    if ( likely(!tb_init_done) )
-        return;
-
-    d.vcpu = v->vcpu_id;
-    d.domain = v->domain->domain_id;
-
-    __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
-}
-
-static inline void vcpu_urgent_count_update(struct vcpu *v)
-{
-    if ( is_idle_vcpu(v) )
-        return;
-
-    if ( unlikely(v->is_urgent) )
-    {
-        if ( !(v->pause_flags & VPF_blocked) ||
-             !test_bit(v->vcpu_id, v->domain->poll_mask) )
-        {
-            v->is_urgent = 0;
-            atomic_dec(&per_cpu(sched_urgent_count, v->processor));
-        }
-    }
-    else
-    {
-        if ( unlikely(v->pause_flags & VPF_blocked) &&
-             unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
-        {
-            v->is_urgent = 1;
-            atomic_inc(&per_cpu(sched_urgent_count, v->processor));
-        }
-    }
-}
-
-static inline void vcpu_runstate_change(
-    struct vcpu *v, int new_state, s_time_t new_entry_time)
-{
-    s_time_t delta;
-    struct sched_unit *unit = v->sched_unit;
-
-    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
-    if ( v->runstate.state == new_state )
-        return;
-
-    vcpu_urgent_count_update(v);
-
-    trace_runstate_change(v, new_state);
-
-    if ( !is_idle_vcpu(v) )
-    {
-        unit->runstate_cnt[v->runstate.state]--;
-        unit->runstate_cnt[new_state]++;
-    }
-
-    delta = new_entry_time - v->runstate.state_entry_time;
-    if ( delta > 0 )
-    {
-        v->runstate.time[v->runstate.state] += delta;
-        v->runstate.state_entry_time = new_entry_time;
-    }
-
-    v->runstate.state = new_state;
-}
-
-void sched_guest_idle(void (*idle) (void), unsigned int cpu)
-{
-    /*
-     * Another vcpu of the unit is active in guest context while this one is
-     * idle. In case of a scheduling event we don't want to have high latencies
-     * due to a cpu needing to wake up from deep C state for joining the
-     * rendezvous, so avoid those deep C states by incrementing the urgent
-     * count of the cpu.
-     */
-    atomic_inc(&per_cpu(sched_urgent_count, cpu));
-    idle();
-    atomic_dec(&per_cpu(sched_urgent_count, cpu));
-}
-
-void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
-{
-    spinlock_t *lock;
-    s_time_t delta;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
-    memcpy(runstate, &v->runstate, sizeof(*runstate));
-    delta = NOW() - runstate->state_entry_time;
-    if ( delta > 0 )
-        runstate->time[runstate->state] += delta;
-
-    if ( unlikely(lock != NULL) )
-        unit_schedule_unlock_irq(lock, v->sched_unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-uint64_t get_cpu_idle_time(unsigned int cpu)
-{
-    struct vcpu_runstate_info state = { 0 };
-    struct vcpu *v = idle_vcpu[cpu];
-
-    if ( cpu_online(cpu) && v )
-        vcpu_runstate_get(v, &state);
-
-    return state.time[RUNSTATE_running];
-}
-
-/*
- * If locks are different, take the one with the lower address first.
- * This avoids dead- or live-locks when this code is running on both
- * cpus at the same time.
- */
-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
-                                   unsigned long *flags)
-{
-    if ( lock1 == lock2 )
-    {
-        spin_lock_irqsave(lock1, *flags);
-    }
-    else if ( lock1 < lock2 )
-    {
-        spin_lock_irqsave(lock1, *flags);
-        spin_lock(lock2);
-    }
-    else
-    {
-        spin_lock_irqsave(lock2, *flags);
-        spin_lock(lock1);
-    }
-}
-
-static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
-                                     unsigned long flags)
-{
-    if ( lock1 != lock2 )
-        spin_unlock(lock2);
-    spin_unlock_irqrestore(lock1, flags);
-}
-
-static void sched_free_unit_mem(struct sched_unit *unit)
-{
-    struct sched_unit *prev_unit;
-    struct domain *d = unit->domain;
-
-    if ( d->sched_unit_list == unit )
-        d->sched_unit_list = unit->next_in_list;
-    else
-    {
-        for_each_sched_unit ( d, prev_unit )
-        {
-            if ( prev_unit->next_in_list == unit )
-            {
-                prev_unit->next_in_list = unit->next_in_list;
-                break;
-            }
-        }
-    }
-
-    free_cpumask_var(unit->cpu_hard_affinity);
-    free_cpumask_var(unit->cpu_hard_affinity_saved);
-    free_cpumask_var(unit->cpu_soft_affinity);
-
-    xfree(unit);
-}
-
-static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
-{
-    struct vcpu *vunit;
-    unsigned int cnt = 0;
-
-    /* Don't count to be released vcpu, might be not in vcpu list yet. */
-    for_each_sched_unit_vcpu ( unit, vunit )
-        if ( vunit != v )
-            cnt++;
-
-    v->sched_unit = NULL;
-    unit->runstate_cnt[v->runstate.state]--;
-
-    if ( unit->vcpu_list == v )
-        unit->vcpu_list = v->next_in_list;
-
-    if ( !cnt )
-        sched_free_unit_mem(unit);
-}
-
-static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
-{
-    v->sched_unit = unit;
-
-    /* All but idle vcpus are allocated with sequential vcpu_id. */
-    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
-    {
-        unit->vcpu_list = v;
-        /*
-         * unit_id is always the same as lowest vcpu_id of unit.
-         * This is used for stopping for_each_sched_unit_vcpu() loop and in
-         * order to support cpupools with different granularities.
-         */
-        unit->unit_id = v->vcpu_id;
-    }
-    unit->runstate_cnt[v->runstate.state]++;
-}
-
-static struct sched_unit *sched_alloc_unit_mem(void)
-{
-    struct sched_unit *unit;
-
-    unit = xzalloc(struct sched_unit);
-    if ( !unit )
-        return NULL;
-
-    if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
-         !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
-         !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
-    {
-        sched_free_unit_mem(unit);
-        unit = NULL;
-    }
-
-    return unit;
-}
-
-static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
-{
-    struct sched_unit **prev_unit;
-
-    unit->domain = d;
-
-    for ( prev_unit = &d->sched_unit_list; *prev_unit;
-          prev_unit = &(*prev_unit)->next_in_list )
-        if ( (*prev_unit)->next_in_list &&
-             (*prev_unit)->next_in_list->unit_id > unit->unit_id )
-            break;
-
-    unit->next_in_list = *prev_unit;
-    *prev_unit = unit;
-}
-
-static struct sched_unit *sched_alloc_unit(struct vcpu *v)
-{
-    struct sched_unit *unit;
-    struct domain *d = v->domain;
-    unsigned int gran = cpupool_get_granularity(d->cpupool);
-
-    for_each_sched_unit ( d, unit )
-        if ( unit->unit_id / gran == v->vcpu_id / gran )
-            break;
-
-    if ( unit )
-    {
-        sched_unit_add_vcpu(unit, v);
-        return unit;
-    }
-
-    if ( (unit = sched_alloc_unit_mem()) == NULL )
-        return NULL;
-
-    sched_unit_add_vcpu(unit, v);
-    sched_domain_insert_unit(unit, d);
-
-    return unit;
-}
-
-static unsigned int sched_select_initial_cpu(const struct vcpu *v)
-{
-    const struct domain *d = v->domain;
-    nodeid_t node;
-    spinlock_t *lock;
-    unsigned long flags;
-    unsigned int cpu_ret, cpu = smp_processor_id();
-    cpumask_t *cpus = cpumask_scratch_cpu(cpu);
-
-    lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-    cpumask_clear(cpus);
-    for_each_node_mask ( node, d->node_affinity )
-        cpumask_or(cpus, cpus, &node_to_cpumask(node));
-    cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
-    if ( cpumask_empty(cpus) )
-        cpumask_copy(cpus, d->cpupool->cpu_valid);
-
-    if ( v->vcpu_id == 0 )
-        cpu_ret = cpumask_first(cpus);
-    else
-    {
-        /* We can rely on previous vcpu being available. */
-        ASSERT(!is_idle_domain(d));
-
-        cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
-    }
-
-    pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
-
-    return cpu_ret;
-}
-
-int sched_init_vcpu(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct sched_unit *unit;
-    unsigned int processor;
-
-    if ( (unit = sched_alloc_unit(v)) == NULL )
-        return 1;
-
-    if ( is_idle_domain(d) )
-        processor = v->vcpu_id;
-    else
-        processor = sched_select_initial_cpu(v);
-
-    /* Initialise the per-vcpu timers. */
-    spin_lock_init(&v->periodic_timer_lock);
-    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
-    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
-    init_timer(&v->poll_timer, poll_timer_fn, v, processor);
-
-    /* If this is not the first vcpu of the unit we are done. */
-    if ( unit->priv != NULL )
-    {
-        v->processor = processor;
-        return 0;
-    }
-
-    rcu_read_lock(&sched_res_rculock);
-
-    /* The first vcpu of an unit can be set via sched_set_res(). */
-    sched_set_res(unit, get_sched_res(processor));
-
-    unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
-    if ( unit->priv == NULL )
-    {
-        sched_free_unit(unit, v);
-        rcu_read_unlock(&sched_res_rculock);
-        return 1;
-    }
-
-    /*
-     * Initialize affinity settings. The idler, and potentially
-     * domain-0 VCPUs, are pinned onto their respective physical CPUs.
-     */
-    if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
-        sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
-    else
-        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
-
-    /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
-    if ( is_idle_domain(d) )
-    {
-        get_sched_res(v->processor)->curr = unit;
-        get_sched_res(v->processor)->sched_unit_idle = unit;
-        v->is_running = 1;
-        unit->is_running = true;
-        unit->state_entry_time = NOW();
-    }
-    else
-    {
-        sched_insert_unit(dom_scheduler(d), unit);
-    }
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    return 0;
-}
-
-static void vcpu_move_irqs(struct vcpu *v)
-{
-    arch_move_irqs(v);
-    evtchn_move_pirqs(v);
-}
-
-static void sched_move_irqs(const struct sched_unit *unit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        vcpu_move_irqs(v);
-}
-
-int sched_move_domain(struct domain *d, struct cpupool *c)
-{
-    struct vcpu *v;
-    struct sched_unit *unit;
-    unsigned int new_p, unit_idx;
-    void **unit_priv;
-    void *domdata;
-    void *unitdata;
-    struct scheduler *old_ops;
-    void *old_domdata;
-    unsigned int gran = cpupool_get_granularity(c);
-    int ret = 0;
-
-    for_each_vcpu ( d, v )
-    {
-        if ( v->affinity_broken )
-            return -EBUSY;
-    }
-
-    rcu_read_lock(&sched_res_rculock);
-
-    domdata = sched_alloc_domdata(c->sched, d);
-    if ( IS_ERR(domdata) )
-    {
-        ret = PTR_ERR(domdata);
-        goto out;
-    }
-
-    unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
-    if ( unit_priv == NULL )
-    {
-        sched_free_domdata(c->sched, domdata);
-        ret = -ENOMEM;
-        goto out;
-    }
-
-    unit_idx = 0;
-    for_each_sched_unit ( d, unit )
-    {
-        unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
-        if ( unit_priv[unit_idx] == NULL )
-        {
-            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
-                sched_free_udata(c->sched, unit_priv[unit_idx]);
-            xfree(unit_priv);
-            sched_free_domdata(c->sched, domdata);
-            ret = -ENOMEM;
-            goto out;
-        }
-        unit_idx++;
-    }
-
-    domain_pause(d);
-
-    old_ops = dom_scheduler(d);
-    old_domdata = d->sched_priv;
-
-    for_each_sched_unit ( d, unit )
-    {
-        sched_remove_unit(old_ops, unit);
-    }
-
-    d->cpupool = c;
-    d->sched_priv = domdata;
-
-    new_p = cpumask_first(c->cpu_valid);
-    unit_idx = 0;
-    for_each_sched_unit ( d, unit )
-    {
-        spinlock_t *lock;
-        unsigned int unit_p = new_p;
-
-        unitdata = unit->priv;
-
-        for_each_sched_unit_vcpu ( unit, v )
-        {
-            migrate_timer(&v->periodic_timer, new_p);
-            migrate_timer(&v->singleshot_timer, new_p);
-            migrate_timer(&v->poll_timer, new_p);
-            new_p = cpumask_cycle(new_p, c->cpu_valid);
-        }
-
-        lock = unit_schedule_lock_irq(unit);
-
-        sched_set_affinity(unit, &cpumask_all, &cpumask_all);
-
-        sched_set_res(unit, get_sched_res(unit_p));
-        /*
-         * With v->processor modified we must not
-         * - make any further changes assuming we hold the scheduler lock,
-         * - use unit_schedule_unlock_irq().
-         */
-        spin_unlock_irq(lock);
-
-        unit->priv = unit_priv[unit_idx];
-        if ( !d->is_dying )
-            sched_move_irqs(unit);
-
-        sched_insert_unit(c->sched, unit);
-
-        sched_free_udata(old_ops, unitdata);
-
-        unit_idx++;
-    }
-
-    domain_update_node_affinity(d);
-
-    domain_unpause(d);
-
-    sched_free_domdata(old_ops, old_domdata);
-
-    xfree(unit_priv);
-
-out:
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-void sched_destroy_vcpu(struct vcpu *v)
-{
-    struct sched_unit *unit = v->sched_unit;
-
-    kill_timer(&v->periodic_timer);
-    kill_timer(&v->singleshot_timer);
-    kill_timer(&v->poll_timer);
-    if ( test_and_clear_bool(v->is_urgent) )
-        atomic_dec(&per_cpu(sched_urgent_count, v->processor));
-    /*
-     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
-     * is the same as being the only one.
-     */
-    if ( unit->vcpu_list == v )
-    {
-        rcu_read_lock(&sched_res_rculock);
-
-        sched_remove_unit(vcpu_scheduler(v), unit);
-        sched_free_udata(vcpu_scheduler(v), unit->priv);
-        sched_free_unit(unit, v);
-
-        rcu_read_unlock(&sched_res_rculock);
-    }
-}
-
-int sched_init_domain(struct domain *d, int poolid)
-{
-    void *sdom;
-    int ret;
-
-    ASSERT(d->cpupool == NULL);
-    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
-
-    if ( (ret = cpupool_add_domain(d, poolid)) )
-        return ret;
-
-    SCHED_STAT_CRANK(dom_init);
-    TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sdom = sched_alloc_domdata(dom_scheduler(d), d);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    if ( IS_ERR(sdom) )
-        return PTR_ERR(sdom);
-
-    d->sched_priv = sdom;
-
-    return 0;
-}
-
-void sched_destroy_domain(struct domain *d)
-{
-    ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
-
-    if ( d->cpupool )
-    {
-        SCHED_STAT_CRANK(dom_destroy);
-        TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
-
-        rcu_read_lock(&sched_res_rculock);
-
-        sched_free_domdata(dom_scheduler(d), d->sched_priv);
-        d->sched_priv = NULL;
-
-        rcu_read_unlock(&sched_res_rculock);
-
-        cpupool_rm_domain(d);
-    }
-}
-
-static void vcpu_sleep_nosync_locked(struct vcpu *v)
-{
-    struct sched_unit *unit = v->sched_unit;
-
-    ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
-
-    if ( likely(!vcpu_runnable(v)) )
-    {
-        if ( v->runstate.state == RUNSTATE_runnable )
-            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
-
-        /* Only put unit to sleep in case all vcpus are not runnable. */
-        if ( likely(!unit_runnable(unit)) )
-            sched_sleep(unit_scheduler(unit), unit);
-        else if ( unit_running(unit) > 1 && v->is_running &&
-                  !v->force_context_switch )
-        {
-            v->force_context_switch = true;
-            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
-        }
-    }
-}
-
-void vcpu_sleep_nosync(struct vcpu *v)
-{
-    unsigned long flags;
-    spinlock_t *lock;
-
-    TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
-
-    vcpu_sleep_nosync_locked(v);
-
-    unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-void vcpu_sleep_sync(struct vcpu *v)
-{
-    vcpu_sleep_nosync(v);
-
-    while ( !vcpu_runnable(v) && v->is_running )
-        cpu_relax();
-
-    sync_vcpu_execstate(v);
-}
-
-void vcpu_wake(struct vcpu *v)
-{
-    unsigned long flags;
-    spinlock_t *lock;
-    struct sched_unit *unit = v->sched_unit;
-
-    TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = unit_schedule_lock_irqsave(unit, &flags);
-
-    if ( likely(vcpu_runnable(v)) )
-    {
-        if ( v->runstate.state >= RUNSTATE_blocked )
-            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        /*
-         * Call sched_wake() unconditionally, even if unit is running already.
-         * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
-         * and are now to be woken up again.
-         */
-        sched_wake(unit_scheduler(unit), unit);
-        if ( unit->is_running && !v->is_running && !v->force_context_switch )
-        {
-            v->force_context_switch = true;
-            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
-        }
-    }
-    else if ( !(v->pause_flags & VPF_blocked) )
-    {
-        if ( v->runstate.state == RUNSTATE_blocked )
-            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
-    }
-
-    unit_schedule_unlock_irqrestore(lock, flags, unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-void vcpu_unblock(struct vcpu *v)
-{
-    if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
-        return;
-
-    /* Polling period ends when a VCPU is unblocked. */
-    if ( unlikely(v->poll_evtchn != 0) )
-    {
-        v->poll_evtchn = 0;
-        /*
-         * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
-         * this VCPU (and it then going back to sleep on poll_mask).
-         * Test-and-clear is idiomatic and ensures clear_bit not reordered.
-         */
-        if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
-            clear_bit(_VPF_blocked, &v->pause_flags);
-    }
-
-    vcpu_wake(v);
-}
-
-/*
- * Do the actual movement of an unit from old to new CPU. Locks for *both*
- * CPUs needs to have been taken already when calling this!
- */
-static void sched_unit_move_locked(struct sched_unit *unit,
-                                   unsigned int new_cpu)
-{
-    unsigned int old_cpu = unit->res->master_cpu;
-    struct vcpu *v;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    /*
-     * Transfer urgency status to new CPU before switching CPUs, as
-     * once the switch occurs, v->is_urgent is no longer protected by
-     * the per-CPU scheduler lock we are holding.
-     */
-    for_each_sched_unit_vcpu ( unit, v )
-    {
-        if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
-        {
-            atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
-            atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
-        }
-    }
-
-    /*
-     * Actual CPU switch to new CPU.  This is safe because the lock
-     * pointer can't change while the current lock is held.
-     */
-    sched_migrate(unit_scheduler(unit), unit, new_cpu);
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * Initiating migration
- *
- * In order to migrate, we need the unit in question to have stopped
- * running and have called sched_sleep() (to take it off any
- * runqueues, for instance); and if it is currently running, it needs
- * to be scheduled out.  Finally, we need to hold the scheduling locks
- * for both the processor we're migrating from, and the processor
- * we're migrating to.
- *
- * In order to avoid deadlock while satisfying the final requirement,
- * we must release any scheduling lock we hold, then try to grab both
- * locks we want, then double-check to make sure that what we started
- * to do hasn't been changed in the mean time.
- *
- * These steps are encapsulated in the following two functions; they
- * should be called like this:
- *
- *     lock = unit_schedule_lock_irq(unit);
- *     sched_unit_migrate_start(unit);
- *     unit_schedule_unlock_irq(lock, unit)
- *     sched_unit_migrate_finish(unit);
- *
- * sched_unit_migrate_finish() will do the work now if it can, or simply
- * return if it can't (because unit is still running); in that case
- * sched_unit_migrate_finish() will be called by unit_context_saved().
- */
-static void sched_unit_migrate_start(struct sched_unit *unit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-    {
-        set_bit(_VPF_migrating, &v->pause_flags);
-        vcpu_sleep_nosync_locked(v);
-    }
-}
-
-static void sched_unit_migrate_finish(struct sched_unit *unit)
-{
-    unsigned long flags;
-    unsigned int old_cpu, new_cpu;
-    spinlock_t *old_lock, *new_lock;
-    bool_t pick_called = 0;
-    struct vcpu *v;
-
-    /*
-     * If the unit is currently running, this will be handled by
-     * unit_context_saved(); and in any case, if the bit is cleared, then
-     * someone else has already done the work so we don't need to.
-     */
-    if ( unit->is_running )
-        return;
-    for_each_sched_unit_vcpu ( unit, v )
-        if ( !test_bit(_VPF_migrating, &v->pause_flags) )
-            return;
-
-    old_cpu = new_cpu = unit->res->master_cpu;
-    for ( ; ; )
-    {
-        /*
-         * We need another iteration if the pre-calculated lock addresses
-         * are not correct any longer after evaluating old and new cpu holding
-         * the locks.
-         */
-        old_lock = get_sched_res(old_cpu)->schedule_lock;
-        new_lock = get_sched_res(new_cpu)->schedule_lock;
-
-        sched_spin_lock_double(old_lock, new_lock, &flags);
-
-        old_cpu = unit->res->master_cpu;
-        if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
-        {
-            /*
-             * If we selected a CPU on the previosu iteration, check if it
-             * remains suitable for running this vCPU.
-             */
-            if ( pick_called &&
-                 (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
-                 cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
-                 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
-                break;
-
-            /* Select a new CPU. */
-            new_cpu = sched_pick_resource(unit_scheduler(unit),
-                                          unit)->master_cpu;
-            if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
-                 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
-                break;
-            pick_called = 1;
-        }
-        else
-        {
-            /*
-             * We do not hold the scheduler lock appropriate for this vCPU.
-             * Thus we cannot select a new CPU on this iteration. Try again.
-             */
-            pick_called = 0;
-        }
-
-        sched_spin_unlock_double(old_lock, new_lock, flags);
-    }
-
-    /*
-     * NB. Check of v->running happens /after/ setting migration flag
-     * because they both happen in (different) spinlock regions, and those
-     * regions are strictly serialised.
-     */
-    if ( unit->is_running )
-    {
-        sched_spin_unlock_double(old_lock, new_lock, flags);
-        return;
-    }
-    for_each_sched_unit_vcpu ( unit, v )
-    {
-        if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
-        {
-            sched_spin_unlock_double(old_lock, new_lock, flags);
-            return;
-        }
-    }
-
-    sched_unit_move_locked(unit, new_cpu);
-
-    sched_spin_unlock_double(old_lock, new_lock, flags);
-
-    if ( old_cpu != new_cpu )
-    {
-        /* Vcpus are moved to other pcpus, commit their states to memory. */
-        for_each_sched_unit_vcpu ( unit, v )
-            sync_vcpu_execstate(v);
-        sched_move_irqs(unit);
-    }
-
-    /* Wake on new CPU. */
-    for_each_sched_unit_vcpu ( unit, v )
-        vcpu_wake(v);
-}
-
-static bool sched_check_affinity_broken(const struct sched_unit *unit)
-{
-    const struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        if ( v->affinity_broken )
-            return true;
-
-    return false;
-}
-
-static void sched_reset_affinity_broken(struct sched_unit *unit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        v->affinity_broken = false;
-}
-
-void restore_vcpu_affinity(struct domain *d)
-{
-    unsigned int cpu = smp_processor_id();
-    struct sched_unit *unit;
-
-    ASSERT(system_state == SYS_STATE_resume);
-
-    rcu_read_lock(&sched_res_rculock);
-
-    for_each_sched_unit ( d, unit )
-    {
-        spinlock_t *lock;
-        unsigned int old_cpu = sched_unit_master(unit);
-        struct sched_resource *res;
-
-        ASSERT(!unit_runnable(unit));
-
-        /*
-         * Re-assign the initial processor as after resume we have no
-         * guarantee the old processor has come back to life again.
-         *
-         * Therefore, here, before actually unpausing the domains, we should
-         * set v->processor of each of their vCPUs to something that will
-         * make sense for the scheduler of the cpupool in which they are in.
-         */
-        lock = unit_schedule_lock_irq(unit);
-
-        cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                    cpupool_domain_master_cpumask(d));
-        if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
-        {
-            if ( sched_check_affinity_broken(unit) )
-            {
-                sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
-                sched_reset_affinity_broken(unit);
-                cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                            cpupool_domain_master_cpumask(d));
-            }
-
-            if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
-            {
-                /* Affinity settings of one vcpu are for the complete unit. */
-                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
-                       unit->vcpu_list);
-                sched_set_affinity(unit, &cpumask_all, NULL);
-                cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
-                            cpupool_domain_master_cpumask(d));
-            }
-        }
-
-        res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
-        sched_set_res(unit, res);
-
-        spin_unlock_irq(lock);
-
-        /* v->processor might have changed, so reacquire the lock. */
-        lock = unit_schedule_lock_irq(unit);
-        res = sched_pick_resource(unit_scheduler(unit), unit);
-        sched_set_res(unit, res);
-        spin_unlock_irq(lock);
-
-        if ( old_cpu != sched_unit_master(unit) )
-            sched_move_irqs(unit);
-    }
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    domain_update_node_affinity(d);
-}
-
-/*
- * This function is used by cpu_hotplug code via cpu notifier chain
- * and from cpupools to switch schedulers on a cpu.
- * Caller must get domlist_read_lock.
- */
-int cpu_disable_scheduler(unsigned int cpu)
-{
-    struct domain *d;
-    struct cpupool *c;
-    cpumask_t online_affinity;
-    int ret = 0;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    c = get_sched_res(cpu)->cpupool;
-    if ( c == NULL )
-        goto out;
-
-    for_each_domain_in_cpupool ( d, c )
-    {
-        struct sched_unit *unit;
-
-        for_each_sched_unit ( d, unit )
-        {
-            unsigned long flags;
-            spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
-
-            cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
-            if ( cpumask_empty(&online_affinity) &&
-                 cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
-            {
-                if ( sched_check_affinity_broken(unit) )
-                {
-                    /* The unit is temporarily pinned, can't move it. */
-                    unit_schedule_unlock_irqrestore(lock, flags, unit);
-                    ret = -EADDRINUSE;
-                    break;
-                }
-
-                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
-                       unit->vcpu_list);
-
-                sched_set_affinity(unit, &cpumask_all, NULL);
-            }
-
-            if ( unit->res != get_sched_res(cpu) )
-            {
-                /* The unit is not on this cpu, so we can move on. */
-                unit_schedule_unlock_irqrestore(lock, flags, unit);
-                continue;
-            }
-
-            /* If it is on this cpu, we must send it away.
-             * We are doing some cpupool manipulations:
-             *  * we want to call the scheduler, and let it re-evaluation
-             *    the placement of the vcpu, taking into account the new
-             *    cpupool configuration;
-             *  * the scheduler will always find a suitable solution, or
-             *    things would have failed before getting in here.
-             */
-            sched_unit_migrate_start(unit);
-            unit_schedule_unlock_irqrestore(lock, flags, unit);
-            sched_unit_migrate_finish(unit);
-
-            /*
-             * The only caveat, in this case, is that if a vcpu active in
-             * the hypervisor isn't migratable. In this case, the caller
-             * should try again after releasing and reaquiring all locks.
-             */
-            if ( unit->res == get_sched_res(cpu) )
-                ret = -EAGAIN;
-        }
-    }
-
-out:
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-static int cpu_disable_scheduler_check(unsigned int cpu)
-{
-    struct domain *d;
-    struct vcpu *v;
-    struct cpupool *c;
-
-    c = get_sched_res(cpu)->cpupool;
-    if ( c == NULL )
-        return 0;
-
-    for_each_domain_in_cpupool ( d, c )
-        for_each_vcpu ( d, v )
-            if ( v->affinity_broken )
-                return -EADDRINUSE;
-
-    return 0;
-}
-
-/*
- * In general, this must be called with the scheduler lock held, because the
- * adjust_affinity hook may want to modify the vCPU state. However, when the
- * vCPU is being initialized (either for dom0 or domU) there is no risk of
- * races, and it's fine to not take the look (we're talking about
- * sched_setup_dom0_vcpus() an sched_init_vcpu()).
- */
-static void sched_set_affinity(
-    struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
-{
-    rcu_read_lock(&sched_res_rculock);
-    sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
-    rcu_read_unlock(&sched_res_rculock);
-
-    if ( hard )
-        cpumask_copy(unit->cpu_hard_affinity, hard);
-    if ( soft )
-        cpumask_copy(unit->cpu_soft_affinity, soft);
-
-    unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
-                                               unit->cpu_soft_affinity) &&
-                               cpumask_intersects(unit->cpu_soft_affinity,
-                                                  unit->cpu_hard_affinity);
-}
-
-static int vcpu_set_affinity(
-    struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
-{
-    struct sched_unit *unit = v->sched_unit;
-    spinlock_t *lock;
-    int ret = 0;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = unit_schedule_lock_irq(unit);
-
-    if ( v->affinity_broken )
-        ret = -EBUSY;
-    else
-    {
-        /*
-         * Tell the scheduler we changes something about affinity,
-         * and ask to re-evaluate vcpu placement.
-         */
-        if ( which == unit->cpu_hard_affinity )
-        {
-            sched_set_affinity(unit, affinity, NULL);
-        }
-        else
-        {
-            ASSERT(which == unit->cpu_soft_affinity);
-            sched_set_affinity(unit, NULL, affinity);
-        }
-        sched_unit_migrate_start(unit);
-    }
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    domain_update_node_affinity(v->domain);
-
-    sched_unit_migrate_finish(unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
-{
-    cpumask_t online_affinity;
-    cpumask_t *online;
-
-    online = VCPU2ONLINE(v);
-    cpumask_and(&online_affinity, affinity, online);
-    if ( cpumask_empty(&online_affinity) )
-        return -EINVAL;
-
-    return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
-}
-
-int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
-{
-    return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
-}
-
-/* Block the currently-executing domain until a pertinent event occurs. */
-void vcpu_block(void)
-{
-    struct vcpu *v = current;
-
-    set_bit(_VPF_blocked, &v->pause_flags);
-
-    arch_vcpu_block(v);
-
-    /* Check for events /after/ blocking: avoids wakeup waiting race. */
-    if ( local_events_need_delivery() )
-    {
-        clear_bit(_VPF_blocked, &v->pause_flags);
-    }
-    else
-    {
-        TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
-        raise_softirq(SCHEDULE_SOFTIRQ);
-    }
-}
-
-static void vcpu_block_enable_events(void)
-{
-    local_event_delivery_enable();
-    vcpu_block();
-}
-
-static long do_poll(struct sched_poll *sched_poll)
-{
-    struct vcpu   *v = current;
-    struct domain *d = v->domain;
-    evtchn_port_t  port = 0;
-    long           rc;
-    unsigned int   i;
-
-    /* Fairly arbitrary limit. */
-    if ( sched_poll->nr_ports > 128 )
-        return -EINVAL;
-
-    if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
-        return -EFAULT;
-
-    set_bit(_VPF_blocked, &v->pause_flags);
-    v->poll_evtchn = -1;
-    set_bit(v->vcpu_id, d->poll_mask);
-
-    arch_vcpu_block(v);
-
-#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
-    /* Check for events /after/ setting flags: avoids wakeup waiting race. */
-    smp_mb();
-
-    /*
-     * Someone may have seen we are blocked but not that we are polling, or
-     * vice versa. We are certainly being woken, so clean up and bail. Beyond
-     * this point others can be guaranteed to clean up for us if they wake us.
-     */
-    rc = 0;
-    if ( (v->poll_evtchn == 0) ||
-         !test_bit(_VPF_blocked, &v->pause_flags) ||
-         !test_bit(v->vcpu_id, d->poll_mask) )
-        goto out;
-#endif
-
-    rc = 0;
-    if ( local_events_need_delivery() )
-        goto out;
-
-    for ( i = 0; i < sched_poll->nr_ports; i++ )
-    {
-        rc = -EFAULT;
-        if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
-            goto out;
-
-        rc = -EINVAL;
-        if ( port >= d->max_evtchns )
-            goto out;
-
-        rc = 0;
-        if ( evtchn_port_is_pending(d, port) )
-            goto out;
-    }
-
-    if ( sched_poll->nr_ports == 1 )
-        v->poll_evtchn = port;
-
-    if ( sched_poll->timeout != 0 )
-        set_timer(&v->poll_timer, sched_poll->timeout);
-
-    TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
-    raise_softirq(SCHEDULE_SOFTIRQ);
-
-    return 0;
-
- out:
-    v->poll_evtchn = 0;
-    clear_bit(v->vcpu_id, d->poll_mask);
-    clear_bit(_VPF_blocked, &v->pause_flags);
-    return rc;
-}
-
-/* Voluntarily yield the processor for this allocation. */
-long vcpu_yield(void)
-{
-    struct vcpu * v=current;
-    spinlock_t *lock;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = unit_schedule_lock_irq(v->sched_unit);
-    sched_yield(vcpu_scheduler(v), v->sched_unit);
-    unit_schedule_unlock_irq(lock, v->sched_unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    SCHED_STAT_CRANK(vcpu_yield);
-
-    TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
-    raise_softirq(SCHEDULE_SOFTIRQ);
-    return 0;
-}
-
-static void domain_watchdog_timeout(void *data)
-{
-    struct domain *d = data;
-
-    if ( d->is_shutting_down || d->is_dying )
-        return;
-
-    printk("Watchdog timer fired for domain %u\n", d->domain_id);
-    domain_shutdown(d, SHUTDOWN_watchdog);
-}
-
-static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
-{
-    if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
-        return -EINVAL;
-
-    spin_lock(&d->watchdog_lock);
-
-    if ( id == 0 )
-    {
-        for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
-        {
-            if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
-                continue;
-            set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
-            break;
-        }
-        spin_unlock(&d->watchdog_lock);
-        return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
-    }
-
-    id -= 1;
-    if ( !test_bit(id, &d->watchdog_inuse_map) )
-    {
-        spin_unlock(&d->watchdog_lock);
-        return -EINVAL;
-    }
-
-    if ( timeout == 0 )
-    {
-        stop_timer(&d->watchdog_timer[id]);
-        clear_bit(id, &d->watchdog_inuse_map);
-    }
-    else
-    {
-        set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
-    }
-
-    spin_unlock(&d->watchdog_lock);
-    return 0;
-}
-
-void watchdog_domain_init(struct domain *d)
-{
-    unsigned int i;
-
-    spin_lock_init(&d->watchdog_lock);
-
-    d->watchdog_inuse_map = 0;
-
-    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
-        init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
-}
-
-void watchdog_domain_destroy(struct domain *d)
-{
-    unsigned int i;
-
-    for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
-        kill_timer(&d->watchdog_timer[i]);
-}
-
-/*
- * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
- * cpu is NR_CPUS).
- * Temporary pinning can be done due to two reasons, which may be nested:
- * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
- *   of a conflict (e.g. in case cpupool doesn't include requested CPU, or
- *   another conflicting temporary pinning is already in effect.
- * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
- *   CPU it is just running on. Can't fail if used properly.
- */
-int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
-{
-    struct sched_unit *unit = v->sched_unit;
-    spinlock_t *lock;
-    int ret = -EINVAL;
-    bool migrate;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = unit_schedule_lock_irq(unit);
-
-    if ( cpu == NR_CPUS )
-    {
-        if ( v->affinity_broken & reason )
-        {
-            ret = 0;
-            v->affinity_broken &= ~reason;
-        }
-        if ( !ret && !sched_check_affinity_broken(unit) )
-            sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
-    }
-    else if ( cpu < nr_cpu_ids )
-    {
-        if ( (v->affinity_broken & reason) ||
-             (sched_check_affinity_broken(unit) && v->processor != cpu) )
-            ret = -EBUSY;
-        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
-        {
-            if ( !sched_check_affinity_broken(unit) )
-            {
-                cpumask_copy(unit->cpu_hard_affinity_saved,
-                             unit->cpu_hard_affinity);
-                sched_set_affinity(unit, cpumask_of(cpu), NULL);
-            }
-            v->affinity_broken |= reason;
-            ret = 0;
-        }
-    }
-
-    migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
-    if ( migrate )
-        sched_unit_migrate_start(unit);
-
-    unit_schedule_unlock_irq(lock, unit);
-
-    if ( migrate )
-        sched_unit_migrate_finish(unit);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-typedef long ret_t;
-
-#endif /* !COMPAT */
-
-ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
-{
-    ret_t ret = 0;
-
-    switch ( cmd )
-    {
-    case SCHEDOP_yield:
-    {
-        ret = vcpu_yield();
-        break;
-    }
-
-    case SCHEDOP_block:
-    {
-        vcpu_block_enable_events();
-        break;
-    }
-
-    case SCHEDOP_shutdown:
-    {
-        struct sched_shutdown sched_shutdown;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_shutdown, arg, 1) )
-            break;
-
-        TRACE_3D(TRC_SCHED_SHUTDOWN,
-                 current->domain->domain_id, current->vcpu_id,
-                 sched_shutdown.reason);
-        ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
-
-        break;
-    }
-
-    case SCHEDOP_shutdown_code:
-    {
-        struct sched_shutdown sched_shutdown;
-        struct domain *d = current->domain;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_shutdown, arg, 1) )
-            break;
-
-        TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
-                 d->domain_id, current->vcpu_id, sched_shutdown.reason);
-
-        spin_lock(&d->shutdown_lock);
-        if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
-            d->shutdown_code = (u8)sched_shutdown.reason;
-        spin_unlock(&d->shutdown_lock);
-
-        ret = 0;
-        break;
-    }
-
-    case SCHEDOP_poll:
-    {
-        struct sched_poll sched_poll;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_poll, arg, 1) )
-            break;
-
-        ret = do_poll(&sched_poll);
-
-        break;
-    }
-
-    case SCHEDOP_remote_shutdown:
-    {
-        struct domain *d;
-        struct sched_remote_shutdown sched_remote_shutdown;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
-            break;
-
-        ret = -ESRCH;
-        d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
-        if ( d == NULL )
-            break;
-
-        ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
-        if ( likely(!ret) )
-            domain_shutdown(d, sched_remote_shutdown.reason);
-
-        rcu_unlock_domain(d);
-
-        break;
-    }
-
-    case SCHEDOP_watchdog:
-    {
-        struct sched_watchdog sched_watchdog;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_watchdog, arg, 1) )
-            break;
-
-        ret = domain_watchdog(
-            current->domain, sched_watchdog.id, sched_watchdog.timeout);
-        break;
-    }
-
-    case SCHEDOP_pin_override:
-    {
-        struct sched_pin_override sched_pin_override;
-        unsigned int cpu;
-
-        ret = -EPERM;
-        if ( !is_hardware_domain(current->domain) )
-            break;
-
-        ret = -EFAULT;
-        if ( copy_from_guest(&sched_pin_override, arg, 1) )
-            break;
-
-        ret = -EINVAL;
-        if ( sched_pin_override.pcpu >= NR_CPUS )
-           break;
-
-        cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
-        ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
-
-        break;
-    }
-
-    default:
-        ret = -ENOSYS;
-    }
-
-    return ret;
-}
-
-#ifndef COMPAT
-
-/* Per-vcpu oneshot-timer hypercall. */
-long do_set_timer_op(s_time_t timeout)
-{
-    struct vcpu *v = current;
-    s_time_t offset = timeout - NOW();
-
-    if ( timeout == 0 )
-    {
-        stop_timer(&v->singleshot_timer);
-    }
-    else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
-              unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
-    {
-        /*
-         * Linux workaround: occasionally we will see timeouts a long way in
-         * the future due to wrapping in Linux's jiffy time handling. We check
-         * for timeouts wrapped negative, and for positive timeouts more than
-         * about 13 days in the future (2^50ns). The correct fix is to trigger
-         * an interrupt immediately (since Linux in fact has pending work to
-         * do in this situation). However, older guests also set a long timeout
-         * when they have *no* pending timers at all: setting an immediate
-         * timeout in this case can burn a lot of CPU. We therefore go for a
-         * reasonable middleground of triggering a timer event in 100ms.
-         */
-        gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
-                 timeout);
-        set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
-    }
-    else
-    {
-        migrate_timer(&v->singleshot_timer, smp_processor_id());
-        set_timer(&v->singleshot_timer, timeout);
-    }
-
-    return 0;
-}
-
-/* sched_id - fetch ID of current scheduler */
-int sched_id(void)
-{
-    return ops.sched_id;
-}
-
-/* Adjust scheduling parameter for a given domain. */
-long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
-{
-    long ret;
-
-    ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
-    if ( ret )
-        return ret;
-
-    if ( op->sched_id != dom_scheduler(d)->sched_id )
-        return -EINVAL;
-
-    switch ( op->cmd )
-    {
-    case XEN_DOMCTL_SCHEDOP_putinfo:
-    case XEN_DOMCTL_SCHEDOP_getinfo:
-    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
-    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
-        break;
-    default:
-        return -EINVAL;
-    }
-
-    /* NB: the pluggable scheduler code needs to take care
-     * of locking by itself. */
-    rcu_read_lock(&sched_res_rculock);
-
-    if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
-        TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
-{
-    struct cpupool *pool;
-    int rc;
-
-    rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
-    if ( rc )
-        return rc;
-
-    if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
-         (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
-        return -EINVAL;
-
-    pool = cpupool_get_by_id(op->cpupool_id);
-    if ( pool == NULL )
-        return -ESRCH;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    rc = ((op->sched_id == pool->sched->sched_id)
-          ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    cpupool_put(pool);
-
-    return rc;
-}
-
-static void vcpu_periodic_timer_work_locked(struct vcpu *v)
-{
-    s_time_t now;
-    s_time_t periodic_next_event;
-
-    now = NOW();
-    periodic_next_event = v->periodic_last_event + v->periodic_period;
-
-    if ( now >= periodic_next_event )
-    {
-        send_timer_event(v);
-        v->periodic_last_event = now;
-        periodic_next_event = now + v->periodic_period;
-    }
-
-    migrate_timer(&v->periodic_timer, v->processor);
-    set_timer(&v->periodic_timer, periodic_next_event);
-}
-
-static void vcpu_periodic_timer_work(struct vcpu *v)
-{
-    if ( v->periodic_period == 0 )
-        return;
-
-    spin_lock(&v->periodic_timer_lock);
-    if ( v->periodic_period )
-        vcpu_periodic_timer_work_locked(v);
-    spin_unlock(&v->periodic_timer_lock);
-}
-
-/*
- * Set the periodic timer of a vcpu.
- */
-void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
-{
-    spin_lock(&v->periodic_timer_lock);
-
-    stop_timer(&v->periodic_timer);
-
-    v->periodic_period = value;
-    if ( value )
-        vcpu_periodic_timer_work_locked(v);
-
-    spin_unlock(&v->periodic_timer_lock);
-}
-
-static void sched_switch_units(struct sched_resource *sr,
-                               struct sched_unit *next, struct sched_unit *prev,
-                               s_time_t now)
-{
-    unsigned int cpu;
-
-    ASSERT(unit_running(prev));
-
-    if ( prev != next )
-    {
-        sr->curr = next;
-        sr->prev = prev;
-
-        TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
-                 prev->unit_id, now - prev->state_entry_time);
-        TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
-                 next->unit_id,
-                 (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
-                 (now - next->state_entry_time) : 0, prev->next_time);
-        TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
-                 next->domain->domain_id, next->unit_id);
-
-        ASSERT(!unit_running(next));
-
-        /*
-         * NB. Don't add any trace records from here until the actual context
-         * switch, else lost_records resume will not work properly.
-         */
-
-        ASSERT(!next->is_running);
-        next->is_running = true;
-        next->state_entry_time = now;
-
-        if ( is_idle_unit(prev) )
-        {
-            prev->runstate_cnt[RUNSTATE_running] = 0;
-            prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
-        }
-        if ( is_idle_unit(next) )
-        {
-            next->runstate_cnt[RUNSTATE_running] = sr->granularity;
-            next->runstate_cnt[RUNSTATE_runnable] = 0;
-        }
-    }
-
-    for_each_cpu ( cpu, sr->cpus )
-    {
-        struct vcpu *vprev = get_cpu_current(cpu);
-        struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
-
-        if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
-        {
-            vcpu_runstate_change(vprev,
-                ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
-                 (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
-                now);
-            vcpu_runstate_change(vnext, vnext->new_state, now);
-        }
-
-        vnext->is_running = 1;
-
-        if ( is_idle_vcpu(vnext) )
-            vnext->sched_unit = next;
-    }
-}
-
-static bool sched_tasklet_check_cpu(unsigned int cpu)
-{
-    unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
-
-    switch ( *tasklet_work )
-    {
-    case TASKLET_enqueued:
-        set_bit(_TASKLET_scheduled, tasklet_work);
-        /* fallthrough */
-    case TASKLET_enqueued|TASKLET_scheduled:
-        return true;
-        break;
-    case TASKLET_scheduled:
-        clear_bit(_TASKLET_scheduled, tasklet_work);
-        /* fallthrough */
-    case 0:
-        /* return false; */
-        break;
-    default:
-        BUG();
-    }
-
-    return false;
-}
-
-static bool sched_tasklet_check(unsigned int cpu)
-{
-    bool tasklet_work_scheduled = false;
-    const cpumask_t *mask = get_sched_res(cpu)->cpus;
-    unsigned int cpu_iter;
-
-    for_each_cpu ( cpu_iter, mask )
-        if ( sched_tasklet_check_cpu(cpu_iter) )
-            tasklet_work_scheduled = true;
-
-    return tasklet_work_scheduled;
-}
-
-static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
-                                      unsigned int cpu)
-{
-    struct sched_resource *sr = get_sched_res(cpu);
-    struct scheduler *sched = sr->scheduler;
-    struct sched_unit *next;
-
-    /* get policy-specific decision on scheduling... */
-    sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
-
-    next = prev->next_task;
-
-    if ( prev->next_time >= 0 ) /* -ve means no limit */
-        set_timer(&sr->s_timer, now + prev->next_time);
-
-    sched_switch_units(sr, next, prev, now);
-
-    return next;
-}
-
-static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
-{
-    /* Clear running flag /after/ writing context to memory. */
-    smp_wmb();
-
-    if ( vprev != vnext )
-        vprev->is_running = 0;
-}
-
-static void unit_context_saved(struct sched_resource *sr)
-{
-    struct sched_unit *unit = sr->prev;
-
-    if ( !unit )
-        return;
-
-    unit->is_running = false;
-    unit->state_entry_time = NOW();
-    sr->prev = NULL;
-
-    /* Check for migration request /after/ clearing running flag. */
-    smp_mb();
-
-    sched_context_saved(unit_scheduler(unit), unit);
-
-    /* Idle never migrates and idle vcpus might belong to other units. */
-    if ( !is_idle_unit(unit) )
-        sched_unit_migrate_finish(unit);
-}
-
-/*
- * Rendezvous on end of context switch.
- * As no lock is protecting this rendezvous function we need to use atomic
- * access functions on the counter.
- * The counter will be 0 in case no rendezvous is needed. For the rendezvous
- * case it is initialised to the number of cpus to rendezvous plus 1. Each
- * member entering decrements the counter. The last one will decrement it to
- * 1 and perform the final needed action in that case (call of
- * unit_context_saved()), and then set the counter to zero. The other members
- * will wait until the counter becomes zero until they proceed.
- */
-void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
-{
-    struct sched_unit *next = vnext->sched_unit;
-    struct sched_resource *sr;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sr = get_sched_res(smp_processor_id());
-
-    if ( atomic_read(&next->rendezvous_out_cnt) )
-    {
-        int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
-
-        vcpu_context_saved(vprev, vnext);
-
-        /* Call unit_context_saved() before releasing other waiters. */
-        if ( cnt == 1 )
-        {
-            unit_context_saved(sr);
-            atomic_set(&next->rendezvous_out_cnt, 0);
-        }
-        else
-            while ( atomic_read(&next->rendezvous_out_cnt) )
-                cpu_relax();
-    }
-    else
-    {
-        vcpu_context_saved(vprev, vnext);
-        if ( sr->granularity == 1 )
-            unit_context_saved(sr);
-    }
-
-    if ( is_idle_vcpu(vprev) && vprev != vnext )
-        vprev->sched_unit = sr->sched_unit_idle;
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
-                                 bool reset_idle_unit, s_time_t now)
-{
-    if ( unlikely(vprev == vnext) )
-    {
-        TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
-                 vnext->domain->domain_id, vnext->sched_unit->unit_id,
-                 now - vprev->runstate.state_entry_time,
-                 vprev->sched_unit->next_time);
-        sched_context_switched(vprev, vnext);
-
-        /*
-         * We are switching from a non-idle to an idle unit.
-         * A vcpu of the idle unit might have been running before due to
-         * the guest vcpu being blocked. We must adjust the unit of the idle
-         * vcpu which might have been set to the guest's one.
-         */
-        if ( reset_idle_unit )
-            vnext->sched_unit =
-                get_sched_res(smp_processor_id())->sched_unit_idle;
-
-        rcu_read_unlock(&sched_res_rculock);
-
-        trace_continue_running(vnext);
-        return continue_running(vprev);
-    }
-
-    SCHED_STAT_CRANK(sched_ctx);
-
-    stop_timer(&vprev->periodic_timer);
-
-    if ( vnext->sched_unit->migrated )
-        vcpu_move_irqs(vnext);
-
-    vcpu_periodic_timer_work(vnext);
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    context_switch(vprev, vnext);
-}
-
-/*
- * Force a context switch of a single vcpu of an unit.
- * Might be called either if a vcpu of an already running unit is woken up
- * or if a vcpu of a running unit is put asleep with other vcpus of the same
- * unit still running.
- * Returns either NULL if v is already in the correct state or the vcpu to
- * run next.
- */
-static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
-                                               struct vcpu *v,
-                                               unsigned int cpu, s_time_t now)
-{
-    v->force_context_switch = false;
-
-    if ( vcpu_runnable(v) == v->is_running )
-        return NULL;
-
-    if ( vcpu_runnable(v) )
-    {
-        if ( is_idle_vcpu(vprev) )
-        {
-            vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
-            vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
-        }
-        vcpu_runstate_change(v, RUNSTATE_running, now);
-    }
-    else
-    {
-        /* Make sure not to switch last vcpu of an unit away. */
-        if ( unit_running(v->sched_unit) == 1 )
-            return NULL;
-
-        v->new_state = vcpu_runstate_blocked(v);
-        vcpu_runstate_change(v, v->new_state, now);
-        v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
-        if ( v != vprev )
-        {
-            if ( is_idle_vcpu(vprev) )
-            {
-                vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
-                vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
-            }
-            else
-            {
-                v->sched_unit = vprev->sched_unit;
-                vcpu_runstate_change(v, RUNSTATE_running, now);
-            }
-        }
-    }
-
-    /* This vcpu will be switched to. */
-    v->is_running = true;
-
-    /* Make sure not to loose another slave call. */
-    raise_softirq(SCHED_SLAVE_SOFTIRQ);
-
-    return v;
-}
-
-/*
- * Rendezvous before taking a scheduling decision.
- * Called with schedule lock held, so all accesses to the rendezvous counter
- * can be normal ones (no atomic accesses needed).
- * The counter is initialized to the number of cpus to rendezvous initially.
- * Each cpu entering will decrement the counter. In case the counter becomes
- * zero do_schedule() is called and the rendezvous counter for leaving
- * context_switch() is set. All other members will wait until the counter is
- * becoming zero, dropping the schedule lock in between.
- */
-static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
-                                                   spinlock_t **lock, int cpu,
-                                                   s_time_t now)
-{
-    struct sched_unit *next;
-    struct vcpu *v;
-    unsigned int gran = get_sched_res(cpu)->granularity;
-
-    if ( !--prev->rendezvous_in_cnt )
-    {
-        next = do_schedule(prev, now, cpu);
-        atomic_set(&next->rendezvous_out_cnt, gran + 1);
-        return next;
-    }
-
-    v = unit2vcpu_cpu(prev, cpu);
-    while ( prev->rendezvous_in_cnt )
-    {
-        if ( v && v->force_context_switch )
-        {
-            struct vcpu *vprev = current;
-
-            v = sched_force_context_switch(vprev, v, cpu, now);
-
-            if ( v )
-            {
-                /* We'll come back another time, so adjust rendezvous_in_cnt. */
-                prev->rendezvous_in_cnt++;
-                atomic_set(&prev->rendezvous_out_cnt, 0);
-
-                pcpu_schedule_unlock_irq(*lock, cpu);
-
-                sched_context_switch(vprev, v, false, now);
-
-                return NULL;     /* ARM only. */
-            }
-
-            v = unit2vcpu_cpu(prev, cpu);
-        }
-        /*
-         * Coming from idle might need to do tasklet work.
-         * In order to avoid deadlocks we can't do that here, but have to
-         * continue the idle loop.
-         * Undo the rendezvous_in_cnt decrement and schedule another call of
-         * sched_slave().
-         */
-        if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
-        {
-            struct vcpu *vprev = current;
-
-            prev->rendezvous_in_cnt++;
-            atomic_set(&prev->rendezvous_out_cnt, 0);
-
-            pcpu_schedule_unlock_irq(*lock, cpu);
-
-            raise_softirq(SCHED_SLAVE_SOFTIRQ);
-            sched_context_switch(vprev, vprev, false, now);
-
-            return NULL;         /* ARM only. */
-        }
-
-        pcpu_schedule_unlock_irq(*lock, cpu);
-
-        cpu_relax();
-
-        *lock = pcpu_schedule_lock_irq(cpu);
-
-        if ( unlikely(!scheduler_active) )
-        {
-            ASSERT(is_idle_unit(prev));
-            atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
-            prev->rendezvous_in_cnt = 0;
-        }
-    }
-
-    return prev->next_task;
-}
-
-static void sched_slave(void)
-{
-    struct vcpu          *v, *vprev = current;
-    struct sched_unit    *prev = vprev->sched_unit, *next;
-    s_time_t              now;
-    spinlock_t           *lock;
-    bool                  do_softirq = false;
-    unsigned int          cpu = smp_processor_id();
-
-    ASSERT_NOT_IN_ATOMIC();
-
-    rcu_read_lock(&sched_res_rculock);
-
-    lock = pcpu_schedule_lock_irq(cpu);
-
-    now = NOW();
-
-    v = unit2vcpu_cpu(prev, cpu);
-    if ( v && v->force_context_switch )
-    {
-        v = sched_force_context_switch(vprev, v, cpu, now);
-
-        if ( v )
-        {
-            pcpu_schedule_unlock_irq(lock, cpu);
-
-            sched_context_switch(vprev, v, false, now);
-
-            return;
-        }
-
-        do_softirq = true;
-    }
-
-    if ( !prev->rendezvous_in_cnt )
-    {
-        pcpu_schedule_unlock_irq(lock, cpu);
-
-        rcu_read_unlock(&sched_res_rculock);
-
-        /* Check for failed forced context switch. */
-        if ( do_softirq )
-            raise_softirq(SCHEDULE_SOFTIRQ);
-
-        return;
-    }
-
-    stop_timer(&get_sched_res(cpu)->s_timer);
-
-    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
-    if ( !next )
-        return;
-
-    pcpu_schedule_unlock_irq(lock, cpu);
-
-    sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
-                         is_idle_unit(next) && !is_idle_unit(prev), now);
-}
-
-/*
- * The main function
- * - deschedule the current domain (scheduler independent).
- * - pick a new domain (scheduler dependent).
- */
-static void schedule(void)
-{
-    struct vcpu          *vnext, *vprev = current;
-    struct sched_unit    *prev = vprev->sched_unit, *next = NULL;
-    s_time_t              now;
-    struct sched_resource *sr;
-    spinlock_t           *lock;
-    int cpu = smp_processor_id();
-    unsigned int          gran;
-
-    ASSERT_NOT_IN_ATOMIC();
-
-    SCHED_STAT_CRANK(sched_run);
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sr = get_sched_res(cpu);
-    gran = sr->granularity;
-
-    lock = pcpu_schedule_lock_irq(cpu);
-
-    if ( prev->rendezvous_in_cnt )
-    {
-        /*
-         * We have a race: sched_slave() should be called, so raise a softirq
-         * in order to re-enter schedule() later and call sched_slave() now.
-         */
-        pcpu_schedule_unlock_irq(lock, cpu);
-
-        rcu_read_unlock(&sched_res_rculock);
-
-        raise_softirq(SCHEDULE_SOFTIRQ);
-        return sched_slave();
-    }
-
-    stop_timer(&sr->s_timer);
-
-    now = NOW();
-
-    if ( gran > 1 )
-    {
-        cpumask_t mask;
-
-        prev->rendezvous_in_cnt = gran;
-        cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
-        cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
-        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
-        if ( !next )
-            return;
-    }
-    else
-    {
-        prev->rendezvous_in_cnt = 0;
-        next = do_schedule(prev, now, cpu);
-        atomic_set(&next->rendezvous_out_cnt, 0);
-    }
-
-    pcpu_schedule_unlock_irq(lock, cpu);
-
-    vnext = sched_unit2vcpu_cpu(next, cpu);
-    sched_context_switch(vprev, vnext,
-                         !is_idle_unit(prev) && is_idle_unit(next), now);
-}
-
-/* The scheduler timer: force a run through the scheduler */
-static void s_timer_fn(void *unused)
-{
-    raise_softirq(SCHEDULE_SOFTIRQ);
-    SCHED_STAT_CRANK(sched_irq);
-}
-
-/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
-static void vcpu_periodic_timer_fn(void *data)
-{
-    struct vcpu *v = data;
-    vcpu_periodic_timer_work(v);
-}
-
-/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
-static void vcpu_singleshot_timer_fn(void *data)
-{
-    struct vcpu *v = data;
-    send_timer_event(v);
-}
-
-/* SCHEDOP_poll timeout callback. */
-static void poll_timer_fn(void *data)
-{
-    struct vcpu *v = data;
-
-    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
-        vcpu_unblock(v);
-}
-
-static struct sched_resource *sched_alloc_res(void)
-{
-    struct sched_resource *sr;
-
-    sr = xzalloc(struct sched_resource);
-    if ( sr == NULL )
-        return NULL;
-    if ( !zalloc_cpumask_var(&sr->cpus) )
-    {
-        xfree(sr);
-        return NULL;
-    }
-    return sr;
-}
-
-static int cpu_schedule_up(unsigned int cpu)
-{
-    struct sched_resource *sr;
-
-    sr = sched_alloc_res();
-    if ( sr == NULL )
-        return -ENOMEM;
-
-    sr->master_cpu = cpu;
-    cpumask_copy(sr->cpus, cpumask_of(cpu));
-    set_sched_res(cpu, sr);
-
-    sr->scheduler = &sched_idle_ops;
-    spin_lock_init(&sr->_lock);
-    sr->schedule_lock = &sched_free_cpu_lock;
-    init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
-    atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
-
-    /* We start with cpu granularity. */
-    sr->granularity = 1;
-
-    cpumask_set_cpu(cpu, &sched_res_mask);
-
-    /* Boot CPU is dealt with later in scheduler_init(). */
-    if ( cpu == 0 )
-        return 0;
-
-    if ( idle_vcpu[cpu] == NULL )
-        vcpu_create(idle_vcpu[0]->domain, cpu);
-    else
-        idle_vcpu[cpu]->sched_unit->res = sr;
-
-    if ( idle_vcpu[cpu] == NULL )
-        return -ENOMEM;
-
-    idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
-
-    /*
-     * No need to allocate any scheduler data, as cpus coming online are
-     * free initially and the idle scheduler doesn't need any data areas
-     * allocated.
-     */
-
-    sr->curr = idle_vcpu[cpu]->sched_unit;
-    sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
-
-    sr->sched_priv = NULL;
-
-    return 0;
-}
-
-static void sched_res_free(struct rcu_head *head)
-{
-    struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
-
-    free_cpumask_var(sr->cpus);
-    if ( sr->sched_unit_idle )
-        sched_free_unit_mem(sr->sched_unit_idle);
-    xfree(sr);
-}
-
-static void cpu_schedule_down(unsigned int cpu)
-{
-    struct sched_resource *sr;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sr = get_sched_res(cpu);
-
-    kill_timer(&sr->s_timer);
-
-    cpumask_clear_cpu(cpu, &sched_res_mask);
-    set_sched_res(cpu, NULL);
-
-    /* Keep idle unit. */
-    sr->sched_unit_idle = NULL;
-    call_rcu(&sr->rcu, sched_res_free);
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-void sched_rm_cpu(unsigned int cpu)
-{
-    int rc;
-
-    rcu_read_lock(&domlist_read_lock);
-    rc = cpu_disable_scheduler(cpu);
-    BUG_ON(rc);
-    rcu_read_unlock(&domlist_read_lock);
-    cpu_schedule_down(cpu);
-}
-
-static int cpu_schedule_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-    int rc = 0;
-
-    /*
-     * All scheduler related suspend/resume handling needed is done in
-     * cpupool.c.
-     */
-    if ( system_state > SYS_STATE_active )
-        return NOTIFY_DONE;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    /*
-     * From the scheduler perspective, bringing up a pCPU requires
-     * allocating and initializing the per-pCPU scheduler specific data,
-     * as well as "registering" this pCPU to the scheduler (which may
-     * involve modifying some scheduler wide data structures).
-     * As new pCPUs always start as "free" cpus with the minimal idle
-     * scheduler being in charge, we don't need any of that.
-     *
-     * On the other hand, at teardown, we need to reverse what has been done
-     * during initialization, and then free the per-pCPU specific data. A
-     * pCPU brought down is not forced through "free" cpus, so here we need to
-     * use the appropriate hooks.
-     *
-     * This happens by calling the deinit_pdata and free_pdata hooks, in this
-     * order. If no per-pCPU memory was allocated, there is no need to
-     * provide an implementation of free_pdata. deinit_pdata may, however,
-     * be necessary/useful in this case too (e.g., it can undo something done
-     * on scheduler wide data structure during init_pdata). Both deinit_pdata
-     * and free_pdata are called during CPU_DEAD.
-     *
-     * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
-     */
-    switch ( action )
-    {
-    case CPU_UP_PREPARE:
-        rc = cpu_schedule_up(cpu);
-        break;
-    case CPU_DOWN_PREPARE:
-        rcu_read_lock(&domlist_read_lock);
-        rc = cpu_disable_scheduler_check(cpu);
-        rcu_read_unlock(&domlist_read_lock);
-        break;
-    case CPU_DEAD:
-        sched_rm_cpu(cpu);
-        break;
-    case CPU_UP_CANCELED:
-        cpu_schedule_down(cpu);
-        break;
-    default:
-        break;
-    }
-
-    rcu_read_unlock(&sched_res_rculock);
-
-    return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_schedule_nfb = {
-    .notifier_call = cpu_schedule_callback
-};
-
-const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
-{
-    const cpumask_t *mask;
-
-    switch ( opt )
-    {
-    case SCHED_GRAN_cpu:
-        mask = cpumask_of(cpu);
-        break;
-    case SCHED_GRAN_core:
-        mask = per_cpu(cpu_sibling_mask, cpu);
-        break;
-    case SCHED_GRAN_socket:
-        mask = per_cpu(cpu_core_mask, cpu);
-        break;
-    default:
-        ASSERT_UNREACHABLE();
-        return NULL;
-    }
-
-    return mask;
-}
-
-static void schedule_dummy(void)
-{
-    sched_tasklet_check_cpu(smp_processor_id());
-}
-
-void scheduler_disable(void)
-{
-    scheduler_active = false;
-    open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
-    open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
-}
-
-void scheduler_enable(void)
-{
-    open_softirq(SCHEDULE_SOFTIRQ, schedule);
-    open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
-    scheduler_active = true;
-}
-
-/* Initialise the data structures. */
-void __init scheduler_init(void)
-{
-    struct domain *idle_domain;
-    int i;
-
-    scheduler_enable();
-
-    for ( i = 0; i < NUM_SCHEDULERS; i++)
-    {
-#define sched_test_func(f)                               \
-        if ( !schedulers[i]->f )                         \
-        {                                                \
-            printk("scheduler %s misses .%s, dropped\n", \
-                   schedulers[i]->opt_name, #f);         \
-            schedulers[i] = NULL;                        \
-        }
-
-        sched_test_func(init);
-        sched_test_func(deinit);
-        sched_test_func(pick_resource);
-        sched_test_func(alloc_udata);
-        sched_test_func(free_udata);
-        sched_test_func(switch_sched);
-        sched_test_func(do_schedule);
-
-#undef sched_test_func
-
-        if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
-        {
-            printk("scheduler %s failed initialization, dropped\n",
-                   schedulers[i]->opt_name);
-            schedulers[i] = NULL;
-        }
-
-        if ( schedulers[i] && !ops.name &&
-             !strcmp(schedulers[i]->opt_name, opt_sched) )
-            ops = *schedulers[i];
-    }
-
-    if ( !ops.name )
-    {
-        printk("Could not find scheduler: %s\n", opt_sched);
-        for ( i = 0; i < NUM_SCHEDULERS; i++ )
-            if ( schedulers[i] &&
-                 !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
-            {
-                ops = *schedulers[i];
-                break;
-            }
-        BUG_ON(!ops.name);
-        printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
-    }
-
-    if ( cpu_schedule_up(0) )
-        BUG();
-    register_cpu_notifier(&cpu_schedule_nfb);
-
-    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
-    if ( sched_init(&ops) )
-        panic("scheduler returned error on init\n");
-
-    if ( sched_ratelimit_us &&
-         (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
-          || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
-    {
-        printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
-               " Resetting to default %u\n",
-               XEN_SYSCTL_SCHED_RATELIMIT_MIN,
-               XEN_SYSCTL_SCHED_RATELIMIT_MAX,
-               SCHED_DEFAULT_RATELIMIT_US);
-        sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
-    }
-
-    idle_domain = domain_create(DOMID_IDLE, NULL, false);
-    BUG_ON(IS_ERR(idle_domain));
-    BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
-    idle_domain->vcpu = idle_vcpu;
-    idle_domain->max_vcpus = nr_cpu_ids;
-    if ( vcpu_create(idle_domain, 0) == NULL )
-        BUG();
-
-    rcu_read_lock(&sched_res_rculock);
-
-    get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
-    get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
- * using any "real" scheduler.
- * The cpu is still marked as "free" and not yet valid for its cpupool.
- */
-int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
-{
-    struct vcpu *idle;
-    void *ppriv, *vpriv;
-    struct scheduler *new_ops = c->sched;
-    struct sched_resource *sr;
-    spinlock_t *old_lock, *new_lock;
-    unsigned long flags;
-    int ret = 0;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sr = get_sched_res(cpu);
-
-    ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
-    ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
-    ASSERT(get_sched_res(cpu)->cpupool == NULL);
-
-    /*
-     * To setup the cpu for the new scheduler we need:
-     *  - a valid instance of per-CPU scheduler specific data, as it is
-     *    allocated by sched_alloc_pdata(). Note that we do not want to
-     *    initialize it yet (i.e., we are not calling sched_init_pdata()).
-     *    That will be done by the target scheduler, in sched_switch_sched(),
-     *    in proper ordering and with locking.
-     *  - a valid instance of per-vCPU scheduler specific data, for the idle
-     *    vCPU of cpu. That is what the target scheduler will use for the
-     *    sched_priv field of the per-vCPU info of the idle domain.
-     */
-    idle = idle_vcpu[cpu];
-    ppriv = sched_alloc_pdata(new_ops, cpu);
-    if ( IS_ERR(ppriv) )
-    {
-        ret = PTR_ERR(ppriv);
-        goto out;
-    }
-
-    vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
-                              idle->domain->sched_priv);
-    if ( vpriv == NULL )
-    {
-        sched_free_pdata(new_ops, ppriv, cpu);
-        ret = -ENOMEM;
-        goto out;
-    }
-
-    /*
-     * The actual switch, including the rerouting of the scheduler lock to
-     * whatever new_ops prefers, needs to happen in one critical section,
-     * protected by old_ops' lock, or races are possible.
-     * It is, in fact, the lock of the idle scheduler that we are taking.
-     * But that is ok as anyone trying to schedule on this cpu will spin until
-     * when we release that lock (bottom of this function). When he'll get the
-     * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
-     * notice that the lock itself changed, and retry acquiring the new one
-     * (which will be the correct, remapped one, at that point).
-     */
-    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
-    if ( cpupool_get_granularity(c) > 1 )
-    {
-        const cpumask_t *mask;
-        unsigned int cpu_iter, idx = 0;
-        struct sched_unit *old_unit, *master_unit;
-        struct sched_resource *sr_old;
-
-        /*
-         * We need to merge multiple idle_vcpu units and sched_resource structs
-         * into one. As the free cpus all share the same lock we are fine doing
-         * that now. The worst which could happen would be someone waiting for
-         * the lock, thus dereferencing sched_res->schedule_lock. This is the
-         * reason we are freeing struct sched_res via call_rcu() to avoid the
-         * lock pointer suddenly disappearing.
-         */
-        mask = sched_get_opt_cpumask(c->gran, cpu);
-        master_unit = idle_vcpu[cpu]->sched_unit;
-
-        for_each_cpu ( cpu_iter, mask )
-        {
-            if ( idx )
-                cpumask_clear_cpu(cpu_iter, &sched_res_mask);
-
-            per_cpu(sched_res_idx, cpu_iter) = idx++;
-
-            if ( cpu == cpu_iter )
-                continue;
-
-            old_unit = idle_vcpu[cpu_iter]->sched_unit;
-            sr_old = get_sched_res(cpu_iter);
-            kill_timer(&sr_old->s_timer);
-            idle_vcpu[cpu_iter]->sched_unit = master_unit;
-            master_unit->runstate_cnt[RUNSTATE_running]++;
-            set_sched_res(cpu_iter, sr);
-            cpumask_set_cpu(cpu_iter, sr->cpus);
-
-            call_rcu(&sr_old->rcu, sched_res_free);
-        }
-    }
-
-    new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
-
-    sr->scheduler = new_ops;
-    sr->sched_priv = ppriv;
-
-    /*
-     * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
-     * if it is free (and it can be) we want that anyone that manages
-     * taking it, finds all the initializations we've done above in place.
-     */
-    smp_wmb();
-    sr->schedule_lock = new_lock;
-
-    /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
-    spin_unlock_irqrestore(old_lock, flags);
-
-    sr->granularity = cpupool_get_granularity(c);
-    sr->cpupool = c;
-    /* The  cpu is added to a pool, trigger it to go pick up some work */
-    cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
-
-out:
-    rcu_read_unlock(&sched_res_rculock);
-
-    return ret;
-}
-
-/*
- * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
- * (the idle scheduler).
- * The cpu is already marked as "free" and not valid any longer for its
- * cpupool.
- */
-int schedule_cpu_rm(unsigned int cpu)
-{
-    void *ppriv_old, *vpriv_old;
-    struct sched_resource *sr, **sr_new = NULL;
-    struct sched_unit *unit;
-    struct scheduler *old_ops;
-    spinlock_t *old_lock;
-    unsigned long flags;
-    int idx, ret = -ENOMEM;
-    unsigned int cpu_iter;
-
-    rcu_read_lock(&sched_res_rculock);
-
-    sr = get_sched_res(cpu);
-    old_ops = sr->scheduler;
-
-    if ( sr->granularity > 1 )
-    {
-        sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
-        if ( !sr_new )
-            goto out;
-        for ( idx = 0; idx < sr->granularity - 1; idx++ )
-        {
-            sr_new[idx] = sched_alloc_res();
-            if ( sr_new[idx] )
-            {
-                sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
-                if ( !sr_new[idx]->sched_unit_idle )
-                {
-                    sched_res_free(&sr_new[idx]->rcu);
-                    sr_new[idx] = NULL;
-                }
-            }
-            if ( !sr_new[idx] )
-            {
-                for ( idx--; idx >= 0; idx-- )
-                    sched_res_free(&sr_new[idx]->rcu);
-                goto out;
-            }
-            sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
-            sr_new[idx]->scheduler = &sched_idle_ops;
-            sr_new[idx]->granularity = 1;
-
-            /* We want the lock not to change when replacing the resource. */
-            sr_new[idx]->schedule_lock = sr->schedule_lock;
-        }
-    }
-
-    ret = 0;
-    ASSERT(sr->cpupool != NULL);
-    ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
-    ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
-
-    /* See comment in schedule_cpu_add() regarding lock switching. */
-    old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
-
-    vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
-    ppriv_old = sr->sched_priv;
-
-    idx = 0;
-    for_each_cpu ( cpu_iter, sr->cpus )
-    {
-        per_cpu(sched_res_idx, cpu_iter) = 0;
-        if ( cpu_iter == cpu )
-        {
-            idle_vcpu[cpu_iter]->sched_unit->priv = NULL;
-        }
-        else
-        {
-            /* Initialize unit. */
-            unit = sr_new[idx]->sched_unit_idle;
-            unit->res = sr_new[idx];
-            unit->is_running = true;
-            sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
-            sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
-
-            /* Adjust cpu masks of resources (old and new). */
-            cpumask_clear_cpu(cpu_iter, sr->cpus);
-            cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
-
-            /* Init timer. */
-            init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
-
-            /* Last resource initializations and insert resource pointer. */
-            sr_new[idx]->master_cpu = cpu_iter;
-            set_sched_res(cpu_iter, sr_new[idx]);
-
-            /* Last action: set the new lock pointer. */
-            smp_mb();
-            sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
-
-            idx++;
-        }
-    }
-    sr->scheduler = &sched_idle_ops;
-    sr->sched_priv = NULL;
-
-    smp_mb();
-    sr->schedule_lock = &sched_free_cpu_lock;
-
-    /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
-    spin_unlock_irqrestore(old_lock, flags);
-
-    sched_deinit_pdata(old_ops, ppriv_old, cpu);
-
-    sched_free_udata(old_ops, vpriv_old);
-    sched_free_pdata(old_ops, ppriv_old, cpu);
-
-    sr->granularity = 1;
-    sr->cpupool = NULL;
-
-out:
-    rcu_read_unlock(&sched_res_rculock);
-    xfree(sr_new);
-
-    return ret;
-}
-
-struct scheduler *scheduler_get_default(void)
-{
-    return &ops;
-}
-
-struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
-{
-    int i;
-    struct scheduler *sched;
-
-    for ( i = 0; i < NUM_SCHEDULERS; i++ )
-        if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
-            goto found;
-    *perr = -ENOENT;
-    return NULL;
-
- found:
-    *perr = -ENOMEM;
-    if ( (sched = xmalloc(struct scheduler)) == NULL )
-        return NULL;
-    memcpy(sched, schedulers[i], sizeof(*sched));
-    if ( (*perr = sched_init(sched)) != 0 )
-    {
-        xfree(sched);
-        sched = NULL;
-    }
-
-    return sched;
-}
-
-void scheduler_free(struct scheduler *sched)
-{
-    BUG_ON(sched == &ops);
-    sched_deinit(sched);
-    xfree(sched);
-}
-
-void schedule_dump(struct cpupool *c)
-{
-    unsigned int      i;
-    struct scheduler *sched;
-    cpumask_t        *cpus;
-
-    /* Locking, if necessary, must be handled withing each scheduler */
-
-    rcu_read_lock(&sched_res_rculock);
-
-    if ( c != NULL )
-    {
-        sched = c->sched;
-        cpus = c->cpu_valid;
-        printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
-        sched_dump_settings(sched);
-    }
-    else
-    {
-        sched = &ops;
-        cpus = &cpupool_free_cpus;
-    }
-
-    if ( sched->dump_cpu_state != NULL )
-    {
-        printk("CPUs info:\n");
-        for_each_cpu (i, cpus)
-            sched_dump_cpu_state(sched, i);
-    }
-
-    rcu_read_unlock(&sched_res_rculock);
-}
-
-void sched_tick_suspend(void)
-{
-    rcu_idle_enter(smp_processor_id());
-    rcu_idle_timer_start();
-}
-
-void sched_tick_resume(void)
-{
-    rcu_idle_timer_stop();
-    rcu_idle_exit(smp_processor_id());
-}
-
-void wait(void)
-{
-    schedule();
-}
-
-#ifdef CONFIG_X86
-void __init sched_setup_dom0_vcpus(struct domain *d)
-{
-    unsigned int i;
-    struct sched_unit *unit;
-
-    for ( i = 1; i < d->max_vcpus; i++ )
-        vcpu_create(d, i);
-
-    /*
-     * PV-shim: vcpus are pinned 1:1.
-     * Initially only 1 cpu is online, others will be dealt with when
-     * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
-     */
-    if ( pv_shim )
-        sched_set_affinity(d->vcpu[0]->sched_unit,
-                           cpumask_of(0), cpumask_of(0));
-    else
-    {
-        for_each_sched_unit ( d, unit )
-        {
-            if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
-                sched_set_affinity(unit, &dom0_cpus, NULL);
-            sched_set_affinity(unit, NULL, &dom0_cpus);
-        }
-    }
-
-    domain_update_node_affinity(d);
-}
-#endif
-
-#ifdef CONFIG_COMPAT
-#include "compat/schedule.c"
-#endif
-
-#endif /* !COMPAT */
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */