From: Juergen Gross Date: Wed, 22 Jan 2020 14:06:43 +0000 (+0100) Subject: xen/sched: move schedulers and cpupool coding to dedicated directory X-Git-Tag: archive/raspbian/4.14.0+80-gd101b417b7-1+rpi1^2~63^2~854 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=6cb4b01c033b7abc3e7175501330dfb01fb09da5;p=xen.git xen/sched: move schedulers and cpupool coding to dedicated directory Move sched*c and cpupool.c to a new directory common/sched. Signed-off-by: Juergen Gross Reviewed-by: Dario Faggioli --- diff --git a/MAINTAINERS b/MAINTAINERS index a91080cde5..dadcfb63d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -174,7 +174,7 @@ M: Josh Whitehead M: Stewart Hildebrand S: Supported L: xen-devel@dornerworks.com -F: xen/common/sched_arinc653.c +F: xen/common/sched/arinc653.c F: tools/libxc/xc_arinc653.c ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE @@ -218,7 +218,7 @@ CPU POOLS M: Juergen Gross M: Dario Faggioli S: Supported -F: xen/common/cpupool.c +F: xen/common/sched/cpupool.c DEVICE TREE M: Stefano Stabellini @@ -384,13 +384,13 @@ RTDS SCHEDULER M: Dario Faggioli M: Meng Xu S: Supported -F: xen/common/sched_rt.c +F: xen/common/sched/rt.c SCHEDULING M: George Dunlap M: Dario Faggioli S: Supported -F: xen/common/sched* +F: xen/common/sched/ SEABIOS UPSTREAM M: Wei Liu diff --git a/xen/common/Kconfig b/xen/common/Kconfig index b3d161d057..9d6d09eb37 100644 --- a/xen/common/Kconfig +++ b/xen/common/Kconfig @@ -275,71 +275,7 @@ config ARGO If unsure, say N. -menu "Schedulers" - visible if EXPERT = "y" - -config SCHED_CREDIT - bool "Credit scheduler support" - default y - ---help--- - The traditional credit scheduler is a general purpose scheduler. - -config SCHED_CREDIT2 - bool "Credit2 scheduler support" - default y - ---help--- - The credit2 scheduler is a general purpose scheduler that is - optimized for lower latency and higher VM density. - -config SCHED_RTDS - bool "RTDS scheduler support (EXPERIMENTAL)" - default y - ---help--- - The RTDS scheduler is a soft and firm real-time scheduler for - multicore, targeted for embedded, automotive, graphics and gaming - in the cloud, and general low-latency workloads. - -config SCHED_ARINC653 - bool "ARINC653 scheduler support (EXPERIMENTAL)" - default DEBUG - ---help--- - The ARINC653 scheduler is a hard real-time scheduler for single - cores, targeted for avionics, drones, and medical devices. - -config SCHED_NULL - bool "Null scheduler support (EXPERIMENTAL)" - default y - ---help--- - The null scheduler is a static, zero overhead scheduler, - for when there always are less vCPUs than pCPUs, typically - in embedded or HPC scenarios. - -choice - prompt "Default Scheduler?" - default SCHED_CREDIT2_DEFAULT - - config SCHED_CREDIT_DEFAULT - bool "Credit Scheduler" if SCHED_CREDIT - config SCHED_CREDIT2_DEFAULT - bool "Credit2 Scheduler" if SCHED_CREDIT2 - config SCHED_RTDS_DEFAULT - bool "RT Scheduler" if SCHED_RTDS - config SCHED_ARINC653_DEFAULT - bool "ARINC653 Scheduler" if SCHED_ARINC653 - config SCHED_NULL_DEFAULT - bool "Null Scheduler" if SCHED_NULL -endchoice - -config SCHED_DEFAULT - string - default "credit" if SCHED_CREDIT_DEFAULT - default "credit2" if SCHED_CREDIT2_DEFAULT - default "rtds" if SCHED_RTDS_DEFAULT - default "arinc653" if SCHED_ARINC653_DEFAULT - default "null" if SCHED_NULL_DEFAULT - default "credit2" - -endmenu +source "common/sched/Kconfig" config CRYPTO bool diff --git a/xen/common/Makefile b/xen/common/Makefile index 62b34e69e9..2abb8250b0 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -3,7 +3,6 @@ obj-y += bitmap.o obj-y += bsearch.o obj-$(CONFIG_CORE_PARKING) += core_parking.o obj-y += cpu.o -obj-y += cpupool.o obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o obj-y += domctl.o @@ -38,12 +37,6 @@ obj-y += radix-tree.o obj-y += rbtree.o obj-y += rcupdate.o obj-y += rwlock.o -obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o -obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o -obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o -obj-$(CONFIG_SCHED_RTDS) += sched_rt.o -obj-$(CONFIG_SCHED_NULL) += sched_null.o -obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o obj-y += sort.o @@ -74,6 +67,7 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o kernel.o memory.o multicall extra-y := symbols-dummy.o subdir-$(CONFIG_COVERAGE) += coverage +subdir-y += sched subdir-$(CONFIG_UBSAN) += ubsan subdir-$(CONFIG_NEEDS_LIBELF) += libelf diff --git a/xen/common/compat/schedule.c b/xen/common/compat/schedule.c deleted file mode 100644 index 8b6e6f107d..0000000000 --- a/xen/common/compat/schedule.c +++ /dev/null @@ -1,55 +0,0 @@ -/**************************************************************************** - * schedule.c - * - */ - -#include - -#define COMPAT -#define ret_t int - -#define do_sched_op compat_sched_op - -#define xen_sched_pin_override sched_pin_override -CHECK_sched_pin_override; -#undef xen_sched_pin_override - -#define xen_sched_shutdown sched_shutdown -CHECK_sched_shutdown; -#undef xen_sched_shutdown - -#define xen_sched_remote_shutdown sched_remote_shutdown -CHECK_sched_remote_shutdown; -#undef xen_sched_remote_shutdown - -static int compat_poll(struct compat_sched_poll *compat) -{ - struct sched_poll native; - -#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \ - guest_from_compat_handle((_d_)->ports, (_s_)->ports) - XLAT_sched_poll(&native, compat); -#undef XLAT_sched_poll_HNDL_ports - - return do_poll(&native); -} - -#define do_poll compat_poll -#define sched_poll compat_sched_poll - -#include "../schedule.c" - -int compat_set_timer_op(u32 lo, s32 hi) -{ - return do_set_timer_op(((s64)hi << 32) | lo); -} - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c deleted file mode 100644 index d66b541a94..0000000000 --- a/xen/common/cpupool.c +++ /dev/null @@ -1,979 +0,0 @@ -/****************************************************************************** - * cpupool.c - * - * Generic cpupool-handling functions. - * - * Cpupools are a feature to have configurable scheduling domains. Each - * cpupool runs an own scheduler on a dedicated set of physical cpus. - * A domain is bound to one cpupool at any time, but it can be moved to - * another cpupool. - * - * (C) 2009, Juergen Gross, Fujitsu Technology Solutions - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define for_each_cpupool(ptr) \ - for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) - -struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ -cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ - -static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ - -static int cpupool_moving_cpu = -1; -static struct cpupool *cpupool_cpu_moving = NULL; -static cpumask_t cpupool_locked_cpus; - -static DEFINE_SPINLOCK(cpupool_lock); - -static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu; -static unsigned int __read_mostly sched_granularity = 1; - -#ifdef CONFIG_HAS_SCHED_GRANULARITY -static int __init sched_select_granularity(const char *str) -{ - if ( strcmp("cpu", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_cpu; - else if ( strcmp("core", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_core; - else if ( strcmp("socket", str) == 0 ) - opt_sched_granularity = SCHED_GRAN_socket; - else - return -EINVAL; - - return 0; -} -custom_param("sched-gran", sched_select_granularity); -#endif - -static unsigned int __init cpupool_check_granularity(void) -{ - unsigned int cpu; - unsigned int siblings, gran = 0; - - if ( opt_sched_granularity == SCHED_GRAN_cpu ) - return 1; - - for_each_online_cpu ( cpu ) - { - siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity, - cpu)); - if ( gran == 0 ) - gran = siblings; - else if ( gran != siblings ) - return 0; - } - - sched_disable_smt_switching = true; - - return gran; -} - -/* Setup data for selected scheduler granularity. */ -static void __init cpupool_gran_init(void) -{ - unsigned int gran = 0; - const char *fallback = NULL; - - while ( gran == 0 ) - { - gran = cpupool_check_granularity(); - - if ( gran == 0 ) - { - switch ( opt_sched_granularity ) - { - case SCHED_GRAN_core: - opt_sched_granularity = SCHED_GRAN_cpu; - fallback = "Asymmetric cpu configuration.\n" - "Falling back to sched-gran=cpu.\n"; - break; - case SCHED_GRAN_socket: - opt_sched_granularity = SCHED_GRAN_core; - fallback = "Asymmetric cpu configuration.\n" - "Falling back to sched-gran=core.\n"; - break; - default: - ASSERT_UNREACHABLE(); - break; - } - } - } - - if ( fallback ) - warning_add(fallback); - - sched_granularity = gran; -} - -unsigned int cpupool_get_granularity(const struct cpupool *c) -{ - return c ? sched_granularity : 1; -} - -static void free_cpupool_struct(struct cpupool *c) -{ - if ( c ) - { - free_cpumask_var(c->res_valid); - free_cpumask_var(c->cpu_valid); - } - xfree(c); -} - -static struct cpupool *alloc_cpupool_struct(void) -{ - struct cpupool *c = xzalloc(struct cpupool); - - if ( !c ) - return NULL; - - if ( !zalloc_cpumask_var(&c->cpu_valid) || - !zalloc_cpumask_var(&c->res_valid) ) - { - free_cpupool_struct(c); - c = NULL; - } - - return c; -} - -/* - * find a cpupool by it's id. to be called with cpupool lock held - * if exact is not specified, the first cpupool with an id larger or equal to - * the searched id is returned - * returns NULL if not found. - */ -static struct cpupool *__cpupool_find_by_id(int id, int exact) -{ - struct cpupool **q; - - ASSERT(spin_is_locked(&cpupool_lock)); - - for_each_cpupool(q) - if ( (*q)->cpupool_id >= id ) - break; - - return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL; -} - -static struct cpupool *cpupool_find_by_id(int poolid) -{ - return __cpupool_find_by_id(poolid, 1); -} - -static struct cpupool *__cpupool_get_by_id(int poolid, int exact) -{ - struct cpupool *c; - spin_lock(&cpupool_lock); - c = __cpupool_find_by_id(poolid, exact); - if ( c != NULL ) - atomic_inc(&c->refcnt); - spin_unlock(&cpupool_lock); - return c; -} - -struct cpupool *cpupool_get_by_id(int poolid) -{ - return __cpupool_get_by_id(poolid, 1); -} - -static struct cpupool *cpupool_get_next_by_id(int poolid) -{ - return __cpupool_get_by_id(poolid, 0); -} - -void cpupool_put(struct cpupool *pool) -{ - if ( !atomic_dec_and_test(&pool->refcnt) ) - return; - scheduler_free(pool->sched); - free_cpupool_struct(pool); -} - -/* - * create a new cpupool with specified poolid and scheduler - * returns pointer to new cpupool structure if okay, NULL else - * possible failures: - * - no memory - * - poolid already used - * - unknown scheduler - */ -static struct cpupool *cpupool_create( - int poolid, unsigned int sched_id, int *perr) -{ - struct cpupool *c; - struct cpupool **q; - int last = 0; - - *perr = -ENOMEM; - if ( (c = alloc_cpupool_struct()) == NULL ) - return NULL; - - /* One reference for caller, one reference for cpupool_destroy(). */ - atomic_set(&c->refcnt, 2); - - debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id); - - spin_lock(&cpupool_lock); - - for_each_cpupool(q) - { - last = (*q)->cpupool_id; - if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) - break; - } - if ( *q != NULL ) - { - if ( (*q)->cpupool_id == poolid ) - { - *perr = -EEXIST; - goto err; - } - c->next = *q; - } - - c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; - if ( poolid == 0 ) - { - c->sched = scheduler_get_default(); - } - else - { - c->sched = scheduler_alloc(sched_id, perr); - if ( c->sched == NULL ) - goto err; - } - c->gran = opt_sched_granularity; - - *q = c; - - spin_unlock(&cpupool_lock); - - debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n", - c->cpupool_id, c->sched->name, c->sched->opt_name); - - *perr = 0; - return c; - - err: - spin_unlock(&cpupool_lock); - free_cpupool_struct(c); - return NULL; -} -/* - * destroys the given cpupool - * returns 0 on success, 1 else - * possible failures: - * - pool still in use - * - cpus still assigned to pool - * - pool not in list - */ -static int cpupool_destroy(struct cpupool *c) -{ - struct cpupool **q; - - spin_lock(&cpupool_lock); - for_each_cpupool(q) - if ( *q == c ) - break; - if ( *q != c ) - { - spin_unlock(&cpupool_lock); - return -ENOENT; - } - if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) ) - { - spin_unlock(&cpupool_lock); - return -EBUSY; - } - *q = c->next; - spin_unlock(&cpupool_lock); - - cpupool_put(c); - - debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id); - return 0; -} - -/* - * Move domain to another cpupool - */ -static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c) -{ - int ret; - - if ( unlikely(d->cpupool == c) ) - return 0; - - d->cpupool->n_dom--; - ret = sched_move_domain(d, c); - if ( ret ) - d->cpupool->n_dom++; - else - c->n_dom++; - - return ret; -} -int cpupool_move_domain(struct domain *d, struct cpupool *c) -{ - int ret; - - spin_lock(&cpupool_lock); - - ret = cpupool_move_domain_locked(d, c); - - spin_unlock(&cpupool_lock); - - return ret; -} - -/* - * assign a specific cpu to a cpupool - * cpupool_lock must be held - */ -static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) -{ - int ret; - struct domain *d; - const cpumask_t *cpus; - - cpus = sched_get_opt_cpumask(c->gran, cpu); - - if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) - return -EADDRNOTAVAIL; - ret = schedule_cpu_add(cpumask_first(cpus), c); - if ( ret ) - return ret; - - rcu_read_lock(&sched_res_rculock); - - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - if (cpupool_moving_cpu == cpu) - { - cpupool_moving_cpu = -1; - cpupool_put(cpupool_cpu_moving); - cpupool_cpu_moving = NULL; - } - cpumask_or(c->cpu_valid, c->cpu_valid, cpus); - cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); - - rcu_read_unlock(&sched_res_rculock); - - rcu_read_lock(&domlist_read_lock); - for_each_domain_in_cpupool(d, c) - { - domain_update_node_affinity(d); - } - rcu_read_unlock(&domlist_read_lock); - - return 0; -} - -static int cpupool_unassign_cpu_finish(struct cpupool *c) -{ - int cpu = cpupool_moving_cpu; - const cpumask_t *cpus; - struct domain *d; - int ret; - - if ( c != cpupool_cpu_moving ) - return -EADDRNOTAVAIL; - - /* - * We need this for scanning the domain list, both in - * cpu_disable_scheduler(), and at the bottom of this function. - */ - rcu_read_lock(&domlist_read_lock); - ret = cpu_disable_scheduler(cpu); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - - /* - * cpu_disable_scheduler() returning an error doesn't require resetting - * cpupool_free_cpus' cpu bit. All error cases should be of temporary - * nature and tools will retry the operation. Even if the number of - * retries may be limited, the in-between state can easily be repaired - * by adding the cpu to the cpupool again. - */ - if ( !ret ) - { - ret = schedule_cpu_rm(cpu); - if ( ret ) - cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); - else - { - cpupool_moving_cpu = -1; - cpupool_put(cpupool_cpu_moving); - cpupool_cpu_moving = NULL; - } - } - rcu_read_unlock(&sched_res_rculock); - - for_each_domain_in_cpupool(d, c) - { - domain_update_node_affinity(d); - } - rcu_read_unlock(&domlist_read_lock); - - return ret; -} - -static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu) -{ - int ret; - struct domain *d; - const cpumask_t *cpus; - - spin_lock(&cpupool_lock); - ret = -EADDRNOTAVAIL; - if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid)) - && (cpu != cpupool_moving_cpu) ) - goto out; - - ret = 0; - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - - if ( (c->n_dom > 0) && - (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) && - (cpu != cpupool_moving_cpu) ) - { - rcu_read_lock(&domlist_read_lock); - for_each_domain_in_cpupool(d, c) - { - if ( !d->is_dying && system_state == SYS_STATE_active ) - { - ret = -EBUSY; - break; - } - ret = cpupool_move_domain_locked(d, cpupool0); - if ( ret ) - break; - } - rcu_read_unlock(&domlist_read_lock); - if ( ret ) - goto out; - } - cpupool_moving_cpu = cpu; - atomic_inc(&c->refcnt); - cpupool_cpu_moving = c; - cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus); - cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); - - rcu_read_unlock(&domlist_read_lock); -out: - spin_unlock(&cpupool_lock); - - return ret; -} - -static long cpupool_unassign_cpu_helper(void *info) -{ - struct cpupool *c = info; - long ret; - - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", - cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); - spin_lock(&cpupool_lock); - - ret = cpupool_unassign_cpu_finish(c); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); - - return ret; -} - -/* - * unassign a specific cpu from a cpupool - * we must be sure not to run on the cpu to be unassigned! to achieve this - * the main functionality is performed via continue_hypercall_on_cpu on a - * specific cpu. - * if the cpu to be removed is the last one of the cpupool no active domain - * must be bound to the cpupool. dying domains are moved to cpupool0 as they - * might be zombies. - * possible failures: - * - last cpu and still active domains in cpupool - * - cpu just being unplugged - */ -static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) -{ - int work_cpu; - int ret; - unsigned int master_cpu; - - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", - c->cpupool_id, cpu); - - master_cpu = sched_get_resource_cpu(cpu); - ret = cpupool_unassign_cpu_start(c, master_cpu); - if ( ret ) - { - debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", - c->cpupool_id, cpu, ret); - return ret; - } - - work_cpu = sched_get_resource_cpu(smp_processor_id()); - if ( work_cpu == master_cpu ) - { - work_cpu = cpumask_first(cpupool0->cpu_valid); - if ( work_cpu == master_cpu ) - work_cpu = cpumask_last(cpupool0->cpu_valid); - } - return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c); -} - -/* - * add a new domain to a cpupool - * possible failures: - * - pool does not exist - * - no cpu assigned to pool - */ -int cpupool_add_domain(struct domain *d, int poolid) -{ - struct cpupool *c; - int rc; - int n_dom = 0; - - if ( poolid == CPUPOOLID_NONE ) - return 0; - spin_lock(&cpupool_lock); - c = cpupool_find_by_id(poolid); - if ( c == NULL ) - rc = -ESRCH; - else if ( !cpumask_weight(c->cpu_valid) ) - rc = -ENODEV; - else - { - c->n_dom++; - n_dom = c->n_dom; - d->cpupool = c; - rc = 0; - } - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n", - d->domain_id, poolid, n_dom, rc); - return rc; -} - -/* - * remove a domain from a cpupool - */ -void cpupool_rm_domain(struct domain *d) -{ - int cpupool_id; - int n_dom; - - if ( d->cpupool == NULL ) - return; - spin_lock(&cpupool_lock); - cpupool_id = d->cpupool->cpupool_id; - d->cpupool->n_dom--; - n_dom = d->cpupool->n_dom; - d->cpupool = NULL; - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", - d->domain_id, cpupool_id, n_dom); - return; -} - -/* - * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0, - * as they must have been in there when unplugged. - */ -static int cpupool_cpu_add(unsigned int cpu) -{ - int ret = 0; - const cpumask_t *cpus; - - spin_lock(&cpupool_lock); - cpumask_clear_cpu(cpu, &cpupool_locked_cpus); - cpumask_set_cpu(cpu, &cpupool_free_cpus); - - /* - * If we are not resuming, we are hot-plugging cpu, and in which case - * we add it to pool0, as it certainly was there when hot-unplagged - * (or unplugging would have failed) and that is the default behavior - * anyway. - */ - rcu_read_lock(&sched_res_rculock); - get_sched_res(cpu)->cpupool = NULL; - - cpus = sched_get_opt_cpumask(cpupool0->gran, cpu); - if ( cpumask_subset(cpus, &cpupool_free_cpus) ) - ret = cpupool_assign_cpu_locked(cpupool0, cpu); - - rcu_read_unlock(&sched_res_rculock); - - spin_unlock(&cpupool_lock); - - return ret; -} - -/* - * This function is called in stop_machine context, so we can be sure no - * non-idle vcpu is active on the system. - */ -static void cpupool_cpu_remove(unsigned int cpu) -{ - int ret; - - ASSERT(is_idle_vcpu(current)); - - if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) - { - ret = cpupool_unassign_cpu_finish(cpupool0); - BUG_ON(ret); - } - cpumask_clear_cpu(cpu, &cpupool_free_cpus); -} - -/* - * Called before a CPU is being removed from the system. - * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved - * to free cpus actually before removing them). - * The CPU is locked, to forbid adding it again to another cpupool. - */ -static int cpupool_cpu_remove_prologue(unsigned int cpu) -{ - int ret = 0; - cpumask_t *cpus; - unsigned int master_cpu; - - spin_lock(&cpupool_lock); - - rcu_read_lock(&sched_res_rculock); - cpus = get_sched_res(cpu)->cpus; - master_cpu = sched_get_resource_cpu(cpu); - if ( cpumask_intersects(cpus, &cpupool_locked_cpus) ) - ret = -EBUSY; - else - cpumask_set_cpu(cpu, &cpupool_locked_cpus); - rcu_read_unlock(&sched_res_rculock); - - spin_unlock(&cpupool_lock); - - if ( ret ) - return ret; - - if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) ) - { - /* Cpupool0 is populated only after all cpus are up. */ - ASSERT(system_state == SYS_STATE_active); - - ret = cpupool_unassign_cpu_start(cpupool0, master_cpu); - } - else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) ) - ret = -ENODEV; - - return ret; -} - -/* - * Called during resume for all cpus which didn't come up again. The cpu must - * be removed from the cpupool it is assigned to. In case a cpupool will be - * left without cpu we move all domains of that cpupool to cpupool0. - * As we are called with all domains still frozen there is no need to take the - * cpupool lock here. - */ -static void cpupool_cpu_remove_forced(unsigned int cpu) -{ - struct cpupool **c; - int ret; - unsigned int master_cpu = sched_get_resource_cpu(cpu); - - for_each_cpupool ( c ) - { - if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) ) - { - ret = cpupool_unassign_cpu_start(*c, master_cpu); - BUG_ON(ret); - ret = cpupool_unassign_cpu_finish(*c); - BUG_ON(ret); - } - } - - cpumask_clear_cpu(cpu, &cpupool_free_cpus); - - rcu_read_lock(&sched_res_rculock); - sched_rm_cpu(cpu); - rcu_read_unlock(&sched_res_rculock); -} - -/* - * do cpupool related sysctl operations - */ -int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) -{ - int ret; - struct cpupool *c; - - switch ( op->op ) - { - - case XEN_SYSCTL_CPUPOOL_OP_CREATE: - { - int poolid; - - poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ? - CPUPOOLID_NONE: op->cpupool_id; - c = cpupool_create(poolid, op->sched_id, &ret); - if ( c != NULL ) - { - op->cpupool_id = c->cpupool_id; - cpupool_put(c); - } - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_DESTROY: - { - c = cpupool_get_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - ret = cpupool_destroy(c); - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_INFO: - { - c = cpupool_get_next_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - op->cpupool_id = c->cpupool_id; - op->sched_id = c->sched->sched_id; - op->n_dom = c->n_dom; - ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_ADDCPU: - { - unsigned cpu; - const cpumask_t *cpus; - - cpu = op->cpu; - debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n", - op->cpupool_id, cpu); - - spin_lock(&cpupool_lock); - - c = cpupool_find_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - goto addcpu_out; - if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) - { - for_each_cpu ( cpu, &cpupool_free_cpus ) - { - cpus = sched_get_opt_cpumask(c->gran, cpu); - if ( cpumask_subset(cpus, &cpupool_free_cpus) ) - break; - } - ret = -ENODEV; - if ( cpu >= nr_cpu_ids ) - goto addcpu_out; - } - ret = -EINVAL; - if ( cpu >= nr_cpu_ids ) - goto addcpu_out; - ret = -ENODEV; - cpus = sched_get_opt_cpumask(c->gran, cpu); - if ( !cpumask_subset(cpus, &cpupool_free_cpus) || - cpumask_intersects(cpus, &cpupool_locked_cpus) ) - goto addcpu_out; - ret = cpupool_assign_cpu_locked(c, cpu); - - addcpu_out: - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", - op->cpupool_id, cpu, ret); - - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_RMCPU: - { - unsigned cpu; - - c = cpupool_get_by_id(op->cpupool_id); - ret = -ENOENT; - if ( c == NULL ) - break; - cpu = op->cpu; - if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) - cpu = cpumask_last(c->cpu_valid); - ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL; - cpupool_put(c); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN: - { - struct domain *d; - - ret = rcu_lock_remote_domain_by_id(op->domid, &d); - if ( ret ) - break; - if ( d->cpupool == NULL ) - { - ret = -EINVAL; - rcu_unlock_domain(d); - break; - } - if ( op->cpupool_id == d->cpupool->cpupool_id ) - { - ret = 0; - rcu_unlock_domain(d); - break; - } - debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n", - d->domain_id, op->cpupool_id); - ret = -ENOENT; - spin_lock(&cpupool_lock); - - c = cpupool_find_by_id(op->cpupool_id); - if ( (c != NULL) && cpumask_weight(c->cpu_valid) ) - ret = cpupool_move_domain_locked(d, c); - - spin_unlock(&cpupool_lock); - debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n", - d->domain_id, op->cpupool_id, ret); - rcu_unlock_domain(d); - } - break; - - case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: - { - ret = cpumask_to_xenctl_bitmap( - &op->cpumap, &cpupool_free_cpus); - } - break; - - default: - ret = -ENOSYS; - break; - } - - return ret; -} - -void dump_runq(unsigned char key) -{ - unsigned long flags; - s_time_t now = NOW(); - struct cpupool **c; - - spin_lock(&cpupool_lock); - local_irq_save(flags); - - printk("sched_smt_power_savings: %s\n", - sched_smt_power_savings? "enabled":"disabled"); - printk("NOW=%"PRI_stime"\n", now); - - printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map)); - if ( !cpumask_empty(&cpupool_free_cpus) ) - { - printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus)); - schedule_dump(NULL); - } - - for_each_cpupool(c) - { - printk("Cpupool %d:\n", (*c)->cpupool_id); - printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid)); - schedule_dump(*c); - } - - local_irq_restore(flags); - spin_unlock(&cpupool_lock); -} - -static int cpu_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int rc = 0; - - switch ( action ) - { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_add(cpu); - break; - case CPU_DOWN_PREPARE: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) - rc = cpupool_cpu_remove_prologue(cpu); - break; - case CPU_DYING: - /* Suspend/Resume don't change assignments of cpus to cpupools. */ - if ( system_state <= SYS_STATE_active ) - cpupool_cpu_remove(cpu); - break; - case CPU_RESUME_FAILED: - cpupool_cpu_remove_forced(cpu); - break; - default: - break; - } - - return !rc ? NOTIFY_DONE : notifier_from_errno(rc); -} - -static struct notifier_block cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __init cpupool_init(void) -{ - unsigned int cpu; - int err; - - cpupool_gran_init(); - - cpupool0 = cpupool_create(0, 0, &err); - BUG_ON(cpupool0 == NULL); - cpupool_put(cpupool0); - register_cpu_notifier(&cpu_nfb); - - spin_lock(&cpupool_lock); - - cpumask_copy(&cpupool_free_cpus, &cpu_online_map); - - for_each_cpu ( cpu, &cpupool_free_cpus ) - cpupool_assign_cpu_locked(cpupool0, cpu); - - spin_unlock(&cpupool_lock); - - return 0; -} -__initcall(cpupool_init); - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/common/sched/Kconfig b/xen/common/sched/Kconfig new file mode 100644 index 0000000000..883ac87cab --- /dev/null +++ b/xen/common/sched/Kconfig @@ -0,0 +1,65 @@ +menu "Schedulers" + visible if EXPERT = "y" + +config SCHED_CREDIT + bool "Credit scheduler support" + default y + ---help--- + The traditional credit scheduler is a general purpose scheduler. + +config SCHED_CREDIT2 + bool "Credit2 scheduler support" + default y + ---help--- + The credit2 scheduler is a general purpose scheduler that is + optimized for lower latency and higher VM density. + +config SCHED_RTDS + bool "RTDS scheduler support (EXPERIMENTAL)" + default y + ---help--- + The RTDS scheduler is a soft and firm real-time scheduler for + multicore, targeted for embedded, automotive, graphics and gaming + in the cloud, and general low-latency workloads. + +config SCHED_ARINC653 + bool "ARINC653 scheduler support (EXPERIMENTAL)" + default DEBUG + ---help--- + The ARINC653 scheduler is a hard real-time scheduler for single + cores, targeted for avionics, drones, and medical devices. + +config SCHED_NULL + bool "Null scheduler support (EXPERIMENTAL)" + default y + ---help--- + The null scheduler is a static, zero overhead scheduler, + for when there always are less vCPUs than pCPUs, typically + in embedded or HPC scenarios. + +choice + prompt "Default Scheduler?" + default SCHED_CREDIT2_DEFAULT + + config SCHED_CREDIT_DEFAULT + bool "Credit Scheduler" if SCHED_CREDIT + config SCHED_CREDIT2_DEFAULT + bool "Credit2 Scheduler" if SCHED_CREDIT2 + config SCHED_RTDS_DEFAULT + bool "RT Scheduler" if SCHED_RTDS + config SCHED_ARINC653_DEFAULT + bool "ARINC653 Scheduler" if SCHED_ARINC653 + config SCHED_NULL_DEFAULT + bool "Null Scheduler" if SCHED_NULL +endchoice + +config SCHED_DEFAULT + string + default "credit" if SCHED_CREDIT_DEFAULT + default "credit2" if SCHED_CREDIT2_DEFAULT + default "rtds" if SCHED_RTDS_DEFAULT + default "arinc653" if SCHED_ARINC653_DEFAULT + default "null" if SCHED_NULL_DEFAULT + default "credit2" + +endmenu diff --git a/xen/common/sched/Makefile b/xen/common/sched/Makefile new file mode 100644 index 0000000000..3537f2a68d --- /dev/null +++ b/xen/common/sched/Makefile @@ -0,0 +1,7 @@ +obj-y += cpupool.o +obj-$(CONFIG_SCHED_ARINC653) += arinc653.o +obj-$(CONFIG_SCHED_CREDIT) += credit.o +obj-$(CONFIG_SCHED_CREDIT2) += credit2.o +obj-$(CONFIG_SCHED_RTDS) += rt.o +obj-$(CONFIG_SCHED_NULL) += null.o +obj-y += core.o diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c new file mode 100644 index 0000000000..565575c326 --- /dev/null +++ b/xen/common/sched/arinc653.c @@ -0,0 +1,739 @@ +/****************************************************************************** + * sched_arinc653.c + * + * An ARINC653-compatible scheduling algorithm for use in Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2010, DornerWorks, Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************************************************** + * Private Macros * + **************************************************************************/ + +/** + * Default timeslice for domain 0. + */ +#define DEFAULT_TIMESLICE MILLISECS(10) + +/** + * Retrieve the idle UNIT for a given physical CPU + */ +#define IDLETASK(cpu) (sched_idle_unit(cpu)) + +/** + * Return a pointer to the ARINC 653-specific scheduler data information + * associated with the given UNIT (unit) + */ +#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv) + +/** + * Return the global scheduler private data given the scheduler ops pointer + */ +#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data)) + +/************************************************************************** + * Private Type Definitions * + **************************************************************************/ + +/** + * The arinc653_unit_t structure holds ARINC 653-scheduler-specific + * information for all non-idle UNITs + */ +typedef struct arinc653_unit_s +{ + /* unit points to Xen's struct sched_unit so we can get to it from an + * arinc653_unit_t pointer. */ + struct sched_unit * unit; + /* awake holds whether the UNIT has been woken with vcpu_wake() */ + bool_t awake; + /* list holds the linked list information for the list this UNIT + * is stored in */ + struct list_head list; +} arinc653_unit_t; + +/** + * The sched_entry_t structure holds a single entry of the + * ARINC 653 schedule. + */ +typedef struct sched_entry_s +{ + /* dom_handle holds the handle ("UUID") for the domain that this + * schedule entry refers to. */ + xen_domain_handle_t dom_handle; + /* unit_id holds the UNIT number for the UNIT that this schedule + * entry refers to. */ + int unit_id; + /* runtime holds the number of nanoseconds that the UNIT for this + * schedule entry should be allowed to run per major frame. */ + s_time_t runtime; + /* unit holds a pointer to the Xen sched_unit structure */ + struct sched_unit * unit; +} sched_entry_t; + +/** + * This structure defines data that is global to an instance of the scheduler + */ +typedef struct a653sched_priv_s +{ + /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ + spinlock_t lock; + + /** + * This array holds the active ARINC 653 schedule. + * + * When the system tries to start a new UNIT, this schedule is scanned + * to look for a matching (handle, UNIT #) pair. If both the handle (UUID) + * and UNIT number match, then the UNIT is allowed to run. Its run time + * (per major frame) is given in the third entry of the schedule. + */ + sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE]; + + /** + * This variable holds the number of entries that are valid in + * the arinc653_schedule table. + * + * This is not necessarily the same as the number of domains in the + * schedule. A domain could be listed multiple times within the schedule, + * or a domain with multiple UNITs could have a different + * schedule entry for each UNIT. + */ + unsigned int num_schedule_entries; + + /** + * the major frame time for the ARINC 653 schedule. + */ + s_time_t major_frame; + + /** + * the time that the next major frame starts + */ + s_time_t next_major_frame; + + /** + * pointers to all Xen UNIT structures for iterating through + */ + struct list_head unit_list; +} a653sched_priv_t; + +/************************************************************************** + * Helper functions * + **************************************************************************/ + +/** + * This function compares two domain handles. + * + * @param h1 Pointer to handle 1 + * @param h2 Pointer to handle 2 + * + * @return
    + *
  • <0: handle 1 is less than handle 2 + *
  • 0: handle 1 is equal to handle 2 + *
  • >0: handle 1 is greater than handle 2 + *
+ */ +static int dom_handle_cmp(const xen_domain_handle_t h1, + const xen_domain_handle_t h2) +{ + return memcmp(h1, h2, sizeof(xen_domain_handle_t)); +} + +/** + * This function searches the unit list to find a UNIT that matches + * the domain handle and UNIT ID specified. + * + * @param ops Pointer to this instance of the scheduler structure + * @param handle Pointer to handler + * @param unit_id UNIT ID + * + * @return
    + *
  • Pointer to the matching UNIT if one is found + *
  • NULL otherwise + *
+ */ +static struct sched_unit *find_unit( + const struct scheduler *ops, + xen_domain_handle_t handle, + int unit_id) +{ + arinc653_unit_t *aunit; + + /* loop through the unit_list looking for the specified UNIT */ + list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list ) + if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0) + && (unit_id == aunit->unit->unit_id) ) + return aunit->unit; + + return NULL; +} + +/** + * This function updates the pointer to the Xen UNIT structure for each entry + * in the ARINC 653 schedule. + * + * @param ops Pointer to this instance of the scheduler structure + * @return + */ +static void update_schedule_units(const struct scheduler *ops) +{ + unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries; + + for ( i = 0; i < n_entries; i++ ) + SCHED_PRIV(ops)->schedule[i].unit = + find_unit(ops, + SCHED_PRIV(ops)->schedule[i].dom_handle, + SCHED_PRIV(ops)->schedule[i].unit_id); +} + +/** + * This function is called by the adjust_global scheduler hook to put + * in place a new ARINC653 schedule. + * + * @param ops Pointer to this instance of the scheduler structure + * + * @return
    + *
  • 0 = success + *
  • !0 = error + *
+ */ +static int +arinc653_sched_set( + const struct scheduler *ops, + struct xen_sysctl_arinc653_schedule *schedule) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + s_time_t total_runtime = 0; + unsigned int i; + unsigned long flags; + int rc = -EINVAL; + + spin_lock_irqsave(&sched_priv->lock, flags); + + /* Check for valid major frame and number of schedule entries. */ + if ( (schedule->major_frame <= 0) + || (schedule->num_sched_entries < 1) + || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) ) + goto fail; + + for ( i = 0; i < schedule->num_sched_entries; i++ ) + { + /* Check for a valid run time. */ + if ( schedule->sched_entries[i].runtime <= 0 ) + goto fail; + + /* Add this entry's run time to total run time. */ + total_runtime += schedule->sched_entries[i].runtime; + } + + /* + * Error if the major frame is not large enough to run all entries as + * indicated by comparing the total run time to the major frame length. + */ + if ( total_runtime > schedule->major_frame ) + goto fail; + + /* Copy the new schedule into place. */ + sched_priv->num_schedule_entries = schedule->num_sched_entries; + sched_priv->major_frame = schedule->major_frame; + for ( i = 0; i < schedule->num_sched_entries; i++ ) + { + memcpy(sched_priv->schedule[i].dom_handle, + schedule->sched_entries[i].dom_handle, + sizeof(sched_priv->schedule[i].dom_handle)); + sched_priv->schedule[i].unit_id = + schedule->sched_entries[i].vcpu_id; + sched_priv->schedule[i].runtime = + schedule->sched_entries[i].runtime; + } + update_schedule_units(ops); + + /* + * The newly-installed schedule takes effect immediately. We do not even + * wait for the current major frame to expire. + * + * Signal a new major frame to begin. The next major frame is set up by + * the do_schedule callback function when it is next invoked. + */ + sched_priv->next_major_frame = NOW(); + + rc = 0; + + fail: + spin_unlock_irqrestore(&sched_priv->lock, flags); + return rc; +} + +/** + * This function is called by the adjust_global scheduler hook to read the + * current ARINC 653 schedule + * + * @param ops Pointer to this instance of the scheduler structure + * @return
    + *
  • 0 = success + *
  • !0 = error + *
+ */ +static int +arinc653_sched_get( + const struct scheduler *ops, + struct xen_sysctl_arinc653_schedule *schedule) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + unsigned int i; + unsigned long flags; + + spin_lock_irqsave(&sched_priv->lock, flags); + + schedule->num_sched_entries = sched_priv->num_schedule_entries; + schedule->major_frame = sched_priv->major_frame; + for ( i = 0; i < sched_priv->num_schedule_entries; i++ ) + { + memcpy(schedule->sched_entries[i].dom_handle, + sched_priv->schedule[i].dom_handle, + sizeof(sched_priv->schedule[i].dom_handle)); + schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id; + schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime; + } + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + return 0; +} + +/************************************************************************** + * Scheduler callback functions * + **************************************************************************/ + +/** + * This function performs initialization for an instance of the scheduler. + * + * @param ops Pointer to this instance of the scheduler structure + * + * @return
    + *
  • 0 = success + *
  • !0 = error + *
+ */ +static int +a653sched_init(struct scheduler *ops) +{ + a653sched_priv_t *prv; + + prv = xzalloc(a653sched_priv_t); + if ( prv == NULL ) + return -ENOMEM; + + ops->sched_data = prv; + + prv->next_major_frame = 0; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->unit_list); + + return 0; +} + +/** + * This function performs deinitialization for an instance of the scheduler + * + * @param ops Pointer to this instance of the scheduler structure + */ +static void +a653sched_deinit(struct scheduler *ops) +{ + xfree(SCHED_PRIV(ops)); + ops->sched_data = NULL; +} + +/** + * This function allocates scheduler-specific data for a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + * + * @return Pointer to the allocated data + */ +static void * +a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + arinc653_unit_t *svc; + unsigned int entry; + unsigned long flags; + + /* + * Allocate memory for the ARINC 653-specific scheduler data information + * associated with the given UNIT (unit). + */ + svc = xmalloc(arinc653_unit_t); + if ( svc == NULL ) + return NULL; + + spin_lock_irqsave(&sched_priv->lock, flags); + + /* + * Add every one of dom0's units to the schedule, as long as there are + * slots available. + */ + if ( unit->domain->domain_id == 0 ) + { + entry = sched_priv->num_schedule_entries; + + if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE ) + { + sched_priv->schedule[entry].dom_handle[0] = '\0'; + sched_priv->schedule[entry].unit_id = unit->unit_id; + sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE; + sched_priv->schedule[entry].unit = unit; + + sched_priv->major_frame += DEFAULT_TIMESLICE; + ++sched_priv->num_schedule_entries; + } + } + + /* + * Initialize our ARINC 653 scheduler-specific information for the UNIT. + * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it + * will call the vcpu_wake scheduler callback function and our scheduler + * will mark the UNIT awake. + */ + svc->unit = unit; + svc->awake = 0; + if ( !is_idle_unit(unit) ) + list_add(&svc->list, &SCHED_PRIV(ops)->unit_list); + update_schedule_units(ops); + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + return svc; +} + +/** + * This function frees scheduler-specific UNIT data + * + * @param ops Pointer to this instance of the scheduler structure + */ +static void +a653sched_free_udata(const struct scheduler *ops, void *priv) +{ + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + arinc653_unit_t *av = priv; + unsigned long flags; + + if (av == NULL) + return; + + spin_lock_irqsave(&sched_priv->lock, flags); + + if ( !is_idle_unit(av->unit) ) + list_del(&av->list); + + xfree(av); + update_schedule_units(ops); + + spin_unlock_irqrestore(&sched_priv->lock, flags); +} + +/** + * Xen scheduler callback function to sleep a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + */ +static void +a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) +{ + if ( AUNIT(unit) != NULL ) + AUNIT(unit)->awake = 0; + + /* + * If the UNIT being put to sleep is the same one that is currently + * running, raise a softirq to invoke the scheduler to switch domains. + */ + if ( get_sched_res(sched_unit_master(unit))->curr == unit ) + cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); +} + +/** + * Xen scheduler callback function to wake up a UNIT + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + */ +static void +a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) +{ + if ( AUNIT(unit) != NULL ) + AUNIT(unit)->awake = 1; + + cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); +} + +/** + * Xen scheduler callback function to select a UNIT to run. + * This is the main scheduler routine. + * + * @param ops Pointer to this instance of the scheduler structure + * @param now Current time + */ +static void +a653sched_do_schedule( + const struct scheduler *ops, + struct sched_unit *prev, + s_time_t now, + bool tasklet_work_scheduled) +{ + struct sched_unit *new_task = NULL; + static unsigned int sched_index = 0; + static s_time_t next_switch_time; + a653sched_priv_t *sched_priv = SCHED_PRIV(ops); + const unsigned int cpu = sched_get_resource_cpu(smp_processor_id()); + unsigned long flags; + + spin_lock_irqsave(&sched_priv->lock, flags); + + if ( sched_priv->num_schedule_entries < 1 ) + sched_priv->next_major_frame = now + DEFAULT_TIMESLICE; + else if ( now >= sched_priv->next_major_frame ) + { + /* time to enter a new major frame + * the first time this function is called, this will be true */ + /* start with the first domain in the schedule */ + sched_index = 0; + sched_priv->next_major_frame = now + sched_priv->major_frame; + next_switch_time = now + sched_priv->schedule[0].runtime; + } + else + { + while ( (now >= next_switch_time) + && (sched_index < sched_priv->num_schedule_entries) ) + { + /* time to switch to the next domain in this major frame */ + sched_index++; + next_switch_time += sched_priv->schedule[sched_index].runtime; + } + } + + /* + * If we exhausted the domains in the schedule and still have time left + * in the major frame then switch next at the next major frame. + */ + if ( sched_index >= sched_priv->num_schedule_entries ) + next_switch_time = sched_priv->next_major_frame; + + /* + * If there are more domains to run in the current major frame, set + * new_task equal to the address of next domain's sched_unit structure. + * Otherwise, set new_task equal to the address of the idle task's + * sched_unit structure. + */ + new_task = (sched_index < sched_priv->num_schedule_entries) + ? sched_priv->schedule[sched_index].unit + : IDLETASK(cpu); + + /* Check to see if the new task can be run (awake & runnable). */ + if ( !((new_task != NULL) + && (AUNIT(new_task) != NULL) + && AUNIT(new_task)->awake + && unit_runnable_state(new_task)) ) + new_task = IDLETASK(cpu); + BUG_ON(new_task == NULL); + + /* + * Check to make sure we did not miss a major frame. + * This is a good test for robust partitioning. + */ + BUG_ON(now >= sched_priv->next_major_frame); + + spin_unlock_irqrestore(&sched_priv->lock, flags); + + /* Tasklet work (which runs in idle UNIT context) overrides all else. */ + if ( tasklet_work_scheduled ) + new_task = IDLETASK(cpu); + + /* Running this task would result in a migration */ + if ( !is_idle_unit(new_task) + && (sched_unit_master(new_task) != cpu) ) + new_task = IDLETASK(cpu); + + /* + * Return the amount of time the next domain has to run and the address + * of the selected task's UNIT structure. + */ + prev->next_time = next_switch_time - now; + prev->next_task = new_task; + new_task->migrated = false; + + BUG_ON(prev->next_time <= 0); +} + +/** + * Xen scheduler callback function to select a resource for the UNIT to run on + * + * @param ops Pointer to this instance of the scheduler structure + * @param unit Pointer to struct sched_unit + * + * @return Scheduler resource to run on + */ +static struct sched_resource * +a653sched_pick_resource(const struct scheduler *ops, + const struct sched_unit *unit) +{ + cpumask_t *online; + unsigned int cpu; + + /* + * If present, prefer unit's current processor, else + * just find the first valid unit. + */ + online = cpupool_domain_master_cpumask(unit->domain); + + cpu = cpumask_first(online); + + if ( cpumask_test_cpu(sched_unit_master(unit), online) + || (cpu >= nr_cpu_ids) ) + cpu = sched_unit_master(unit); + + return get_sched_res(cpu); +} + +/** + * Xen scheduler callback to change the scheduler of a cpu + * + * @param new_ops Pointer to this instance of the scheduler structure + * @param cpu The cpu that is changing scheduler + * @param pdata scheduler specific PCPU data (we don't have any) + * @param vdata scheduler specific UNIT data of the idle unit + */ +static spinlock_t * +a653_switch_sched(struct scheduler *new_ops, unsigned int cpu, + void *pdata, void *vdata) +{ + struct sched_resource *sr = get_sched_res(cpu); + arinc653_unit_t *svc = vdata; + + ASSERT(!pdata && svc && is_idle_unit(svc->unit)); + + sched_idle_unit(cpu)->priv = vdata; + + return &sr->_lock; +} + +/** + * Xen scheduler callback function to perform a global (not domain-specific) + * adjustment. It is used by the ARINC 653 scheduler to put in place a new + * ARINC 653 schedule or to retrieve the schedule currently in place. + * + * @param ops Pointer to this instance of the scheduler structure + * @param sc Pointer to the scheduler operation specified by Domain 0 + */ +static int +a653sched_adjust_global(const struct scheduler *ops, + struct xen_sysctl_scheduler_op *sc) +{ + struct xen_sysctl_arinc653_schedule local_sched; + int rc = -EINVAL; + + switch ( sc->cmd ) + { + case XEN_SYSCTL_SCHEDOP_putinfo: + if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) ) + { + rc = -EFAULT; + break; + } + + rc = arinc653_sched_set(ops, &local_sched); + break; + case XEN_SYSCTL_SCHEDOP_getinfo: + memset(&local_sched, -1, sizeof(local_sched)); + rc = arinc653_sched_get(ops, &local_sched); + if ( rc ) + break; + + if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) ) + rc = -EFAULT; + break; + } + + return rc; +} + +/** + * This structure defines our scheduler for Xen. + * The entries tell Xen where to find our scheduler-specific + * callback functions. + * The symbol must be visible to the rest of Xen at link time. + */ +static const struct scheduler sched_arinc653_def = { + .name = "ARINC 653 Scheduler", + .opt_name = "arinc653", + .sched_id = XEN_SCHEDULER_ARINC653, + .sched_data = NULL, + + .init = a653sched_init, + .deinit = a653sched_deinit, + + .free_udata = a653sched_free_udata, + .alloc_udata = a653sched_alloc_udata, + + .insert_unit = NULL, + .remove_unit = NULL, + + .sleep = a653sched_unit_sleep, + .wake = a653sched_unit_wake, + .yield = NULL, + .context_saved = NULL, + + .do_schedule = a653sched_do_schedule, + + .pick_resource = a653sched_pick_resource, + + .switch_sched = a653_switch_sched, + + .adjust = NULL, + .adjust_global = a653sched_adjust_global, + + .dump_settings = NULL, + .dump_cpu_state = NULL, +}; + +REGISTER_SCHEDULER(sched_arinc653_def); + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c new file mode 100644 index 0000000000..040b4caca2 --- /dev/null +++ b/xen/common/sched/compat.c @@ -0,0 +1,55 @@ +/**************************************************************************** + * schedule.c + * + */ + +#include + +#define COMPAT +#define ret_t int + +#define do_sched_op compat_sched_op + +#define xen_sched_pin_override sched_pin_override +CHECK_sched_pin_override; +#undef xen_sched_pin_override + +#define xen_sched_shutdown sched_shutdown +CHECK_sched_shutdown; +#undef xen_sched_shutdown + +#define xen_sched_remote_shutdown sched_remote_shutdown +CHECK_sched_remote_shutdown; +#undef xen_sched_remote_shutdown + +static int compat_poll(struct compat_sched_poll *compat) +{ + struct sched_poll native; + +#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \ + guest_from_compat_handle((_d_)->ports, (_s_)->ports) + XLAT_sched_poll(&native, compat); +#undef XLAT_sched_poll_HNDL_ports + + return do_poll(&native); +} + +#define do_poll compat_poll +#define sched_poll compat_sched_poll + +#include "core.c" + +int compat_set_timer_op(u32 lo, s32 hi) +{ + return do_set_timer_op(((s64)hi << 32) | lo); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c new file mode 100644 index 0000000000..4d8eb4c617 --- /dev/null +++ b/xen/common/sched/core.c @@ -0,0 +1,3144 @@ +/**************************************************************************** + * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge + * (C) 2002-2003 University of Cambridge + * (C) 2004 - Mark Williamson - Intel Research Cambridge + **************************************************************************** + * + * File: common/schedule.c + * Author: Rolf Neugebauer & Keir Fraser + * Updated for generic API by Mark Williamson + * + * Description: Generic CPU scheduling code + * implements support functionality for the Xen scheduler API. + * + */ + +#ifndef COMPAT +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_XEN_GUEST +#include +#else +#define pv_shim false +#endif + +/* opt_sched: scheduler - default to configured value */ +static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT; +string_param("sched", opt_sched); + +/* if sched_smt_power_savings is set, + * scheduler will give preferrence to partially idle package compared to + * the full idle package, when picking pCPU to schedule vCPU. + */ +bool_t sched_smt_power_savings = 0; +boolean_param("sched_smt_power_savings", sched_smt_power_savings); + +/* Default scheduling rate limit: 1ms + * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined + * */ +int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; +integer_param("sched_ratelimit_us", sched_ratelimit_us); + +/* Number of vcpus per struct sched_unit. */ +bool __read_mostly sched_disable_smt_switching; +cpumask_t sched_res_mask; + +/* Common lock for free cpus. */ +static DEFINE_SPINLOCK(sched_free_cpu_lock); + +/* Various timer handlers. */ +static void s_timer_fn(void *unused); +static void vcpu_periodic_timer_fn(void *data); +static void vcpu_singleshot_timer_fn(void *data); +static void poll_timer_fn(void *data); + +/* This is global for now so that private implementations can reach it */ +DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res); +static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx); +DEFINE_RCU_READ_LOCK(sched_res_rculock); + +/* Scratch space for cpumasks. */ +DEFINE_PER_CPU(cpumask_t, cpumask_scratch); + +/* How many urgent vcpus. */ +DEFINE_PER_CPU(atomic_t, sched_urgent_count); + +extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[]; +#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array) +#define schedulers __start_schedulers_array + +static struct scheduler __read_mostly ops; + +static bool scheduler_active; + +static void sched_set_affinity( + struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft); + +static struct sched_resource * +sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + return unit->res; +} + +static void * +sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + /* Any non-NULL pointer is fine here. */ + return ZERO_BLOCK_PTR; +} + +static void +sched_idle_free_udata(const struct scheduler *ops, void *priv) +{ +} + +static void sched_idle_schedule( + const struct scheduler *ops, struct sched_unit *unit, s_time_t now, + bool tasklet_work_scheduled) +{ + const unsigned int cpu = smp_processor_id(); + + unit->next_time = -1; + unit->next_task = sched_idle_unit(cpu); +} + +static struct scheduler sched_idle_ops = { + .name = "Idle Scheduler", + .opt_name = "idle", + .sched_data = NULL, + + .pick_resource = sched_idle_res_pick, + .do_schedule = sched_idle_schedule, + + .alloc_udata = sched_idle_alloc_udata, + .free_udata = sched_idle_free_udata, +}; + +static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit, + unsigned int cpu) +{ + unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu); + const struct domain *d = unit->domain; + + return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL; +} + +static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit, + unsigned int cpu) +{ + struct vcpu *v = unit2vcpu_cpu(unit, cpu); + + return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu]; +} + +static inline struct scheduler *dom_scheduler(const struct domain *d) +{ + if ( likely(d->cpupool != NULL) ) + return d->cpupool->sched; + + /* + * If d->cpupool is NULL, this is the idle domain. This is special + * because the idle domain does not really belong to any cpupool, and, + * hence, does not really have a scheduler. + * + * This is (should be!) only called like this for allocating the idle + * vCPUs for the first time, during boot, in which case what we want + * is the default scheduler that has been, choosen at boot. + */ + ASSERT(is_idle_domain(d)); + return &ops; +} + +static inline struct scheduler *unit_scheduler(const struct sched_unit *unit) +{ + struct domain *d = unit->domain; + + if ( likely(d->cpupool != NULL) ) + return d->cpupool->sched; + + /* + * If d->cpupool is NULL, this is a unit of the idle domain. And this + * case is special because the idle domain does not really belong to + * a cpupool and, hence, doesn't really have a scheduler). In fact, its + * units (may) run on pCPUs which are in different pools, with different + * schedulers. + * + * What we want, in this case, is the scheduler of the pCPU where this + * particular idle unit is running. And, since unit->res never changes + * for idle units, it is safe to use it, with no locks, to figure that out. + */ + + ASSERT(is_idle_domain(d)); + return unit->res->scheduler; +} + +static inline struct scheduler *vcpu_scheduler(const struct vcpu *v) +{ + return unit_scheduler(v->sched_unit); +} +#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain) + +static inline void trace_runstate_change(struct vcpu *v, int new_state) +{ + struct { uint32_t vcpu:16, domain:16; } d; + uint32_t event; + + if ( likely(!tb_init_done) ) + return; + + d.vcpu = v->vcpu_id; + d.domain = v->domain->domain_id; + + event = TRC_SCHED_RUNSTATE_CHANGE; + event |= ( v->runstate.state & 0x3 ) << 8; + event |= ( new_state & 0x3 ) << 4; + + __trace_var(event, 1/*tsc*/, sizeof(d), &d); +} + +static inline void trace_continue_running(struct vcpu *v) +{ + struct { uint32_t vcpu:16, domain:16; } d; + + if ( likely(!tb_init_done) ) + return; + + d.vcpu = v->vcpu_id; + d.domain = v->domain->domain_id; + + __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); +} + +static inline void vcpu_urgent_count_update(struct vcpu *v) +{ + if ( is_idle_vcpu(v) ) + return; + + if ( unlikely(v->is_urgent) ) + { + if ( !(v->pause_flags & VPF_blocked) || + !test_bit(v->vcpu_id, v->domain->poll_mask) ) + { + v->is_urgent = 0; + atomic_dec(&per_cpu(sched_urgent_count, v->processor)); + } + } + else + { + if ( unlikely(v->pause_flags & VPF_blocked) && + unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) ) + { + v->is_urgent = 1; + atomic_inc(&per_cpu(sched_urgent_count, v->processor)); + } + } +} + +static inline void vcpu_runstate_change( + struct vcpu *v, int new_state, s_time_t new_entry_time) +{ + s_time_t delta; + struct sched_unit *unit = v->sched_unit; + + ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); + if ( v->runstate.state == new_state ) + return; + + vcpu_urgent_count_update(v); + + trace_runstate_change(v, new_state); + + if ( !is_idle_vcpu(v) ) + { + unit->runstate_cnt[v->runstate.state]--; + unit->runstate_cnt[new_state]++; + } + + delta = new_entry_time - v->runstate.state_entry_time; + if ( delta > 0 ) + { + v->runstate.time[v->runstate.state] += delta; + v->runstate.state_entry_time = new_entry_time; + } + + v->runstate.state = new_state; +} + +void sched_guest_idle(void (*idle) (void), unsigned int cpu) +{ + /* + * Another vcpu of the unit is active in guest context while this one is + * idle. In case of a scheduling event we don't want to have high latencies + * due to a cpu needing to wake up from deep C state for joining the + * rendezvous, so avoid those deep C states by incrementing the urgent + * count of the cpu. + */ + atomic_inc(&per_cpu(sched_urgent_count, cpu)); + idle(); + atomic_dec(&per_cpu(sched_urgent_count, cpu)); +} + +void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) +{ + spinlock_t *lock; + s_time_t delta; + + rcu_read_lock(&sched_res_rculock); + + lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit); + memcpy(runstate, &v->runstate, sizeof(*runstate)); + delta = NOW() - runstate->state_entry_time; + if ( delta > 0 ) + runstate->time[runstate->state] += delta; + + if ( unlikely(lock != NULL) ) + unit_schedule_unlock_irq(lock, v->sched_unit); + + rcu_read_unlock(&sched_res_rculock); +} + +uint64_t get_cpu_idle_time(unsigned int cpu) +{ + struct vcpu_runstate_info state = { 0 }; + struct vcpu *v = idle_vcpu[cpu]; + + if ( cpu_online(cpu) && v ) + vcpu_runstate_get(v, &state); + + return state.time[RUNSTATE_running]; +} + +/* + * If locks are different, take the one with the lower address first. + * This avoids dead- or live-locks when this code is running on both + * cpus at the same time. + */ +static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, + unsigned long *flags) +{ + if ( lock1 == lock2 ) + { + spin_lock_irqsave(lock1, *flags); + } + else if ( lock1 < lock2 ) + { + spin_lock_irqsave(lock1, *flags); + spin_lock(lock2); + } + else + { + spin_lock_irqsave(lock2, *flags); + spin_lock(lock1); + } +} + +static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, + unsigned long flags) +{ + if ( lock1 != lock2 ) + spin_unlock(lock2); + spin_unlock_irqrestore(lock1, flags); +} + +static void sched_free_unit_mem(struct sched_unit *unit) +{ + struct sched_unit *prev_unit; + struct domain *d = unit->domain; + + if ( d->sched_unit_list == unit ) + d->sched_unit_list = unit->next_in_list; + else + { + for_each_sched_unit ( d, prev_unit ) + { + if ( prev_unit->next_in_list == unit ) + { + prev_unit->next_in_list = unit->next_in_list; + break; + } + } + } + + free_cpumask_var(unit->cpu_hard_affinity); + free_cpumask_var(unit->cpu_hard_affinity_saved); + free_cpumask_var(unit->cpu_soft_affinity); + + xfree(unit); +} + +static void sched_free_unit(struct sched_unit *unit, struct vcpu *v) +{ + struct vcpu *vunit; + unsigned int cnt = 0; + + /* Don't count to be released vcpu, might be not in vcpu list yet. */ + for_each_sched_unit_vcpu ( unit, vunit ) + if ( vunit != v ) + cnt++; + + v->sched_unit = NULL; + unit->runstate_cnt[v->runstate.state]--; + + if ( unit->vcpu_list == v ) + unit->vcpu_list = v->next_in_list; + + if ( !cnt ) + sched_free_unit_mem(unit); +} + +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v) +{ + v->sched_unit = unit; + + /* All but idle vcpus are allocated with sequential vcpu_id. */ + if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id ) + { + unit->vcpu_list = v; + /* + * unit_id is always the same as lowest vcpu_id of unit. + * This is used for stopping for_each_sched_unit_vcpu() loop and in + * order to support cpupools with different granularities. + */ + unit->unit_id = v->vcpu_id; + } + unit->runstate_cnt[v->runstate.state]++; +} + +static struct sched_unit *sched_alloc_unit_mem(void) +{ + struct sched_unit *unit; + + unit = xzalloc(struct sched_unit); + if ( !unit ) + return NULL; + + if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) || + !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) || + !zalloc_cpumask_var(&unit->cpu_soft_affinity) ) + { + sched_free_unit_mem(unit); + unit = NULL; + } + + return unit; +} + +static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d) +{ + struct sched_unit **prev_unit; + + unit->domain = d; + + for ( prev_unit = &d->sched_unit_list; *prev_unit; + prev_unit = &(*prev_unit)->next_in_list ) + if ( (*prev_unit)->next_in_list && + (*prev_unit)->next_in_list->unit_id > unit->unit_id ) + break; + + unit->next_in_list = *prev_unit; + *prev_unit = unit; +} + +static struct sched_unit *sched_alloc_unit(struct vcpu *v) +{ + struct sched_unit *unit; + struct domain *d = v->domain; + unsigned int gran = cpupool_get_granularity(d->cpupool); + + for_each_sched_unit ( d, unit ) + if ( unit->unit_id / gran == v->vcpu_id / gran ) + break; + + if ( unit ) + { + sched_unit_add_vcpu(unit, v); + return unit; + } + + if ( (unit = sched_alloc_unit_mem()) == NULL ) + return NULL; + + sched_unit_add_vcpu(unit, v); + sched_domain_insert_unit(unit, d); + + return unit; +} + +static unsigned int sched_select_initial_cpu(const struct vcpu *v) +{ + const struct domain *d = v->domain; + nodeid_t node; + spinlock_t *lock; + unsigned long flags; + unsigned int cpu_ret, cpu = smp_processor_id(); + cpumask_t *cpus = cpumask_scratch_cpu(cpu); + + lock = pcpu_schedule_lock_irqsave(cpu, &flags); + cpumask_clear(cpus); + for_each_node_mask ( node, d->node_affinity ) + cpumask_or(cpus, cpus, &node_to_cpumask(node)); + cpumask_and(cpus, cpus, d->cpupool->cpu_valid); + if ( cpumask_empty(cpus) ) + cpumask_copy(cpus, d->cpupool->cpu_valid); + + if ( v->vcpu_id == 0 ) + cpu_ret = cpumask_first(cpus); + else + { + /* We can rely on previous vcpu being available. */ + ASSERT(!is_idle_domain(d)); + + cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus); + } + + pcpu_schedule_unlock_irqrestore(lock, flags, cpu); + + return cpu_ret; +} + +int sched_init_vcpu(struct vcpu *v) +{ + struct domain *d = v->domain; + struct sched_unit *unit; + unsigned int processor; + + if ( (unit = sched_alloc_unit(v)) == NULL ) + return 1; + + if ( is_idle_domain(d) ) + processor = v->vcpu_id; + else + processor = sched_select_initial_cpu(v); + + /* Initialise the per-vcpu timers. */ + spin_lock_init(&v->periodic_timer_lock); + init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor); + init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor); + init_timer(&v->poll_timer, poll_timer_fn, v, processor); + + /* If this is not the first vcpu of the unit we are done. */ + if ( unit->priv != NULL ) + { + v->processor = processor; + return 0; + } + + rcu_read_lock(&sched_res_rculock); + + /* The first vcpu of an unit can be set via sched_set_res(). */ + sched_set_res(unit, get_sched_res(processor)); + + unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv); + if ( unit->priv == NULL ) + { + sched_free_unit(unit, v); + rcu_read_unlock(&sched_res_rculock); + return 1; + } + + /* + * Initialize affinity settings. The idler, and potentially + * domain-0 VCPUs, are pinned onto their respective physical CPUs. + */ + if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) ) + sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); + else + sched_set_affinity(unit, &cpumask_all, &cpumask_all); + + /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */ + if ( is_idle_domain(d) ) + { + get_sched_res(v->processor)->curr = unit; + get_sched_res(v->processor)->sched_unit_idle = unit; + v->is_running = 1; + unit->is_running = true; + unit->state_entry_time = NOW(); + } + else + { + sched_insert_unit(dom_scheduler(d), unit); + } + + rcu_read_unlock(&sched_res_rculock); + + return 0; +} + +static void vcpu_move_irqs(struct vcpu *v) +{ + arch_move_irqs(v); + evtchn_move_pirqs(v); +} + +static void sched_move_irqs(const struct sched_unit *unit) +{ + struct vcpu *v; + + for_each_sched_unit_vcpu ( unit, v ) + vcpu_move_irqs(v); +} + +int sched_move_domain(struct domain *d, struct cpupool *c) +{ + struct vcpu *v; + struct sched_unit *unit; + unsigned int new_p, unit_idx; + void **unit_priv; + void *domdata; + void *unitdata; + struct scheduler *old_ops; + void *old_domdata; + unsigned int gran = cpupool_get_granularity(c); + int ret = 0; + + for_each_vcpu ( d, v ) + { + if ( v->affinity_broken ) + return -EBUSY; + } + + rcu_read_lock(&sched_res_rculock); + + domdata = sched_alloc_domdata(c->sched, d); + if ( IS_ERR(domdata) ) + { + ret = PTR_ERR(domdata); + goto out; + } + + unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran)); + if ( unit_priv == NULL ) + { + sched_free_domdata(c->sched, domdata); + ret = -ENOMEM; + goto out; + } + + unit_idx = 0; + for_each_sched_unit ( d, unit ) + { + unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata); + if ( unit_priv[unit_idx] == NULL ) + { + for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ ) + sched_free_udata(c->sched, unit_priv[unit_idx]); + xfree(unit_priv); + sched_free_domdata(c->sched, domdata); + ret = -ENOMEM; + goto out; + } + unit_idx++; + } + + domain_pause(d); + + old_ops = dom_scheduler(d); + old_domdata = d->sched_priv; + + for_each_sched_unit ( d, unit ) + { + sched_remove_unit(old_ops, unit); + } + + d->cpupool = c; + d->sched_priv = domdata; + + new_p = cpumask_first(c->cpu_valid); + unit_idx = 0; + for_each_sched_unit ( d, unit ) + { + spinlock_t *lock; + unsigned int unit_p = new_p; + + unitdata = unit->priv; + + for_each_sched_unit_vcpu ( unit, v ) + { + migrate_timer(&v->periodic_timer, new_p); + migrate_timer(&v->singleshot_timer, new_p); + migrate_timer(&v->poll_timer, new_p); + new_p = cpumask_cycle(new_p, c->cpu_valid); + } + + lock = unit_schedule_lock_irq(unit); + + sched_set_affinity(unit, &cpumask_all, &cpumask_all); + + sched_set_res(unit, get_sched_res(unit_p)); + /* + * With v->processor modified we must not + * - make any further changes assuming we hold the scheduler lock, + * - use unit_schedule_unlock_irq(). + */ + spin_unlock_irq(lock); + + unit->priv = unit_priv[unit_idx]; + if ( !d->is_dying ) + sched_move_irqs(unit); + + sched_insert_unit(c->sched, unit); + + sched_free_udata(old_ops, unitdata); + + unit_idx++; + } + + domain_update_node_affinity(d); + + domain_unpause(d); + + sched_free_domdata(old_ops, old_domdata); + + xfree(unit_priv); + +out: + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +void sched_destroy_vcpu(struct vcpu *v) +{ + struct sched_unit *unit = v->sched_unit; + + kill_timer(&v->periodic_timer); + kill_timer(&v->singleshot_timer); + kill_timer(&v->poll_timer); + if ( test_and_clear_bool(v->is_urgent) ) + atomic_dec(&per_cpu(sched_urgent_count, v->processor)); + /* + * Vcpus are being destroyed top-down. So being the first vcpu of an unit + * is the same as being the only one. + */ + if ( unit->vcpu_list == v ) + { + rcu_read_lock(&sched_res_rculock); + + sched_remove_unit(vcpu_scheduler(v), unit); + sched_free_udata(vcpu_scheduler(v), unit->priv); + sched_free_unit(unit, v); + + rcu_read_unlock(&sched_res_rculock); + } +} + +int sched_init_domain(struct domain *d, int poolid) +{ + void *sdom; + int ret; + + ASSERT(d->cpupool == NULL); + ASSERT(d->domain_id < DOMID_FIRST_RESERVED); + + if ( (ret = cpupool_add_domain(d, poolid)) ) + return ret; + + SCHED_STAT_CRANK(dom_init); + TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id); + + rcu_read_lock(&sched_res_rculock); + + sdom = sched_alloc_domdata(dom_scheduler(d), d); + + rcu_read_unlock(&sched_res_rculock); + + if ( IS_ERR(sdom) ) + return PTR_ERR(sdom); + + d->sched_priv = sdom; + + return 0; +} + +void sched_destroy_domain(struct domain *d) +{ + ASSERT(d->domain_id < DOMID_FIRST_RESERVED); + + if ( d->cpupool ) + { + SCHED_STAT_CRANK(dom_destroy); + TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); + + rcu_read_lock(&sched_res_rculock); + + sched_free_domdata(dom_scheduler(d), d->sched_priv); + d->sched_priv = NULL; + + rcu_read_unlock(&sched_res_rculock); + + cpupool_rm_domain(d); + } +} + +static void vcpu_sleep_nosync_locked(struct vcpu *v) +{ + struct sched_unit *unit = v->sched_unit; + + ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); + + if ( likely(!vcpu_runnable(v)) ) + { + if ( v->runstate.state == RUNSTATE_runnable ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + + /* Only put unit to sleep in case all vcpus are not runnable. */ + if ( likely(!unit_runnable(unit)) ) + sched_sleep(unit_scheduler(unit), unit); + else if ( unit_running(unit) > 1 && v->is_running && + !v->force_context_switch ) + { + v->force_context_switch = true; + cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); + } + } +} + +void vcpu_sleep_nosync(struct vcpu *v) +{ + unsigned long flags; + spinlock_t *lock; + + TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irqsave(v->sched_unit, &flags); + + vcpu_sleep_nosync_locked(v); + + unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit); + + rcu_read_unlock(&sched_res_rculock); +} + +void vcpu_sleep_sync(struct vcpu *v) +{ + vcpu_sleep_nosync(v); + + while ( !vcpu_runnable(v) && v->is_running ) + cpu_relax(); + + sync_vcpu_execstate(v); +} + +void vcpu_wake(struct vcpu *v) +{ + unsigned long flags; + spinlock_t *lock; + struct sched_unit *unit = v->sched_unit; + + TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irqsave(unit, &flags); + + if ( likely(vcpu_runnable(v)) ) + { + if ( v->runstate.state >= RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); + /* + * Call sched_wake() unconditionally, even if unit is running already. + * We might have not been de-scheduled after vcpu_sleep_nosync_locked() + * and are now to be woken up again. + */ + sched_wake(unit_scheduler(unit), unit); + if ( unit->is_running && !v->is_running && !v->force_context_switch ) + { + v->force_context_switch = true; + cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); + } + } + else if ( !(v->pause_flags & VPF_blocked) ) + { + if ( v->runstate.state == RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + } + + unit_schedule_unlock_irqrestore(lock, flags, unit); + + rcu_read_unlock(&sched_res_rculock); +} + +void vcpu_unblock(struct vcpu *v) +{ + if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) + return; + + /* Polling period ends when a VCPU is unblocked. */ + if ( unlikely(v->poll_evtchn != 0) ) + { + v->poll_evtchn = 0; + /* + * We *must* re-clear _VPF_blocked to avoid racing other wakeups of + * this VCPU (and it then going back to sleep on poll_mask). + * Test-and-clear is idiomatic and ensures clear_bit not reordered. + */ + if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) + clear_bit(_VPF_blocked, &v->pause_flags); + } + + vcpu_wake(v); +} + +/* + * Do the actual movement of an unit from old to new CPU. Locks for *both* + * CPUs needs to have been taken already when calling this! + */ +static void sched_unit_move_locked(struct sched_unit *unit, + unsigned int new_cpu) +{ + unsigned int old_cpu = unit->res->master_cpu; + struct vcpu *v; + + rcu_read_lock(&sched_res_rculock); + + /* + * Transfer urgency status to new CPU before switching CPUs, as + * once the switch occurs, v->is_urgent is no longer protected by + * the per-CPU scheduler lock we are holding. + */ + for_each_sched_unit_vcpu ( unit, v ) + { + if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) ) + { + atomic_inc(&per_cpu(sched_urgent_count, new_cpu)); + atomic_dec(&per_cpu(sched_urgent_count, old_cpu)); + } + } + + /* + * Actual CPU switch to new CPU. This is safe because the lock + * pointer can't change while the current lock is held. + */ + sched_migrate(unit_scheduler(unit), unit, new_cpu); + + rcu_read_unlock(&sched_res_rculock); +} + +/* + * Initiating migration + * + * In order to migrate, we need the unit in question to have stopped + * running and have called sched_sleep() (to take it off any + * runqueues, for instance); and if it is currently running, it needs + * to be scheduled out. Finally, we need to hold the scheduling locks + * for both the processor we're migrating from, and the processor + * we're migrating to. + * + * In order to avoid deadlock while satisfying the final requirement, + * we must release any scheduling lock we hold, then try to grab both + * locks we want, then double-check to make sure that what we started + * to do hasn't been changed in the mean time. + * + * These steps are encapsulated in the following two functions; they + * should be called like this: + * + * lock = unit_schedule_lock_irq(unit); + * sched_unit_migrate_start(unit); + * unit_schedule_unlock_irq(lock, unit) + * sched_unit_migrate_finish(unit); + * + * sched_unit_migrate_finish() will do the work now if it can, or simply + * return if it can't (because unit is still running); in that case + * sched_unit_migrate_finish() will be called by unit_context_saved(). + */ +static void sched_unit_migrate_start(struct sched_unit *unit) +{ + struct vcpu *v; + + for_each_sched_unit_vcpu ( unit, v ) + { + set_bit(_VPF_migrating, &v->pause_flags); + vcpu_sleep_nosync_locked(v); + } +} + +static void sched_unit_migrate_finish(struct sched_unit *unit) +{ + unsigned long flags; + unsigned int old_cpu, new_cpu; + spinlock_t *old_lock, *new_lock; + bool_t pick_called = 0; + struct vcpu *v; + + /* + * If the unit is currently running, this will be handled by + * unit_context_saved(); and in any case, if the bit is cleared, then + * someone else has already done the work so we don't need to. + */ + if ( unit->is_running ) + return; + for_each_sched_unit_vcpu ( unit, v ) + if ( !test_bit(_VPF_migrating, &v->pause_flags) ) + return; + + old_cpu = new_cpu = unit->res->master_cpu; + for ( ; ; ) + { + /* + * We need another iteration if the pre-calculated lock addresses + * are not correct any longer after evaluating old and new cpu holding + * the locks. + */ + old_lock = get_sched_res(old_cpu)->schedule_lock; + new_lock = get_sched_res(new_cpu)->schedule_lock; + + sched_spin_lock_double(old_lock, new_lock, &flags); + + old_cpu = unit->res->master_cpu; + if ( old_lock == get_sched_res(old_cpu)->schedule_lock ) + { + /* + * If we selected a CPU on the previosu iteration, check if it + * remains suitable for running this vCPU. + */ + if ( pick_called && + (new_lock == get_sched_res(new_cpu)->schedule_lock) && + cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) && + cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) + break; + + /* Select a new CPU. */ + new_cpu = sched_pick_resource(unit_scheduler(unit), + unit)->master_cpu; + if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) && + cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) + break; + pick_called = 1; + } + else + { + /* + * We do not hold the scheduler lock appropriate for this vCPU. + * Thus we cannot select a new CPU on this iteration. Try again. + */ + pick_called = 0; + } + + sched_spin_unlock_double(old_lock, new_lock, flags); + } + + /* + * NB. Check of v->running happens /after/ setting migration flag + * because they both happen in (different) spinlock regions, and those + * regions are strictly serialised. + */ + if ( unit->is_running ) + { + sched_spin_unlock_double(old_lock, new_lock, flags); + return; + } + for_each_sched_unit_vcpu ( unit, v ) + { + if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) + { + sched_spin_unlock_double(old_lock, new_lock, flags); + return; + } + } + + sched_unit_move_locked(unit, new_cpu); + + sched_spin_unlock_double(old_lock, new_lock, flags); + + if ( old_cpu != new_cpu ) + { + /* Vcpus are moved to other pcpus, commit their states to memory. */ + for_each_sched_unit_vcpu ( unit, v ) + sync_vcpu_execstate(v); + sched_move_irqs(unit); + } + + /* Wake on new CPU. */ + for_each_sched_unit_vcpu ( unit, v ) + vcpu_wake(v); +} + +static bool sched_check_affinity_broken(const struct sched_unit *unit) +{ + const struct vcpu *v; + + for_each_sched_unit_vcpu ( unit, v ) + if ( v->affinity_broken ) + return true; + + return false; +} + +static void sched_reset_affinity_broken(struct sched_unit *unit) +{ + struct vcpu *v; + + for_each_sched_unit_vcpu ( unit, v ) + v->affinity_broken = false; +} + +void restore_vcpu_affinity(struct domain *d) +{ + unsigned int cpu = smp_processor_id(); + struct sched_unit *unit; + + ASSERT(system_state == SYS_STATE_resume); + + rcu_read_lock(&sched_res_rculock); + + for_each_sched_unit ( d, unit ) + { + spinlock_t *lock; + unsigned int old_cpu = sched_unit_master(unit); + struct sched_resource *res; + + ASSERT(!unit_runnable(unit)); + + /* + * Re-assign the initial processor as after resume we have no + * guarantee the old processor has come back to life again. + * + * Therefore, here, before actually unpausing the domains, we should + * set v->processor of each of their vCPUs to something that will + * make sense for the scheduler of the cpupool in which they are in. + */ + lock = unit_schedule_lock_irq(unit); + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(d)); + if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) + { + if ( sched_check_affinity_broken(unit) ) + { + sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); + sched_reset_affinity_broken(unit); + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(d)); + } + + if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) + { + /* Affinity settings of one vcpu are for the complete unit. */ + printk(XENLOG_DEBUG "Breaking affinity for %pv\n", + unit->vcpu_list); + sched_set_affinity(unit, &cpumask_all, NULL); + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(d)); + } + } + + res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu))); + sched_set_res(unit, res); + + spin_unlock_irq(lock); + + /* v->processor might have changed, so reacquire the lock. */ + lock = unit_schedule_lock_irq(unit); + res = sched_pick_resource(unit_scheduler(unit), unit); + sched_set_res(unit, res); + spin_unlock_irq(lock); + + if ( old_cpu != sched_unit_master(unit) ) + sched_move_irqs(unit); + } + + rcu_read_unlock(&sched_res_rculock); + + domain_update_node_affinity(d); +} + +/* + * This function is used by cpu_hotplug code via cpu notifier chain + * and from cpupools to switch schedulers on a cpu. + * Caller must get domlist_read_lock. + */ +int cpu_disable_scheduler(unsigned int cpu) +{ + struct domain *d; + struct cpupool *c; + cpumask_t online_affinity; + int ret = 0; + + rcu_read_lock(&sched_res_rculock); + + c = get_sched_res(cpu)->cpupool; + if ( c == NULL ) + goto out; + + for_each_domain_in_cpupool ( d, c ) + { + struct sched_unit *unit; + + for_each_sched_unit ( d, unit ) + { + unsigned long flags; + spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags); + + cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid); + if ( cpumask_empty(&online_affinity) && + cpumask_test_cpu(cpu, unit->cpu_hard_affinity) ) + { + if ( sched_check_affinity_broken(unit) ) + { + /* The unit is temporarily pinned, can't move it. */ + unit_schedule_unlock_irqrestore(lock, flags, unit); + ret = -EADDRINUSE; + break; + } + + printk(XENLOG_DEBUG "Breaking affinity for %pv\n", + unit->vcpu_list); + + sched_set_affinity(unit, &cpumask_all, NULL); + } + + if ( unit->res != get_sched_res(cpu) ) + { + /* The unit is not on this cpu, so we can move on. */ + unit_schedule_unlock_irqrestore(lock, flags, unit); + continue; + } + + /* If it is on this cpu, we must send it away. + * We are doing some cpupool manipulations: + * * we want to call the scheduler, and let it re-evaluation + * the placement of the vcpu, taking into account the new + * cpupool configuration; + * * the scheduler will always find a suitable solution, or + * things would have failed before getting in here. + */ + sched_unit_migrate_start(unit); + unit_schedule_unlock_irqrestore(lock, flags, unit); + sched_unit_migrate_finish(unit); + + /* + * The only caveat, in this case, is that if a vcpu active in + * the hypervisor isn't migratable. In this case, the caller + * should try again after releasing and reaquiring all locks. + */ + if ( unit->res == get_sched_res(cpu) ) + ret = -EAGAIN; + } + } + +out: + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +static int cpu_disable_scheduler_check(unsigned int cpu) +{ + struct domain *d; + struct vcpu *v; + struct cpupool *c; + + c = get_sched_res(cpu)->cpupool; + if ( c == NULL ) + return 0; + + for_each_domain_in_cpupool ( d, c ) + for_each_vcpu ( d, v ) + if ( v->affinity_broken ) + return -EADDRINUSE; + + return 0; +} + +/* + * In general, this must be called with the scheduler lock held, because the + * adjust_affinity hook may want to modify the vCPU state. However, when the + * vCPU is being initialized (either for dom0 or domU) there is no risk of + * races, and it's fine to not take the look (we're talking about + * sched_setup_dom0_vcpus() an sched_init_vcpu()). + */ +static void sched_set_affinity( + struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft) +{ + rcu_read_lock(&sched_res_rculock); + sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft); + rcu_read_unlock(&sched_res_rculock); + + if ( hard ) + cpumask_copy(unit->cpu_hard_affinity, hard); + if ( soft ) + cpumask_copy(unit->cpu_soft_affinity, soft); + + unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity, + unit->cpu_soft_affinity) && + cpumask_intersects(unit->cpu_soft_affinity, + unit->cpu_hard_affinity); +} + +static int vcpu_set_affinity( + struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which) +{ + struct sched_unit *unit = v->sched_unit; + spinlock_t *lock; + int ret = 0; + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irq(unit); + + if ( v->affinity_broken ) + ret = -EBUSY; + else + { + /* + * Tell the scheduler we changes something about affinity, + * and ask to re-evaluate vcpu placement. + */ + if ( which == unit->cpu_hard_affinity ) + { + sched_set_affinity(unit, affinity, NULL); + } + else + { + ASSERT(which == unit->cpu_soft_affinity); + sched_set_affinity(unit, NULL, affinity); + } + sched_unit_migrate_start(unit); + } + + unit_schedule_unlock_irq(lock, unit); + + domain_update_node_affinity(v->domain); + + sched_unit_migrate_finish(unit); + + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity) +{ + cpumask_t online_affinity; + cpumask_t *online; + + online = VCPU2ONLINE(v); + cpumask_and(&online_affinity, affinity, online); + if ( cpumask_empty(&online_affinity) ) + return -EINVAL; + + return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity); +} + +int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity) +{ + return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity); +} + +/* Block the currently-executing domain until a pertinent event occurs. */ +void vcpu_block(void) +{ + struct vcpu *v = current; + + set_bit(_VPF_blocked, &v->pause_flags); + + arch_vcpu_block(v); + + /* Check for events /after/ blocking: avoids wakeup waiting race. */ + if ( local_events_need_delivery() ) + { + clear_bit(_VPF_blocked, &v->pause_flags); + } + else + { + TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); + raise_softirq(SCHEDULE_SOFTIRQ); + } +} + +static void vcpu_block_enable_events(void) +{ + local_event_delivery_enable(); + vcpu_block(); +} + +static long do_poll(struct sched_poll *sched_poll) +{ + struct vcpu *v = current; + struct domain *d = v->domain; + evtchn_port_t port = 0; + long rc; + unsigned int i; + + /* Fairly arbitrary limit. */ + if ( sched_poll->nr_ports > 128 ) + return -EINVAL; + + if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) + return -EFAULT; + + set_bit(_VPF_blocked, &v->pause_flags); + v->poll_evtchn = -1; + set_bit(v->vcpu_id, d->poll_mask); + + arch_vcpu_block(v); + +#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ + /* Check for events /after/ setting flags: avoids wakeup waiting race. */ + smp_mb(); + + /* + * Someone may have seen we are blocked but not that we are polling, or + * vice versa. We are certainly being woken, so clean up and bail. Beyond + * this point others can be guaranteed to clean up for us if they wake us. + */ + rc = 0; + if ( (v->poll_evtchn == 0) || + !test_bit(_VPF_blocked, &v->pause_flags) || + !test_bit(v->vcpu_id, d->poll_mask) ) + goto out; +#endif + + rc = 0; + if ( local_events_need_delivery() ) + goto out; + + for ( i = 0; i < sched_poll->nr_ports; i++ ) + { + rc = -EFAULT; + if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) + goto out; + + rc = -EINVAL; + if ( port >= d->max_evtchns ) + goto out; + + rc = 0; + if ( evtchn_port_is_pending(d, port) ) + goto out; + } + + if ( sched_poll->nr_ports == 1 ) + v->poll_evtchn = port; + + if ( sched_poll->timeout != 0 ) + set_timer(&v->poll_timer, sched_poll->timeout); + + TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); + raise_softirq(SCHEDULE_SOFTIRQ); + + return 0; + + out: + v->poll_evtchn = 0; + clear_bit(v->vcpu_id, d->poll_mask); + clear_bit(_VPF_blocked, &v->pause_flags); + return rc; +} + +/* Voluntarily yield the processor for this allocation. */ +long vcpu_yield(void) +{ + struct vcpu * v=current; + spinlock_t *lock; + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irq(v->sched_unit); + sched_yield(vcpu_scheduler(v), v->sched_unit); + unit_schedule_unlock_irq(lock, v->sched_unit); + + rcu_read_unlock(&sched_res_rculock); + + SCHED_STAT_CRANK(vcpu_yield); + + TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); + raise_softirq(SCHEDULE_SOFTIRQ); + return 0; +} + +static void domain_watchdog_timeout(void *data) +{ + struct domain *d = data; + + if ( d->is_shutting_down || d->is_dying ) + return; + + printk("Watchdog timer fired for domain %u\n", d->domain_id); + domain_shutdown(d, SHUTDOWN_watchdog); +} + +static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout) +{ + if ( id > NR_DOMAIN_WATCHDOG_TIMERS ) + return -EINVAL; + + spin_lock(&d->watchdog_lock); + + if ( id == 0 ) + { + for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ ) + { + if ( test_and_set_bit(id, &d->watchdog_inuse_map) ) + continue; + set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); + break; + } + spin_unlock(&d->watchdog_lock); + return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1; + } + + id -= 1; + if ( !test_bit(id, &d->watchdog_inuse_map) ) + { + spin_unlock(&d->watchdog_lock); + return -EINVAL; + } + + if ( timeout == 0 ) + { + stop_timer(&d->watchdog_timer[id]); + clear_bit(id, &d->watchdog_inuse_map); + } + else + { + set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); + } + + spin_unlock(&d->watchdog_lock); + return 0; +} + +void watchdog_domain_init(struct domain *d) +{ + unsigned int i; + + spin_lock_init(&d->watchdog_lock); + + d->watchdog_inuse_map = 0; + + for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) + init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); +} + +void watchdog_domain_destroy(struct domain *d) +{ + unsigned int i; + + for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) + kill_timer(&d->watchdog_timer[i]); +} + +/* + * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if + * cpu is NR_CPUS). + * Temporary pinning can be done due to two reasons, which may be nested: + * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case + * of a conflict (e.g. in case cpupool doesn't include requested CPU, or + * another conflicting temporary pinning is already in effect. + * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the + * CPU it is just running on. Can't fail if used properly. + */ +int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason) +{ + struct sched_unit *unit = v->sched_unit; + spinlock_t *lock; + int ret = -EINVAL; + bool migrate; + + rcu_read_lock(&sched_res_rculock); + + lock = unit_schedule_lock_irq(unit); + + if ( cpu == NR_CPUS ) + { + if ( v->affinity_broken & reason ) + { + ret = 0; + v->affinity_broken &= ~reason; + } + if ( !ret && !sched_check_affinity_broken(unit) ) + sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); + } + else if ( cpu < nr_cpu_ids ) + { + if ( (v->affinity_broken & reason) || + (sched_check_affinity_broken(unit) && v->processor != cpu) ) + ret = -EBUSY; + else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) ) + { + if ( !sched_check_affinity_broken(unit) ) + { + cpumask_copy(unit->cpu_hard_affinity_saved, + unit->cpu_hard_affinity); + sched_set_affinity(unit, cpumask_of(cpu), NULL); + } + v->affinity_broken |= reason; + ret = 0; + } + } + + migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity); + if ( migrate ) + sched_unit_migrate_start(unit); + + unit_schedule_unlock_irq(lock, unit); + + if ( migrate ) + sched_unit_migrate_finish(unit); + + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +typedef long ret_t; + +#endif /* !COMPAT */ + +ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) +{ + ret_t ret = 0; + + switch ( cmd ) + { + case SCHEDOP_yield: + { + ret = vcpu_yield(); + break; + } + + case SCHEDOP_block: + { + vcpu_block_enable_events(); + break; + } + + case SCHEDOP_shutdown: + { + struct sched_shutdown sched_shutdown; + + ret = -EFAULT; + if ( copy_from_guest(&sched_shutdown, arg, 1) ) + break; + + TRACE_3D(TRC_SCHED_SHUTDOWN, + current->domain->domain_id, current->vcpu_id, + sched_shutdown.reason); + ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason); + + break; + } + + case SCHEDOP_shutdown_code: + { + struct sched_shutdown sched_shutdown; + struct domain *d = current->domain; + + ret = -EFAULT; + if ( copy_from_guest(&sched_shutdown, arg, 1) ) + break; + + TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, + d->domain_id, current->vcpu_id, sched_shutdown.reason); + + spin_lock(&d->shutdown_lock); + if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) + d->shutdown_code = (u8)sched_shutdown.reason; + spin_unlock(&d->shutdown_lock); + + ret = 0; + break; + } + + case SCHEDOP_poll: + { + struct sched_poll sched_poll; + + ret = -EFAULT; + if ( copy_from_guest(&sched_poll, arg, 1) ) + break; + + ret = do_poll(&sched_poll); + + break; + } + + case SCHEDOP_remote_shutdown: + { + struct domain *d; + struct sched_remote_shutdown sched_remote_shutdown; + + ret = -EFAULT; + if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) + break; + + ret = -ESRCH; + d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); + if ( d == NULL ) + break; + + ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); + if ( likely(!ret) ) + domain_shutdown(d, sched_remote_shutdown.reason); + + rcu_unlock_domain(d); + + break; + } + + case SCHEDOP_watchdog: + { + struct sched_watchdog sched_watchdog; + + ret = -EFAULT; + if ( copy_from_guest(&sched_watchdog, arg, 1) ) + break; + + ret = domain_watchdog( + current->domain, sched_watchdog.id, sched_watchdog.timeout); + break; + } + + case SCHEDOP_pin_override: + { + struct sched_pin_override sched_pin_override; + unsigned int cpu; + + ret = -EPERM; + if ( !is_hardware_domain(current->domain) ) + break; + + ret = -EFAULT; + if ( copy_from_guest(&sched_pin_override, arg, 1) ) + break; + + ret = -EINVAL; + if ( sched_pin_override.pcpu >= NR_CPUS ) + break; + + cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu; + ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE); + + break; + } + + default: + ret = -ENOSYS; + } + + return ret; +} + +#ifndef COMPAT + +/* Per-vcpu oneshot-timer hypercall. */ +long do_set_timer_op(s_time_t timeout) +{ + struct vcpu *v = current; + s_time_t offset = timeout - NOW(); + + if ( timeout == 0 ) + { + stop_timer(&v->singleshot_timer); + } + else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ + unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) + { + /* + * Linux workaround: occasionally we will see timeouts a long way in + * the future due to wrapping in Linux's jiffy time handling. We check + * for timeouts wrapped negative, and for positive timeouts more than + * about 13 days in the future (2^50ns). The correct fix is to trigger + * an interrupt immediately (since Linux in fact has pending work to + * do in this situation). However, older guests also set a long timeout + * when they have *no* pending timers at all: setting an immediate + * timeout in this case can burn a lot of CPU. We therefore go for a + * reasonable middleground of triggering a timer event in 100ms. + */ + gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n", + timeout); + set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); + } + else + { + migrate_timer(&v->singleshot_timer, smp_processor_id()); + set_timer(&v->singleshot_timer, timeout); + } + + return 0; +} + +/* sched_id - fetch ID of current scheduler */ +int sched_id(void) +{ + return ops.sched_id; +} + +/* Adjust scheduling parameter for a given domain. */ +long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) +{ + long ret; + + ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd); + if ( ret ) + return ret; + + if ( op->sched_id != dom_scheduler(d)->sched_id ) + return -EINVAL; + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_putinfo: + case XEN_DOMCTL_SCHEDOP_getinfo: + case XEN_DOMCTL_SCHEDOP_putvcpuinfo: + case XEN_DOMCTL_SCHEDOP_getvcpuinfo: + break; + default: + return -EINVAL; + } + + /* NB: the pluggable scheduler code needs to take care + * of locking by itself. */ + rcu_read_lock(&sched_res_rculock); + + if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 ) + TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); + + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +long sched_adjust_global(struct xen_sysctl_scheduler_op *op) +{ + struct cpupool *pool; + int rc; + + rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd); + if ( rc ) + return rc; + + if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) && + (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) ) + return -EINVAL; + + pool = cpupool_get_by_id(op->cpupool_id); + if ( pool == NULL ) + return -ESRCH; + + rcu_read_lock(&sched_res_rculock); + + rc = ((op->sched_id == pool->sched->sched_id) + ? sched_adjust_cpupool(pool->sched, op) : -EINVAL); + + rcu_read_unlock(&sched_res_rculock); + + cpupool_put(pool); + + return rc; +} + +static void vcpu_periodic_timer_work_locked(struct vcpu *v) +{ + s_time_t now; + s_time_t periodic_next_event; + + now = NOW(); + periodic_next_event = v->periodic_last_event + v->periodic_period; + + if ( now >= periodic_next_event ) + { + send_timer_event(v); + v->periodic_last_event = now; + periodic_next_event = now + v->periodic_period; + } + + migrate_timer(&v->periodic_timer, v->processor); + set_timer(&v->periodic_timer, periodic_next_event); +} + +static void vcpu_periodic_timer_work(struct vcpu *v) +{ + if ( v->periodic_period == 0 ) + return; + + spin_lock(&v->periodic_timer_lock); + if ( v->periodic_period ) + vcpu_periodic_timer_work_locked(v); + spin_unlock(&v->periodic_timer_lock); +} + +/* + * Set the periodic timer of a vcpu. + */ +void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value) +{ + spin_lock(&v->periodic_timer_lock); + + stop_timer(&v->periodic_timer); + + v->periodic_period = value; + if ( value ) + vcpu_periodic_timer_work_locked(v); + + spin_unlock(&v->periodic_timer_lock); +} + +static void sched_switch_units(struct sched_resource *sr, + struct sched_unit *next, struct sched_unit *prev, + s_time_t now) +{ + unsigned int cpu; + + ASSERT(unit_running(prev)); + + if ( prev != next ) + { + sr->curr = next; + sr->prev = prev; + + TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, + prev->unit_id, now - prev->state_entry_time); + TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, + next->unit_id, + (next->vcpu_list->runstate.state == RUNSTATE_runnable) ? + (now - next->state_entry_time) : 0, prev->next_time); + TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id, + next->domain->domain_id, next->unit_id); + + ASSERT(!unit_running(next)); + + /* + * NB. Don't add any trace records from here until the actual context + * switch, else lost_records resume will not work properly. + */ + + ASSERT(!next->is_running); + next->is_running = true; + next->state_entry_time = now; + + if ( is_idle_unit(prev) ) + { + prev->runstate_cnt[RUNSTATE_running] = 0; + prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity; + } + if ( is_idle_unit(next) ) + { + next->runstate_cnt[RUNSTATE_running] = sr->granularity; + next->runstate_cnt[RUNSTATE_runnable] = 0; + } + } + + for_each_cpu ( cpu, sr->cpus ) + { + struct vcpu *vprev = get_cpu_current(cpu); + struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu); + + if ( vprev != vnext || vprev->runstate.state != vnext->new_state ) + { + vcpu_runstate_change(vprev, + ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked : + (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)), + now); + vcpu_runstate_change(vnext, vnext->new_state, now); + } + + vnext->is_running = 1; + + if ( is_idle_vcpu(vnext) ) + vnext->sched_unit = next; + } +} + +static bool sched_tasklet_check_cpu(unsigned int cpu) +{ + unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu); + + switch ( *tasklet_work ) + { + case TASKLET_enqueued: + set_bit(_TASKLET_scheduled, tasklet_work); + /* fallthrough */ + case TASKLET_enqueued|TASKLET_scheduled: + return true; + break; + case TASKLET_scheduled: + clear_bit(_TASKLET_scheduled, tasklet_work); + /* fallthrough */ + case 0: + /* return false; */ + break; + default: + BUG(); + } + + return false; +} + +static bool sched_tasklet_check(unsigned int cpu) +{ + bool tasklet_work_scheduled = false; + const cpumask_t *mask = get_sched_res(cpu)->cpus; + unsigned int cpu_iter; + + for_each_cpu ( cpu_iter, mask ) + if ( sched_tasklet_check_cpu(cpu_iter) ) + tasklet_work_scheduled = true; + + return tasklet_work_scheduled; +} + +static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now, + unsigned int cpu) +{ + struct sched_resource *sr = get_sched_res(cpu); + struct scheduler *sched = sr->scheduler; + struct sched_unit *next; + + /* get policy-specific decision on scheduling... */ + sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu)); + + next = prev->next_task; + + if ( prev->next_time >= 0 ) /* -ve means no limit */ + set_timer(&sr->s_timer, now + prev->next_time); + + sched_switch_units(sr, next, prev, now); + + return next; +} + +static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext) +{ + /* Clear running flag /after/ writing context to memory. */ + smp_wmb(); + + if ( vprev != vnext ) + vprev->is_running = 0; +} + +static void unit_context_saved(struct sched_resource *sr) +{ + struct sched_unit *unit = sr->prev; + + if ( !unit ) + return; + + unit->is_running = false; + unit->state_entry_time = NOW(); + sr->prev = NULL; + + /* Check for migration request /after/ clearing running flag. */ + smp_mb(); + + sched_context_saved(unit_scheduler(unit), unit); + + /* Idle never migrates and idle vcpus might belong to other units. */ + if ( !is_idle_unit(unit) ) + sched_unit_migrate_finish(unit); +} + +/* + * Rendezvous on end of context switch. + * As no lock is protecting this rendezvous function we need to use atomic + * access functions on the counter. + * The counter will be 0 in case no rendezvous is needed. For the rendezvous + * case it is initialised to the number of cpus to rendezvous plus 1. Each + * member entering decrements the counter. The last one will decrement it to + * 1 and perform the final needed action in that case (call of + * unit_context_saved()), and then set the counter to zero. The other members + * will wait until the counter becomes zero until they proceed. + */ +void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext) +{ + struct sched_unit *next = vnext->sched_unit; + struct sched_resource *sr; + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(smp_processor_id()); + + if ( atomic_read(&next->rendezvous_out_cnt) ) + { + int cnt = atomic_dec_return(&next->rendezvous_out_cnt); + + vcpu_context_saved(vprev, vnext); + + /* Call unit_context_saved() before releasing other waiters. */ + if ( cnt == 1 ) + { + unit_context_saved(sr); + atomic_set(&next->rendezvous_out_cnt, 0); + } + else + while ( atomic_read(&next->rendezvous_out_cnt) ) + cpu_relax(); + } + else + { + vcpu_context_saved(vprev, vnext); + if ( sr->granularity == 1 ) + unit_context_saved(sr); + } + + if ( is_idle_vcpu(vprev) && vprev != vnext ) + vprev->sched_unit = sr->sched_unit_idle; + + rcu_read_unlock(&sched_res_rculock); +} + +static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext, + bool reset_idle_unit, s_time_t now) +{ + if ( unlikely(vprev == vnext) ) + { + TRACE_4D(TRC_SCHED_SWITCH_INFCONT, + vnext->domain->domain_id, vnext->sched_unit->unit_id, + now - vprev->runstate.state_entry_time, + vprev->sched_unit->next_time); + sched_context_switched(vprev, vnext); + + /* + * We are switching from a non-idle to an idle unit. + * A vcpu of the idle unit might have been running before due to + * the guest vcpu being blocked. We must adjust the unit of the idle + * vcpu which might have been set to the guest's one. + */ + if ( reset_idle_unit ) + vnext->sched_unit = + get_sched_res(smp_processor_id())->sched_unit_idle; + + rcu_read_unlock(&sched_res_rculock); + + trace_continue_running(vnext); + return continue_running(vprev); + } + + SCHED_STAT_CRANK(sched_ctx); + + stop_timer(&vprev->periodic_timer); + + if ( vnext->sched_unit->migrated ) + vcpu_move_irqs(vnext); + + vcpu_periodic_timer_work(vnext); + + rcu_read_unlock(&sched_res_rculock); + + context_switch(vprev, vnext); +} + +/* + * Force a context switch of a single vcpu of an unit. + * Might be called either if a vcpu of an already running unit is woken up + * or if a vcpu of a running unit is put asleep with other vcpus of the same + * unit still running. + * Returns either NULL if v is already in the correct state or the vcpu to + * run next. + */ +static struct vcpu *sched_force_context_switch(struct vcpu *vprev, + struct vcpu *v, + unsigned int cpu, s_time_t now) +{ + v->force_context_switch = false; + + if ( vcpu_runnable(v) == v->is_running ) + return NULL; + + if ( vcpu_runnable(v) ) + { + if ( is_idle_vcpu(vprev) ) + { + vcpu_runstate_change(vprev, RUNSTATE_runnable, now); + vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; + } + vcpu_runstate_change(v, RUNSTATE_running, now); + } + else + { + /* Make sure not to switch last vcpu of an unit away. */ + if ( unit_running(v->sched_unit) == 1 ) + return NULL; + + v->new_state = vcpu_runstate_blocked(v); + vcpu_runstate_change(v, v->new_state, now); + v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu); + if ( v != vprev ) + { + if ( is_idle_vcpu(vprev) ) + { + vcpu_runstate_change(vprev, RUNSTATE_runnable, now); + vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; + } + else + { + v->sched_unit = vprev->sched_unit; + vcpu_runstate_change(v, RUNSTATE_running, now); + } + } + } + + /* This vcpu will be switched to. */ + v->is_running = true; + + /* Make sure not to loose another slave call. */ + raise_softirq(SCHED_SLAVE_SOFTIRQ); + + return v; +} + +/* + * Rendezvous before taking a scheduling decision. + * Called with schedule lock held, so all accesses to the rendezvous counter + * can be normal ones (no atomic accesses needed). + * The counter is initialized to the number of cpus to rendezvous initially. + * Each cpu entering will decrement the counter. In case the counter becomes + * zero do_schedule() is called and the rendezvous counter for leaving + * context_switch() is set. All other members will wait until the counter is + * becoming zero, dropping the schedule lock in between. + */ +static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev, + spinlock_t **lock, int cpu, + s_time_t now) +{ + struct sched_unit *next; + struct vcpu *v; + unsigned int gran = get_sched_res(cpu)->granularity; + + if ( !--prev->rendezvous_in_cnt ) + { + next = do_schedule(prev, now, cpu); + atomic_set(&next->rendezvous_out_cnt, gran + 1); + return next; + } + + v = unit2vcpu_cpu(prev, cpu); + while ( prev->rendezvous_in_cnt ) + { + if ( v && v->force_context_switch ) + { + struct vcpu *vprev = current; + + v = sched_force_context_switch(vprev, v, cpu, now); + + if ( v ) + { + /* We'll come back another time, so adjust rendezvous_in_cnt. */ + prev->rendezvous_in_cnt++; + atomic_set(&prev->rendezvous_out_cnt, 0); + + pcpu_schedule_unlock_irq(*lock, cpu); + + sched_context_switch(vprev, v, false, now); + + return NULL; /* ARM only. */ + } + + v = unit2vcpu_cpu(prev, cpu); + } + /* + * Coming from idle might need to do tasklet work. + * In order to avoid deadlocks we can't do that here, but have to + * continue the idle loop. + * Undo the rendezvous_in_cnt decrement and schedule another call of + * sched_slave(). + */ + if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) ) + { + struct vcpu *vprev = current; + + prev->rendezvous_in_cnt++; + atomic_set(&prev->rendezvous_out_cnt, 0); + + pcpu_schedule_unlock_irq(*lock, cpu); + + raise_softirq(SCHED_SLAVE_SOFTIRQ); + sched_context_switch(vprev, vprev, false, now); + + return NULL; /* ARM only. */ + } + + pcpu_schedule_unlock_irq(*lock, cpu); + + cpu_relax(); + + *lock = pcpu_schedule_lock_irq(cpu); + + if ( unlikely(!scheduler_active) ) + { + ASSERT(is_idle_unit(prev)); + atomic_set(&prev->next_task->rendezvous_out_cnt, 0); + prev->rendezvous_in_cnt = 0; + } + } + + return prev->next_task; +} + +static void sched_slave(void) +{ + struct vcpu *v, *vprev = current; + struct sched_unit *prev = vprev->sched_unit, *next; + s_time_t now; + spinlock_t *lock; + bool do_softirq = false; + unsigned int cpu = smp_processor_id(); + + ASSERT_NOT_IN_ATOMIC(); + + rcu_read_lock(&sched_res_rculock); + + lock = pcpu_schedule_lock_irq(cpu); + + now = NOW(); + + v = unit2vcpu_cpu(prev, cpu); + if ( v && v->force_context_switch ) + { + v = sched_force_context_switch(vprev, v, cpu, now); + + if ( v ) + { + pcpu_schedule_unlock_irq(lock, cpu); + + sched_context_switch(vprev, v, false, now); + + return; + } + + do_softirq = true; + } + + if ( !prev->rendezvous_in_cnt ) + { + pcpu_schedule_unlock_irq(lock, cpu); + + rcu_read_unlock(&sched_res_rculock); + + /* Check for failed forced context switch. */ + if ( do_softirq ) + raise_softirq(SCHEDULE_SOFTIRQ); + + return; + } + + stop_timer(&get_sched_res(cpu)->s_timer); + + next = sched_wait_rendezvous_in(prev, &lock, cpu, now); + if ( !next ) + return; + + pcpu_schedule_unlock_irq(lock, cpu); + + sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu), + is_idle_unit(next) && !is_idle_unit(prev), now); +} + +/* + * The main function + * - deschedule the current domain (scheduler independent). + * - pick a new domain (scheduler dependent). + */ +static void schedule(void) +{ + struct vcpu *vnext, *vprev = current; + struct sched_unit *prev = vprev->sched_unit, *next = NULL; + s_time_t now; + struct sched_resource *sr; + spinlock_t *lock; + int cpu = smp_processor_id(); + unsigned int gran; + + ASSERT_NOT_IN_ATOMIC(); + + SCHED_STAT_CRANK(sched_run); + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); + gran = sr->granularity; + + lock = pcpu_schedule_lock_irq(cpu); + + if ( prev->rendezvous_in_cnt ) + { + /* + * We have a race: sched_slave() should be called, so raise a softirq + * in order to re-enter schedule() later and call sched_slave() now. + */ + pcpu_schedule_unlock_irq(lock, cpu); + + rcu_read_unlock(&sched_res_rculock); + + raise_softirq(SCHEDULE_SOFTIRQ); + return sched_slave(); + } + + stop_timer(&sr->s_timer); + + now = NOW(); + + if ( gran > 1 ) + { + cpumask_t mask; + + prev->rendezvous_in_cnt = gran; + cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu)); + cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ); + next = sched_wait_rendezvous_in(prev, &lock, cpu, now); + if ( !next ) + return; + } + else + { + prev->rendezvous_in_cnt = 0; + next = do_schedule(prev, now, cpu); + atomic_set(&next->rendezvous_out_cnt, 0); + } + + pcpu_schedule_unlock_irq(lock, cpu); + + vnext = sched_unit2vcpu_cpu(next, cpu); + sched_context_switch(vprev, vnext, + !is_idle_unit(prev) && is_idle_unit(next), now); +} + +/* The scheduler timer: force a run through the scheduler */ +static void s_timer_fn(void *unused) +{ + raise_softirq(SCHEDULE_SOFTIRQ); + SCHED_STAT_CRANK(sched_irq); +} + +/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ +static void vcpu_periodic_timer_fn(void *data) +{ + struct vcpu *v = data; + vcpu_periodic_timer_work(v); +} + +/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ +static void vcpu_singleshot_timer_fn(void *data) +{ + struct vcpu *v = data; + send_timer_event(v); +} + +/* SCHEDOP_poll timeout callback. */ +static void poll_timer_fn(void *data) +{ + struct vcpu *v = data; + + if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) + vcpu_unblock(v); +} + +static struct sched_resource *sched_alloc_res(void) +{ + struct sched_resource *sr; + + sr = xzalloc(struct sched_resource); + if ( sr == NULL ) + return NULL; + if ( !zalloc_cpumask_var(&sr->cpus) ) + { + xfree(sr); + return NULL; + } + return sr; +} + +static int cpu_schedule_up(unsigned int cpu) +{ + struct sched_resource *sr; + + sr = sched_alloc_res(); + if ( sr == NULL ) + return -ENOMEM; + + sr->master_cpu = cpu; + cpumask_copy(sr->cpus, cpumask_of(cpu)); + set_sched_res(cpu, sr); + + sr->scheduler = &sched_idle_ops; + spin_lock_init(&sr->_lock); + sr->schedule_lock = &sched_free_cpu_lock; + init_timer(&sr->s_timer, s_timer_fn, NULL, cpu); + atomic_set(&per_cpu(sched_urgent_count, cpu), 0); + + /* We start with cpu granularity. */ + sr->granularity = 1; + + cpumask_set_cpu(cpu, &sched_res_mask); + + /* Boot CPU is dealt with later in scheduler_init(). */ + if ( cpu == 0 ) + return 0; + + if ( idle_vcpu[cpu] == NULL ) + vcpu_create(idle_vcpu[0]->domain, cpu); + else + idle_vcpu[cpu]->sched_unit->res = sr; + + if ( idle_vcpu[cpu] == NULL ) + return -ENOMEM; + + idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0; + + /* + * No need to allocate any scheduler data, as cpus coming online are + * free initially and the idle scheduler doesn't need any data areas + * allocated. + */ + + sr->curr = idle_vcpu[cpu]->sched_unit; + sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit; + + sr->sched_priv = NULL; + + return 0; +} + +static void sched_res_free(struct rcu_head *head) +{ + struct sched_resource *sr = container_of(head, struct sched_resource, rcu); + + free_cpumask_var(sr->cpus); + if ( sr->sched_unit_idle ) + sched_free_unit_mem(sr->sched_unit_idle); + xfree(sr); +} + +static void cpu_schedule_down(unsigned int cpu) +{ + struct sched_resource *sr; + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); + + kill_timer(&sr->s_timer); + + cpumask_clear_cpu(cpu, &sched_res_mask); + set_sched_res(cpu, NULL); + + /* Keep idle unit. */ + sr->sched_unit_idle = NULL; + call_rcu(&sr->rcu, sched_res_free); + + rcu_read_unlock(&sched_res_rculock); +} + +void sched_rm_cpu(unsigned int cpu) +{ + int rc; + + rcu_read_lock(&domlist_read_lock); + rc = cpu_disable_scheduler(cpu); + BUG_ON(rc); + rcu_read_unlock(&domlist_read_lock); + cpu_schedule_down(cpu); +} + +static int cpu_schedule_callback( + struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int rc = 0; + + /* + * All scheduler related suspend/resume handling needed is done in + * cpupool.c. + */ + if ( system_state > SYS_STATE_active ) + return NOTIFY_DONE; + + rcu_read_lock(&sched_res_rculock); + + /* + * From the scheduler perspective, bringing up a pCPU requires + * allocating and initializing the per-pCPU scheduler specific data, + * as well as "registering" this pCPU to the scheduler (which may + * involve modifying some scheduler wide data structures). + * As new pCPUs always start as "free" cpus with the minimal idle + * scheduler being in charge, we don't need any of that. + * + * On the other hand, at teardown, we need to reverse what has been done + * during initialization, and then free the per-pCPU specific data. A + * pCPU brought down is not forced through "free" cpus, so here we need to + * use the appropriate hooks. + * + * This happens by calling the deinit_pdata and free_pdata hooks, in this + * order. If no per-pCPU memory was allocated, there is no need to + * provide an implementation of free_pdata. deinit_pdata may, however, + * be necessary/useful in this case too (e.g., it can undo something done + * on scheduler wide data structure during init_pdata). Both deinit_pdata + * and free_pdata are called during CPU_DEAD. + * + * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED. + */ + switch ( action ) + { + case CPU_UP_PREPARE: + rc = cpu_schedule_up(cpu); + break; + case CPU_DOWN_PREPARE: + rcu_read_lock(&domlist_read_lock); + rc = cpu_disable_scheduler_check(cpu); + rcu_read_unlock(&domlist_read_lock); + break; + case CPU_DEAD: + sched_rm_cpu(cpu); + break; + case CPU_UP_CANCELED: + cpu_schedule_down(cpu); + break; + default: + break; + } + + rcu_read_unlock(&sched_res_rculock); + + return !rc ? NOTIFY_DONE : notifier_from_errno(rc); +} + +static struct notifier_block cpu_schedule_nfb = { + .notifier_call = cpu_schedule_callback +}; + +const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu) +{ + const cpumask_t *mask; + + switch ( opt ) + { + case SCHED_GRAN_cpu: + mask = cpumask_of(cpu); + break; + case SCHED_GRAN_core: + mask = per_cpu(cpu_sibling_mask, cpu); + break; + case SCHED_GRAN_socket: + mask = per_cpu(cpu_core_mask, cpu); + break; + default: + ASSERT_UNREACHABLE(); + return NULL; + } + + return mask; +} + +static void schedule_dummy(void) +{ + sched_tasklet_check_cpu(smp_processor_id()); +} + +void scheduler_disable(void) +{ + scheduler_active = false; + open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy); + open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy); +} + +void scheduler_enable(void) +{ + open_softirq(SCHEDULE_SOFTIRQ, schedule); + open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave); + scheduler_active = true; +} + +/* Initialise the data structures. */ +void __init scheduler_init(void) +{ + struct domain *idle_domain; + int i; + + scheduler_enable(); + + for ( i = 0; i < NUM_SCHEDULERS; i++) + { +#define sched_test_func(f) \ + if ( !schedulers[i]->f ) \ + { \ + printk("scheduler %s misses .%s, dropped\n", \ + schedulers[i]->opt_name, #f); \ + schedulers[i] = NULL; \ + } + + sched_test_func(init); + sched_test_func(deinit); + sched_test_func(pick_resource); + sched_test_func(alloc_udata); + sched_test_func(free_udata); + sched_test_func(switch_sched); + sched_test_func(do_schedule); + +#undef sched_test_func + + if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 ) + { + printk("scheduler %s failed initialization, dropped\n", + schedulers[i]->opt_name); + schedulers[i] = NULL; + } + + if ( schedulers[i] && !ops.name && + !strcmp(schedulers[i]->opt_name, opt_sched) ) + ops = *schedulers[i]; + } + + if ( !ops.name ) + { + printk("Could not find scheduler: %s\n", opt_sched); + for ( i = 0; i < NUM_SCHEDULERS; i++ ) + if ( schedulers[i] && + !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) ) + { + ops = *schedulers[i]; + break; + } + BUG_ON(!ops.name); + printk("Using '%s' (%s)\n", ops.name, ops.opt_name); + } + + if ( cpu_schedule_up(0) ) + BUG(); + register_cpu_notifier(&cpu_schedule_nfb); + + printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); + if ( sched_init(&ops) ) + panic("scheduler returned error on init\n"); + + if ( sched_ratelimit_us && + (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX + || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) ) + { + printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n" + " Resetting to default %u\n", + XEN_SYSCTL_SCHED_RATELIMIT_MIN, + XEN_SYSCTL_SCHED_RATELIMIT_MAX, + SCHED_DEFAULT_RATELIMIT_US); + sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; + } + + idle_domain = domain_create(DOMID_IDLE, NULL, false); + BUG_ON(IS_ERR(idle_domain)); + BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu)); + idle_domain->vcpu = idle_vcpu; + idle_domain->max_vcpus = nr_cpu_ids; + if ( vcpu_create(idle_domain, 0) == NULL ) + BUG(); + + rcu_read_lock(&sched_res_rculock); + + get_sched_res(0)->curr = idle_vcpu[0]->sched_unit; + get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit; + + rcu_read_unlock(&sched_res_rculock); +} + +/* + * Move a pCPU from free cpus (running the idle scheduler) to a cpupool + * using any "real" scheduler. + * The cpu is still marked as "free" and not yet valid for its cpupool. + */ +int schedule_cpu_add(unsigned int cpu, struct cpupool *c) +{ + struct vcpu *idle; + void *ppriv, *vpriv; + struct scheduler *new_ops = c->sched; + struct sched_resource *sr; + spinlock_t *old_lock, *new_lock; + unsigned long flags; + int ret = 0; + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); + + ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); + ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid)); + ASSERT(get_sched_res(cpu)->cpupool == NULL); + + /* + * To setup the cpu for the new scheduler we need: + * - a valid instance of per-CPU scheduler specific data, as it is + * allocated by sched_alloc_pdata(). Note that we do not want to + * initialize it yet (i.e., we are not calling sched_init_pdata()). + * That will be done by the target scheduler, in sched_switch_sched(), + * in proper ordering and with locking. + * - a valid instance of per-vCPU scheduler specific data, for the idle + * vCPU of cpu. That is what the target scheduler will use for the + * sched_priv field of the per-vCPU info of the idle domain. + */ + idle = idle_vcpu[cpu]; + ppriv = sched_alloc_pdata(new_ops, cpu); + if ( IS_ERR(ppriv) ) + { + ret = PTR_ERR(ppriv); + goto out; + } + + vpriv = sched_alloc_udata(new_ops, idle->sched_unit, + idle->domain->sched_priv); + if ( vpriv == NULL ) + { + sched_free_pdata(new_ops, ppriv, cpu); + ret = -ENOMEM; + goto out; + } + + /* + * The actual switch, including the rerouting of the scheduler lock to + * whatever new_ops prefers, needs to happen in one critical section, + * protected by old_ops' lock, or races are possible. + * It is, in fact, the lock of the idle scheduler that we are taking. + * But that is ok as anyone trying to schedule on this cpu will spin until + * when we release that lock (bottom of this function). When he'll get the + * lock --thanks to the loop inside *_schedule_lock() functions-- he'll + * notice that the lock itself changed, and retry acquiring the new one + * (which will be the correct, remapped one, at that point). + */ + old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + if ( cpupool_get_granularity(c) > 1 ) + { + const cpumask_t *mask; + unsigned int cpu_iter, idx = 0; + struct sched_unit *old_unit, *master_unit; + struct sched_resource *sr_old; + + /* + * We need to merge multiple idle_vcpu units and sched_resource structs + * into one. As the free cpus all share the same lock we are fine doing + * that now. The worst which could happen would be someone waiting for + * the lock, thus dereferencing sched_res->schedule_lock. This is the + * reason we are freeing struct sched_res via call_rcu() to avoid the + * lock pointer suddenly disappearing. + */ + mask = sched_get_opt_cpumask(c->gran, cpu); + master_unit = idle_vcpu[cpu]->sched_unit; + + for_each_cpu ( cpu_iter, mask ) + { + if ( idx ) + cpumask_clear_cpu(cpu_iter, &sched_res_mask); + + per_cpu(sched_res_idx, cpu_iter) = idx++; + + if ( cpu == cpu_iter ) + continue; + + old_unit = idle_vcpu[cpu_iter]->sched_unit; + sr_old = get_sched_res(cpu_iter); + kill_timer(&sr_old->s_timer); + idle_vcpu[cpu_iter]->sched_unit = master_unit; + master_unit->runstate_cnt[RUNSTATE_running]++; + set_sched_res(cpu_iter, sr); + cpumask_set_cpu(cpu_iter, sr->cpus); + + call_rcu(&sr_old->rcu, sched_res_free); + } + } + + new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv); + + sr->scheduler = new_ops; + sr->sched_priv = ppriv; + + /* + * Reroute the lock to the per pCPU lock as /last/ thing. In fact, + * if it is free (and it can be) we want that anyone that manages + * taking it, finds all the initializations we've done above in place. + */ + smp_wmb(); + sr->schedule_lock = new_lock; + + /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */ + spin_unlock_irqrestore(old_lock, flags); + + sr->granularity = cpupool_get_granularity(c); + sr->cpupool = c; + /* The cpu is added to a pool, trigger it to go pick up some work */ + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + +out: + rcu_read_unlock(&sched_res_rculock); + + return ret; +} + +/* + * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops + * (the idle scheduler). + * The cpu is already marked as "free" and not valid any longer for its + * cpupool. + */ +int schedule_cpu_rm(unsigned int cpu) +{ + void *ppriv_old, *vpriv_old; + struct sched_resource *sr, **sr_new = NULL; + struct sched_unit *unit; + struct scheduler *old_ops; + spinlock_t *old_lock; + unsigned long flags; + int idx, ret = -ENOMEM; + unsigned int cpu_iter; + + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); + old_ops = sr->scheduler; + + if ( sr->granularity > 1 ) + { + sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); + if ( !sr_new ) + goto out; + for ( idx = 0; idx < sr->granularity - 1; idx++ ) + { + sr_new[idx] = sched_alloc_res(); + if ( sr_new[idx] ) + { + sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); + if ( !sr_new[idx]->sched_unit_idle ) + { + sched_res_free(&sr_new[idx]->rcu); + sr_new[idx] = NULL; + } + } + if ( !sr_new[idx] ) + { + for ( idx--; idx >= 0; idx-- ) + sched_res_free(&sr_new[idx]->rcu); + goto out; + } + sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; + sr_new[idx]->scheduler = &sched_idle_ops; + sr_new[idx]->granularity = 1; + + /* We want the lock not to change when replacing the resource. */ + sr_new[idx]->schedule_lock = sr->schedule_lock; + } + } + + ret = 0; + ASSERT(sr->cpupool != NULL); + ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); + ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); + + /* See comment in schedule_cpu_add() regarding lock switching. */ + old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + vpriv_old = idle_vcpu[cpu]->sched_unit->priv; + ppriv_old = sr->sched_priv; + + idx = 0; + for_each_cpu ( cpu_iter, sr->cpus ) + { + per_cpu(sched_res_idx, cpu_iter) = 0; + if ( cpu_iter == cpu ) + { + idle_vcpu[cpu_iter]->sched_unit->priv = NULL; + } + else + { + /* Initialize unit. */ + unit = sr_new[idx]->sched_unit_idle; + unit->res = sr_new[idx]; + unit->is_running = true; + sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); + sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); + + /* Adjust cpu masks of resources (old and new). */ + cpumask_clear_cpu(cpu_iter, sr->cpus); + cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); + + /* Init timer. */ + init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); + + /* Last resource initializations and insert resource pointer. */ + sr_new[idx]->master_cpu = cpu_iter; + set_sched_res(cpu_iter, sr_new[idx]); + + /* Last action: set the new lock pointer. */ + smp_mb(); + sr_new[idx]->schedule_lock = &sched_free_cpu_lock; + + idx++; + } + } + sr->scheduler = &sched_idle_ops; + sr->sched_priv = NULL; + + smp_mb(); + sr->schedule_lock = &sched_free_cpu_lock; + + /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ + spin_unlock_irqrestore(old_lock, flags); + + sched_deinit_pdata(old_ops, ppriv_old, cpu); + + sched_free_udata(old_ops, vpriv_old); + sched_free_pdata(old_ops, ppriv_old, cpu); + + sr->granularity = 1; + sr->cpupool = NULL; + +out: + rcu_read_unlock(&sched_res_rculock); + xfree(sr_new); + + return ret; +} + +struct scheduler *scheduler_get_default(void) +{ + return &ops; +} + +struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr) +{ + int i; + struct scheduler *sched; + + for ( i = 0; i < NUM_SCHEDULERS; i++ ) + if ( schedulers[i] && schedulers[i]->sched_id == sched_id ) + goto found; + *perr = -ENOENT; + return NULL; + + found: + *perr = -ENOMEM; + if ( (sched = xmalloc(struct scheduler)) == NULL ) + return NULL; + memcpy(sched, schedulers[i], sizeof(*sched)); + if ( (*perr = sched_init(sched)) != 0 ) + { + xfree(sched); + sched = NULL; + } + + return sched; +} + +void scheduler_free(struct scheduler *sched) +{ + BUG_ON(sched == &ops); + sched_deinit(sched); + xfree(sched); +} + +void schedule_dump(struct cpupool *c) +{ + unsigned int i; + struct scheduler *sched; + cpumask_t *cpus; + + /* Locking, if necessary, must be handled withing each scheduler */ + + rcu_read_lock(&sched_res_rculock); + + if ( c != NULL ) + { + sched = c->sched; + cpus = c->cpu_valid; + printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); + sched_dump_settings(sched); + } + else + { + sched = &ops; + cpus = &cpupool_free_cpus; + } + + if ( sched->dump_cpu_state != NULL ) + { + printk("CPUs info:\n"); + for_each_cpu (i, cpus) + sched_dump_cpu_state(sched, i); + } + + rcu_read_unlock(&sched_res_rculock); +} + +void sched_tick_suspend(void) +{ + rcu_idle_enter(smp_processor_id()); + rcu_idle_timer_start(); +} + +void sched_tick_resume(void) +{ + rcu_idle_timer_stop(); + rcu_idle_exit(smp_processor_id()); +} + +void wait(void) +{ + schedule(); +} + +#ifdef CONFIG_X86 +void __init sched_setup_dom0_vcpus(struct domain *d) +{ + unsigned int i; + struct sched_unit *unit; + + for ( i = 1; i < d->max_vcpus; i++ ) + vcpu_create(d, i); + + /* + * PV-shim: vcpus are pinned 1:1. + * Initially only 1 cpu is online, others will be dealt with when + * onlining them. This avoids pinning a vcpu to a not yet online cpu here. + */ + if ( pv_shim ) + sched_set_affinity(d->vcpu[0]->sched_unit, + cpumask_of(0), cpumask_of(0)); + else + { + for_each_sched_unit ( d, unit ) + { + if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed ) + sched_set_affinity(unit, &dom0_cpus, NULL); + sched_set_affinity(unit, NULL, &dom0_cpus); + } + } + + domain_update_node_affinity(d); +} +#endif + +#ifdef CONFIG_COMPAT +#include "compat.c" +#endif + +#endif /* !COMPAT */ + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c new file mode 100644 index 0000000000..d66b541a94 --- /dev/null +++ b/xen/common/sched/cpupool.c @@ -0,0 +1,979 @@ +/****************************************************************************** + * cpupool.c + * + * Generic cpupool-handling functions. + * + * Cpupools are a feature to have configurable scheduling domains. Each + * cpupool runs an own scheduler on a dedicated set of physical cpus. + * A domain is bound to one cpupool at any time, but it can be moved to + * another cpupool. + * + * (C) 2009, Juergen Gross, Fujitsu Technology Solutions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define for_each_cpupool(ptr) \ + for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) + +struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ +cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ + +static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ + +static int cpupool_moving_cpu = -1; +static struct cpupool *cpupool_cpu_moving = NULL; +static cpumask_t cpupool_locked_cpus; + +static DEFINE_SPINLOCK(cpupool_lock); + +static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu; +static unsigned int __read_mostly sched_granularity = 1; + +#ifdef CONFIG_HAS_SCHED_GRANULARITY +static int __init sched_select_granularity(const char *str) +{ + if ( strcmp("cpu", str) == 0 ) + opt_sched_granularity = SCHED_GRAN_cpu; + else if ( strcmp("core", str) == 0 ) + opt_sched_granularity = SCHED_GRAN_core; + else if ( strcmp("socket", str) == 0 ) + opt_sched_granularity = SCHED_GRAN_socket; + else + return -EINVAL; + + return 0; +} +custom_param("sched-gran", sched_select_granularity); +#endif + +static unsigned int __init cpupool_check_granularity(void) +{ + unsigned int cpu; + unsigned int siblings, gran = 0; + + if ( opt_sched_granularity == SCHED_GRAN_cpu ) + return 1; + + for_each_online_cpu ( cpu ) + { + siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity, + cpu)); + if ( gran == 0 ) + gran = siblings; + else if ( gran != siblings ) + return 0; + } + + sched_disable_smt_switching = true; + + return gran; +} + +/* Setup data for selected scheduler granularity. */ +static void __init cpupool_gran_init(void) +{ + unsigned int gran = 0; + const char *fallback = NULL; + + while ( gran == 0 ) + { + gran = cpupool_check_granularity(); + + if ( gran == 0 ) + { + switch ( opt_sched_granularity ) + { + case SCHED_GRAN_core: + opt_sched_granularity = SCHED_GRAN_cpu; + fallback = "Asymmetric cpu configuration.\n" + "Falling back to sched-gran=cpu.\n"; + break; + case SCHED_GRAN_socket: + opt_sched_granularity = SCHED_GRAN_core; + fallback = "Asymmetric cpu configuration.\n" + "Falling back to sched-gran=core.\n"; + break; + default: + ASSERT_UNREACHABLE(); + break; + } + } + } + + if ( fallback ) + warning_add(fallback); + + sched_granularity = gran; +} + +unsigned int cpupool_get_granularity(const struct cpupool *c) +{ + return c ? sched_granularity : 1; +} + +static void free_cpupool_struct(struct cpupool *c) +{ + if ( c ) + { + free_cpumask_var(c->res_valid); + free_cpumask_var(c->cpu_valid); + } + xfree(c); +} + +static struct cpupool *alloc_cpupool_struct(void) +{ + struct cpupool *c = xzalloc(struct cpupool); + + if ( !c ) + return NULL; + + if ( !zalloc_cpumask_var(&c->cpu_valid) || + !zalloc_cpumask_var(&c->res_valid) ) + { + free_cpupool_struct(c); + c = NULL; + } + + return c; +} + +/* + * find a cpupool by it's id. to be called with cpupool lock held + * if exact is not specified, the first cpupool with an id larger or equal to + * the searched id is returned + * returns NULL if not found. + */ +static struct cpupool *__cpupool_find_by_id(int id, int exact) +{ + struct cpupool **q; + + ASSERT(spin_is_locked(&cpupool_lock)); + + for_each_cpupool(q) + if ( (*q)->cpupool_id >= id ) + break; + + return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL; +} + +static struct cpupool *cpupool_find_by_id(int poolid) +{ + return __cpupool_find_by_id(poolid, 1); +} + +static struct cpupool *__cpupool_get_by_id(int poolid, int exact) +{ + struct cpupool *c; + spin_lock(&cpupool_lock); + c = __cpupool_find_by_id(poolid, exact); + if ( c != NULL ) + atomic_inc(&c->refcnt); + spin_unlock(&cpupool_lock); + return c; +} + +struct cpupool *cpupool_get_by_id(int poolid) +{ + return __cpupool_get_by_id(poolid, 1); +} + +static struct cpupool *cpupool_get_next_by_id(int poolid) +{ + return __cpupool_get_by_id(poolid, 0); +} + +void cpupool_put(struct cpupool *pool) +{ + if ( !atomic_dec_and_test(&pool->refcnt) ) + return; + scheduler_free(pool->sched); + free_cpupool_struct(pool); +} + +/* + * create a new cpupool with specified poolid and scheduler + * returns pointer to new cpupool structure if okay, NULL else + * possible failures: + * - no memory + * - poolid already used + * - unknown scheduler + */ +static struct cpupool *cpupool_create( + int poolid, unsigned int sched_id, int *perr) +{ + struct cpupool *c; + struct cpupool **q; + int last = 0; + + *perr = -ENOMEM; + if ( (c = alloc_cpupool_struct()) == NULL ) + return NULL; + + /* One reference for caller, one reference for cpupool_destroy(). */ + atomic_set(&c->refcnt, 2); + + debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id); + + spin_lock(&cpupool_lock); + + for_each_cpupool(q) + { + last = (*q)->cpupool_id; + if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) + break; + } + if ( *q != NULL ) + { + if ( (*q)->cpupool_id == poolid ) + { + *perr = -EEXIST; + goto err; + } + c->next = *q; + } + + c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; + if ( poolid == 0 ) + { + c->sched = scheduler_get_default(); + } + else + { + c->sched = scheduler_alloc(sched_id, perr); + if ( c->sched == NULL ) + goto err; + } + c->gran = opt_sched_granularity; + + *q = c; + + spin_unlock(&cpupool_lock); + + debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n", + c->cpupool_id, c->sched->name, c->sched->opt_name); + + *perr = 0; + return c; + + err: + spin_unlock(&cpupool_lock); + free_cpupool_struct(c); + return NULL; +} +/* + * destroys the given cpupool + * returns 0 on success, 1 else + * possible failures: + * - pool still in use + * - cpus still assigned to pool + * - pool not in list + */ +static int cpupool_destroy(struct cpupool *c) +{ + struct cpupool **q; + + spin_lock(&cpupool_lock); + for_each_cpupool(q) + if ( *q == c ) + break; + if ( *q != c ) + { + spin_unlock(&cpupool_lock); + return -ENOENT; + } + if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) ) + { + spin_unlock(&cpupool_lock); + return -EBUSY; + } + *q = c->next; + spin_unlock(&cpupool_lock); + + cpupool_put(c); + + debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id); + return 0; +} + +/* + * Move domain to another cpupool + */ +static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c) +{ + int ret; + + if ( unlikely(d->cpupool == c) ) + return 0; + + d->cpupool->n_dom--; + ret = sched_move_domain(d, c); + if ( ret ) + d->cpupool->n_dom++; + else + c->n_dom++; + + return ret; +} +int cpupool_move_domain(struct domain *d, struct cpupool *c) +{ + int ret; + + spin_lock(&cpupool_lock); + + ret = cpupool_move_domain_locked(d, c); + + spin_unlock(&cpupool_lock); + + return ret; +} + +/* + * assign a specific cpu to a cpupool + * cpupool_lock must be held + */ +static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) +{ + int ret; + struct domain *d; + const cpumask_t *cpus; + + cpus = sched_get_opt_cpumask(c->gran, cpu); + + if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) + return -EADDRNOTAVAIL; + ret = schedule_cpu_add(cpumask_first(cpus), c); + if ( ret ) + return ret; + + rcu_read_lock(&sched_res_rculock); + + cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); + if (cpupool_moving_cpu == cpu) + { + cpupool_moving_cpu = -1; + cpupool_put(cpupool_cpu_moving); + cpupool_cpu_moving = NULL; + } + cpumask_or(c->cpu_valid, c->cpu_valid, cpus); + cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); + + rcu_read_unlock(&sched_res_rculock); + + rcu_read_lock(&domlist_read_lock); + for_each_domain_in_cpupool(d, c) + { + domain_update_node_affinity(d); + } + rcu_read_unlock(&domlist_read_lock); + + return 0; +} + +static int cpupool_unassign_cpu_finish(struct cpupool *c) +{ + int cpu = cpupool_moving_cpu; + const cpumask_t *cpus; + struct domain *d; + int ret; + + if ( c != cpupool_cpu_moving ) + return -EADDRNOTAVAIL; + + /* + * We need this for scanning the domain list, both in + * cpu_disable_scheduler(), and at the bottom of this function. + */ + rcu_read_lock(&domlist_read_lock); + ret = cpu_disable_scheduler(cpu); + + rcu_read_lock(&sched_res_rculock); + cpus = get_sched_res(cpu)->cpus; + cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus); + + /* + * cpu_disable_scheduler() returning an error doesn't require resetting + * cpupool_free_cpus' cpu bit. All error cases should be of temporary + * nature and tools will retry the operation. Even if the number of + * retries may be limited, the in-between state can easily be repaired + * by adding the cpu to the cpupool again. + */ + if ( !ret ) + { + ret = schedule_cpu_rm(cpu); + if ( ret ) + cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); + else + { + cpupool_moving_cpu = -1; + cpupool_put(cpupool_cpu_moving); + cpupool_cpu_moving = NULL; + } + } + rcu_read_unlock(&sched_res_rculock); + + for_each_domain_in_cpupool(d, c) + { + domain_update_node_affinity(d); + } + rcu_read_unlock(&domlist_read_lock); + + return ret; +} + +static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu) +{ + int ret; + struct domain *d; + const cpumask_t *cpus; + + spin_lock(&cpupool_lock); + ret = -EADDRNOTAVAIL; + if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid)) + && (cpu != cpupool_moving_cpu) ) + goto out; + + ret = 0; + rcu_read_lock(&sched_res_rculock); + cpus = get_sched_res(cpu)->cpus; + + if ( (c->n_dom > 0) && + (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) && + (cpu != cpupool_moving_cpu) ) + { + rcu_read_lock(&domlist_read_lock); + for_each_domain_in_cpupool(d, c) + { + if ( !d->is_dying && system_state == SYS_STATE_active ) + { + ret = -EBUSY; + break; + } + ret = cpupool_move_domain_locked(d, cpupool0); + if ( ret ) + break; + } + rcu_read_unlock(&domlist_read_lock); + if ( ret ) + goto out; + } + cpupool_moving_cpu = cpu; + atomic_inc(&c->refcnt); + cpupool_cpu_moving = c; + cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus); + cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask); + + rcu_read_unlock(&domlist_read_lock); +out: + spin_unlock(&cpupool_lock); + + return ret; +} + +static long cpupool_unassign_cpu_helper(void *info) +{ + struct cpupool *c = info; + long ret; + + debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", + cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); + spin_lock(&cpupool_lock); + + ret = cpupool_unassign_cpu_finish(c); + + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); + + return ret; +} + +/* + * unassign a specific cpu from a cpupool + * we must be sure not to run on the cpu to be unassigned! to achieve this + * the main functionality is performed via continue_hypercall_on_cpu on a + * specific cpu. + * if the cpu to be removed is the last one of the cpupool no active domain + * must be bound to the cpupool. dying domains are moved to cpupool0 as they + * might be zombies. + * possible failures: + * - last cpu and still active domains in cpupool + * - cpu just being unplugged + */ +static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) +{ + int work_cpu; + int ret; + unsigned int master_cpu; + + debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", + c->cpupool_id, cpu); + + master_cpu = sched_get_resource_cpu(cpu); + ret = cpupool_unassign_cpu_start(c, master_cpu); + if ( ret ) + { + debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", + c->cpupool_id, cpu, ret); + return ret; + } + + work_cpu = sched_get_resource_cpu(smp_processor_id()); + if ( work_cpu == master_cpu ) + { + work_cpu = cpumask_first(cpupool0->cpu_valid); + if ( work_cpu == master_cpu ) + work_cpu = cpumask_last(cpupool0->cpu_valid); + } + return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c); +} + +/* + * add a new domain to a cpupool + * possible failures: + * - pool does not exist + * - no cpu assigned to pool + */ +int cpupool_add_domain(struct domain *d, int poolid) +{ + struct cpupool *c; + int rc; + int n_dom = 0; + + if ( poolid == CPUPOOLID_NONE ) + return 0; + spin_lock(&cpupool_lock); + c = cpupool_find_by_id(poolid); + if ( c == NULL ) + rc = -ESRCH; + else if ( !cpumask_weight(c->cpu_valid) ) + rc = -ENODEV; + else + { + c->n_dom++; + n_dom = c->n_dom; + d->cpupool = c; + rc = 0; + } + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n", + d->domain_id, poolid, n_dom, rc); + return rc; +} + +/* + * remove a domain from a cpupool + */ +void cpupool_rm_domain(struct domain *d) +{ + int cpupool_id; + int n_dom; + + if ( d->cpupool == NULL ) + return; + spin_lock(&cpupool_lock); + cpupool_id = d->cpupool->cpupool_id; + d->cpupool->n_dom--; + n_dom = d->cpupool->n_dom; + d->cpupool = NULL; + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", + d->domain_id, cpupool_id, n_dom); + return; +} + +/* + * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0, + * as they must have been in there when unplugged. + */ +static int cpupool_cpu_add(unsigned int cpu) +{ + int ret = 0; + const cpumask_t *cpus; + + spin_lock(&cpupool_lock); + cpumask_clear_cpu(cpu, &cpupool_locked_cpus); + cpumask_set_cpu(cpu, &cpupool_free_cpus); + + /* + * If we are not resuming, we are hot-plugging cpu, and in which case + * we add it to pool0, as it certainly was there when hot-unplagged + * (or unplugging would have failed) and that is the default behavior + * anyway. + */ + rcu_read_lock(&sched_res_rculock); + get_sched_res(cpu)->cpupool = NULL; + + cpus = sched_get_opt_cpumask(cpupool0->gran, cpu); + if ( cpumask_subset(cpus, &cpupool_free_cpus) ) + ret = cpupool_assign_cpu_locked(cpupool0, cpu); + + rcu_read_unlock(&sched_res_rculock); + + spin_unlock(&cpupool_lock); + + return ret; +} + +/* + * This function is called in stop_machine context, so we can be sure no + * non-idle vcpu is active on the system. + */ +static void cpupool_cpu_remove(unsigned int cpu) +{ + int ret; + + ASSERT(is_idle_vcpu(current)); + + if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) + { + ret = cpupool_unassign_cpu_finish(cpupool0); + BUG_ON(ret); + } + cpumask_clear_cpu(cpu, &cpupool_free_cpus); +} + +/* + * Called before a CPU is being removed from the system. + * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved + * to free cpus actually before removing them). + * The CPU is locked, to forbid adding it again to another cpupool. + */ +static int cpupool_cpu_remove_prologue(unsigned int cpu) +{ + int ret = 0; + cpumask_t *cpus; + unsigned int master_cpu; + + spin_lock(&cpupool_lock); + + rcu_read_lock(&sched_res_rculock); + cpus = get_sched_res(cpu)->cpus; + master_cpu = sched_get_resource_cpu(cpu); + if ( cpumask_intersects(cpus, &cpupool_locked_cpus) ) + ret = -EBUSY; + else + cpumask_set_cpu(cpu, &cpupool_locked_cpus); + rcu_read_unlock(&sched_res_rculock); + + spin_unlock(&cpupool_lock); + + if ( ret ) + return ret; + + if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) ) + { + /* Cpupool0 is populated only after all cpus are up. */ + ASSERT(system_state == SYS_STATE_active); + + ret = cpupool_unassign_cpu_start(cpupool0, master_cpu); + } + else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) ) + ret = -ENODEV; + + return ret; +} + +/* + * Called during resume for all cpus which didn't come up again. The cpu must + * be removed from the cpupool it is assigned to. In case a cpupool will be + * left without cpu we move all domains of that cpupool to cpupool0. + * As we are called with all domains still frozen there is no need to take the + * cpupool lock here. + */ +static void cpupool_cpu_remove_forced(unsigned int cpu) +{ + struct cpupool **c; + int ret; + unsigned int master_cpu = sched_get_resource_cpu(cpu); + + for_each_cpupool ( c ) + { + if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) ) + { + ret = cpupool_unassign_cpu_start(*c, master_cpu); + BUG_ON(ret); + ret = cpupool_unassign_cpu_finish(*c); + BUG_ON(ret); + } + } + + cpumask_clear_cpu(cpu, &cpupool_free_cpus); + + rcu_read_lock(&sched_res_rculock); + sched_rm_cpu(cpu); + rcu_read_unlock(&sched_res_rculock); +} + +/* + * do cpupool related sysctl operations + */ +int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) +{ + int ret; + struct cpupool *c; + + switch ( op->op ) + { + + case XEN_SYSCTL_CPUPOOL_OP_CREATE: + { + int poolid; + + poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ? + CPUPOOLID_NONE: op->cpupool_id; + c = cpupool_create(poolid, op->sched_id, &ret); + if ( c != NULL ) + { + op->cpupool_id = c->cpupool_id; + cpupool_put(c); + } + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_DESTROY: + { + c = cpupool_get_by_id(op->cpupool_id); + ret = -ENOENT; + if ( c == NULL ) + break; + ret = cpupool_destroy(c); + cpupool_put(c); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_INFO: + { + c = cpupool_get_next_by_id(op->cpupool_id); + ret = -ENOENT; + if ( c == NULL ) + break; + op->cpupool_id = c->cpupool_id; + op->sched_id = c->sched->sched_id; + op->n_dom = c->n_dom; + ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); + cpupool_put(c); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_ADDCPU: + { + unsigned cpu; + const cpumask_t *cpus; + + cpu = op->cpu; + debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n", + op->cpupool_id, cpu); + + spin_lock(&cpupool_lock); + + c = cpupool_find_by_id(op->cpupool_id); + ret = -ENOENT; + if ( c == NULL ) + goto addcpu_out; + if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) + { + for_each_cpu ( cpu, &cpupool_free_cpus ) + { + cpus = sched_get_opt_cpumask(c->gran, cpu); + if ( cpumask_subset(cpus, &cpupool_free_cpus) ) + break; + } + ret = -ENODEV; + if ( cpu >= nr_cpu_ids ) + goto addcpu_out; + } + ret = -EINVAL; + if ( cpu >= nr_cpu_ids ) + goto addcpu_out; + ret = -ENODEV; + cpus = sched_get_opt_cpumask(c->gran, cpu); + if ( !cpumask_subset(cpus, &cpupool_free_cpus) || + cpumask_intersects(cpus, &cpupool_locked_cpus) ) + goto addcpu_out; + ret = cpupool_assign_cpu_locked(c, cpu); + + addcpu_out: + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", + op->cpupool_id, cpu, ret); + + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_RMCPU: + { + unsigned cpu; + + c = cpupool_get_by_id(op->cpupool_id); + ret = -ENOENT; + if ( c == NULL ) + break; + cpu = op->cpu; + if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) + cpu = cpumask_last(c->cpu_valid); + ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL; + cpupool_put(c); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN: + { + struct domain *d; + + ret = rcu_lock_remote_domain_by_id(op->domid, &d); + if ( ret ) + break; + if ( d->cpupool == NULL ) + { + ret = -EINVAL; + rcu_unlock_domain(d); + break; + } + if ( op->cpupool_id == d->cpupool->cpupool_id ) + { + ret = 0; + rcu_unlock_domain(d); + break; + } + debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n", + d->domain_id, op->cpupool_id); + ret = -ENOENT; + spin_lock(&cpupool_lock); + + c = cpupool_find_by_id(op->cpupool_id); + if ( (c != NULL) && cpumask_weight(c->cpu_valid) ) + ret = cpupool_move_domain_locked(d, c); + + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n", + d->domain_id, op->cpupool_id, ret); + rcu_unlock_domain(d); + } + break; + + case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: + { + ret = cpumask_to_xenctl_bitmap( + &op->cpumap, &cpupool_free_cpus); + } + break; + + default: + ret = -ENOSYS; + break; + } + + return ret; +} + +void dump_runq(unsigned char key) +{ + unsigned long flags; + s_time_t now = NOW(); + struct cpupool **c; + + spin_lock(&cpupool_lock); + local_irq_save(flags); + + printk("sched_smt_power_savings: %s\n", + sched_smt_power_savings? "enabled":"disabled"); + printk("NOW=%"PRI_stime"\n", now); + + printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map)); + if ( !cpumask_empty(&cpupool_free_cpus) ) + { + printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus)); + schedule_dump(NULL); + } + + for_each_cpupool(c) + { + printk("Cpupool %d:\n", (*c)->cpupool_id); + printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid)); + schedule_dump(*c); + } + + local_irq_restore(flags); + spin_unlock(&cpupool_lock); +} + +static int cpu_callback( + struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int rc = 0; + + switch ( action ) + { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + if ( system_state <= SYS_STATE_active ) + rc = cpupool_cpu_add(cpu); + break; + case CPU_DOWN_PREPARE: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) + rc = cpupool_cpu_remove_prologue(cpu); + break; + case CPU_DYING: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) + cpupool_cpu_remove(cpu); + break; + case CPU_RESUME_FAILED: + cpupool_cpu_remove_forced(cpu); + break; + default: + break; + } + + return !rc ? NOTIFY_DONE : notifier_from_errno(rc); +} + +static struct notifier_block cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init cpupool_init(void) +{ + unsigned int cpu; + int err; + + cpupool_gran_init(); + + cpupool0 = cpupool_create(0, 0, &err); + BUG_ON(cpupool0 == NULL); + cpupool_put(cpupool0); + register_cpu_notifier(&cpu_nfb); + + spin_lock(&cpupool_lock); + + cpumask_copy(&cpupool_free_cpus, &cpu_online_map); + + for_each_cpu ( cpu, &cpupool_free_cpus ) + cpupool_assign_cpu_locked(cpupool0, cpu); + + spin_unlock(&cpupool_lock); + + return 0; +} +__initcall(cpupool_init); + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c new file mode 100644 index 0000000000..aa41a3301b --- /dev/null +++ b/xen/common/sched/credit.c @@ -0,0 +1,2284 @@ +/**************************************************************************** + * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. + **************************************************************************** + * + * File: common/csched_credit.c + * Author: Emmanuel Ackaouy + * + * Description: Credit-based SMP CPU scheduler + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Locking: + * - Scheduler-lock (a.k.a. runqueue lock): + * + is per-runqueue, and there is one runqueue per-cpu; + * + serializes all runqueue manipulation operations; + * - Private data lock (a.k.a. private scheduler lock): + * + serializes accesses to the scheduler global state (weight, + * credit, balance_credit, etc); + * + serializes updates to the domains' scheduling parameters. + * + * Ordering is "private lock always comes first": + * + if we need both locks, we must acquire the private + * scheduler lock for first; + * + if we already own a runqueue lock, we must never acquire + * the private scheduler lock. + */ + +/* + * Basic constants + */ +#define CSCHED_DEFAULT_WEIGHT 256 +#define CSCHED_TICKS_PER_TSLICE 3 +/* Default timeslice: 30ms */ +#define CSCHED_DEFAULT_TSLICE_MS 30 +#define CSCHED_CREDITS_PER_MSEC 10 +/* Never set a timer shorter than this value. */ +#define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN + + +/* + * Priorities + */ +#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ +#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ +#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ +#define CSCHED_PRI_IDLE -64 /* idle */ + + +/* + * Flags + * + * Note that svc->flags (where these flags live) is protected by an + * inconsistent set of locks. Therefore atomic-safe bit operations must + * be used for accessing it. + */ +#define CSCHED_FLAG_UNIT_PARKED 0x0 /* UNIT over capped credits */ +#define CSCHED_FLAG_UNIT_YIELD 0x1 /* UNIT yielding */ +#define CSCHED_FLAG_UNIT_MIGRATING 0x2 /* UNIT may have moved to a new pcpu */ +#define CSCHED_FLAG_UNIT_PINNED 0x4 /* UNIT can run only on 1 pcpu */ + + +/* + * Useful macros + */ +#define CSCHED_PRIV(_ops) \ + ((struct csched_private *)((_ops)->sched_data)) +#define CSCHED_PCPU(_c) \ + ((struct csched_pcpu *)get_sched_res(_c)->sched_priv) +#define CSCHED_UNIT(unit) ((struct csched_unit *) (unit)->priv) +#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) +#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) + + +/* + * CSCHED_STATS + * + * Manage very basic per-unit counters and stats. + * + * Useful for debugging live systems. The stats are displayed + * with runq dumps ('r' on the Xen console). + */ +#ifdef SCHED_STATS + +#define CSCHED_STATS + +#define SCHED_UNIT_STATS_RESET(_V) \ + do \ + { \ + memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ + } while ( 0 ) + +#define SCHED_UNIT_STAT_CRANK(_V, _X) (((_V)->stats._X)++) + +#define SCHED_UNIT_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) + +#else /* !SCHED_STATS */ + +#undef CSCHED_STATS + +#define SCHED_UNIT_STATS_RESET(_V) do {} while ( 0 ) +#define SCHED_UNIT_STAT_CRANK(_V, _X) do {} while ( 0 ) +#define SCHED_UNIT_STAT_SET(_V, _X, _Y) do {} while ( 0 ) + +#endif /* SCHED_STATS */ + + +/* + * Credit tracing events ("only" 512 available!). Check + * include/public/trace.h for more details. + */ +#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1) +#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2) +#define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3) +#define TRC_CSCHED_STOLEN_UNIT TRC_SCHED_CLASS_EVT(CSCHED, 4) +#define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5) +#define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6) +#define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7) +#define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8) +#define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9) +#define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10) +#define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11) + +/* + * Boot parameters + */ +static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; +integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms); + +/* + * Physical CPU + */ +struct csched_pcpu { + struct list_head runq; + uint32_t runq_sort_last; + + unsigned int idle_bias; + unsigned int nr_runnable; + + unsigned int tick; + struct timer ticker; +}; + +/* + * Virtual UNIT + */ +struct csched_unit { + struct list_head runq_elem; + struct list_head active_unit_elem; + + /* Up-pointers */ + struct csched_dom *sdom; + struct sched_unit *unit; + + s_time_t start_time; /* When we were scheduled (used for credit) */ + unsigned flags; + int pri; + + atomic_t credit; + unsigned int residual; + + s_time_t last_sched_time; + +#ifdef CSCHED_STATS + struct { + int credit_last; + uint32_t credit_incr; + uint32_t state_active; + uint32_t state_idle; + uint32_t migrate_q; + uint32_t migrate_r; + uint32_t kicked_away; + } stats; +#endif +}; + +/* + * Domain + */ +struct csched_dom { + struct list_head active_unit; + struct list_head active_sdom_elem; + struct domain *dom; + uint16_t active_unit_count; + uint16_t weight; + uint16_t cap; +}; + +/* + * System-wide private data + */ +struct csched_private { + /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ + spinlock_t lock; + + cpumask_var_t idlers; + cpumask_var_t cpus; + uint32_t *balance_bias; + uint32_t runq_sort; + uint32_t ncpus; + + /* Period of master and tick in milliseconds */ + unsigned int tick_period_us, ticks_per_tslice; + s_time_t ratelimit, tslice, unit_migr_delay; + + struct list_head active_sdom; + uint32_t weight; + uint32_t credit; + int credit_balance; + unsigned int credits_per_tslice; + + unsigned int master; + struct timer master_ticker; +}; + +static void csched_tick(void *_cpu); +static void csched_acct(void *dummy); + +static inline int +__unit_on_runq(struct csched_unit *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static inline struct csched_unit * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched_unit, runq_elem); +} + +/* Is the first element of cpu's runq (if any) cpu's idle unit? */ +static inline bool_t is_runq_idle(unsigned int cpu) +{ + /* + * We're peeking at cpu's runq, we must hold the proper lock. + */ + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + return list_empty(RUNQ(cpu)) || + is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit); +} + +static inline void +inc_nr_runnable(unsigned int cpu) +{ + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + CSCHED_PCPU(cpu)->nr_runnable++; + +} + +static inline void +dec_nr_runnable(unsigned int cpu) +{ + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1); + CSCHED_PCPU(cpu)->nr_runnable--; +} + +static inline void +__runq_insert(struct csched_unit *svc) +{ + unsigned int cpu = sched_unit_master(svc->unit); + const struct list_head * const runq = RUNQ(cpu); + struct list_head *iter; + + BUG_ON( __unit_on_runq(svc) ); + + list_for_each( iter, runq ) + { + const struct csched_unit * const iter_svc = __runq_elem(iter); + if ( svc->pri > iter_svc->pri ) + break; + } + + /* If the unit yielded, try to put it behind one lower-priority + * runnable unit if we can. The next runq_sort will bring it forward + * within 30ms if the queue too long. */ + if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags) + && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) + { + iter=iter->next; + + /* Some sanity checks */ + BUG_ON(iter == runq); + } + + list_add_tail(&svc->runq_elem, iter); +} + +static inline void +runq_insert(struct csched_unit *svc) +{ + __runq_insert(svc); + inc_nr_runnable(sched_unit_master(svc->unit)); +} + +static inline void +__runq_remove(struct csched_unit *svc) +{ + BUG_ON( !__unit_on_runq(svc) ); + list_del_init(&svc->runq_elem); +} + +static inline void +runq_remove(struct csched_unit *svc) +{ + dec_nr_runnable(sched_unit_master(svc->unit)); + __runq_remove(svc); +} + +static void burn_credits(struct csched_unit *svc, s_time_t now) +{ + s_time_t delta; + uint64_t val; + unsigned int credits; + + /* Assert svc is current */ + ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) ); + + if ( (delta = now - svc->start_time) <= 0 ) + return; + + val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual; + svc->residual = do_div(val, MILLISECS(1)); + credits = val; + ASSERT(credits == val); /* make sure we haven't truncated val */ + atomic_sub(credits, &svc->credit); + svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC; +} + +static bool_t __read_mostly opt_tickle_one_idle = 1; +boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle); + +DEFINE_PER_CPU(unsigned int, last_tickle_cpu); + +static inline void __runq_tickle(struct csched_unit *new) +{ + unsigned int cpu = sched_unit_master(new->unit); + struct sched_resource *sr = get_sched_res(cpu); + struct sched_unit *unit = new->unit; + struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu)); + struct csched_private *prv = CSCHED_PRIV(sr->scheduler); + cpumask_t mask, idle_mask, *online; + int balance_step, idlers_empty; + + ASSERT(cur); + cpumask_clear(&mask); + + online = cpupool_domain_master_cpumask(new->sdom->dom); + cpumask_and(&idle_mask, prv->idlers, online); + idlers_empty = cpumask_empty(&idle_mask); + + /* + * Exclusive pinning is when a unit has hard-affinity with only one + * cpu, and there is no other unit that has hard-affinity with that + * same cpu. This is infrequent, but if it happens, is for achieving + * the most possible determinism, and least possible overhead for + * the units in question. + * + * Try to identify the vast majority of these situations, and deal + * with them quickly. + */ + if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) && + cpumask_test_cpu(cpu, &idle_mask)) ) + { + ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); + SCHED_STAT_CRANK(tickled_idle_cpu_excl); + __cpumask_set_cpu(cpu, &mask); + goto tickle; + } + + /* + * If the pcpu is idle, or there are no idlers and the new + * unit is a higher priority than the old unit, run it here. + * + * If there are idle cpus, first try to find one suitable to run + * new, so we can avoid preempting cur. If we cannot find a + * suitable idler on which to run new, run it here, but try to + * find a suitable idler on which to run cur instead. + */ + if ( cur->pri == CSCHED_PRI_IDLE + || (idlers_empty && new->pri > cur->pri) ) + { + if ( cur->pri != CSCHED_PRI_IDLE ) + SCHED_STAT_CRANK(tickled_busy_cpu); + else + SCHED_STAT_CRANK(tickled_idle_cpu); + __cpumask_set_cpu(cpu, &mask); + } + else if ( !idlers_empty ) + { + /* + * Soft and hard affinity balancing loop. For units without + * a useful soft affinity, consider hard affinity only. + */ + for_each_affinity_balance_step( balance_step ) + { + int new_idlers_empty; + + if ( balance_step == BALANCE_SOFT_AFFINITY + && !has_soft_affinity(unit) ) + continue; + + /* Are there idlers suitable for new (for this balance step)? */ + affinity_balance_cpumask(unit, balance_step, + cpumask_scratch_cpu(cpu)); + cpumask_and(cpumask_scratch_cpu(cpu), + cpumask_scratch_cpu(cpu), &idle_mask); + new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu)); + + /* + * Let's not be too harsh! If there aren't idlers suitable + * for new in its soft affinity mask, make sure we check its + * hard affinity as well, before taking final decisions. + */ + if ( new_idlers_empty + && balance_step == BALANCE_SOFT_AFFINITY ) + continue; + + /* + * If there are no suitable idlers for new, and it's higher + * priority than cur, check whether we can migrate cur away. + * We have to do it indirectly, via _VPF_migrating (instead + * of just tickling any idler suitable for cur) because cur + * is running. + * + * If there are suitable idlers for new, no matter priorities, + * leave cur alone (as it is running and is, likely, cache-hot) + * and wake some of them (which is waking up and so is, likely, + * cache cold anyway). + */ + if ( new_idlers_empty && new->pri > cur->pri ) + { + if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) ) + { + SCHED_UNIT_STAT_CRANK(cur, kicked_away); + SCHED_UNIT_STAT_CRANK(cur, migrate_r); + SCHED_STAT_CRANK(migrate_kicked_away); + sched_set_pause_flags_atomic(cur->unit, _VPF_migrating); + } + /* Tickle cpu anyway, to let new preempt cur. */ + SCHED_STAT_CRANK(tickled_busy_cpu); + __cpumask_set_cpu(cpu, &mask); + } + else if ( !new_idlers_empty ) + { + /* Which of the idlers suitable for new shall we wake up? */ + SCHED_STAT_CRANK(tickled_idle_cpu); + if ( opt_tickle_one_idle ) + { + this_cpu(last_tickle_cpu) = + cpumask_cycle(this_cpu(last_tickle_cpu), + cpumask_scratch_cpu(cpu)); + __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); + } + else + cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu)); + } + + /* Did we find anyone? */ + if ( !cpumask_empty(&mask) ) + break; + } + } + + tickle: + if ( !cpumask_empty(&mask) ) + { + if ( unlikely(tb_init_done) ) + { + /* Avoid TRACE_*: saves checking !tb_init_done each step */ + for_each_cpu(cpu, &mask) + __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu); + } + + /* + * Mark the designated CPUs as busy and send them all the scheduler + * interrupt. We need the for_each_cpu for dealing with the + * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and + * can't use cpumask_andnot(), because prv->idlers needs atomic access. + * + * In the default (and most common) case, when opt_rickle_one_idle is + * true, the loop does only one step, and only one bit is cleared. + */ + for_each_cpu(cpu, &mask) + cpumask_clear_cpu(cpu, prv->idlers); + cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ); + } + else + SCHED_STAT_CRANK(tickled_no_cpu); +} + +static void +csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + + /* + * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're + * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed + * very early. xfree() does not really mind, but we want to be sure that, + * when we get here, either init_pdata has never been called, or + * deinit_pdata has been called already. + */ + ASSERT(!cpumask_test_cpu(cpu, prv->cpus)); + + xfree(pcpu); +} + +static void +csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_pcpu *spc = pcpu; + unsigned int node = cpu_to_node(cpu); + unsigned long flags; + + /* + * Scheduler specific data for this pCPU must still be there and and be + * valid. In fact, if we are here: + * 1. alloc_pdata must have been called for this cpu, and free_pdata + * must not have been called on it before us, + * 2. init_pdata must have been called on this cpu, and deinit_pdata + * (us!) must not have been called on it already. + */ + ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus)); + + spin_lock_irqsave(&prv->lock, flags); + + prv->credit -= prv->credits_per_tslice; + prv->ncpus--; + cpumask_clear_cpu(cpu, prv->idlers); + cpumask_clear_cpu(cpu, prv->cpus); + if ( (prv->master == cpu) && (prv->ncpus > 0) ) + { + prv->master = cpumask_first(prv->cpus); + migrate_timer(&prv->master_ticker, prv->master); + } + if ( prv->balance_bias[node] == cpu ) + { + cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node)); + if ( !cpumask_empty(cpumask_scratch) ) + prv->balance_bias[node] = cpumask_first(cpumask_scratch); + } + kill_timer(&spc->ticker); + if ( prv->ncpus == 0 ) + kill_timer(&prv->master_ticker); + + spin_unlock_irqrestore(&prv->lock, flags); +} + +static void * +csched_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct csched_pcpu *spc; + + /* Allocate per-PCPU info */ + spc = xzalloc(struct csched_pcpu); + if ( spc == NULL ) + return ERR_PTR(-ENOMEM); + + return spc; +} + +static void +init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu) +{ + ASSERT(spin_is_locked(&prv->lock)); + /* cpu data needs to be allocated, but STILL uninitialized. */ + ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL); + + /* Initialize/update system-wide config */ + prv->credit += prv->credits_per_tslice; + prv->ncpus++; + cpumask_set_cpu(cpu, prv->cpus); + if ( prv->ncpus == 1 ) + { + prv->master = cpu; + init_timer(&prv->master_ticker, csched_acct, prv, cpu); + set_timer(&prv->master_ticker, NOW() + prv->tslice); + } + + cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu))); + if ( cpumask_weight(cpumask_scratch) == 1 ) + prv->balance_bias[cpu_to_node(cpu)] = cpu; + + init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); + set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); + + INIT_LIST_HEAD(&spc->runq); + spc->runq_sort_last = prv->runq_sort; + spc->idle_bias = nr_cpu_ids - 1; + + /* Start off idling... */ + BUG_ON(!is_idle_unit(curr_on_cpu(cpu))); + cpumask_set_cpu(cpu, prv->idlers); + spc->nr_runnable = 0; +} + +static void +csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu) +{ + unsigned long flags; + struct csched_private *prv = CSCHED_PRIV(ops); + + spin_lock_irqsave(&prv->lock, flags); + init_pdata(prv, pdata, cpu); + spin_unlock_irqrestore(&prv->lock, flags); +} + +/* Change the scheduler of cpu to us (Credit). */ +static spinlock_t * +csched_switch_sched(struct scheduler *new_ops, unsigned int cpu, + void *pdata, void *vdata) +{ + struct sched_resource *sr = get_sched_res(cpu); + struct csched_private *prv = CSCHED_PRIV(new_ops); + struct csched_unit *svc = vdata; + + ASSERT(svc && is_idle_unit(svc->unit)); + + sched_idle_unit(cpu)->priv = vdata; + + /* + * We are holding the runqueue lock already (it's been taken in + * schedule_cpu_switch()). It actually may or may not be the 'right' + * one for this cpu, but that is ok for preventing races. + */ + ASSERT(!local_irq_is_enabled()); + spin_lock(&prv->lock); + init_pdata(prv, pdata, cpu); + spin_unlock(&prv->lock); + + return &sr->_lock; +} + +#ifndef NDEBUG +static inline void +__csched_unit_check(struct sched_unit *unit) +{ + struct csched_unit * const svc = CSCHED_UNIT(unit); + struct csched_dom * const sdom = svc->sdom; + + BUG_ON( svc->unit != unit ); + BUG_ON( sdom != CSCHED_DOM(unit->domain) ); + if ( sdom ) + { + BUG_ON( is_idle_unit(unit) ); + BUG_ON( sdom->dom != unit->domain ); + } + else + { + BUG_ON( !is_idle_unit(unit) ); + } + + SCHED_STAT_CRANK(unit_check); +} +#define CSCHED_UNIT_CHECK(unit) (__csched_unit_check(unit)) +#else +#define CSCHED_UNIT_CHECK(unit) +#endif + +/* + * Delay, in microseconds, between migrations of a UNIT between PCPUs. + * This prevents rapid fluttering of a UNIT between CPUs, and reduces the + * implicit overheads such as cache-warming. 1ms (1000) has been measured + * as a good value. + */ +static unsigned int vcpu_migration_delay_us; +integer_param("vcpu_migration_delay", vcpu_migration_delay_us); + +static inline bool +__csched_vcpu_is_cache_hot(const struct csched_private *prv, + const struct csched_unit *svc) +{ + bool hot = prv->unit_migr_delay && + (NOW() - svc->last_sched_time) < prv->unit_migr_delay; + + if ( hot ) + SCHED_STAT_CRANK(unit_hot); + + return hot; +} + +static inline int +__csched_unit_is_migrateable(const struct csched_private *prv, + struct sched_unit *unit, + int dest_cpu, cpumask_t *mask) +{ + const struct csched_unit *svc = CSCHED_UNIT(unit); + /* + * Don't pick up work that's hot on peer PCPU, or that can't (or + * would prefer not to) run on cpu. + * + * The caller is supposed to have already checked that unit is also + * not running. + */ + ASSERT(!unit->is_running); + + return !__csched_vcpu_is_cache_hot(prv, svc) && + cpumask_test_cpu(dest_cpu, mask); +} + +static int +_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit, + bool_t commit) +{ + int cpu = sched_unit_master(unit); + /* We must always use cpu's scratch space */ + cpumask_t *cpus = cpumask_scratch_cpu(cpu); + cpumask_t idlers; + cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); + struct csched_pcpu *spc = NULL; + int balance_step; + + for_each_affinity_balance_step( balance_step ) + { + affinity_balance_cpumask(unit, balance_step, cpus); + cpumask_and(cpus, online, cpus); + /* + * We want to pick up a pcpu among the ones that are online and + * can accommodate vc. As far as hard affinity is concerned, there + * always will be at least one of these pcpus in the scratch cpumask, + * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below + * are ok. + * + * On the other hand, when considering soft affinity, it is possible + * that the mask is empty (for instance, if the domain has been put + * in a cpupool that does not contain any of the pcpus in its soft + * affinity), which would result in the ASSERT()-s inside cpumask_*() + * operations triggering (in debug builds). + * + * Therefore, if that is the case, we just skip the soft affinity + * balancing step all together. + */ + if ( balance_step == BALANCE_SOFT_AFFINITY && + (!has_soft_affinity(unit) || cpumask_empty(cpus)) ) + continue; + + /* If present, prefer vc's current processor */ + cpu = cpumask_test_cpu(sched_unit_master(unit), cpus) + ? sched_unit_master(unit) + : cpumask_cycle(sched_unit_master(unit), cpus); + ASSERT(cpumask_test_cpu(cpu, cpus)); + + /* + * Try to find an idle processor within the above constraints. + * + * In multi-core and multi-threaded CPUs, not all idle execution + * vehicles are equal! + * + * We give preference to the idle execution vehicle with the most + * idling neighbours in its grouping. This distributes work across + * distinct cores first and guarantees we don't do something stupid + * like run two UNITs on co-hyperthreads while there are idle cores + * or sockets. + * + * Notice that, when computing the "idleness" of cpu, we may want to + * discount unit. That is, iff unit is the currently running and the + * only runnable unit on cpu, we add cpu to the idlers. + */ + cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); + if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) ) + __cpumask_set_cpu(cpu, &idlers); + cpumask_and(cpus, &idlers, cpus); + + /* + * It is important that cpu points to an idle processor, if a suitable + * one exists (and we can use cpus to check and, possibly, choose a new + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and + * cpu points to a busy thread with an idle sibling, both the threads + * will be considered the same, from the "idleness" calculation point + * of view", preventing unit from being moved to the thread that is + * actually idle. + * + * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so + * we check for it first. + */ + if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) ) + cpu = cpumask_cycle(cpu, cpus); + __cpumask_clear_cpu(cpu, cpus); + + while ( !cpumask_empty(cpus) ) + { + cpumask_t cpu_idlers; + cpumask_t nxt_idlers; + int nxt, weight_cpu, weight_nxt; + int migrate_factor; + + nxt = cpumask_cycle(cpu, cpus); + + if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) + { + /* We're on the same socket, so check the busy-ness of threads. + * Migrate if # of idlers is less at all */ + ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 1; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, + cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, + nxt)); + } + else + { + /* We're on different sockets, so check the busy-ness of cores. + * Migrate only if the other core is twice as idle */ + ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 2; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); + } + + weight_cpu = cpumask_weight(&cpu_idlers); + weight_nxt = cpumask_weight(&nxt_idlers); + /* smt_power_savings: consolidate work rather than spreading it */ + if ( sched_smt_power_savings ? + weight_cpu > weight_nxt : + weight_cpu * migrate_factor < weight_nxt ) + { + cpumask_and(&nxt_idlers, &nxt_idlers, cpus); + spc = CSCHED_PCPU(nxt); + cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); + cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu)); + } + else + { + cpumask_andnot(cpus, cpus, &nxt_idlers); + } + } + + /* Stop if cpu is idle */ + if ( cpumask_test_cpu(cpu, &idlers) ) + break; + } + + if ( commit && spc ) + spc->idle_bias = cpu; + + TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id, + cpu); + + return cpu; +} + +static struct sched_resource * +csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + struct csched_unit *svc = CSCHED_UNIT(unit); + + /* + * We have been called by vcpu_migrate() (in schedule.c), as part + * of the process of seeing if vc can be migrated to another pcpu. + * We make a note about this in svc->flags so that later, in + * csched_unit_wake() (still called from vcpu_migrate()) we won't + * get boosted, which we don't deserve as we are "only" migrating. + */ + set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); + return get_sched_res(_csched_cpu_pick(ops, unit, 1)); +} + +static inline void +__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc) +{ + struct csched_dom * const sdom = svc->sdom; + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + + if ( list_empty(&svc->active_unit_elem) ) + { + SCHED_UNIT_STAT_CRANK(svc, state_active); + SCHED_STAT_CRANK(acct_unit_active); + + sdom->active_unit_count++; + list_add(&svc->active_unit_elem, &sdom->active_unit); + /* Make weight per-unit */ + prv->weight += sdom->weight; + if ( list_empty(&sdom->active_sdom_elem) ) + { + list_add(&sdom->active_sdom_elem, &prv->active_sdom); + } + } + + TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id, + svc->unit->unit_id, sdom->active_unit_count); + + spin_unlock_irqrestore(&prv->lock, flags); +} + +static inline void +__csched_unit_acct_stop_locked(struct csched_private *prv, + struct csched_unit *svc) +{ + struct csched_dom * const sdom = svc->sdom; + + BUG_ON( list_empty(&svc->active_unit_elem) ); + + SCHED_UNIT_STAT_CRANK(svc, state_idle); + SCHED_STAT_CRANK(acct_unit_idle); + + BUG_ON( prv->weight < sdom->weight ); + sdom->active_unit_count--; + list_del_init(&svc->active_unit_elem); + prv->weight -= sdom->weight; + if ( list_empty(&sdom->active_unit) ) + { + list_del_init(&sdom->active_sdom_elem); + } + + TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id, + svc->unit->unit_id, sdom->active_unit_count); +} + +static void +csched_unit_acct(struct csched_private *prv, unsigned int cpu) +{ + struct sched_unit *currunit = current->sched_unit; + struct csched_unit * const svc = CSCHED_UNIT(currunit); + struct sched_resource *sr = get_sched_res(cpu); + const struct scheduler *ops = sr->scheduler; + + ASSERT( sched_unit_master(currunit) == cpu ); + ASSERT( svc->sdom != NULL ); + ASSERT( !is_idle_unit(svc->unit) ); + + /* + * If this UNIT's priority was boosted when it last awoke, reset it. + * If the UNIT is found here, then it's consuming a non-negligeable + * amount of CPU resources and should no longer be boosted. + */ + if ( svc->pri == CSCHED_PRI_TS_BOOST ) + { + svc->pri = CSCHED_PRI_TS_UNDER; + TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id, + svc->unit->unit_id); + } + + /* + * Update credits + */ + burn_credits(svc, NOW()); + + /* + * Put this UNIT and domain back on the active list if it was + * idling. + */ + if ( list_empty(&svc->active_unit_elem) ) + { + __csched_unit_acct_start(prv, svc); + } + else + { + unsigned int new_cpu; + unsigned long flags; + spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags); + + /* + * If it's been active a while, check if we'd be better off + * migrating it to run elsewhere (see multi-core and multi-thread + * support in csched_res_pick()). + */ + new_cpu = _csched_cpu_pick(ops, currunit, 0); + + unit_schedule_unlock_irqrestore(lock, flags, currunit); + + if ( new_cpu != cpu ) + { + SCHED_UNIT_STAT_CRANK(svc, migrate_r); + SCHED_STAT_CRANK(migrate_running); + sched_set_pause_flags_atomic(currunit, _VPF_migrating); + /* + * As we are about to tickle cpu, we should clear its bit in + * idlers. But, if we are here, it means there is someone running + * on it, and hence the bit must be zero already. + */ + ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers)); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + } + } +} + +static void * +csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + struct csched_unit *svc; + + /* Allocate per-UNIT info */ + svc = xzalloc(struct csched_unit); + if ( svc == NULL ) + return NULL; + + INIT_LIST_HEAD(&svc->runq_elem); + INIT_LIST_HEAD(&svc->active_unit_elem); + svc->sdom = dd; + svc->unit = unit; + svc->pri = is_idle_unit(unit) ? + CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; + SCHED_UNIT_STATS_RESET(svc); + SCHED_STAT_CRANK(unit_alloc); + return svc; +} + +static void +csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched_unit *svc = unit->priv; + spinlock_t *lock; + + BUG_ON( is_idle_unit(unit) ); + + /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */ + lock = unit_schedule_lock_irq(unit); + + sched_set_res(unit, csched_res_pick(ops, unit)); + + spin_unlock_irq(lock); + + lock = unit_schedule_lock_irq(unit); + + if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running ) + runq_insert(svc); + + unit_schedule_unlock_irq(lock, unit); + + SCHED_STAT_CRANK(unit_insert); +} + +static void +csched_free_udata(const struct scheduler *ops, void *priv) +{ + struct csched_unit *svc = priv; + + BUG_ON( !list_empty(&svc->runq_elem) ); + + xfree(svc); +} + +static void +csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_unit * const svc = CSCHED_UNIT(unit); + struct csched_dom * const sdom = svc->sdom; + + SCHED_STAT_CRANK(unit_remove); + + ASSERT(!__unit_on_runq(svc)); + + if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) + { + SCHED_STAT_CRANK(unit_unpark); + sched_unit_unpause(svc->unit); + } + + spin_lock_irq(&prv->lock); + + if ( !list_empty(&svc->active_unit_elem) ) + __csched_unit_acct_stop_locked(prv, svc); + + spin_unlock_irq(&prv->lock); + + BUG_ON( sdom == NULL ); +} + +static void +csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched_unit * const svc = CSCHED_UNIT(unit); + unsigned int cpu = sched_unit_master(unit); + struct sched_resource *sr = get_sched_res(cpu); + + SCHED_STAT_CRANK(unit_sleep); + + BUG_ON( is_idle_unit(unit) ); + + if ( curr_on_cpu(cpu) == unit ) + { + /* + * We are about to tickle cpu, so we should clear its bit in idlers. + * But, we are here because unit is going to sleep while running on cpu, + * so the bit must be zero already. + */ + ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers)); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + } + else if ( __unit_on_runq(svc) ) + runq_remove(svc); +} + +static void +csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched_unit * const svc = CSCHED_UNIT(unit); + bool_t migrating; + + BUG_ON( is_idle_unit(unit) ); + + if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) + { + SCHED_STAT_CRANK(unit_wake_running); + return; + } + if ( unlikely(__unit_on_runq(svc)) ) + { + SCHED_STAT_CRANK(unit_wake_onrunq); + return; + } + + if ( likely(unit_runnable(unit)) ) + SCHED_STAT_CRANK(unit_wake_runnable); + else + SCHED_STAT_CRANK(unit_wake_not_runnable); + + /* + * We temporarily boost the priority of awaking UNITs! + * + * If this UNIT consumes a non negligible amount of CPU, it + * will eventually find itself in the credit accounting code + * path where its priority will be reset to normal. + * + * If on the other hand the UNIT consumes little CPU and is + * blocking and awoken a lot (doing I/O for example), its + * priority will remain boosted, optimizing it's wake-to-run + * latencies. + * + * This allows wake-to-run latency sensitive UNITs to preempt + * more CPU resource intensive UNITs without impacting overall + * system fairness. + * + * There are two cases, when we don't want to boost: + * - UNITs that are waking up after a migration, rather than + * after having block; + * - UNITs of capped domains unpausing after earning credits + * they had overspent. + */ + migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); + + if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER && + !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) + { + TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id, + unit->unit_id); + SCHED_STAT_CRANK(unit_boost); + svc->pri = CSCHED_PRI_TS_BOOST; + } + + /* Put the UNIT on the runq and tickle CPUs */ + runq_insert(svc); + __runq_tickle(svc); +} + +static void +csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched_unit * const svc = CSCHED_UNIT(unit); + + /* Let the scheduler know that this vcpu is trying to yield */ + set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags); +} + +static int +csched_dom_cntl( + const struct scheduler *ops, + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct csched_dom * const sdom = CSCHED_DOM(d); + struct csched_private *prv = CSCHED_PRIV(ops); + unsigned long flags; + int rc = 0; + + /* Protect both get and put branches with the pluggable scheduler + * lock. Runq lock not needed anywhere in here. */ + spin_lock_irqsave(&prv->lock, flags); + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + op->u.credit.weight = sdom->weight; + op->u.credit.cap = sdom->cap; + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + if ( op->u.credit.weight != 0 ) + { + if ( !list_empty(&sdom->active_sdom_elem) ) + { + prv->weight -= sdom->weight * sdom->active_unit_count; + prv->weight += op->u.credit.weight * sdom->active_unit_count; + } + sdom->weight = op->u.credit.weight; + } + + if ( op->u.credit.cap != (uint16_t)~0U ) + sdom->cap = op->u.credit.cap; + break; + default: + rc = -EINVAL; + break; + } + + spin_unlock_irqrestore(&prv->lock, flags); + + return rc; +} + +static void +csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, + const cpumask_t *hard, const cpumask_t *soft) +{ + struct csched_unit *svc = CSCHED_UNIT(unit); + + if ( !hard ) + return; + + /* Are we becoming exclusively pinned? */ + if ( cpumask_weight(hard) == 1 ) + set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); + else + clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); +} + +static inline void +__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms) +{ + prv->tslice = MILLISECS(timeslice_ms); + prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE; + if ( timeslice_ms < prv->ticks_per_tslice ) + prv->ticks_per_tslice = 1; + prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice; + prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms; + prv->credit = prv->credits_per_tslice * prv->ncpus; +} + +static int +csched_sys_cntl(const struct scheduler *ops, + struct xen_sysctl_scheduler_op *sc) +{ + int rc = -EINVAL; + struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit; + struct csched_private *prv = CSCHED_PRIV(ops); + unsigned long flags; + + switch ( sc->cmd ) + { + case XEN_SYSCTL_SCHEDOP_putinfo: + if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX + || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN + || (params->ratelimit_us + && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX + || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN)) + || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) + || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) + goto out; + + spin_lock_irqsave(&prv->lock, flags); + __csched_set_tslice(prv, params->tslice_ms); + if ( !prv->ratelimit && params->ratelimit_us ) + printk(XENLOG_INFO "Enabling context switch rate limiting\n"); + else if ( prv->ratelimit && !params->ratelimit_us ) + printk(XENLOG_INFO "Disabling context switch rate limiting\n"); + prv->ratelimit = MICROSECS(params->ratelimit_us); + prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us); + spin_unlock_irqrestore(&prv->lock, flags); + + /* FALLTHRU */ + case XEN_SYSCTL_SCHEDOP_getinfo: + params->tslice_ms = prv->tslice / MILLISECS(1); + params->ratelimit_us = prv->ratelimit / MICROSECS(1); + params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1); + rc = 0; + break; + } + out: + return rc; +} + +static void * +csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + struct csched_dom *sdom; + + sdom = xzalloc(struct csched_dom); + if ( sdom == NULL ) + return ERR_PTR(-ENOMEM); + + /* Initialize credit and weight */ + INIT_LIST_HEAD(&sdom->active_unit); + INIT_LIST_HEAD(&sdom->active_sdom_elem); + sdom->dom = dom; + sdom->weight = CSCHED_DEFAULT_WEIGHT; + + return sdom; +} + +static void +csched_free_domdata(const struct scheduler *ops, void *data) +{ + xfree(data); +} + +/* + * This is a O(n) optimized sort of the runq. + * + * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk + * through the runq and move up any UNDERs that are preceded by OVERS. We + * remember the last UNDER to make the move up operation O(1). + */ +static void +csched_runq_sort(struct csched_private *prv, unsigned int cpu) +{ + struct csched_pcpu * const spc = CSCHED_PCPU(cpu); + struct list_head *runq, *elem, *next, *last_under; + struct csched_unit *svc_elem; + spinlock_t *lock; + unsigned long flags; + int sort_epoch; + + sort_epoch = prv->runq_sort; + if ( sort_epoch == spc->runq_sort_last ) + return; + + spc->runq_sort_last = sort_epoch; + + lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + runq = &spc->runq; + elem = runq->next; + last_under = runq; + + while ( elem != runq ) + { + next = elem->next; + svc_elem = __runq_elem(elem); + + if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) + { + /* does elem need to move up the runq? */ + if ( elem->prev != last_under ) + { + list_del(elem); + list_add(elem, last_under); + } + last_under = elem; + } + + elem = next; + } + + pcpu_schedule_unlock_irqrestore(lock, flags, cpu); +} + +static void +csched_acct(void* dummy) +{ + struct csched_private *prv = dummy; + unsigned long flags; + struct list_head *iter_unit, *next_unit; + struct list_head *iter_sdom, *next_sdom; + struct csched_unit *svc; + struct csched_dom *sdom; + uint32_t credit_total; + uint32_t weight_total; + uint32_t weight_left; + uint32_t credit_fair; + uint32_t credit_peak; + uint32_t credit_cap; + int credit_balance; + int credit_xtra; + int credit; + + + spin_lock_irqsave(&prv->lock, flags); + + weight_total = prv->weight; + credit_total = prv->credit; + + /* Converge balance towards 0 when it drops negative */ + if ( prv->credit_balance < 0 ) + { + credit_total -= prv->credit_balance; + SCHED_STAT_CRANK(acct_balance); + } + + if ( unlikely(weight_total == 0) ) + { + prv->credit_balance = 0; + spin_unlock_irqrestore(&prv->lock, flags); + SCHED_STAT_CRANK(acct_no_work); + goto out; + } + + SCHED_STAT_CRANK(acct_run); + + weight_left = weight_total; + credit_balance = 0; + credit_xtra = 0; + credit_cap = 0U; + + list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) + { + sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); + + BUG_ON( is_idle_domain(sdom->dom) ); + BUG_ON( sdom->active_unit_count == 0 ); + BUG_ON( sdom->weight == 0 ); + BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left ); + + weight_left -= ( sdom->weight * sdom->active_unit_count ); + + /* + * A domain's fair share is computed using its weight in competition + * with that of all other active domains. + * + * At most, a domain can use credits to run all its active UNITs + * for one full accounting period. We allow a domain to earn more + * only when the system-wide credit balance is negative. + */ + credit_peak = sdom->active_unit_count * prv->credits_per_tslice; + if ( prv->credit_balance < 0 ) + { + credit_peak += ( ( -prv->credit_balance + * sdom->weight + * sdom->active_unit_count) + + (weight_total - 1) + ) / weight_total; + } + + if ( sdom->cap != 0U ) + { + credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100; + if ( credit_cap < credit_peak ) + credit_peak = credit_cap; + + /* FIXME -- set cap per-unit as well...? */ + credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 ) + ) / sdom->active_unit_count; + } + + credit_fair = ( ( credit_total + * sdom->weight + * sdom->active_unit_count ) + + (weight_total - 1) + ) / weight_total; + + if ( credit_fair < credit_peak ) + { + credit_xtra = 1; + } + else + { + if ( weight_left != 0U ) + { + /* Give other domains a chance at unused credits */ + credit_total += ( ( ( credit_fair - credit_peak + ) * weight_total + ) + ( weight_left - 1 ) + ) / weight_left; + } + + if ( credit_xtra ) + { + /* + * Lazily keep domains with extra credits at the head of + * the queue to give others a chance at them in future + * accounting periods. + */ + SCHED_STAT_CRANK(acct_reorder); + list_del(&sdom->active_sdom_elem); + list_add(&sdom->active_sdom_elem, &prv->active_sdom); + } + + credit_fair = credit_peak; + } + + /* Compute fair share per UNIT */ + credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 ) + ) / sdom->active_unit_count; + + + list_for_each_safe( iter_unit, next_unit, &sdom->active_unit ) + { + svc = list_entry(iter_unit, struct csched_unit, active_unit_elem); + BUG_ON( sdom != svc->sdom ); + + /* Increment credit */ + atomic_add(credit_fair, &svc->credit); + credit = atomic_read(&svc->credit); + + /* + * Recompute priority or, if UNIT is idling, remove it from + * the active list. + */ + if ( credit < 0 ) + { + svc->pri = CSCHED_PRI_TS_OVER; + + /* Park running UNITs of capped-out domains */ + if ( sdom->cap != 0U && + credit < -credit_cap && + !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) + { + SCHED_STAT_CRANK(unit_park); + sched_unit_pause_nosync(svc->unit); + } + + /* Lower bound on credits */ + if ( credit < -prv->credits_per_tslice ) + { + SCHED_STAT_CRANK(acct_min_credit); + credit = -prv->credits_per_tslice; + atomic_set(&svc->credit, credit); + } + } + else + { + svc->pri = CSCHED_PRI_TS_UNDER; + + /* Unpark any capped domains whose credits go positive */ + if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) + { + /* + * It's important to unset the flag AFTER the unpause() + * call to make sure the UNIT's priority is not boosted + * if it is woken up here. + */ + SCHED_STAT_CRANK(unit_unpark); + sched_unit_unpause(svc->unit); + clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags); + } + + /* Upper bound on credits means UNIT stops earning */ + if ( credit > prv->credits_per_tslice ) + { + __csched_unit_acct_stop_locked(prv, svc); + /* Divide credits in half, so that when it starts + * accounting again, it starts a little bit "ahead" */ + credit /= 2; + atomic_set(&svc->credit, credit); + } + } + + SCHED_UNIT_STAT_SET(svc, credit_last, credit); + SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair); + credit_balance += credit; + } + } + + prv->credit_balance = credit_balance; + + spin_unlock_irqrestore(&prv->lock, flags); + + /* Inform each CPU that its runq needs to be sorted */ + prv->runq_sort++; + +out: + set_timer( &prv->master_ticker, NOW() + prv->tslice); +} + +static void +csched_tick(void *_cpu) +{ + unsigned int cpu = (unsigned long)_cpu; + struct sched_resource *sr = get_sched_res(cpu); + struct csched_pcpu *spc = CSCHED_PCPU(cpu); + struct csched_private *prv = CSCHED_PRIV(sr->scheduler); + + spc->tick++; + + /* + * Accounting for running UNIT + */ + if ( !is_idle_unit(current->sched_unit) ) + csched_unit_acct(prv, cpu); + + /* + * Check if runq needs to be sorted + * + * Every physical CPU resorts the runq after the accounting master has + * modified priorities. This is a special O(n) sort and runs at most + * once per accounting period (currently 30 milliseconds). + */ + csched_runq_sort(prv, cpu); + + set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); +} + +static struct csched_unit * +csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) +{ + struct sched_resource *sr = get_sched_res(cpu); + const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler); + const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); + struct csched_unit *speer; + struct list_head *iter; + struct sched_unit *unit; + + ASSERT(peer_pcpu != NULL); + + /* + * Don't steal from an idle CPU's runq because it's about to + * pick up work from it itself. + */ + if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) ) + goto out; + + list_for_each( iter, &peer_pcpu->runq ) + { + speer = __runq_elem(iter); + + /* + * If next available UNIT here is not of strictly higher + * priority than ours, this PCPU is useless to us. + */ + if ( speer->pri <= pri ) + break; + + /* Is this UNIT runnable on our PCPU? */ + unit = speer->unit; + BUG_ON( is_idle_unit(unit) ); + + /* + * If the unit is still in peer_cpu's scheduling tail, or if it + * has no useful soft affinity, skip it. + * + * In fact, what we want is to check if we have any "soft-affine + * work" to steal, before starting to look at "hard-affine work". + * + * Notice that, if not even one unit on this runq has a useful + * soft affinity, we could have avoid considering this runq for + * a soft balancing step in the first place. This, for instance, + * can be implemented by taking note of on what runq there are + * units with useful soft affinities in some sort of bitmap + * or counter. + */ + if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY && + !has_soft_affinity(unit)) ) + continue; + + affinity_balance_cpumask(unit, balance_step, cpumask_scratch); + if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) ) + { + /* We got a candidate. Grab it! */ + TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu, + unit->domain->domain_id, unit->unit_id); + SCHED_UNIT_STAT_CRANK(speer, migrate_q); + SCHED_STAT_CRANK(migrate_queued); + runq_remove(speer); + sched_set_res(unit, get_sched_res(cpu)); + /* + * speer will start executing directly on cpu, without having to + * go through runq_insert(). So we must update the runnable count + * for cpu here. + */ + inc_nr_runnable(cpu); + return speer; + } + } + out: + SCHED_STAT_CRANK(steal_peer_idle); + return NULL; +} + +static struct csched_unit * +csched_load_balance(struct csched_private *prv, int cpu, + struct csched_unit *snext, bool *stolen) +{ + struct cpupool *c = get_sched_res(cpu)->cpupool; + struct csched_unit *speer; + cpumask_t workers; + cpumask_t *online = c->res_valid; + int peer_cpu, first_cpu, peer_node, bstep; + int node = cpu_to_node(cpu); + + BUG_ON(get_sched_res(cpu) != snext->unit->res); + + /* + * If this CPU is going offline, or is not (yet) part of any cpupool + * (as it happens, e.g., during cpu bringup), we shouldn't steal work. + */ + if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) ) + goto out; + + if ( snext->pri == CSCHED_PRI_IDLE ) + SCHED_STAT_CRANK(load_balance_idle); + else if ( snext->pri == CSCHED_PRI_TS_OVER ) + SCHED_STAT_CRANK(load_balance_over); + else + SCHED_STAT_CRANK(load_balance_other); + + /* + * Let's look around for work to steal, taking both hard affinity + * and soft affinity into account. More specifically, we check all + * the non-idle CPUs' runq, looking for: + * 1. any "soft-affine work" to steal first, + * 2. if not finding anything, any "hard-affine work" to steal. + */ + for_each_affinity_balance_step( bstep ) + { + /* + * We peek at the non-idling CPUs in a node-wise fashion. In fact, + * it is more likely that we find some affine work on our same + * node, not to mention that migrating units within the same node + * could well expected to be cheaper than across-nodes (memory + * stays local, there might be some node-wide cache[s], etc.). + */ + peer_node = node; + do + { + /* Select the pCPUs in this node that have work we can steal. */ + cpumask_andnot(&workers, online, prv->idlers); + cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); + __cpumask_clear_cpu(cpu, &workers); + + first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers); + if ( first_cpu >= nr_cpu_ids ) + goto next_node; + peer_cpu = first_cpu; + do + { + spinlock_t *lock; + + /* + * If there is only one runnable unit on peer_cpu, it means + * there's no one to be stolen in its runqueue, so skip it. + * + * Checking this without holding the lock is racy... But that's + * the whole point of this optimization! + * + * In more details: + * - if we race with dec_nr_runnable(), we may try to take the + * lock and call csched_runq_steal() for no reason. This is + * not a functional issue, and should be infrequent enough. + * And we can avoid that by re-checking nr_runnable after + * having grabbed the lock, if we want; + * - if we race with inc_nr_runnable(), we skip a pCPU that may + * have runnable units in its runqueue, but that's not a + * problem because: + * + if racing with csched_unit_insert() or csched_unit_wake(), + * __runq_tickle() will be called afterwords, so the unit + * won't get stuck in the runqueue for too long; + * + if racing with csched_runq_steal(), it may be that an + * unit that we could have picked up, stays in a runqueue + * until someone else tries to steal it again. But this is + * no worse than what can happen already (without this + * optimization), it the pCPU would schedule right after we + * have taken the lock, and hence block on it. + */ + if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 ) + { + TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0); + goto next_cpu; + } + + /* + * Get ahold of the scheduler lock for this peer CPU. + * + * Note: We don't spin on this lock but simply try it. Spinning + * could cause a deadlock if the peer CPU is also load + * balancing and trying to lock this CPU. + */ + lock = pcpu_schedule_trylock(peer_cpu); + SCHED_STAT_CRANK(steal_trylock); + if ( !lock ) + { + SCHED_STAT_CRANK(steal_trylock_failed); + TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0); + goto next_cpu; + } + + TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1); + + /* Any work over there to steal? */ + speer = cpumask_test_cpu(peer_cpu, online) ? + csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; + pcpu_schedule_unlock(lock, peer_cpu); + + /* As soon as one unit is found, balancing ends */ + if ( speer != NULL ) + { + *stolen = true; + /* + * Next time we'll look for work to steal on this node, we + * will start from the next pCPU, with respect to this one, + * so we don't risk stealing always from the same ones. + */ + prv->balance_bias[peer_node] = peer_cpu; + return speer; + } + + next_cpu: + peer_cpu = cpumask_cycle(peer_cpu, &workers); + + } while( peer_cpu != first_cpu ); + + next_node: + peer_node = cycle_node(peer_node, node_online_map); + } while( peer_node != node ); + } + + out: + /* Failed to find more important work elsewhere... */ + __runq_remove(snext); + return snext; +} + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static void csched_schedule( + const struct scheduler *ops, struct sched_unit *unit, s_time_t now, + bool tasklet_work_scheduled) +{ + const unsigned int cur_cpu = smp_processor_id(); + const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); + struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu); + struct list_head * const runq = RUNQ(sched_cpu); + struct csched_unit * const scurr = CSCHED_UNIT(unit); + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_unit *snext; + s_time_t runtime, tslice; + bool migrated = false; + + SCHED_STAT_CRANK(schedule); + CSCHED_UNIT_CHECK(unit); + + /* + * Here in Credit1 code, we usually just call TRACE_nD() helpers, and + * don't care about packing. But scheduling happens very often, so it + * actually is important that the record is as small as possible. + */ + if ( unlikely(tb_init_done) ) + { + struct { + unsigned cpu:16, tasklet:8, idle:8; + } d; + d.cpu = cur_cpu; + d.tasklet = tasklet_work_scheduled; + d.idle = is_idle_unit(unit); + __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d), + (unsigned char *)&d); + } + + runtime = now - unit->state_entry_time; + if ( runtime < 0 ) /* Does this ever happen? */ + runtime = 0; + + if ( !is_idle_unit(unit) ) + { + /* Update credits of a non-idle UNIT. */ + burn_credits(scurr, now); + scurr->start_time -= now; + scurr->last_sched_time = now; + } + else + { + /* Re-instate a boosted idle UNIT as normal-idle. */ + scurr->pri = CSCHED_PRI_IDLE; + } + + /* Choices, choices: + * - If we have a tasklet, we need to run the idle unit no matter what. + * - If sched rate limiting is in effect, and the current unit has + * run for less than that amount of time, continue the current one, + * but with a shorter timeslice and return it immediately + * - Otherwise, chose the one with the highest priority (which may + * be the one currently running) + * - If the currently running one is TS_OVER, see if there + * is a higher priority one waiting on the runqueue of another + * cpu and steal it. + */ + + /* + * If we have schedule rate limiting enabled, check to see + * how long we've run for. + * + * If scurr is yielding, however, we don't let rate limiting kick in. + * In fact, it may be the case that scurr is about to spin, and there's + * no point forcing it to do so until rate limiting expires. + */ + if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags) + && !tasklet_work_scheduled + && prv->ratelimit + && unit_runnable_state(unit) + && !is_idle_unit(unit) + && runtime < prv->ratelimit ) + { + snext = scurr; + snext->start_time += now; + perfc_incr(delay_ms); + /* + * Next timeslice must last just until we'll have executed for + * ratelimit. However, to avoid setting a really short timer, which + * will most likely be inaccurate and counterproductive, we never go + * below CSCHED_MIN_TIMER. + */ + tslice = prv->ratelimit - runtime; + if ( unlikely(runtime < CSCHED_MIN_TIMER) ) + tslice = CSCHED_MIN_TIMER; + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned runtime; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.runtime = runtime; + __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d), + (unsigned char *)&d); + } + + goto out; + } + tslice = prv->tslice; + + /* + * Select next runnable local UNIT (ie top of local runq) + */ + if ( unit_runnable(unit) ) + __runq_insert(scurr); + else + { + BUG_ON( is_idle_unit(unit) || list_empty(runq) ); + /* Current has blocked. Update the runnable counter for this cpu. */ + dec_nr_runnable(sched_cpu); + } + + /* + * Clear YIELD flag before scheduling out + */ + clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags); + + do { + snext = __runq_elem(runq->next); + + /* Tasklet work (which runs in idle UNIT context) overrides all else. */ + if ( tasklet_work_scheduled ) + { + TRACE_0D(TRC_CSCHED_SCHED_TASKLET); + snext = CSCHED_UNIT(sched_idle_unit(sched_cpu)); + snext->pri = CSCHED_PRI_TS_BOOST; + } + + /* + * SMP Load balance: + * + * If the next highest priority local runnable UNIT has already eaten + * through its credits, look on other PCPUs to see if we have more + * urgent work... If not, csched_load_balance() will return snext, but + * already removed from the runq. + */ + if ( snext->pri > CSCHED_PRI_TS_OVER ) + __runq_remove(snext); + else + snext = csched_load_balance(prv, sched_cpu, snext, &migrated); + + } while ( !unit_runnable_state(snext->unit) ); + + /* + * Update idlers mask if necessary. When we're idling, other CPUs + * will tickle us when they get extra work. + */ + if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE ) + { + if ( !cpumask_test_cpu(sched_cpu, prv->idlers) ) + cpumask_set_cpu(sched_cpu, prv->idlers); + } + else if ( cpumask_test_cpu(sched_cpu, prv->idlers) ) + { + cpumask_clear_cpu(sched_cpu, prv->idlers); + } + + if ( !is_idle_unit(snext->unit) ) + snext->start_time += now; + +out: + /* + * Return task to run next... + */ + unit->next_time = (is_idle_unit(snext->unit) ? + -1 : tslice); + unit->next_task = snext->unit; + snext->unit->migrated = migrated; + + /* Stop credit tick when going to idle, restart it when coming from idle. */ + if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) ) + stop_timer(&spc->ticker); + if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) ) + set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us) + - now % MICROSECS(prv->tick_period_us) ); + + CSCHED_UNIT_CHECK(unit->next_task); +} + +static void +csched_dump_unit(struct csched_unit *svc) +{ + struct csched_dom * const sdom = svc->sdom; + + printk("[%i.%i] pri=%i flags=%x cpu=%i", + svc->unit->domain->domain_id, + svc->unit->unit_id, + svc->pri, + svc->flags, + sched_unit_master(svc->unit)); + + if ( sdom ) + { + printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit), + sdom->weight, sdom->cap); +#ifdef CSCHED_STATS + printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}", + svc->stats.credit_last, + svc->stats.credit_incr, + svc->stats.state_active, + svc->stats.state_idle, + svc->stats.migrate_q, + svc->stats.migrate_r, + svc->stats.kicked_away); +#endif + } + + printk("\n"); +} + +static void +csched_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct list_head *runq, *iter; + struct csched_private *prv = CSCHED_PRIV(ops); + struct csched_pcpu *spc; + struct csched_unit *svc; + spinlock_t *lock; + unsigned long flags; + int loop; + + /* + * We need both locks: + * - csched_dump_unit() wants to access domains' scheduling + * parameters, which are protected by the private scheduler lock; + * - we scan through the runqueue, so we need the proper runqueue + * lock (the one of the runqueue of this cpu). + */ + spin_lock_irqsave(&prv->lock, flags); + lock = pcpu_schedule_lock(cpu); + + spc = CSCHED_PCPU(cpu); + runq = &spc->runq; + + printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n", + cpu, spc->nr_runnable, spc->runq_sort_last, + CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), + CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); + + /* current UNIT (nothing to say if that's the idle unit). */ + svc = CSCHED_UNIT(curr_on_cpu(cpu)); + if ( svc && !is_idle_unit(svc->unit) ) + { + printk("\trun: "); + csched_dump_unit(svc); + } + + loop = 0; + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + if ( svc ) + { + printk("\t%3d: ", ++loop); + csched_dump_unit(svc); + } + } + + pcpu_schedule_unlock(lock, cpu); + spin_unlock_irqrestore(&prv->lock, flags); +} + +static void +csched_dump(const struct scheduler *ops) +{ + struct list_head *iter_sdom, *iter_svc; + struct csched_private *prv = CSCHED_PRIV(ops); + int loop; + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + + printk("info:\n" + "\tncpus = %u\n" + "\tmaster = %u\n" + "\tcredit = %u\n" + "\tcredit balance = %d\n" + "\tweight = %u\n" + "\trunq_sort = %u\n" + "\tdefault-weight = %d\n" + "\ttslice = %"PRI_stime"ms\n" + "\tratelimit = %"PRI_stime"us\n" + "\tcredits per msec = %d\n" + "\tticks per tslice = %d\n" + "\tmigration delay = %"PRI_stime"us\n", + prv->ncpus, + prv->master, + prv->credit, + prv->credit_balance, + prv->weight, + prv->runq_sort, + CSCHED_DEFAULT_WEIGHT, + prv->tslice / MILLISECS(1), + prv->ratelimit / MICROSECS(1), + CSCHED_CREDITS_PER_MSEC, + prv->ticks_per_tslice, + prv->unit_migr_delay/ MICROSECS(1)); + + printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers)); + + printk("active units:\n"); + loop = 0; + list_for_each( iter_sdom, &prv->active_sdom ) + { + struct csched_dom *sdom; + sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); + + list_for_each( iter_svc, &sdom->active_unit ) + { + struct csched_unit *svc; + spinlock_t *lock; + + svc = list_entry(iter_svc, struct csched_unit, active_unit_elem); + lock = unit_schedule_lock(svc->unit); + + printk("\t%3d: ", ++loop); + csched_dump_unit(svc); + + unit_schedule_unlock(lock, svc->unit); + } + } + + spin_unlock_irqrestore(&prv->lock, flags); +} + +static int __init +csched_global_init(void) +{ + if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || + sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN ) + { + printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n" + " Resetting to default %u\n", + XEN_SYSCTL_CSCHED_TSLICE_MIN, + XEN_SYSCTL_CSCHED_TSLICE_MAX, + CSCHED_DEFAULT_TSLICE_MS); + sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; + } + + if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) + printk("WARNING: sched_ratelimit_us >" + "sched_credit_tslice_ms is undefined\n" + "Setting ratelimit to tslice\n"); + + if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) + { + vcpu_migration_delay_us = 0; + printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n" + "Resetting to default: %u\n", + XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us); + } + + return 0; +} + +static int +csched_init(struct scheduler *ops) +{ + struct csched_private *prv; + + prv = xzalloc(struct csched_private); + if ( prv == NULL ) + return -ENOMEM; + + prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES); + if ( prv->balance_bias == NULL ) + { + xfree(prv); + return -ENOMEM; + } + + if ( !zalloc_cpumask_var(&prv->cpus) || + !zalloc_cpumask_var(&prv->idlers) ) + { + free_cpumask_var(prv->cpus); + xfree(prv->balance_bias); + xfree(prv); + return -ENOMEM; + } + + ops->sched_data = prv; + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->active_sdom); + prv->master = UINT_MAX; + + __csched_set_tslice(prv, sched_credit_tslice_ms); + + if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) + prv->ratelimit = prv->tslice; + else + prv->ratelimit = MICROSECS(sched_ratelimit_us); + + prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us); + + return 0; +} + +static void +csched_deinit(struct scheduler *ops) +{ + struct csched_private *prv; + + prv = CSCHED_PRIV(ops); + if ( prv != NULL ) + { + ops->sched_data = NULL; + free_cpumask_var(prv->cpus); + free_cpumask_var(prv->idlers); + xfree(prv->balance_bias); + xfree(prv); + } +} + +static const struct scheduler sched_credit_def = { + .name = "SMP Credit Scheduler", + .opt_name = "credit", + .sched_id = XEN_SCHEDULER_CREDIT, + .sched_data = NULL, + + .global_init = csched_global_init, + + .insert_unit = csched_unit_insert, + .remove_unit = csched_unit_remove, + + .sleep = csched_unit_sleep, + .wake = csched_unit_wake, + .yield = csched_unit_yield, + + .adjust = csched_dom_cntl, + .adjust_affinity= csched_aff_cntl, + .adjust_global = csched_sys_cntl, + + .pick_resource = csched_res_pick, + .do_schedule = csched_schedule, + + .dump_cpu_state = csched_dump_pcpu, + .dump_settings = csched_dump, + .init = csched_init, + .deinit = csched_deinit, + .alloc_udata = csched_alloc_udata, + .free_udata = csched_free_udata, + .alloc_pdata = csched_alloc_pdata, + .init_pdata = csched_init_pdata, + .deinit_pdata = csched_deinit_pdata, + .free_pdata = csched_free_pdata, + .switch_sched = csched_switch_sched, + .alloc_domdata = csched_alloc_domdata, + .free_domdata = csched_free_domdata, +}; + +REGISTER_SCHEDULER(sched_credit_def); diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c new file mode 100644 index 0000000000..f7c477053c --- /dev/null +++ b/xen/common/sched/credit2.c @@ -0,0 +1,4122 @@ + +/**************************************************************************** + * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd + **************************************************************************** + * + * File: common/sched_credit2.c + * Author: George Dunlap + * + * Description: Credit-based SMP CPU scheduler + * Based on an earlier verson by Emmanuel Ackaouy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Meant only for helping developers during debugging. */ +/* #define d2printk printk */ +#define d2printk(x...) + + +/* + * Credit2 tracing events ("only" 512 available!). Check + * include/public/trace.h for more details. + */ +#define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) +#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) +#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) +#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) +#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) +#define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) +#define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) +#define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) +#define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) +#define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) +#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) +#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) +#define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13) +#define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14) +#define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15) +#define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16) +#define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17) +#define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19) +#define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20) +#define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21) +#define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22) +#define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23) + +/* + * TODO: + * + Hyperthreading + * - "Discount" time run on a thread with busy siblings + * + Algorithm: + * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., + * a flash animation in the background) can we schedule it with low enough latency + * so that audio doesn't skip? + * + Optimizing + * - Profiling, making new algorithms, making math more efficient (no long division) + */ + +/* + * Design: + * + * VMs "burn" credits based on their weight; higher weight means + * credits burn more slowly. The highest weight unit burns credits at + * a rate of 1 credit per nanosecond. Others burn proportionally + * more. + * + * units are inserted into the runqueue by credit order. + * + * Credits are "reset" when the next unit in the runqueue is less than + * or equal to zero. At that point, everyone's credits are "clipped" + * to a small value, and a fixed credit is added to everyone. + */ + +/* + * Utilization cap: + * + * Setting an pCPU utilization cap for a domain means the following: + * + * - a domain can have a cap, expressed in terms of % of physical CPU time. + * A domain that must not use more than 1/4 of _one_ physical CPU, will + * be given a cap of 25%; a domain that must not use more than 1+1/2 of + * physical CPU time, will be given a cap of 150%; + * + * - caps are per-domain (not per-unit). If a domain has only 1 unit, and + * a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4 + * units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be + * split among the v units. How much each of the units will actually get, + * during any given interval of time, is unspecified (as it depends on + * various aspects: workload, system load, etc.). For instance, it is + * possible that, during a given time interval, 2 units use 100% each, + * and the other two use nothing; while during another time interval, + * two units use 80%, one uses 10% and the other 30%; or that each use + * 50% (and so on and so forth). + * + * For implementing this, we use the following approach: + * + * - each domain is given a 'budget', an each domain has a timer, which + * replenishes the domain's budget periodically. The budget is the amount + * of time the units of the domain can use every 'period'; + * + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains + * (but each domain has its own timer; so the all are periodic by the same + * period, but replenishment of the budgets of the various domains, at + * periods boundaries, are not synchronous); + * + * - when units run, they consume budget. When they don't run, they don't + * consume budget. If there is no budget left for the domain, no unit of + * that domain can run. If an unit tries to run and finds that there is no + * budget, it blocks. + * At whatever time an unit wants to run, it must check the domain's budget, + * and if there is some, it can use it. + * + * - budget is replenished to the top of the capacity for the domain once + * per period. Even if there was some leftover budget from previous period, + * though, the budget after a replenishment will always be at most equal + * to the total capacify of the domain ('tot_budget'); + * + * - when a budget replenishment occurs, if there are units that had been + * blocked because of lack of budget, they'll be unblocked, and they will + * (potentially) be able to run again. + * + * Finally, some even more implementation related detail: + * + * - budget is stored in a domain-wide pool. Units of the domain that want + * to run go to such pool, and grub some. When they do so, the amount + * they grabbed is _immediately_ removed from the pool. This happens in + * unit_grab_budget(); + * + * - when units stop running, if they've not consumed all the budget they + * took, the leftover is put back in the pool. This happens in + * unit_return_budget(); + * + * - the above means that an unit can find out that there is no budget and + * block, not only if the cap has actually been reached (for this period), + * but also if some other units, in order to run, have grabbed a certain + * quota of budget, no matter whether they've already used it all or not. + * An unit blocking because (any form of) lack of budget is said to be + * "parked", and such blocking happens in park_unit(); + * + * - when an unit stops running, and puts back some budget in the domain pool, + * we need to check whether there is someone which has been parked and that + * can be unparked. This happens in unpark_parked_units(), called from + * csched2_context_saved(); + * + * - of course, unparking happens also as a consequence of the domain's budget + * being replenished by the periodic timer. This also occurs by means of + * calling csched2_context_saved() (but from replenish_domain_budget()); + * + * - parked units of a domain are kept in a (per-domain) list, called + * 'parked_units'). Manipulation of the list and of the domain-wide budget + * pool, must occur only when holding the 'budget_lock'. + */ + +/* + * Locking: + * + * - runqueue lock + * + it is per-runqueue, so: + * * cpus in a runqueue take the runqueue lock, when using + * pcpu_schedule_lock() / unit_schedule_lock() (and friends), + * * a cpu may (try to) take a "remote" runqueue lock, e.g., for + * load balancing; + * + serializes runqueue operations (removing and inserting units); + * + protects runqueue-wide data in csched2_runqueue_data; + * + protects unit parameters in csched2_unit for the unit in the + * runqueue. + * + * - Private scheduler lock + * + protects scheduler-wide data in csched2_private, such as: + * * the list of domains active in this scheduler, + * * what cpus and what runqueues are active and in what + * runqueue each cpu is; + * + serializes the operation of changing the weights of domains; + * + * - Budget lock + * + it is per-domain; + * + protects, in domains that have an utilization cap; + * * manipulation of the total budget of the domain (as it is shared + * among all units of the domain), + * * manipulation of the list of units that are blocked waiting for + * some budget to be available. + * + * - Type: + * + runqueue locks are 'regular' spinlocks; + * + the private scheduler lock can be an rwlock. In fact, data + * it protects is modified only during initialization, cpupool + * manipulation and when changing weights, and read in all + * other cases (e.g., during load balancing); + * + budget locks are 'regular' spinlocks. + * + * Ordering: + * + tylock must be used when wanting to take a runqueue lock, + * if we already hold another one; + * + if taking both a runqueue lock and the private scheduler + * lock is, the latter must always be taken for first; + * + if taking both a runqueue lock and a budget lock, the former + * must always be taken for first. + */ + +/* + * Basic constants + */ +/* Default weight: How much a new domain starts with. */ +#define CSCHED2_DEFAULT_WEIGHT 256 +/* + * Min timer: Minimum length a timer will be set, to + * achieve efficiency. + */ +#define CSCHED2_MIN_TIMER MICROSECS(500) +/* + * Amount of credit VMs begin with, and are reset to. + * ATM, set so that highest-weight VMs can only run for 10ms + * before a reset event. + */ +#define CSCHED2_CREDIT_INIT MILLISECS(10) +/* + * Amount of credit the idle units have. It never changes, as idle + * units does not consume credits, and it must be lower than whatever + * amount of credit 'regular' unit would end up with. + */ +#define CSCHED2_IDLE_CREDIT (-(1U<<30)) +/* + * Carryover: How much "extra" credit may be carried over after + * a reset. + */ +#define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER +/* + * Stickiness: Cross-L2 migration resistance. Should be less than + * MIN_TIMER. + */ +#define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) +/* How much to "compensate" an unit for L2 migration. */ +#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50) +/* How tolerant we should be when peeking at runtime of units on other cpus */ +#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50) +/* Reset: Value below which credit will be reset. */ +#define CSCHED2_CREDIT_RESET 0 +/* Max timer: Maximum time a guest can be run for. */ +#define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT +/* Period of the cap replenishment timer. */ +#define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1)) + +/* + * Flags + */ +/* + * CSFLAG_scheduled: Is this unit either running on, or context-switching off, + * a physical cpu? + * + Accessed only with runqueue lock held + * + Set when chosen as next in csched2_schedule(). + * + Cleared after context switch has been saved in csched2_context_saved() + * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should + * set CSFLAG_delayed_runq_add + * + Checked to be false in runq_insert. + */ +#define __CSFLAG_scheduled 1 +#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled) +/* + * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done + * being context switched out? + * + Set when scheduling out in csched2_schedule() if prev is runnable + * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set + * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and + * clears the bit. + */ +#define __CSFLAG_delayed_runq_add 2 +#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add) +/* + * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a + * credit2-initiated runq migrate request; migrate it to the runqueue indicated + * in the svc struct. + */ +#define __CSFLAG_runq_migrate_request 3 +#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request) +/* + * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The + * scheduler is invoked to see if we can give the cpu to someone else, and + * get back to the yielding unit in a while. + */ +#define __CSFLAG_unit_yield 4 +#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield) +/* + * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard + * affinity set to one and only 1 cpu (and, hence, can only run there). + */ +#define __CSFLAG_pinned 5 +#define CSFLAG_pinned (1U<<__CSFLAG_pinned) + +static unsigned int __read_mostly opt_migrate_resist = 500; +integer_param("sched_credit2_migrate_resist", opt_migrate_resist); + +/* + * Load tracking and load balancing + * + * Load history of runqueues and units is accounted for by using an + * exponential weighted moving average algorithm. However, instead of using + * fractions,we shift everything to left by the number of bits we want to + * use for representing the fractional part (Q-format). + * + * We may also want to reduce the precision of time accounting, to + * accommodate 'longer windows'. So, if that is the case, we just need to + * shift all time samples to the right. + * + * The details of the formulas used for load tracking are explained close to + * update_runq_load(). Let's just say here that, with full nanosecond time + * granularity, a 30 bits wide 'decaying window' is ~1 second long. + * + * We want to consider the following equations: + * + * avg[0] = load*P + * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W + * + * where W is the length of the window, P the multiplier for transitiong into + * Q-format fixed point arithmetic and load is the instantaneous load of a + * runqueue, which basically is the number of runnable units there are on the + * runqueue (for the meaning of the other terms, look at the doc comment to + * update_runq_load()). + * + * So, again, with full nanosecond granularity, and 1 second window, we have: + * + * W = 2^30 + * P = 2^18 + * + * The maximum possible value for the average load, which we want to store in + * s_time_t type variables (i.e., we have 63 bits available) is load*P. This + * means that, with P 18 bits wide, load can occupy 45 bits. This in turn + * means we can have 2^45 units in each runqueue, before overflow occurs! + * + * However, it can happen that, at step j+1, if: + * + * avg[j] = load*P + * delta = W + * + * then: + * + * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W + * + * So we must be able to deal with W*load*P. This means load can't be higher + * than: + * + * 2^(63 - 30 - 18) = 2^15 = 32768 + * + * So 32768 is the maximum number of units the we can have in a runqueue, + * at any given time, and still not have problems with the load tracking + * calculations... and this is more than fine. + * + * As a matter of fact, since we are using microseconds granularity, we have + * W=2^20. So, still with 18 fractional bits and a 1 second long window, there + * may be 2^25 = 33554432 units in a runq before we have to start thinking + * about overflow. + */ + +/* If >0, decreases the granularity of time samples used for load tracking. */ +#define LOADAVG_GRANULARITY_SHIFT (10) +/* Time window during which we still give value to previous load history. */ +#define LOADAVG_WINDOW_SHIFT (30) +/* 18 bits by default (and not less than 4) for decimals. */ +#define LOADAVG_PRECISION_SHIFT (18) +#define LOADAVG_PRECISION_SHIFT_MIN (4) + +/* + * Both the length of the window and the number of fractional bits can be + * decided with boot parameters. + * + * The length of the window is always expressed in nanoseconds. The actual + * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT. + */ +static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT; +integer_param("credit2_load_window_shift", opt_load_window_shift); +static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT; +integer_param("credit2_load_precision_shift", opt_load_precision_shift); + +static int __read_mostly opt_underload_balance_tolerance = 0; +integer_param("credit2_balance_under", opt_underload_balance_tolerance); +static int __read_mostly opt_overload_balance_tolerance = -3; +integer_param("credit2_balance_over", opt_overload_balance_tolerance); +/* + * Domains subject to a cap receive a replenishment of their runtime budget + * once every opt_cap_period interval. Default is 10 ms. The amount of budget + * they receive depends on their cap. For instance, a domain with a 50% cap + * will receive 50% of 10 ms, so 5 ms. + */ +static unsigned int __read_mostly opt_cap_period = 10; /* ms */ +integer_param("credit2_cap_period_ms", opt_cap_period); + +/* + * Runqueue organization. + * + * The various cpus are to be assigned each one to a runqueue, and we + * want that to happen basing on topology. At the moment, it is possible + * to choose to arrange runqueues to be: + * + * - per-cpu: meaning that there will be one runqueue per logical cpu. This + * will happen when if the opt_runqueue parameter is set to 'cpu'. + * + * - per-core: meaning that there will be one runqueue per each physical + * core of the host. This will happen if the opt_runqueue + * parameter is set to 'core'; + * + * - per-socket: meaning that there will be one runqueue per each physical + * socket (AKA package, which often, but not always, also + * matches a NUMA node) of the host; This will happen if + * the opt_runqueue parameter is set to 'socket'; + * + * - per-node: meaning that there will be one runqueue per each physical + * NUMA node of the host. This will happen if the opt_runqueue + * parameter is set to 'node'; + * + * - global: meaning that there will be only one runqueue to which all the + * (logical) processors of the host belong. This will happen if + * the opt_runqueue parameter is set to 'all'. + * + * Depending on the value of opt_runqueue, therefore, cpus that are part of + * either the same physical core, the same physical socket, the same NUMA + * node, or just all of them, will be put together to form runqueues. + */ +#define OPT_RUNQUEUE_CPU 0 +#define OPT_RUNQUEUE_CORE 1 +#define OPT_RUNQUEUE_SOCKET 2 +#define OPT_RUNQUEUE_NODE 3 +#define OPT_RUNQUEUE_ALL 4 +static const char *const opt_runqueue_str[] = { + [OPT_RUNQUEUE_CPU] = "cpu", + [OPT_RUNQUEUE_CORE] = "core", + [OPT_RUNQUEUE_SOCKET] = "socket", + [OPT_RUNQUEUE_NODE] = "node", + [OPT_RUNQUEUE_ALL] = "all" +}; +static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET; + +static int __init parse_credit2_runqueue(const char *s) +{ + unsigned int i; + + for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ ) + { + if ( !strcmp(s, opt_runqueue_str[i]) ) + { + opt_runqueue = i; + return 0; + } + } + + return -EINVAL; +} +custom_param("credit2_runqueue", parse_credit2_runqueue); + +/* + * Per-runqueue data + */ +struct csched2_runqueue_data { + spinlock_t lock; /* Lock for this runqueue */ + + struct list_head runq; /* Ordered list of runnable vms */ + unsigned int nr_cpus; /* How many CPUs are sharing this runqueue */ + int id; /* ID of this runqueue (-1 if invalid) */ + + int load; /* Instantaneous load (num of non-idle units) */ + s_time_t load_last_update; /* Last time average was updated */ + s_time_t avgload; /* Decaying queue load */ + s_time_t b_avgload; /* Decaying queue load modified by balancing */ + + cpumask_t active, /* CPUs enabled for this runqueue */ + smt_idle, /* Fully idle-and-untickled cores (see below) */ + tickled, /* Have been asked to go through schedule */ + idle; /* Currently idle pcpus */ + + struct list_head svc; /* List of all units assigned to the runqueue */ + unsigned int max_weight; /* Max weight of the units in this runqueue */ + unsigned int pick_bias; /* Last picked pcpu. Start from it next time */ +}; + +/* + * System-wide private data + */ +struct csched2_private { + rwlock_t lock; /* Private scheduler lock */ + + unsigned int load_precision_shift; /* Precision of load calculations */ + unsigned int load_window_shift; /* Lenght of load decaying window */ + unsigned int ratelimit_us; /* Rate limiting for this scheduler */ + + cpumask_t active_queues; /* Runqueues with (maybe) active cpus */ + struct csched2_runqueue_data *rqd; /* Data of the various runqueues */ + + cpumask_t initialized; /* CPUs part of this scheduler */ + struct list_head sdom; /* List of domains (for debug key) */ +}; + +/* + * Physical CPU + */ +struct csched2_pcpu { + cpumask_t sibling_mask; /* Siblings in the same runqueue */ + int runq_id; +}; + +/* + * Schedule Unit + */ +struct csched2_unit { + struct csched2_dom *sdom; /* Up-pointer to domain */ + struct sched_unit *unit; /* Up-pointer, to schedule unit */ + struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */ + + int credit; /* Current amount of credit */ + unsigned int weight; /* Weight of this unit */ + unsigned int residual; /* Reminder of div(max_weight/weight) */ + unsigned flags; /* Status flags (16 bits would be ok, */ + s_time_t budget; /* Current budget (if domains has cap) */ + /* but clear_bit() does not like that) */ + s_time_t budget_quota; /* Budget to which unit is entitled */ + + s_time_t start_time; /* Time we were scheduled (for credit) */ + + /* Individual contribution to load */ + s_time_t load_last_update; /* Last time average was updated */ + s_time_t avgload; /* Decaying queue load */ + + struct list_head runq_elem; /* On the runqueue (rqd->runq) */ + struct list_head parked_elem; /* On the parked_units list */ + struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */ + struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */ + int tickled_cpu; /* Cpu that will pick us (-1 if none) */ +}; + +/* + * Domain + */ +struct csched2_dom { + struct domain *dom; /* Up-pointer to domain */ + + spinlock_t budget_lock; /* Serialized budget calculations */ + s_time_t tot_budget; /* Total amount of budget */ + s_time_t budget; /* Currently available budget */ + + struct timer repl_timer; /* Timer for periodic replenishment of budget */ + s_time_t next_repl; /* Time at which next replenishment occurs */ + struct list_head parked_units; /* List of CPUs waiting for budget */ + + struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */ + uint16_t weight; /* User specified weight */ + uint16_t cap; /* User specified cap */ + uint16_t nr_units; /* Number of units of this domain */ +}; + +/* + * Accessor helpers functions. + */ +static inline struct csched2_private *csched2_priv(const struct scheduler *ops) +{ + return ops->sched_data; +} + +static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu) +{ + return get_sched_res(cpu)->sched_priv; +} + +static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit) +{ + return unit->priv; +} + +static inline struct csched2_dom *csched2_dom(const struct domain *d) +{ + return d->sched_priv; +} + +/* CPU to runq_id macro */ +static inline int c2r(unsigned int cpu) +{ + return csched2_pcpu(cpu)->runq_id; +} + +/* CPU to runqueue struct macro */ +static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops, + unsigned int cpu) +{ + return &csched2_priv(ops)->rqd[c2r(cpu)]; +} + +/* Does the domain of this unit have a cap? */ +static inline bool has_cap(const struct csched2_unit *svc) +{ + return svc->budget != STIME_MAX; +} + +/* + * Hyperthreading (SMT) support. + * + * We use a special per-runq mask (smt_idle) and update it according to the + * following logic: + * - when _all_ the SMT sibling in a core are idle, all their corresponding + * bits are set in the smt_idle mask; + * - when even _just_one_ of the SMT siblings in a core is not idle, all the + * bits correspondings to it and to all its siblings are clear in the + * smt_idle mask. + * + * Once we have such a mask, it is easy to implement a policy that, either: + * - uses fully idle cores first: it is enough to try to schedule the units + * on pcpus from smt_idle mask first. This is what happens if + * sched_smt_power_savings was not set at boot (default), and it maximizes + * true parallelism, and hence performance; + * - uses already busy cores first: it is enough to try to schedule the units + * on pcpus that are idle, but are not in smt_idle. This is what happens if + * sched_smt_power_savings is set at boot, and it allows as more cores as + * possible to stay in low power states, minimizing power consumption. + * + * This logic is entirely implemented in runq_tickle(), and that is enough. + * In fact, in this scheduler, placement of an unit on one of the pcpus of a + * runq, _always_ happens by means of tickling: + * - when an unit wakes up, it calls csched2_unit_wake(), which calls + * runq_tickle(); + * - when a migration is initiated in schedule.c, we call csched2_res_pick(), + * csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake(). + * csched2_res_pick() looks for the least loaded runq and return just any + * of its processors. Then, csched2_unit_migrate() just moves the unit to + * the chosen runq, and it is again runq_tickle(), called by + * csched2_unit_wake() that actually decides what pcpu to use within the + * chosen runq; + * - when a migration is initiated in sched_credit2.c, by calling migrate() + * directly, that again temporarily use a random pcpu from the new runq, + * and then calls runq_tickle(), by itself. + */ + +/* + * If all the siblings of cpu (including cpu itself) are both idle and + * untickled, set all their bits in mask. + * + * NB that rqd->smt_idle is different than rqd->idle. rqd->idle + * records pcpus that at are merely idle (i.e., at the moment do not + * have an unit running on them). But you have to manually filter out + * which pcpus have been tickled in order to find cores that are not + * going to be busy soon. Filtering out tickled cpus pairwise is a + * lot of extra pain; so for rqd->smt_idle, we explicitly make so that + * the bits of a pcpu are set only if all the threads on its core are + * both idle *and* untickled. + * + * This means changing the mask when either rqd->idle or rqd->tickled + * changes. + */ +static inline +void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers, + cpumask_t *mask) +{ + const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; + + if ( cpumask_subset(cpu_siblings, idlers) ) + cpumask_or(mask, mask, cpu_siblings); +} + +/* + * Clear the bits of all the siblings of cpu from mask (if necessary). + */ +static inline +void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask) +{ + const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; + + if ( cpumask_subset(cpu_siblings, mask) ) + cpumask_andnot(mask, mask, cpu_siblings); +} + +/* + * In csched2_res_pick(), it may not be possible to actually look at remote + * runqueues (the trylock-s on their spinlocks can fail!). If that happens, + * we pick, in order of decreasing preference: + * 1) svc's current pcpu, if it is part of svc's soft affinity; + * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity; + * 3) svc's current pcpu, if it is part of svc's hard affinity; + * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity; + * 5) just one valid pcpu from svc's hard affinity + * + * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also + * note that at least 5 is guaranteed to _always_ return at least one pcpu. + */ +static int get_fallback_cpu(struct csched2_unit *svc) +{ + struct sched_unit *unit = svc->unit; + unsigned int bs; + + SCHED_STAT_CRANK(need_fallback_cpu); + + for_each_affinity_balance_step( bs ) + { + int cpu = sched_unit_master(unit); + + if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) + continue; + + affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + cpupool_domain_master_cpumask(unit->domain)); + + /* + * This is cases 1 or 3 (depending on bs): if processor is (still) + * in our affinity, go for it, for cache betterness. + */ + if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) + return cpu; + + /* + * This is cases 2 or 4 (depending on bs): v->processor isn't there + * any longer, check if we at least can stay in our current runq. + */ + if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu), + &svc->rqd->active)) ) + { + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &svc->rqd->active); + return cpumask_first(cpumask_scratch_cpu(cpu)); + } + + /* + * We may well pick any valid pcpu from our soft-affinity, outside + * of our current runqueue, but we decide not to. In fact, changing + * runqueue is slow, affects load distribution, and is a source of + * overhead for the units running on the other runqueue (we need the + * lock). So, better do that as a consequence of a well informed + * decision (or if we really don't have any other chance, as we will, + * at step 5, if we get to there). + * + * Also, being here, looking for a fallback, is an unfortunate and + * infrequent event, while the decision of putting us in the runqueue + * wehere we are was (likely) made taking all the relevant factors + * into account. So let's not disrupt that, just for the sake of + * soft-affinity, and let's wait here to be able to made (hopefully, + * soon), another similar well informed decision. + */ + if ( bs == BALANCE_SOFT_AFFINITY ) + continue; + + /* + * This is cases 5: last stand, just one valid pcpu from our hard + * affinity. It's guaranteed that there is at least one valid cpu, + * and therefore we are sure that we return it, and never really + * exit the loop. + */ + ASSERT(bs == BALANCE_HARD_AFFINITY && + !cpumask_empty(cpumask_scratch_cpu(cpu))); + cpu = cpumask_first(cpumask_scratch_cpu(cpu)); + if ( likely(cpu < nr_cpu_ids) ) + return cpu; + } + ASSERT_UNREACHABLE(); + /* + * We can't be here. But if that somehow happen (in non-debug builds), + * at least return something which both online and in our hard-affinity. + */ + return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit))); +} + +/* + * Time-to-credit, credit-to-time. + * + * We keep track of the "residual" time to make sure that frequent short + * schedules still get accounted for in the end. + * + * FIXME: Do pre-calculated division? + */ +static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time, + struct csched2_unit *svc) +{ + uint64_t val = time * rqd->max_weight + svc->residual; + + svc->residual = do_div(val, svc->weight); + svc->credit -= val; +} + +static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc) +{ + return credit * svc->weight / rqd->max_weight; +} + +/* + * Runqueue related code. + */ + +static inline int unit_on_runq(struct csched2_unit *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static inline struct csched2_unit * runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched2_unit, runq_elem); +} + +static void activate_runqueue(struct csched2_private *prv, int rqi) +{ + struct csched2_runqueue_data *rqd; + + rqd = prv->rqd + rqi; + + BUG_ON(!cpumask_empty(&rqd->active)); + + rqd->max_weight = 1; + rqd->id = rqi; + INIT_LIST_HEAD(&rqd->svc); + INIT_LIST_HEAD(&rqd->runq); + spin_lock_init(&rqd->lock); + + __cpumask_set_cpu(rqi, &prv->active_queues); +} + +static void deactivate_runqueue(struct csched2_private *prv, int rqi) +{ + struct csched2_runqueue_data *rqd; + + rqd = prv->rqd + rqi; + + BUG_ON(!cpumask_empty(&rqd->active)); + + rqd->id = -1; + + __cpumask_clear_cpu(rqi, &prv->active_queues); +} + +static inline bool same_node(unsigned int cpua, unsigned int cpub) +{ + return cpu_to_node(cpua) == cpu_to_node(cpub); +} + +static inline bool same_socket(unsigned int cpua, unsigned int cpub) +{ + return cpu_to_socket(cpua) == cpu_to_socket(cpub); +} + +static inline bool same_core(unsigned int cpua, unsigned int cpub) +{ + return same_socket(cpua, cpub) && + cpu_to_core(cpua) == cpu_to_core(cpub); +} + +static unsigned int +cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu) +{ + struct csched2_runqueue_data *rqd; + unsigned int rqi; + + for ( rqi = 0; rqi < nr_cpu_ids; rqi++ ) + { + unsigned int peer_cpu; + + /* + * As soon as we come across an uninitialized runqueue, use it. + * In fact, either: + * - we are initializing the first cpu, and we assign it to + * runqueue 0. This is handy, especially if we are dealing + * with the boot cpu (if credit2 is the default scheduler), + * as we would not be able to use cpu_to_socket() and similar + * helpers anyway (they're result of which is not reliable yet); + * - we have gone through all the active runqueues, and have not + * found anyone whose cpus' topology matches the one we are + * dealing with, so activating a new runqueue is what we want. + */ + if ( prv->rqd[rqi].id == -1 ) + break; + + rqd = prv->rqd + rqi; + BUG_ON(cpumask_empty(&rqd->active)); + + peer_cpu = cpumask_first(&rqd->active); + BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID || + cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID); + + if (opt_runqueue == OPT_RUNQUEUE_CPU) + continue; + if ( opt_runqueue == OPT_RUNQUEUE_ALL || + (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) || + (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) || + (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) ) + break; + } + + /* We really expect to be able to assign each cpu to a runqueue. */ + BUG_ON(rqi >= nr_cpu_ids); + + return rqi; +} + +/* Find the domain with the highest weight. */ +static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight, + int old_weight) +{ + /* Try to avoid brute-force search: + * - If new_weight is larger, max_weigth <- new_weight + * - If old_weight != max_weight, someone else is still max_weight + * (No action required) + * - If old_weight == max_weight, brute-force search for max weight + */ + if ( new_weight > rqd->max_weight ) + { + rqd->max_weight = new_weight; + SCHED_STAT_CRANK(upd_max_weight_quick); + } + else if ( old_weight == rqd->max_weight ) + { + struct list_head *iter; + int max_weight = 1; + + list_for_each( iter, &rqd->svc ) + { + struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem); + + if ( svc->weight > max_weight ) + max_weight = svc->weight; + } + + rqd->max_weight = max_weight; + SCHED_STAT_CRANK(upd_max_weight_full); + } + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned rqi:16, max_weight:16; + } d; + d.rqi = rqd->id; + d.max_weight = rqd->max_weight; + __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +/* Add and remove from runqueue assignment (not active run queue) */ +static void +_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd) +{ + + svc->rqd = rqd; + list_add_tail(&svc->rqd_elem, &svc->rqd->svc); + + update_max_weight(svc->rqd, svc->weight, 0); + + /* Expected new load based on adding this unit */ + rqd->b_avgload += svc->avgload; + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned rqi:16; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.rqi=rqd->id; + __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, + sizeof(d), + (unsigned char *)&d); + } + +} + +static void +runq_assign(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit *svc = unit->priv; + + ASSERT(svc->rqd == NULL); + + _runq_assign(svc, c2rqd(ops, sched_unit_master(unit))); +} + +static void +_runq_deassign(struct csched2_unit *svc) +{ + struct csched2_runqueue_data *rqd = svc->rqd; + + ASSERT(!unit_on_runq(svc)); + ASSERT(!(svc->flags & CSFLAG_scheduled)); + + list_del_init(&svc->rqd_elem); + update_max_weight(rqd, 0, svc->weight); + + /* Expected new load based on removing this unit */ + rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); + + svc->rqd = NULL; +} + +static void +runq_deassign(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit *svc = unit->priv; + + ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); + + _runq_deassign(svc); +} + +/* + * Track the runq load by gathering instantaneous load samples, and using + * exponentially weighted moving average (EWMA) for the 'decaying'. + * + * We consider a window of length W=2^(prv->load_window_shift) nsecs + * (which takes LOADAVG_GRANULARITY_SHIFT into account). + * + * If load is the instantaneous load, the formula for EWMA looks as follows, + * for the i-eth sample: + * + * avg[i] = a*load + (1 - a)*avg[i-1] + * + * where avg[i] is the new value of the average load, avg[i-1] is the value + * of the average load calculated so far, and a is a coefficient less or + * equal to 1. + * + * So, for us, it becomes: + * + * avgload = a*load + (1 - a)*avgload + * + * For determining a, we consider _when_ we are doing the load update, wrt + * the length of the window. We define delta as follows: + * + * delta = t - load_last_update + * + * where t is current time (i.e., time at which we are both sampling and + * updating the load average) and load_last_update is the last time we did + * that. + * + * There are two possible situations: + * + * a) delta <= W + * this means that, during the last window of length W, the runeuque load + * was avgload for (W - detla) time, and load for delta time: + * + * |----------- W ---------| + * | | + * | load_last_update t + * -------------------------|---------|--- + * | | | + * \__W - delta__/\_delta__/ + * | | | + * |___avgload___|__load___| + * + * So, what about using delta/W as our smoothing coefficient a. If we do, + * here's what happens: + * + * a = delta / W + * 1 - a = 1 - (delta / W) = (W - delta) / W + * + * Which matches the above description of what happened in the last + * window of length W. + * + * Note that this also means that the weight that we assign to both the + * latest load sample, and to previous history, varies at each update. + * The longer the latest load sample has been in efect, within the last + * window, the higher it weights (and the lesser the previous history + * weights). + * + * This is some sort of extension of plain EWMA to fit even better to our + * use case. + * + * b) delta > W + * this means more than a full window has passed since the last update: + * + * |----------- W ---------| + * | | + * load_last_update t + * ----|------------------------------|--- + * | | + * \_________________delta________/ + * + * Basically, it means the last load sample has been in effect for more + * than W time, and hence we should just use it, and forget everything + * before that. + * + * This can be seen as a 'reset condition', occurring when, for whatever + * reason, load has not been updated for longer than we expected. (It is + * also how avgload is assigned its first value.) + * + * The formula for avgload then becomes: + * + * avgload = (delta/W)*load + (W - delta)*avgload/W + * avgload = delta*load/W + W*avgload/W - delta*avgload/W + * avgload = avgload + delta*load/W - delta*avgload/W + * + * So, final form is: + * + * avgload_0 = load + * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W + * + * As a confirmation, let's look at the extremes, when delta is 0 (i.e., + * what happens if we update the load twice, at the same time instant?): + * + * avgload = avgload + 0*load/W - 0*avgload/W + * avgload = avgload + * + * and when delta is W (i.e., what happens if we update at the last + * possible instant before the window 'expires'?): + * + * avgload = avgload + W*load/W - W*avgload/W + * avgload = avgload + load - avgload + * avgload = load + * + * Which, in both cases, is what we expect. + */ +static void +update_runq_load(const struct scheduler *ops, + struct csched2_runqueue_data *rqd, int change, s_time_t now) +{ + struct csched2_private *prv = csched2_priv(ops); + s_time_t delta, load = rqd->load; + unsigned int P, W; + + W = prv->load_window_shift; + P = prv->load_precision_shift; + now >>= LOADAVG_GRANULARITY_SHIFT; + + /* + * To avoid using fractions, we shift to left by load_precision_shift, + * and use the least last load_precision_shift bits as fractional part. + * Looking back at the formula we want to use, we now have: + * + * P = 2^(load_precision_shift) + * P*avgload = P*(avgload + delta*load/W - delta*avgload/W) + * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W + * + * And if we are ok storing and using P*avgload, we can rewrite this as: + * + * P*avgload = avgload' + * avgload' = avgload' + delta*P*load/W - delta*avgload'/W + * + * Coupled with, of course: + * + * avgload_0' = P*load + */ + + if ( rqd->load_last_update + (1ULL << W) < now ) + { + rqd->avgload = load << P; + rqd->b_avgload = load << P; + } + else + { + delta = now - rqd->load_last_update; + if ( unlikely(delta < 0) ) + { + d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", + __func__, now, rqd->load_last_update); + delta = 0; + } + + /* + * Note that, if we were to enforce (or check) some relationship + * between P and W, we may save one shift. E.g., if we are sure + * that P < W, we could write: + * + * (delta * (load << P)) >> W + * + * as: + * + * (delta * load) >> (W - P) + */ + rqd->avgload = rqd->avgload + + ((delta * (load << P)) >> W) - + ((delta * rqd->avgload) >> W); + rqd->b_avgload = rqd->b_avgload + + ((delta * (load << P)) >> W) - + ((delta * rqd->b_avgload) >> W); + } + rqd->load += change; + rqd->load_last_update = now; + + /* Overflow, capable of making the load look negative, must not occur. */ + ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0); + + if ( unlikely(tb_init_done) ) + { + struct { + uint64_t rq_avgload, b_avgload; + unsigned rq_load:16, rq_id:8, shift:8; + } d; + d.rq_id = rqd->id; + d.rq_load = rqd->load; + d.rq_avgload = rqd->avgload; + d.b_avgload = rqd->b_avgload; + d.shift = P; + __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +static void +update_svc_load(const struct scheduler *ops, + struct csched2_unit *svc, int change, s_time_t now) +{ + struct csched2_private *prv = csched2_priv(ops); + s_time_t delta, unit_load; + unsigned int P, W; + + if ( change == -1 ) + unit_load = 1; + else if ( change == 1 ) + unit_load = 0; + else + unit_load = unit_runnable(svc->unit); + + W = prv->load_window_shift; + P = prv->load_precision_shift; + now >>= LOADAVG_GRANULARITY_SHIFT; + + if ( svc->load_last_update + (1ULL << W) < now ) + { + svc->avgload = unit_load << P; + } + else + { + delta = now - svc->load_last_update; + if ( unlikely(delta < 0) ) + { + d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", + __func__, now, svc->load_last_update); + delta = 0; + } + + svc->avgload = svc->avgload + + ((delta * (unit_load << P)) >> W) - + ((delta * svc->avgload) >> W); + } + svc->load_last_update = now; + + /* Overflow, capable of making the load look negative, must not occur. */ + ASSERT(svc->avgload >= 0); + + if ( unlikely(tb_init_done) ) + { + struct { + uint64_t v_avgload; + unsigned unit:16, dom:16; + unsigned shift; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.v_avgload = svc->avgload; + d.shift = P; + __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +static void +update_load(const struct scheduler *ops, + struct csched2_runqueue_data *rqd, + struct csched2_unit *svc, int change, s_time_t now) +{ + trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL); + + update_runq_load(ops, rqd, change, now); + if ( svc ) + update_svc_load(ops, svc, change, now); +} + +static void +runq_insert(const struct scheduler *ops, struct csched2_unit *svc) +{ + struct list_head *iter; + unsigned int cpu = sched_unit_master(svc->unit); + struct list_head * runq = &c2rqd(ops, cpu)->runq; + int pos = 0; + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + ASSERT(!unit_on_runq(svc)); + ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit))); + + ASSERT(&svc->rqd->runq == runq); + ASSERT(!is_idle_unit(svc->unit)); + ASSERT(!svc->unit->is_running); + ASSERT(!(svc->flags & CSFLAG_scheduled)); + + list_for_each( iter, runq ) + { + struct csched2_unit * iter_svc = runq_elem(iter); + + if ( svc->credit > iter_svc->credit ) + break; + + pos++; + } + list_add_tail(&svc->runq_elem, iter); + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned pos; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.pos = pos; + __trace_var(TRC_CSCHED2_RUNQ_POS, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +static inline void runq_remove(struct csched2_unit *svc) +{ + ASSERT(unit_on_runq(svc)); + list_del_init(&svc->runq_elem); +} + +void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t); + +static inline void +tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd) +{ + __cpumask_set_cpu(cpu, &rqd->tickled); + smt_idle_mask_clear(cpu, &rqd->smt_idle); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); +} + +/* + * What we want to know is whether svc, which we assume to be running on some + * pcpu, can be interrupted and preempted (which, so far, basically means + * whether or not it already run for more than the ratelimit, to which we + * apply some tolerance). + */ +static inline bool is_preemptable(const struct csched2_unit *svc, + s_time_t now, s_time_t ratelimit) +{ + if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE ) + return true; + + ASSERT(svc->unit->is_running); + return now - svc->unit->state_entry_time > + ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE; +} + +/* + * Score to preempt the target cpu. Return a negative number if the + * credit isn't high enough; if it is, favor a preemption on cpu in + * this order: + * - cpu is in new's soft-affinity, not in cur's soft-affinity + * (2 x CSCHED2_CREDIT_INIT score bonus); + * - cpu is in new's soft-affinity and cur's soft-affinity, or + * cpu is not in new's soft-affinity, nor in cur's soft-affinity + * (1x CSCHED2_CREDIT_INIT score bonus); + * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity + * (no bonus). + * + * Within the same class, the highest difference of credit. + */ +static s_time_t tickle_score(const struct scheduler *ops, s_time_t now, + struct csched2_unit *new, unsigned int cpu) +{ + struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); + struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu)); + struct csched2_private *prv = csched2_priv(ops); + s_time_t score; + + /* + * We are dealing with cpus that are marked non-idle (i.e., that are not + * in rqd->idle). However, some of them may be running their idle unit, + * if taking care of tasklets. In that case, we want to leave it alone. + */ + if ( unlikely(is_idle_unit(cur->unit) || + !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) ) + return -1; + + burn_credits(rqd, cur, now); + + score = new->credit - cur->credit; + if ( sched_unit_master(new->unit) != cpu ) + score -= CSCHED2_MIGRATE_RESIST; + + /* + * If score is positive, it means new has enough credits (i.e., + * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST). + * + * Let's compute the bonuses for soft-affinities. + */ + if ( score > 0 ) + { + if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) ) + score += CSCHED2_CREDIT_INIT; + + if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) ) + score += CSCHED2_CREDIT_INIT; + } + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + int credit, score; + } d; + d.dom = cur->unit->domain->domain_id; + d.unit = cur->unit->unit_id; + d.credit = cur->credit; + d.score = score; + __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + + return score; +} + +/* + * Check what processor it is best to 'wake', for picking up an unit that has + * just been put (back) in the runqueue. Logic is as follows: + * 1. if there are idle processors in the runq, wake one of them; + * 2. if there aren't idle processor, check the one were the unit was + * running before to see if we can preempt what's running there now + * (and hence doing just one migration); + * 3. last stand: check all processors and see if the unit is in right + * of preempting any of the other units running on them (this requires + * two migrations, and that's indeed why it is left as the last stand). + * + * Note that when we say 'idle processors' what we really mean is (pretty + * much always) both _idle_ and _not_already_tickled_. In fact, if a + * processor has been tickled, it will run csched2_schedule() shortly, and + * pick up some work, so it would be wrong to consider it idle. + */ +static void +runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now) +{ + int i, ipid = -1; + s_time_t max = 0; + struct sched_unit *unit = new->unit; + unsigned int bs, cpu = sched_unit_master(unit); + struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); + cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); + cpumask_t mask; + + ASSERT(new->rqd == rqd); + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned processor; + int credit; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.processor = cpu; + d.credit = new->credit; + __trace_var(TRC_CSCHED2_TICKLE_NEW, 1, + sizeof(d), + (unsigned char *)&d); + } + + /* + * Exclusive pinning is when an unit has hard-affinity with only one + * cpu, and there is no other unit that has hard-affinity with that + * same cpu. This is infrequent, but if it happens, is for achieving + * the most possible determinism, and least possible overhead for + * the units in question. + * + * Try to identify the vast majority of these situations, and deal + * with them quickly. + */ + if ( unlikely((new->flags & CSFLAG_pinned) && + cpumask_test_cpu(cpu, &rqd->idle) && + !cpumask_test_cpu(cpu, &rqd->tickled)) ) + { + ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); + SCHED_STAT_CRANK(tickled_idle_cpu_excl); + ipid = cpu; + goto tickle; + } + + for_each_affinity_balance_step( bs ) + { + /* Just skip first step, if we don't have a soft affinity */ + if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) + continue; + + affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); + + /* + * First of all, consider idle cpus, checking if we can just + * re-use the pcpu where we were running before. + * + * If there are cores where all the siblings are idle, consider + * them first, honoring whatever the spreading-vs-consolidation + * SMT policy wants us to do. + */ + if ( unlikely(sched_smt_power_savings) ) + { + cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle); + cpumask_and(&mask, &mask, online); + } + else + cpumask_and(&mask, &rqd->smt_idle, online); + cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); + i = cpumask_test_or_cycle(cpu, &mask); + if ( i < nr_cpu_ids ) + { + SCHED_STAT_CRANK(tickled_idle_cpu); + ipid = i; + goto tickle; + } + + /* + * If there are no fully idle cores, check all idlers, after + * having filtered out pcpus that have been tickled but haven't + * gone through the scheduler yet. + */ + cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online); + cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); + i = cpumask_test_or_cycle(cpu, &mask); + if ( i < nr_cpu_ids ) + { + SCHED_STAT_CRANK(tickled_idle_cpu); + ipid = i; + goto tickle; + } + } + + /* + * Note that, if we are here, it means we have done the hard-affinity + * balancing step of the loop, and hence what we have in cpumask_scratch + * is what we put there for last, i.e., new's unit_hard_affinity & online + * which is exactly what we need for the next part of the function. + */ + + /* + * Otherwise, look for the non-idle (and non-tickled) processors with + * the lowest credit, among the ones new is allowed to run on. Again, + * the cpu were it was running on would be the best candidate. + * + * For deciding which cpu to tickle, we use tickle_score(), which will + * factor in both new's soft-affinity, and the soft-affinity of the + * unit running on each cpu that we consider. + */ + cpumask_andnot(&mask, &rqd->active, &rqd->idle); + cpumask_andnot(&mask, &mask, &rqd->tickled); + cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); + if ( __cpumask_test_and_clear_cpu(cpu, &mask) ) + { + s_time_t score = tickle_score(ops, now, new, cpu); + + if ( score > max ) + { + max = score; + ipid = cpu; + + /* If this is in new's soft affinity, just take it */ + if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) ) + { + SCHED_STAT_CRANK(tickled_busy_cpu); + goto tickle; + } + } + } + + for_each_cpu(i, &mask) + { + s_time_t score; + + /* Already looked at this one above */ + ASSERT(i != cpu); + + score = tickle_score(ops, now, new, i); + + if ( score > max ) + { + max = score; + ipid = i; + } + } + + if ( ipid == -1 ) + { + SCHED_STAT_CRANK(tickled_no_cpu); + return; + } + + ASSERT(!is_idle_unit(curr_on_cpu(ipid))); + SCHED_STAT_CRANK(tickled_busy_cpu); + tickle: + BUG_ON(ipid == -1); + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned cpu:16, pad:16; + } d; + d.cpu = ipid; d.pad = 0; + __trace_var(TRC_CSCHED2_TICKLE, 1, + sizeof(d), + (unsigned char *)&d); + } + + tickle_cpu(ipid, rqd); + + if ( unlikely(new->tickled_cpu != -1) ) + SCHED_STAT_CRANK(tickled_cpu_overwritten); + new->tickled_cpu = ipid; +} + +/* + * Credit-related code + */ +static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now, + struct csched2_unit *snext) +{ + struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); + struct list_head *iter; + int m; + + /* + * Under normal circumstances, snext->credit should never be less + * than -CSCHED2_MIN_TIMER. However, under some circumstances, an + * unit with low credits may be allowed to run long enough that + * its credits are actually less than -CSCHED2_CREDIT_INIT. + * (Instances have been observed, for example, where an unit with + * 200us of credit was allowed to run for 11ms, giving it -10.8ms + * of credit. Thus it was still negative even after the reset.) + * + * If this is the case for snext, we simply want to keep moving + * everyone up until it is in the black again. This fair because + * none of the other units want to run at the moment. + * + * Rather than looping, however, we just calculate a multiplier, + * avoiding an integer division and multiplication in the common + * case. + */ + m = 1; + if ( snext->credit < -CSCHED2_CREDIT_INIT ) + m += (-snext->credit) / CSCHED2_CREDIT_INIT; + + list_for_each( iter, &rqd->svc ) + { + unsigned int svc_cpu; + struct csched2_unit * svc; + int start_credit; + + svc = list_entry(iter, struct csched2_unit, rqd_elem); + svc_cpu = sched_unit_master(svc->unit); + + ASSERT(!is_idle_unit(svc->unit)); + ASSERT(svc->rqd == rqd); + + /* + * If svc is running, it is our responsibility to make sure, here, + * that the credit it has spent so far get accounted. + */ + if ( svc->unit == curr_on_cpu(svc_cpu) ) + { + burn_credits(rqd, svc, now); + /* + * And, similarly, in case it has run out of budget, as a + * consequence of this round of accounting, we also must inform + * its pCPU that it's time to park it, and pick up someone else. + */ + if ( unlikely(svc->budget <= 0) ) + tickle_cpu(svc_cpu, rqd); + } + + start_credit = svc->credit; + + /* + * Add INIT * m, avoiding integer multiplication in the common case. + */ + if ( likely(m==1) ) + svc->credit += CSCHED2_CREDIT_INIT; + else + svc->credit += m * CSCHED2_CREDIT_INIT; + + /* "Clip" credits to max carryover */ + if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX ) + svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX; + + svc->start_time = now; + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + int credit_start, credit_end; + unsigned multiplier; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.credit_start = start_credit; + d.credit_end = svc->credit; + d.multiplier = m; + __trace_var(TRC_CSCHED2_CREDIT_RESET, 1, + sizeof(d), + (unsigned char *)&d); + } + } + + SCHED_STAT_CRANK(credit_reset); + + /* No need to resort runqueue, as everyone's order should be the same. */ +} + +void burn_credits(struct csched2_runqueue_data *rqd, + struct csched2_unit *svc, s_time_t now) +{ + s_time_t delta; + + ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit)))); + + if ( unlikely(is_idle_unit(svc->unit)) ) + { + ASSERT(svc->credit == CSCHED2_IDLE_CREDIT); + return; + } + + delta = now - svc->start_time; + + if ( unlikely(delta <= 0) ) + { + if ( unlikely(delta < 0) ) + d2printk("WARNING: %s: Time went backwards? now %"PRI_stime + " start_time %"PRI_stime"\n", __func__, now, + svc->start_time); + goto out; + } + + SCHED_STAT_CRANK(burn_credits_t2c); + t2c_update(rqd, delta, svc); + + if ( has_cap(svc) ) + svc->budget -= delta; + + svc->start_time = now; + + out: + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + int credit, budget; + int delta; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.credit = svc->credit; + d.budget = has_cap(svc) ? svc->budget : INT_MIN; + d.delta = delta; + __trace_var(TRC_CSCHED2_CREDIT_BURN, 1, + sizeof(d), + (unsigned char *)&d); + } +} + +/* + * Budget-related code. + */ + +static void park_unit(struct csched2_unit *svc) +{ + struct sched_unit *unit = svc->unit; + + ASSERT(spin_is_locked(&svc->sdom->budget_lock)); + + /* + * It was impossible to find budget for this unit, so it has to be + * "parked". This implies it is not runnable, so we mark it as such in + * its pause_flags. If the unit is currently scheduled (which means we + * are here after being called from within csched_schedule()), flagging + * is enough, as we'll choose someone else, and then context_saved() + * will take care of updating the load properly. + * + * If, OTOH, the unit is sitting in the runqueue (which means we are here + * after being called from within runq_candidate()), we must go all the + * way down to taking it out of there, and updating the load accordingly. + * + * In both cases, we also add it to the list of parked units of the domain. + */ + sched_set_pause_flags(unit, _VPF_parked); + if ( unit_on_runq(svc) ) + { + runq_remove(svc); + update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW()); + } + list_add(&svc->parked_elem, &svc->sdom->parked_units); +} + +static bool unit_grab_budget(struct csched2_unit *svc) +{ + struct csched2_dom *sdom = svc->sdom; + unsigned int cpu = sched_unit_master(svc->unit); + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + if ( svc->budget > 0 ) + return true; + + /* budget_lock nests inside runqueue lock. */ + spin_lock(&sdom->budget_lock); + + /* + * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if + * above!). That basically means the unit has overrun a bit --because of + * various reasons-- and we want to take that into account. With the +=, + * we are actually subtracting the amount of budget the unit has + * overconsumed, from the total domain budget. + */ + sdom->budget += svc->budget; + + if ( sdom->budget > 0 ) + { + s_time_t budget; + + /* Get our quota, if there's at least as much budget */ + if ( likely(sdom->budget >= svc->budget_quota) ) + budget = svc->budget_quota; + else + budget = sdom->budget; + + svc->budget = budget; + sdom->budget -= budget; + } + else + { + svc->budget = 0; + park_unit(svc); + } + + spin_unlock(&sdom->budget_lock); + + return svc->budget > 0; +} + +static void +unit_return_budget(struct csched2_unit *svc, struct list_head *parked) +{ + struct csched2_dom *sdom = svc->sdom; + unsigned int cpu = sched_unit_master(svc->unit); + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + ASSERT(list_empty(parked)); + + /* budget_lock nests inside runqueue lock. */ + spin_lock(&sdom->budget_lock); + + /* + * The unit is stopping running (e.g., because it's blocking, or it has + * been preempted). If it hasn't consumed all the budget it got when, + * starting to run, put that remaining amount back in the domain's budget + * pool. + */ + sdom->budget += svc->budget; + svc->budget = 0; + + /* + * Making budget available again to the domain means that parked units + * may be unparked and run. They are, if any, in the domain's parked_units + * list, so we want to go through that and unpark them (so they can try + * to get some budget). + * + * Touching the list requires the budget_lock, which we hold. Let's + * therefore put everyone in that list in another, temporary list, which + * then the caller will traverse, unparking the units it finds there. + * + * In fact, we can't do the actual unparking here, because that requires + * taking the runqueue lock of the units being unparked, and we can't + * take any runqueue locks while we hold a budget_lock. + */ + if ( sdom->budget > 0 ) + list_splice_init(&sdom->parked_units, parked); + + spin_unlock(&sdom->budget_lock); +} + +static void +unpark_parked_units(const struct scheduler *ops, struct list_head *units) +{ + struct csched2_unit *svc, *tmp; + spinlock_t *lock; + + list_for_each_entry_safe ( svc, tmp, units, parked_elem ) + { + unsigned long flags; + s_time_t now; + + lock = unit_schedule_lock_irqsave(svc->unit, &flags); + + sched_clear_pause_flags(svc->unit, _VPF_parked); + if ( unlikely(svc->flags & CSFLAG_scheduled) ) + { + /* + * We end here if a budget replenishment arrived between + * csched2_schedule() (and, in particular, after a call to + * unit_grab_budget() that returned false), and + * context_saved(). By setting __CSFLAG_delayed_runq_add, + * we tell context_saved() to put the unit back in the + * runqueue, from where it will compete with the others + * for the newly replenished budget. + */ + ASSERT( svc->rqd != NULL ); + ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd ); + __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); + } + else if ( unit_runnable(svc->unit) ) + { + /* + * The unit should go back to the runqueue, and compete for + * the newly replenished budget, but only if it is actually + * runnable (and was therefore offline only because of the + * lack of budget). + */ + now = NOW(); + update_load(ops, svc->rqd, svc, 1, now); + runq_insert(ops, svc); + runq_tickle(ops, svc, now); + } + list_del_init(&svc->parked_elem); + + unit_schedule_unlock_irqrestore(lock, flags, svc->unit); + } +} + +static inline void do_replenish(struct csched2_dom *sdom) +{ + sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD; + sdom->budget += sdom->tot_budget; +} + +static void replenish_domain_budget(void* data) +{ + struct csched2_dom *sdom = data; + unsigned long flags; + s_time_t now; + LIST_HEAD(parked); + + spin_lock_irqsave(&sdom->budget_lock, flags); + + now = NOW(); + + /* + * Let's do the replenishment. Note, though, that a domain may overrun, + * which means the budget would have gone below 0 (reasons may be system + * overbooking, accounting issues, etc.). It also may happen that we are + * handling the replenishment (much) later than we should (reasons may + * again be overbooking, or issues with timers). + * + * Even in cases of overrun or delay, however, we expect that in 99% of + * cases, doing just one replenishment will be good enough for being able + * to unpark the units that are waiting for some budget. + */ + do_replenish(sdom); + + /* + * And now, the special cases: + * 1) if we are late enough to have skipped (at least) one full period, + * what we must do is doing more replenishments. Note that, however, + * every time we add tot_budget to the budget, we also move next_repl + * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always + * respected. + */ + if ( unlikely(sdom->next_repl <= now) ) + { + do + do_replenish(sdom); + while ( sdom->next_repl <= now ); + } + /* + * 2) if we overrun by more than tot_budget, then budget+tot_budget is + * still < 0, which means that we can't unpark the units. Let's bail, + * and wait for future replenishments. + */ + if ( unlikely(sdom->budget <= 0) ) + { + spin_unlock_irqrestore(&sdom->budget_lock, flags); + goto out; + } + + /* Since we do more replenishments, make sure we didn't overshot. */ + sdom->budget = min(sdom->budget, sdom->tot_budget); + + /* + * As above, let's prepare the temporary list, out of the domain's + * parked_units list, now that we hold the budget_lock. Then, drop such + * lock, and pass the list to the unparking function. + */ + list_splice_init(&sdom->parked_units, &parked); + + spin_unlock_irqrestore(&sdom->budget_lock, flags); + + unpark_parked_units(sdom->dom->cpupool->sched, &parked); + + out: + set_timer(&sdom->repl_timer, sdom->next_repl); +} + +#ifndef NDEBUG +static inline void +csched2_unit_check(struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + struct csched2_dom * const sdom = svc->sdom; + + BUG_ON( svc->unit != unit ); + BUG_ON( sdom != csched2_dom(unit->domain) ); + if ( sdom ) + { + BUG_ON( is_idle_unit(unit) ); + BUG_ON( sdom->dom != unit->domain ); + } + else + { + BUG_ON( !is_idle_unit(unit) ); + } + SCHED_STAT_CRANK(unit_check); +} +#define CSCHED2_UNIT_CHECK(unit) (csched2_unit_check(unit)) +#else +#define CSCHED2_UNIT_CHECK(unit) +#endif + +static void * +csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, + void *dd) +{ + struct csched2_unit *svc; + + /* Allocate per-UNIT info */ + svc = xzalloc(struct csched2_unit); + if ( svc == NULL ) + return NULL; + + INIT_LIST_HEAD(&svc->rqd_elem); + INIT_LIST_HEAD(&svc->runq_elem); + + svc->sdom = dd; + svc->unit = unit; + svc->flags = 0U; + + if ( ! is_idle_unit(unit) ) + { + ASSERT(svc->sdom != NULL); + svc->credit = CSCHED2_CREDIT_INIT; + svc->weight = svc->sdom->weight; + /* Starting load of 50% */ + svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1); + svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT; + } + else + { + ASSERT(svc->sdom == NULL); + svc->credit = CSCHED2_IDLE_CREDIT; + svc->weight = 0; + } + svc->tickled_cpu = -1; + + svc->budget = STIME_MAX; + svc->budget_quota = 0; + INIT_LIST_HEAD(&svc->parked_elem); + + SCHED_STAT_CRANK(unit_alloc); + + return svc; +} + +static void +csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + + ASSERT(!is_idle_unit(unit)); + SCHED_STAT_CRANK(unit_sleep); + + if ( curr_on_cpu(sched_unit_master(unit)) == unit ) + { + tickle_cpu(sched_unit_master(unit), svc->rqd); + } + else if ( unit_on_runq(svc) ) + { + ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); + update_load(ops, svc->rqd, svc, -1, NOW()); + runq_remove(svc); + } + else + __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); +} + +static void +csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + unsigned int cpu = sched_unit_master(unit); + s_time_t now; + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + ASSERT(!is_idle_unit(unit)); + + if ( unlikely(curr_on_cpu(cpu) == unit) ) + { + SCHED_STAT_CRANK(unit_wake_running); + goto out; + } + + if ( unlikely(unit_on_runq(svc)) ) + { + SCHED_STAT_CRANK(unit_wake_onrunq); + goto out; + } + + if ( likely(unit_runnable(unit)) ) + SCHED_STAT_CRANK(unit_wake_runnable); + else + SCHED_STAT_CRANK(unit_wake_not_runnable); + + /* If the context hasn't been saved for this unit yet, we can't put it on + * another runqueue. Instead, we set a flag so that it will be put on the runqueue + * after the context has been saved. */ + if ( unlikely(svc->flags & CSFLAG_scheduled) ) + { + __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); + goto out; + } + + /* Add into the new runqueue if necessary */ + if ( svc->rqd == NULL ) + runq_assign(ops, unit); + else + ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd ); + + now = NOW(); + + update_load(ops, svc->rqd, svc, 1, now); + + /* Put the UNIT on the runq */ + runq_insert(ops, svc); + runq_tickle(ops, svc, now); + +out: + return; +} + +static void +csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + + __set_bit(__CSFLAG_unit_yield, &svc->flags); +} + +static void +csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + spinlock_t *lock = unit_schedule_lock_irq(unit); + s_time_t now = NOW(); + LIST_HEAD(were_parked); + + BUG_ON( !is_idle_unit(unit) && + svc->rqd != c2rqd(ops, sched_unit_master(unit))); + ASSERT(is_idle_unit(unit) || + svc->rqd == c2rqd(ops, sched_unit_master(unit))); + + /* This unit is now eligible to be put on the runqueue again */ + __clear_bit(__CSFLAG_scheduled, &svc->flags); + + if ( unlikely(has_cap(svc) && svc->budget > 0) ) + unit_return_budget(svc, &were_parked); + + /* If someone wants it on the runqueue, put it there. */ + /* + * NB: We can get rid of CSFLAG_scheduled by checking for + * vc->is_running and unit_on_runq(svc) here. However, + * since we're accessing the flags cacheline anyway, + * it seems a bit pointless; especially as we have plenty of + * bits free. + */ + if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) + && likely(unit_runnable(unit)) ) + { + ASSERT(!unit_on_runq(svc)); + + runq_insert(ops, svc); + runq_tickle(ops, svc, now); + } + else if ( !is_idle_unit(unit) ) + update_load(ops, svc->rqd, svc, -1, now); + + unit_schedule_unlock_irq(lock, unit); + + unpark_parked_units(ops, &were_parked); +} + +#define MAX_LOAD (STIME_MAX) +static struct sched_resource * +csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + struct csched2_private *prv = csched2_priv(ops); + int i, min_rqi = -1, min_s_rqi = -1; + unsigned int new_cpu, cpu = sched_unit_master(unit); + struct csched2_unit *svc = csched2_unit(unit); + s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD; + bool has_soft; + + ASSERT(!cpumask_empty(&prv->active_queues)); + + SCHED_STAT_CRANK(pick_resource); + + /* Locking: + * - Runqueue lock of vc->processor is already locked + * - Need to grab prv lock to make sure active runqueues don't + * change + * - Need to grab locks for other runqueues while checking + * avgload + * Locking constraint is: + * - Lock prv before runqueue locks + * - Trylock between runqueue locks (no ordering) + * + * Since one of the runqueue locks is already held, we can't + * just grab the prv lock. Instead, we'll have to trylock, and + * do something else reasonable if we fail. + */ + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + if ( !read_trylock(&prv->lock) ) + { + /* We may be here because someone requested us to migrate. */ + __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); + new_cpu = get_fallback_cpu(svc); + /* + * Tracing of runq and its load won't be accurate, since we could + * not get the lock, but at least we will output the chosen pcpu. + */ + goto out; + } + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + + /* + * First check to see if we're here because someone else suggested a place + * for us to move. + */ + if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) + { + if ( unlikely(svc->migrate_rqd->id < 0) ) + { + printk(XENLOG_WARNING "%s: target runqueue disappeared!\n", + __func__); + } + else if ( cpumask_intersects(cpumask_scratch_cpu(cpu), + &svc->migrate_rqd->active) ) + { + /* + * If we've been asked to move to migrate_rqd, we should just do + * that, which we actually do by returning one cpu from that runq. + * There is no need to take care of soft affinity, as that will + * happen in runq_tickle(). + */ + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &svc->migrate_rqd->active); + new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias, + cpumask_scratch_cpu(cpu)); + + svc->migrate_rqd->pick_bias = new_cpu; + goto out_up; + } + /* Fall-through to normal cpu pick */ + } + + /* + * What we want is: + * - if we have soft affinity, the runqueue with the lowest average + * load, among the ones that contain cpus in our soft affinity; this + * represents the best runq on which we would want to run. + * - the runqueue with the lowest average load among the ones that + * contains cpus in our hard affinity; this represent the best runq + * on which we can run. + * + * Find both runqueues in one pass. + */ + has_soft = has_soft_affinity(unit); + for_each_cpu(i, &prv->active_queues) + { + struct csched2_runqueue_data *rqd; + s_time_t rqd_avgload = MAX_LOAD; + + rqd = prv->rqd + i; + + /* + * If none of the cpus of this runqueue is in svc's hard-affinity, + * skip the runqueue. + * + * Note that, in case svc's hard-affinity has changed, this is the + * first time when we see such change, so it is indeed possible + * that we end up skipping svc's current runqueue. + */ + if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) ) + continue; + + /* + * If checking a different runqueue, grab the lock, read the avg, + * and then release the lock. + * + * If on our own runqueue, don't grab or release the lock; + * but subtract our own load from the runqueue load to simulate + * impartiality. + */ + if ( rqd == svc->rqd ) + { + rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); + } + else if ( spin_trylock(&rqd->lock) ) + { + rqd_avgload = rqd->b_avgload; + spin_unlock(&rqd->lock); + } + + /* + * if svc has a soft-affinity, and some cpus of rqd are part of it, + * see if we need to update the "soft-affinity minimum". + */ + if ( has_soft && + rqd_avgload < min_s_avgload ) + { + cpumask_t mask; + + cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active); + if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) ) + { + min_s_avgload = rqd_avgload; + min_s_rqi = i; + } + } + /* In any case, keep the "hard-affinity minimum" updated too. */ + if ( rqd_avgload < min_avgload ) + { + min_avgload = rqd_avgload; + min_rqi = i; + } + } + + if ( has_soft && min_s_rqi != -1 ) + { + /* + * We have soft affinity, and we have a candidate runq, so go for it. + * + * Note that, to obtain the soft-affinity mask, we "just" put what we + * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is + * ok because: + * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have + * a non-empty intersection (because has_soft is true); + * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask() + * already in cpumask_scratch, we do save a lot doing like this. + * + * It's kind of like open coding affinity_balance_cpumask() but, in + * this specific case, calling that would mean a lot of (unnecessary) + * cpumask operations. + */ + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + unit->cpu_soft_affinity); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &prv->rqd[min_s_rqi].active); + } + else if ( min_rqi != -1 ) + { + /* + * Either we don't have soft-affinity, or we do, but we did not find + * any suitable runq. But we did find one when considering hard + * affinity, so go for it. + * + * cpumask_scratch already has unit->cpu_hard_affinity & + * cpupool_domain_master_cpumask() in it, so it's enough that we filter + * with the cpus of the runq. + */ + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &prv->rqd[min_rqi].active); + } + else + { + /* + * We didn't find anyone at all (most likely because of spinlock + * contention). + */ + new_cpu = get_fallback_cpu(svc); + min_rqi = c2r(new_cpu); + min_avgload = prv->rqd[min_rqi].b_avgload; + goto out_up; + } + + new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias, + cpumask_scratch_cpu(cpu)); + prv->rqd[min_rqi].pick_bias = new_cpu; + BUG_ON(new_cpu >= nr_cpu_ids); + + out_up: + read_unlock(&prv->lock); + out: + if ( unlikely(tb_init_done) ) + { + struct { + uint64_t b_avgload; + unsigned unit:16, dom:16; + unsigned rq_id:16, new_cpu:16; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.rq_id = min_rqi; + d.b_avgload = min_avgload; + d.new_cpu = new_cpu; + __trace_var(TRC_CSCHED2_PICKED_CPU, 1, + sizeof(d), + (unsigned char *)&d); + } + + return get_sched_res(new_cpu); +} + +/* Working state of the load-balancing algorithm */ +typedef struct { + /* NB: Modified by consider() */ + s_time_t load_delta; + struct csched2_unit * best_push_svc, *best_pull_svc; + /* NB: Read by consider() */ + struct csched2_runqueue_data *lrqd; + struct csched2_runqueue_data *orqd; +} balance_state_t; + +static void consider(balance_state_t *st, + struct csched2_unit *push_svc, + struct csched2_unit *pull_svc) +{ + s_time_t l_load, o_load, delta; + + l_load = st->lrqd->b_avgload; + o_load = st->orqd->b_avgload; + if ( push_svc ) + { + /* What happens to the load on both if we push? */ + l_load -= push_svc->avgload; + o_load += push_svc->avgload; + } + if ( pull_svc ) + { + /* What happens to the load on both if we pull? */ + l_load += pull_svc->avgload; + o_load -= pull_svc->avgload; + } + + delta = l_load - o_load; + if ( delta < 0 ) + delta = -delta; + + if ( delta < st->load_delta ) + { + st->load_delta = delta; + st->best_push_svc=push_svc; + st->best_pull_svc=pull_svc; + } +} + + +static void migrate(const struct scheduler *ops, + struct csched2_unit *svc, + struct csched2_runqueue_data *trqd, + s_time_t now) +{ + struct sched_unit *unit = svc->unit; + int cpu = sched_unit_master(unit); + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned rqi:16, trqi:16; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.rqi = svc->rqd->id; + d.trqi = trqd->id; + __trace_var(TRC_CSCHED2_MIGRATE, 1, + sizeof(d), + (unsigned char *)&d); + } + + if ( svc->flags & CSFLAG_scheduled ) + { + /* It's running; mark it to migrate. */ + svc->migrate_rqd = trqd; + sched_set_pause_flags(unit, _VPF_migrating); + __set_bit(__CSFLAG_runq_migrate_request, &svc->flags); + SCHED_STAT_CRANK(migrate_requested); + tickle_cpu(cpu, svc->rqd); + } + else + { + int on_runq = 0; + /* It's not running; just move it */ + if ( unit_on_runq(svc) ) + { + runq_remove(svc); + update_load(ops, svc->rqd, NULL, -1, now); + on_runq = 1; + } + _runq_deassign(svc); + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &trqd->active); + sched_set_res(unit, + get_sched_res(cpumask_cycle(trqd->pick_bias, + cpumask_scratch_cpu(cpu)))); + trqd->pick_bias = sched_unit_master(unit); + ASSERT(sched_unit_master(unit) < nr_cpu_ids); + + _runq_assign(svc, trqd); + if ( on_runq ) + { + update_load(ops, svc->rqd, NULL, 1, now); + runq_insert(ops, svc); + runq_tickle(ops, svc, now); + SCHED_STAT_CRANK(migrate_on_runq); + } + else + SCHED_STAT_CRANK(migrate_no_runq); + } +} + +/* + * It makes sense considering migrating svc to rqd, if: + * - svc is not already flagged to migrate, + * - if svc is allowed to run on at least one of the pcpus of rqd. + */ +static bool unit_is_migrateable(struct csched2_unit *svc, + struct csched2_runqueue_data *rqd) +{ + struct sched_unit *unit = svc->unit; + int cpu = sched_unit_master(unit); + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + + return !(svc->flags & CSFLAG_runq_migrate_request) && + cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active); +} + +static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) +{ + struct csched2_private *prv = csched2_priv(ops); + int i, max_delta_rqi; + struct list_head *push_iter, *pull_iter; + bool inner_load_updated = 0; + + balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; + + /* + * Basic algorithm: Push, pull, or swap. + * - Find the runqueue with the furthest load distance + * - Find a pair that makes the difference the least (where one + * on either side may be empty). + */ + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + st.lrqd = c2rqd(ops, cpu); + + update_runq_load(ops, st.lrqd, 0, now); + +retry: + max_delta_rqi = -1; + if ( !read_trylock(&prv->lock) ) + return; + + st.load_delta = 0; + + for_each_cpu(i, &prv->active_queues) + { + s_time_t delta; + + st.orqd = prv->rqd + i; + + if ( st.orqd == st.lrqd + || !spin_trylock(&st.orqd->lock) ) + continue; + + update_runq_load(ops, st.orqd, 0, now); + + delta = st.lrqd->b_avgload - st.orqd->b_avgload; + if ( delta < 0 ) + delta = -delta; + + if ( delta > st.load_delta ) + { + st.load_delta = delta; + max_delta_rqi = i; + } + + spin_unlock(&st.orqd->lock); + } + + /* Minimize holding the private scheduler lock. */ + read_unlock(&prv->lock); + if ( max_delta_rqi == -1 ) + goto out; + + { + s_time_t load_max; + int cpus_max; + + + load_max = st.lrqd->b_avgload; + if ( st.orqd->b_avgload > load_max ) + load_max = st.orqd->b_avgload; + + cpus_max = st.lrqd->nr_cpus; + i = st.orqd->nr_cpus; + if ( i > cpus_max ) + cpus_max = i; + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned lrq_id:16, orq_id:16; + unsigned load_delta; + } d; + d.lrq_id = st.lrqd->id; + d.orq_id = st.orqd->id; + d.load_delta = st.load_delta; + __trace_var(TRC_CSCHED2_LOAD_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + + /* + * If we're under 100% capacaty, only shift if load difference + * is > 1. otherwise, shift if under 12.5% + */ + if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) ) + { + if ( st.load_delta < (1ULL << (prv->load_precision_shift + + opt_underload_balance_tolerance)) ) + goto out; + } + else + if ( st.load_delta < (1ULL << (prv->load_precision_shift + + opt_overload_balance_tolerance)) ) + goto out; + } + + /* Try to grab the other runqueue lock; if it's been taken in the + * meantime, try the process over again. This can't deadlock + * because if it doesn't get any other rqd locks, it will simply + * give up and return. */ + st.orqd = prv->rqd + max_delta_rqi; + if ( !spin_trylock(&st.orqd->lock) ) + goto retry; + + /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ + if ( unlikely(st.orqd->id < 0) ) + goto out_up; + + if ( unlikely(tb_init_done) ) + { + struct { + uint64_t lb_avgload, ob_avgload; + unsigned lrq_id:16, orq_id:16; + } d; + d.lrq_id = st.lrqd->id; + d.lb_avgload = st.lrqd->b_avgload; + d.orq_id = st.orqd->id; + d.ob_avgload = st.orqd->b_avgload; + __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1, + sizeof(d), + (unsigned char *)&d); + } + + SCHED_STAT_CRANK(acct_load_balance); + + /* Look for "swap" which gives the best load average + * FIXME: O(n^2)! */ + + /* Reuse load delta (as we're trying to minimize it) */ + list_for_each( push_iter, &st.lrqd->svc ) + { + struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem); + + update_svc_load(ops, push_svc, 0, now); + + if ( !unit_is_migrateable(push_svc, st.orqd) ) + continue; + + list_for_each( pull_iter, &st.orqd->svc ) + { + struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); + + if ( !inner_load_updated ) + update_svc_load(ops, pull_svc, 0, now); + + if ( !unit_is_migrateable(pull_svc, st.lrqd) ) + continue; + + consider(&st, push_svc, pull_svc); + } + + inner_load_updated = 1; + + /* Consider push only */ + consider(&st, push_svc, NULL); + } + + list_for_each( pull_iter, &st.orqd->svc ) + { + struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); + + if ( !unit_is_migrateable(pull_svc, st.lrqd) ) + continue; + + /* Consider pull only */ + consider(&st, NULL, pull_svc); + } + + /* OK, now we have some candidates; do the moving */ + if ( st.best_push_svc ) + migrate(ops, st.best_push_svc, st.orqd, now); + if ( st.best_pull_svc ) + migrate(ops, st.best_pull_svc, st.lrqd, now); + + out_up: + spin_unlock(&st.orqd->lock); + out: + return; +} + +static void +csched2_unit_migrate( + const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu) +{ + struct domain *d = unit->domain; + struct csched2_unit * const svc = csched2_unit(unit); + struct csched2_runqueue_data *trqd; + s_time_t now = NOW(); + + /* + * Being passed a target pCPU which is outside of our cpupool is only + * valid if we are shutting down (or doing ACPI suspend), and we are + * moving everyone to BSP, no matter whether or not BSP is inside our + * cpupool. + * + * And since there indeed is the chance that it is not part of it, all + * we must do is remove _and_ unassign the unit from any runqueue, as + * well as updating v->processor with the target, so that the suspend + * process can continue. + * + * It will then be during resume that a new, meaningful, value for + * v->processor will be chosen, and during actual domain unpause that + * the unit will be assigned to and added to the proper runqueue. + */ + if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) ) + { + ASSERT(system_state == SYS_STATE_suspend); + if ( unit_on_runq(svc) ) + { + runq_remove(svc); + update_load(ops, svc->rqd, NULL, -1, now); + } + _runq_deassign(svc); + sched_set_res(unit, get_sched_res(new_cpu)); + return; + } + + /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */ + ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized)); + ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity)); + + trqd = c2rqd(ops, new_cpu); + + /* + * Do the actual movement toward new_cpu, and update vc->processor. + * If we are changing runqueue, migrate() takes care of everything. + * If we are not changing runqueue, we need to update vc->processor + * here. In fact, if, for instance, we are here because the unit's + * hard affinity changed, we don't want to risk leaving vc->processor + * pointing to a pcpu where we can't run any longer. + */ + if ( trqd != svc->rqd ) + migrate(ops, svc, trqd, now); + else + sched_set_res(unit, get_sched_res(new_cpu)); +} + +static int +csched2_dom_cntl( + const struct scheduler *ops, + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct csched2_dom * const sdom = csched2_dom(d); + struct csched2_private *prv = csched2_priv(ops); + unsigned long flags; + struct sched_unit *unit; + int rc = 0; + + /* + * Locking: + * - we must take the private lock for accessing the weights of the + * units of d, and/or the cap; + * - in the putinfo case, we also need the runqueue lock(s), for + * updating the max waight of the runqueue(s). + * If changing the cap, we also need the budget_lock, for updating + * the value of the domain budget pool (and the runqueue lock, + * for adjusting the parameters and rescheduling any unit that is + * running at the time of the change). + */ + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + read_lock_irqsave(&prv->lock, flags); + op->u.credit2.weight = sdom->weight; + op->u.credit2.cap = sdom->cap; + read_unlock_irqrestore(&prv->lock, flags); + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + write_lock_irqsave(&prv->lock, flags); + /* Weight */ + if ( op->u.credit2.weight != 0 ) + { + int old_weight; + + old_weight = sdom->weight; + + sdom->weight = op->u.credit2.weight; + + /* Update weights for units, and max_weight for runqueues on which they reside */ + for_each_sched_unit ( d, unit ) + { + struct csched2_unit *svc = csched2_unit(unit); + spinlock_t *lock = unit_schedule_lock(unit); + + ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); + + svc->weight = sdom->weight; + update_max_weight(svc->rqd, svc->weight, old_weight); + + unit_schedule_unlock(lock, unit); + } + } + /* Cap */ + if ( op->u.credit2.cap != 0 ) + { + struct csched2_unit *svc; + spinlock_t *lock; + + /* Cap is only valid if it's below 100 * nr_of_units */ + if ( op->u.credit2.cap > 100 * sdom->nr_units ) + { + rc = -EINVAL; + write_unlock_irqrestore(&prv->lock, flags); + break; + } + + spin_lock(&sdom->budget_lock); + sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap); + sdom->tot_budget /= 100; + spin_unlock(&sdom->budget_lock); + + /* + * When trying to get some budget and run, each unit will grab + * from the pool 1/N (with N = nr of units of the domain) of + * the total budget. Roughly speaking, this means each unit will + * have at least one chance to run during every period. + */ + for_each_sched_unit ( d, unit ) + { + svc = csched2_unit(unit); + lock = unit_schedule_lock(unit); + /* + * Too small quotas would in theory cause a lot of overhead, + * which then won't happen because, in csched2_runtime(), + * CSCHED2_MIN_TIMER is what would be used anyway. + */ + svc->budget_quota = max(sdom->tot_budget / sdom->nr_units, + CSCHED2_MIN_TIMER); + unit_schedule_unlock(lock, unit); + } + + if ( sdom->cap == 0 ) + { + /* + * We give to the domain the budget to which it is entitled, + * and queue its first replenishment event. + * + * Since cap is currently disabled for this domain, we + * know no unit is messing with the domain's budget, and + * the replenishment timer is still off. + * For these reasons, it is safe to do the following without + * taking the budget_lock. + */ + sdom->budget = sdom->tot_budget; + sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD; + set_timer(&sdom->repl_timer, sdom->next_repl); + + /* + * Now, let's enable budget accounting for all the units. + * For making sure that they will start to honour the domain's + * cap, we set their budget to 0. + * This way, as soon as they will try to run, they will have + * to get some budget. + * + * For the units that are already running, we trigger the + * scheduler on their pCPU. When, as a consequence of this, + * csched2_schedule() will run, it will figure out there is + * no budget, and the unit will try to get some (and be parked, + * if there's none, and we'll switch to someone else). + */ + for_each_sched_unit ( d, unit ) + { + svc = csched2_unit(unit); + lock = unit_schedule_lock(unit); + if ( unit->is_running ) + { + unsigned int cpu = sched_unit_master(unit); + struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); + + ASSERT(curr_on_cpu(cpu) == unit); + + /* + * We are triggering a reschedule on the unit's + * pCPU. That will run burn_credits() and, since + * the unit is capped now, it would charge all the + * execution time of this last round as budget as + * well. That will make the unit budget go negative, + * potentially by a large amount, and it's unfair. + * + * To avoid that, call burn_credit() here, to do the + * accounting of this current running instance now, + * with budgetting still disabled. This does not + * prevent some small amount of budget being charged + * to the unit (i.e., the amount of time it runs from + * now, to when scheduling happens). The budget will + * also go below 0, but a lot less than how it would + * if we don't do this. + */ + burn_credits(rqd, svc, NOW()); + __cpumask_set_cpu(cpu, &rqd->tickled); + ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle)); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + } + svc->budget = 0; + unit_schedule_unlock(lock, unit); + } + } + + sdom->cap = op->u.credit2.cap; + } + else if ( sdom->cap != 0 ) + { + LIST_HEAD(parked); + + stop_timer(&sdom->repl_timer); + + /* Disable budget accounting for all the units. */ + for_each_sched_unit ( d, unit ) + { + struct csched2_unit *svc = csched2_unit(unit); + spinlock_t *lock = unit_schedule_lock(unit); + + svc->budget = STIME_MAX; + svc->budget_quota = 0; + + unit_schedule_unlock(lock, unit); + } + sdom->cap = 0; + /* + * We are disabling the cap for this domain, which may have + * units waiting for a replenishment, so we unpark them all. + * Note that, since we have already disabled budget accounting + * for all the units of the domain, no currently running unit + * will be added to the parked units list any longer. + */ + spin_lock(&sdom->budget_lock); + list_splice_init(&sdom->parked_units, &parked); + spin_unlock(&sdom->budget_lock); + + unpark_parked_units(ops, &parked); + } + write_unlock_irqrestore(&prv->lock, flags); + break; + default: + rc = -EINVAL; + break; + } + + + return rc; +} + +static void +csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, + const cpumask_t *hard, const cpumask_t *soft) +{ + struct csched2_unit *svc = csched2_unit(unit); + + if ( !hard ) + return; + + /* Are we becoming exclusively pinned? */ + if ( cpumask_weight(hard) == 1 ) + __set_bit(__CSFLAG_pinned, &svc->flags); + else + __clear_bit(__CSFLAG_pinned, &svc->flags); +} + +static int csched2_sys_cntl(const struct scheduler *ops, + struct xen_sysctl_scheduler_op *sc) +{ + struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2; + struct csched2_private *prv = csched2_priv(ops); + unsigned long flags; + + switch (sc->cmd ) + { + case XEN_SYSCTL_SCHEDOP_putinfo: + if ( params->ratelimit_us && + (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || + params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN )) + return -EINVAL; + + write_lock_irqsave(&prv->lock, flags); + if ( !prv->ratelimit_us && params->ratelimit_us ) + printk(XENLOG_INFO "Enabling context switch rate limiting\n"); + else if ( prv->ratelimit_us && !params->ratelimit_us ) + printk(XENLOG_INFO "Disabling context switch rate limiting\n"); + prv->ratelimit_us = params->ratelimit_us; + write_unlock_irqrestore(&prv->lock, flags); + + /* FALLTHRU */ + case XEN_SYSCTL_SCHEDOP_getinfo: + params->ratelimit_us = prv->ratelimit_us; + break; + } + + return 0; +} + +static void * +csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + struct csched2_private *prv = csched2_priv(ops); + struct csched2_dom *sdom; + unsigned long flags; + + sdom = xzalloc(struct csched2_dom); + if ( sdom == NULL ) + return ERR_PTR(-ENOMEM); + + /* Initialize credit, cap and weight */ + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + sdom->weight = CSCHED2_DEFAULT_WEIGHT; + sdom->cap = 0U; + sdom->nr_units = 0; + + init_timer(&sdom->repl_timer, replenish_domain_budget, sdom, + cpumask_any(cpupool_domain_master_cpumask(dom))); + spin_lock_init(&sdom->budget_lock); + INIT_LIST_HEAD(&sdom->parked_units); + + write_lock_irqsave(&prv->lock, flags); + + list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom); + + write_unlock_irqrestore(&prv->lock, flags); + + return sdom; +} + +static void +csched2_free_domdata(const struct scheduler *ops, void *data) +{ + struct csched2_dom *sdom = data; + struct csched2_private *prv = csched2_priv(ops); + + if ( sdom ) + { + unsigned long flags; + + kill_timer(&sdom->repl_timer); + + write_lock_irqsave(&prv->lock, flags); + list_del_init(&sdom->sdom_elem); + write_unlock_irqrestore(&prv->lock, flags); + + xfree(sdom); + } +} + +static void +csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit *svc = unit->priv; + struct csched2_dom * const sdom = svc->sdom; + spinlock_t *lock; + + ASSERT(!is_idle_unit(unit)); + ASSERT(list_empty(&svc->runq_elem)); + + /* csched2_res_pick() expects the pcpu lock to be held */ + lock = unit_schedule_lock_irq(unit); + + sched_set_res(unit, csched2_res_pick(ops, unit)); + + spin_unlock_irq(lock); + + lock = unit_schedule_lock_irq(unit); + + /* Add unit to runqueue of initial processor */ + runq_assign(ops, unit); + + unit_schedule_unlock_irq(lock, unit); + + sdom->nr_units++; + + SCHED_STAT_CRANK(unit_insert); + + CSCHED2_UNIT_CHECK(unit); +} + +static void +csched2_free_udata(const struct scheduler *ops, void *priv) +{ + struct csched2_unit *svc = priv; + + xfree(svc); +} + +static void +csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit) +{ + struct csched2_unit * const svc = csched2_unit(unit); + spinlock_t *lock; + + ASSERT(!is_idle_unit(unit)); + ASSERT(list_empty(&svc->runq_elem)); + + SCHED_STAT_CRANK(unit_remove); + + /* Remove from runqueue */ + lock = unit_schedule_lock_irq(unit); + + runq_deassign(ops, unit); + + unit_schedule_unlock_irq(lock, unit); + + svc->sdom->nr_units--; +} + +/* How long should we let this unit run for? */ +static s_time_t +csched2_runtime(const struct scheduler *ops, int cpu, + struct csched2_unit *snext, s_time_t now) +{ + s_time_t time, min_time; + int rt_credit; /* Proposed runtime measured in credits */ + struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); + struct list_head *runq = &rqd->runq; + struct csched2_private *prv = csched2_priv(ops); + + /* + * If we're idle, just stay so. Others (or external events) + * will poke us when necessary. + */ + if ( is_idle_unit(snext->unit) ) + return -1; + + /* General algorithm: + * 1) Run until snext's credit will be 0. + * 2) But if someone is waiting, run until snext's credit is equal + * to his. + * 3) But, if we are capped, never run more than our budget. + * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or + * the ratelimit time. + */ + + /* Calculate mintime */ + min_time = CSCHED2_MIN_TIMER; + if ( prv->ratelimit_us ) + { + s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us); + if ( snext->unit->is_running ) + ratelimit_min = snext->unit->state_entry_time + + MICROSECS(prv->ratelimit_us) - now; + if ( ratelimit_min > min_time ) + min_time = ratelimit_min; + } + + /* 1) Run until snext's credit will be 0. */ + rt_credit = snext->credit; + + /* + * 2) If there's someone waiting whose credit is positive, + * run until your credit ~= his. + */ + if ( ! list_empty(runq) ) + { + struct csched2_unit *swait = runq_elem(runq->next); + + if ( ! is_idle_unit(swait->unit) + && swait->credit > 0 ) + { + rt_credit = snext->credit - swait->credit; + } + } + + /* + * The next guy on the runqueue may actually have a higher credit, + * if we've tried to avoid migrating him from a different cpu. + * Setting time=0 will ensure the minimum timeslice is chosen. + * + * FIXME: See if we can eliminate this conversion if we know time + * will be outside (MIN,MAX). Probably requires pre-calculating + * credit values of MIN,MAX per unit, since each unit burns credit + * at a different rate. + */ + if ( rt_credit > 0 ) + time = c2t(rqd, rt_credit, snext); + else + time = 0; + + /* + * 3) But, if capped, never run more than our budget. + */ + if ( has_cap(snext) ) + time = snext->budget < time ? snext->budget : time; + + /* + * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or + * the rate_limit time. + */ + if ( time < min_time ) + { + time = min_time; + SCHED_STAT_CRANK(runtime_min_timer); + } + else if (time > CSCHED2_MAX_TIMER) + { + time = CSCHED2_MAX_TIMER; + SCHED_STAT_CRANK(runtime_max_timer); + } + + return time; +} + +/* + * Find a candidate. + */ +static struct csched2_unit * +runq_candidate(struct csched2_runqueue_data *rqd, + struct csched2_unit *scurr, + int cpu, s_time_t now, + unsigned int *skipped) +{ + struct list_head *iter, *temp; + struct sched_resource *sr = get_sched_res(cpu); + struct csched2_unit *snext = NULL; + struct csched2_private *prv = csched2_priv(sr->scheduler); + bool yield = false, soft_aff_preempt = false; + + *skipped = 0; + + if ( unlikely(is_idle_unit(scurr->unit)) ) + { + snext = scurr; + goto check_runq; + } + + yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags); + + /* + * Return the current unit if it has executed for less than ratelimit. + * Adjuststment for the selected unit's credit and decision + * for how long it will run will be taken in csched2_runtime. + * + * Note that, if scurr is yielding, we don't let rate limiting kick in. + * In fact, it may be the case that scurr is about to spin, and there's + * no point forcing it to do so until rate limiting expires. + */ + if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) && + (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) ) + { + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned runtime; + } d; + d.dom = scurr->unit->domain->domain_id; + d.unit = scurr->unit->unit_id; + d.runtime = now - scurr->unit->state_entry_time; + __trace_var(TRC_CSCHED2_RATELIMIT, 1, + sizeof(d), + (unsigned char *)&d); + } + return scurr; + } + + /* If scurr has a soft-affinity, let's check whether cpu is part of it */ + if ( has_soft_affinity(scurr->unit) ) + { + affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY, + cpumask_scratch); + if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) ) + { + cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain); + + /* Ok, is any of the pcpus in scurr soft-affinity idle? */ + cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle); + cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled); + soft_aff_preempt = cpumask_intersects(cpumask_scratch, online); + } + } + + /* + * If scurr is runnable, and this cpu is in its soft-affinity, default to + * it. We also default to it, even if cpu is not in its soft-affinity, if + * there aren't any idle and not tickled cpu in its soft-affinity. In + * fact, we don't want to risk leaving scurr in the runq and this cpu idle + * only because scurr is running outside of its soft-affinity. + * + * On the other hand, if cpu is not in scurr's soft-affinity, and there + * looks to be better options, go for them. That happens by defaulting to + * idle here, which means scurr will be preempted, put back in runq, and + * one of those idle and not tickled cpus from its soft-affinity will be + * tickled to pick it up. + * + * Finally, if scurr does not have a valid soft-affinity, we also let it + * continue to run here (in fact, soft_aff_preempt will still be false, + * in this case). + * + * Of course, we also default to idle also if scurr is not runnable. + */ + if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt ) + snext = scurr; + else + snext = csched2_unit(sched_idle_unit(cpu)); + + check_runq: + list_for_each_safe( iter, temp, &rqd->runq ) + { + struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem); + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1, + sizeof(d), + (unsigned char *)&d); + } + + /* Only consider units that are allowed to run on this processor. */ + if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) ) + { + (*skipped)++; + continue; + } + + /* + * If an unit is meant to be picked up by another processor, and such + * processor has not scheduled yet, leave it in the runqueue for him. + */ + if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu && + cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) ) + { + (*skipped)++; + SCHED_STAT_CRANK(deferred_to_tickled_cpu); + continue; + } + + /* + * If this is on a different processor, don't pull it unless + * its credit is at least CSCHED2_MIGRATE_RESIST higher. + */ + if ( sched_unit_master(svc->unit) != cpu + && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit ) + { + (*skipped)++; + SCHED_STAT_CRANK(migrate_resisted); + continue; + } + + /* + * If the one in the runqueue has more credit than current (or idle, + * if current is not runnable), or if current is yielding, and also + * if the one in runqueue either is not capped, or is capped but has + * some budget, then choose it. + */ + if ( (yield || svc->credit > snext->credit) && + (!has_cap(svc) || unit_grab_budget(svc)) && + unit_runnable_state(svc->unit) ) + snext = svc; + + /* In any case, if we got this far, break. */ + break; + } + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned unit:16, dom:16; + unsigned tickled_cpu, skipped; + int credit; + } d; + d.dom = snext->unit->domain->domain_id; + d.unit = snext->unit->unit_id; + d.credit = snext->credit; + d.tickled_cpu = snext->tickled_cpu; + d.skipped = *skipped; + __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1, + sizeof(d), + (unsigned char *)&d); + } + + if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) ) + SCHED_STAT_CRANK(tickled_cpu_overridden); + + /* + * If snext is from a capped domain, it must have budget (or it + * wouldn't have been in the runq). If it is not, it'd be STIME_MAX, + * which still is >= 0. + */ + ASSERT(snext->budget >= 0); + + return snext; +} + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static void csched2_schedule( + const struct scheduler *ops, struct sched_unit *currunit, s_time_t now, + bool tasklet_work_scheduled) +{ + const unsigned int cur_cpu = smp_processor_id(); + const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); + struct csched2_runqueue_data *rqd; + struct csched2_unit * const scurr = csched2_unit(currunit); + struct csched2_unit *snext = NULL; + unsigned int skipped_units = 0; + bool tickled; + bool migrated = false; + + SCHED_STAT_CRANK(schedule); + CSCHED2_UNIT_CHECK(currunit); + + BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized)); + + rqd = c2rqd(ops, sched_cpu); + BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active)); + + ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock)); + + BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd); + + /* Clear "tickled" bit now that we've been scheduled */ + tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled); + if ( tickled ) + { + __cpumask_clear_cpu(sched_cpu, &rqd->tickled); + cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); + smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); + } + + if ( unlikely(tb_init_done) ) + { + struct { + unsigned cpu:16, rq_id:16; + unsigned tasklet:8, idle:8, smt_idle:8, tickled:8; + } d; + d.cpu = cur_cpu; + d.rq_id = c2r(sched_cpu); + d.tasklet = tasklet_work_scheduled; + d.idle = is_idle_unit(currunit); + d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle); + d.tickled = tickled; + __trace_var(TRC_CSCHED2_SCHEDULE, 1, + sizeof(d), + (unsigned char *)&d); + } + + /* Update credits (and budget, if necessary). */ + burn_credits(rqd, scurr, now); + + /* + * Below 0, means that we are capped and we have overrun our budget. + * Let's try to get some more but, if we fail (e.g., because of the + * other running units), we will be parked. + */ + if ( unlikely(scurr->budget <= 0) ) + unit_grab_budget(scurr); + + /* + * Select next runnable local UNIT (ie top of local runq). + * + * If the current unit is runnable, and has higher credit than + * the next guy on the queue (or there is noone else), we want to + * run him again. + * + * If there's tasklet work to do, we want to chose the idle unit + * for this processor, and mark the current for delayed runqueue + * add. + * + * If the current unit is runnable, and there's another runnable + * candidate, we want to mark current for delayed runqueue add, + * and remove the next guy from the queue. + * + * If the current unit is not runnable, we want to chose the idle + * unit for this processor. + */ + if ( tasklet_work_scheduled ) + { + __clear_bit(__CSFLAG_unit_yield, &scurr->flags); + trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL); + snext = csched2_unit(sched_idle_unit(sched_cpu)); + } + else + snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units); + + /* If switching from a non-idle runnable unit, put it + * back on the runqueue. */ + if ( snext != scurr + && !is_idle_unit(currunit) + && unit_runnable(currunit) ) + __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); + + /* Accounting for non-idle tasks */ + if ( !is_idle_unit(snext->unit) ) + { + /* If switching, remove this from the runqueue and mark it scheduled */ + if ( snext != scurr ) + { + ASSERT(snext->rqd == rqd); + ASSERT(!snext->unit->is_running); + + runq_remove(snext); + __set_bit(__CSFLAG_scheduled, &snext->flags); + } + + /* Clear the idle mask if necessary */ + if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) + { + __cpumask_clear_cpu(sched_cpu, &rqd->idle); + smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); + } + + /* + * The reset condition is "has a scheduler epoch come to an end?". + * The way this is enforced is checking whether the unit at the top + * of the runqueue has negative credits. This means the epochs have + * variable length, as in one epoch expores when: + * 1) the unit at the top of the runqueue has executed for + * around 10 ms (with default parameters); + * 2) no other unit with higher credits wants to run. + * + * Here, where we want to check for reset, we need to make sure the + * proper unit is being used. In fact, runqueue_candidate() may have + * not returned the first unit in the runqueue, for various reasons + * (e.g., affinity). Only trigger a reset when it does. + */ + if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET ) + { + reset_credit(ops, sched_cpu, now, snext); + balance_load(ops, sched_cpu, now); + } + + snext->start_time = now; + snext->tickled_cpu = -1; + + /* Safe because lock for old processor is held */ + if ( sched_unit_master(snext->unit) != sched_cpu ) + { + snext->credit += CSCHED2_MIGRATE_COMPENSATION; + sched_set_res(snext->unit, get_sched_res(sched_cpu)); + SCHED_STAT_CRANK(migrated); + migrated = true; + } + } + else + { + /* + * Update the idle mask if necessary. Note that, if we're scheduling + * idle in order to carry on some tasklet work, we want to play busy! + */ + if ( tasklet_work_scheduled ) + { + if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) + { + __cpumask_clear_cpu(sched_cpu, &rqd->idle); + smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); + } + } + else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) ) + { + __cpumask_set_cpu(sched_cpu, &rqd->idle); + cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); + smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); + } + /* Make sure avgload gets updated periodically even + * if there's no activity */ + update_load(ops, rqd, NULL, 0, now); + } + + /* + * Return task to run next... + */ + currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now); + currunit->next_task = snext->unit; + snext->unit->migrated = migrated; + + CSCHED2_UNIT_CHECK(currunit->next_task); +} + +static void +csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc) +{ + printk("[%i.%i] flags=%x cpu=%i", + svc->unit->domain->domain_id, + svc->unit->unit_id, + svc->flags, + sched_unit_master(svc->unit)); + + printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); + + if ( has_cap(svc) ) + printk(" budget=%"PRI_stime"(%"PRI_stime")", + svc->budget, svc->budget_quota); + + printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload, + (svc->avgload * 100) >> prv->load_precision_shift); + + printk("\n"); +} + +static inline void +dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct csched2_private *prv = csched2_priv(ops); + struct csched2_unit *svc; + + printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n", + cpu, c2r(cpu), + CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), + CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); + + /* current UNIT (nothing to say if that's the idle unit) */ + svc = csched2_unit(curr_on_cpu(cpu)); + if ( svc && !is_idle_unit(svc->unit) ) + { + printk("\trun: "); + csched2_dump_unit(prv, svc); + } +} + +static void +csched2_dump(const struct scheduler *ops) +{ + struct list_head *iter_sdom; + struct csched2_private *prv = csched2_priv(ops); + unsigned long flags; + unsigned int i, j, loop; + + /* + * We need the private scheduler lock as we access global + * scheduler data and (below) the list of active domains. + */ + read_lock_irqsave(&prv->lock, flags); + + printk("Active queues: %d\n" + "\tdefault-weight = %d\n", + cpumask_weight(&prv->active_queues), + CSCHED2_DEFAULT_WEIGHT); + for_each_cpu(i, &prv->active_queues) + { + s_time_t fraction; + + fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift; + + printk("Runqueue %d:\n" + "\tncpus = %u\n" + "\tcpus = %*pbl\n" + "\tmax_weight = %u\n" + "\tpick_bias = %u\n" + "\tinstload = %d\n" + "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n", + i, + prv->rqd[i].nr_cpus, + CPUMASK_PR(&prv->rqd[i].active), + prv->rqd[i].max_weight, + prv->rqd[i].pick_bias, + prv->rqd[i].load, + prv->rqd[i].avgload, + fraction); + + printk("\tidlers: %*pb\n" + "\ttickled: %*pb\n" + "\tfully idle cores: %*pb\n", + CPUMASK_PR(&prv->rqd[i].idle), + CPUMASK_PR(&prv->rqd[i].tickled), + CPUMASK_PR(&prv->rqd[i].smt_idle)); + } + + printk("Domain info:\n"); + loop = 0; + list_for_each( iter_sdom, &prv->sdom ) + { + struct csched2_dom *sdom; + struct sched_unit *unit; + + sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem); + + printk("\tDomain: %d w %d c %u v %d\n", + sdom->dom->domain_id, + sdom->weight, + sdom->cap, + sdom->nr_units); + + for_each_sched_unit ( sdom->dom, unit ) + { + struct csched2_unit * const svc = csched2_unit(unit); + spinlock_t *lock; + + lock = unit_schedule_lock(unit); + + printk("\t%3d: ", ++loop); + csched2_dump_unit(prv, svc); + + unit_schedule_unlock(lock, unit); + } + } + + for_each_cpu(i, &prv->active_queues) + { + struct csched2_runqueue_data *rqd = prv->rqd + i; + struct list_head *iter, *runq = &rqd->runq; + int loop = 0; + + /* We need the lock to scan the runqueue. */ + spin_lock(&rqd->lock); + + printk("Runqueue %d:\n", i); + + for_each_cpu(j, &rqd->active) + dump_pcpu(ops, j); + + printk("RUNQ:\n"); + list_for_each( iter, runq ) + { + struct csched2_unit *svc = runq_elem(iter); + + if ( svc ) + { + printk("\t%3d: ", loop++); + csched2_dump_unit(prv, svc); + } + } + spin_unlock(&rqd->lock); + } + + read_unlock_irqrestore(&prv->lock, flags); +} + +static void * +csched2_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct csched2_pcpu *spc; + + spc = xzalloc(struct csched2_pcpu); + if ( spc == NULL ) + return ERR_PTR(-ENOMEM); + + /* Not in any runqueue yet */ + spc->runq_id = -1; + + return spc; +} + +/* Returns the ID of the runqueue the cpu is assigned to. */ +static unsigned +init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc, + unsigned int cpu) +{ + struct csched2_runqueue_data *rqd; + unsigned int rcpu; + + ASSERT(rw_is_write_locked(&prv->lock)); + ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); + /* CPU data needs to be allocated, but still uninitialized. */ + ASSERT(spc && spc->runq_id == -1); + + /* Figure out which runqueue to put it in */ + spc->runq_id = cpu_to_runqueue(prv, cpu); + + rqd = prv->rqd + spc->runq_id; + + printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id); + if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) ) + { + printk(XENLOG_INFO " First cpu on runqueue, activating\n"); + activate_runqueue(prv, spc->runq_id); + } + + __cpumask_set_cpu(cpu, &spc->sibling_mask); + + if ( rqd->nr_cpus > 0 ) + for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) ) + if ( cpumask_test_cpu(rcpu, &rqd->active) ) + { + __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); + __cpumask_set_cpu(rcpu, &spc->sibling_mask); + } + + __cpumask_set_cpu(cpu, &rqd->idle); + __cpumask_set_cpu(cpu, &rqd->active); + __cpumask_set_cpu(cpu, &prv->initialized); + __cpumask_set_cpu(cpu, &rqd->smt_idle); + + rqd->nr_cpus++; + ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); + + if ( rqd->nr_cpus == 1 ) + rqd->pick_bias = cpu; + + return spc->runq_id; +} + +static void +csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu) +{ + struct csched2_private *prv = csched2_priv(ops); + spinlock_t *old_lock; + unsigned long flags; + unsigned rqi; + + write_lock_irqsave(&prv->lock, flags); + old_lock = pcpu_schedule_lock(cpu); + + rqi = init_pdata(prv, pdata, cpu); + /* Move the scheduler lock to the new runq lock. */ + get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock; + + /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ + spin_unlock(old_lock); + write_unlock_irqrestore(&prv->lock, flags); +} + +/* Change the scheduler of cpu to us (Credit2). */ +static spinlock_t * +csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, + void *pdata, void *vdata) +{ + struct csched2_private *prv = csched2_priv(new_ops); + struct csched2_unit *svc = vdata; + unsigned rqi; + + ASSERT(pdata && svc && is_idle_unit(svc->unit)); + + /* + * We own one runqueue lock already (from schedule_cpu_switch()). This + * looks like it violates this scheduler's locking rules, but it does + * not, as what we own is the lock of another scheduler, that hence has + * no particular (ordering) relationship with our private global lock. + * And owning exactly that one (the lock of the old scheduler of this + * cpu) is what is necessary to prevent races. + */ + ASSERT(!local_irq_is_enabled()); + write_lock(&prv->lock); + + sched_idle_unit(cpu)->priv = vdata; + + rqi = init_pdata(prv, pdata, cpu); + + /* + * Now that we know what runqueue we'll go in, double check what's said + * above: the lock we already hold is not the one of this runqueue of + * this scheduler, and so it's safe to have taken it /before/ our + * private global lock. + */ + ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock); + + write_unlock(&prv->lock); + + return &prv->rqd[rqi].lock; +} + +static void +csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + unsigned long flags; + struct csched2_private *prv = csched2_priv(ops); + struct csched2_runqueue_data *rqd; + struct csched2_pcpu *spc = pcpu; + unsigned int rcpu; + + write_lock_irqsave(&prv->lock, flags); + + /* + * alloc_pdata is not implemented, so pcpu must be NULL. On the other + * hand, init_pdata must have been called for this pCPU. + */ + /* + * Scheduler specific data for this pCPU must still be there and and be + * valid. In fact, if we are here: + * 1. alloc_pdata must have been called for this cpu, and free_pdata + * must not have been called on it before us, + * 2. init_pdata must have been called on this cpu, and deinit_pdata + * (us!) must not have been called on it already. + */ + ASSERT(spc && spc->runq_id != -1); + ASSERT(cpumask_test_cpu(cpu, &prv->initialized)); + + /* Find the old runqueue and remove this cpu from it */ + rqd = prv->rqd + spc->runq_id; + + /* No need to save IRQs here, they're already disabled */ + spin_lock(&rqd->lock); + + printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id); + + __cpumask_clear_cpu(cpu, &rqd->idle); + __cpumask_clear_cpu(cpu, &rqd->smt_idle); + __cpumask_clear_cpu(cpu, &rqd->active); + + for_each_cpu ( rcpu, &rqd->active ) + __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); + + rqd->nr_cpus--; + ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); + + if ( rqd->nr_cpus == 0 ) + { + printk(XENLOG_INFO " No cpus left on runqueue, disabling\n"); + deactivate_runqueue(prv, spc->runq_id); + } + else if ( rqd->pick_bias == cpu ) + rqd->pick_bias = cpumask_first(&rqd->active); + + spc->runq_id = -1; + + spin_unlock(&rqd->lock); + + __cpumask_clear_cpu(cpu, &prv->initialized); + + write_unlock_irqrestore(&prv->lock, flags); + + return; +} + +static void +csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched2_pcpu *spc = pcpu; + + /* + * pcpu either points to a valid struct csched2_pcpu, or is NULL (if + * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED). + * xfree() does not really mind, but we want to be sure that either + * init_pdata has never been called, or deinit_pdata has been called + * already. + */ + ASSERT(!pcpu || spc->runq_id == -1); + ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized)); + + xfree(pcpu); +} + +static int __init +csched2_global_init(void) +{ + if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN ) + { + printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n", + __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN); + opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN; + } + + if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT ) + { + printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n", + __func__, opt_load_window_shift); + opt_load_window_shift = LOADAVG_WINDOW_SHIFT; + } + + if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER ) + { + printk("WARNING: %s: opt_cap_period %u too small, resetting\n", + __func__, opt_cap_period); + opt_cap_period = 10; /* ms */ + } + + return 0; +} + +static int +csched2_init(struct scheduler *ops) +{ + int i; + struct csched2_private *prv; + + printk("Initializing Credit2 scheduler\n"); + + printk(XENLOG_INFO " load_precision_shift: %d\n" + XENLOG_INFO " load_window_shift: %d\n" + XENLOG_INFO " underload_balance_tolerance: %d\n" + XENLOG_INFO " overload_balance_tolerance: %d\n" + XENLOG_INFO " runqueues arrangement: %s\n" + XENLOG_INFO " cap enforcement granularity: %dms\n", + opt_load_precision_shift, + opt_load_window_shift, + opt_underload_balance_tolerance, + opt_overload_balance_tolerance, + opt_runqueue_str[opt_runqueue], + opt_cap_period); + + printk(XENLOG_INFO "load tracking window length %llu ns\n", + 1ULL << opt_load_window_shift); + + /* + * Basically no CPU information is available at this point; just + * set up basic structures, and a callback when the CPU info is + * available. + */ + + prv = xzalloc(struct csched2_private); + if ( prv == NULL ) + return -ENOMEM; + ops->sched_data = prv; + + rwlock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); + + /* Allocate all runqueues and mark them as un-initialized */ + prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids); + if ( !prv->rqd ) + { + xfree(prv); + return -ENOMEM; + } + for ( i = 0; i < nr_cpu_ids; i++ ) + prv->rqd[i].id = -1; + + /* initialize ratelimit */ + prv->ratelimit_us = sched_ratelimit_us; + + prv->load_precision_shift = opt_load_precision_shift; + prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT; + ASSERT(opt_load_window_shift > 0); + + return 0; +} + +static void +csched2_deinit(struct scheduler *ops) +{ + struct csched2_private *prv; + + prv = csched2_priv(ops); + ops->sched_data = NULL; + if ( prv ) + xfree(prv->rqd); + xfree(prv); +} + +static const struct scheduler sched_credit2_def = { + .name = "SMP Credit Scheduler rev2", + .opt_name = "credit2", + .sched_id = XEN_SCHEDULER_CREDIT2, + .sched_data = NULL, + + .global_init = csched2_global_init, + + .insert_unit = csched2_unit_insert, + .remove_unit = csched2_unit_remove, + + .sleep = csched2_unit_sleep, + .wake = csched2_unit_wake, + .yield = csched2_unit_yield, + + .adjust = csched2_dom_cntl, + .adjust_affinity= csched2_aff_cntl, + .adjust_global = csched2_sys_cntl, + + .pick_resource = csched2_res_pick, + .migrate = csched2_unit_migrate, + .do_schedule = csched2_schedule, + .context_saved = csched2_context_saved, + + .dump_settings = csched2_dump, + .init = csched2_init, + .deinit = csched2_deinit, + .alloc_udata = csched2_alloc_udata, + .free_udata = csched2_free_udata, + .alloc_pdata = csched2_alloc_pdata, + .init_pdata = csched2_init_pdata, + .deinit_pdata = csched2_deinit_pdata, + .free_pdata = csched2_free_pdata, + .switch_sched = csched2_switch_sched, + .alloc_domdata = csched2_alloc_domdata, + .free_domdata = csched2_free_domdata, +}; + +REGISTER_SCHEDULER(sched_credit2_def); diff --git a/xen/common/sched/null.c b/xen/common/sched/null.c new file mode 100644 index 0000000000..3f3418c9b1 --- /dev/null +++ b/xen/common/sched/null.c @@ -0,0 +1,1034 @@ +/* + * xen/common/sched_null.c + * + * Copyright (c) 2017, Dario Faggioli, Citrix Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; If not, see . + */ + +/* + * The 'null' scheduler always choose to run, on each pCPU, either nothing + * (i.e., the pCPU stays idle) or always the same unit. + * + * It is aimed at supporting static scenarios, where there always are + * less units than pCPUs (and the units don't need to move among pCPUs + * for any reason) with the least possible overhead. + * + * Typical usecase are embedded applications, but also HPC, especially + * if the scheduler is used inside a cpupool. + */ + +#include +#include +#include +#include + +/* + * null tracing events. Check include/public/trace.h for more details. + */ +#define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1) +#define TRC_SNULL_UNIT_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2) +#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3) +#define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4) +#define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5) +#define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6) + +/* + * Locking: + * - Scheduler-lock (a.k.a. runqueue lock): + * + is per-pCPU; + * + serializes assignment and deassignment of units to a pCPU. + * - Private data lock (a.k.a. private scheduler lock): + * + is scheduler-wide; + * + serializes accesses to the list of domains in this scheduler. + * - Waitqueue lock: + * + is scheduler-wide; + * + serialize accesses to the list of units waiting to be assigned + * to pCPUs. + * + * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH, + * waitqueue lock nests inside runqueue lock which nests inside private + * lock. More specifically: + * + if we need both runqueue and private locks, we must acquire the + * private lock for first; + * + if we need both runqueue and waitqueue locks, we must acquire + * the runqueue lock for first; + * + if we need both private and waitqueue locks, we must acquire + * the private lock for first; + * + if we already own a runqueue lock, we must never acquire + * the private lock; + * + if we already own the waitqueue lock, we must never acquire + * the runqueue lock or the private lock. + */ + +/* + * System-wide private data + */ +struct null_private { + spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */ + struct list_head ndom; /* Domains of this scheduler */ + struct list_head waitq; /* units not assigned to any pCPU */ + spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */ + cpumask_t cpus_free; /* CPUs without a unit associated to them */ +}; + +/* + * Physical CPU + */ +struct null_pcpu { + struct sched_unit *unit; +}; +DEFINE_PER_CPU(struct null_pcpu, npc); + +/* + * Schedule unit + */ +struct null_unit { + struct list_head waitq_elem; + struct sched_unit *unit; +}; + +/* + * Domain + */ +struct null_dom { + struct list_head ndom_elem; + struct domain *dom; +}; + +/* + * Accessor helpers functions + */ +static inline struct null_private *null_priv(const struct scheduler *ops) +{ + return ops->sched_data; +} + +static inline struct null_unit *null_unit(const struct sched_unit *unit) +{ + return unit->priv; +} + +static inline bool unit_check_affinity(struct sched_unit *unit, + unsigned int cpu, + unsigned int balance_step) +{ + affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + cpupool_domain_master_cpumask(unit->domain)); + + return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu)); +} + +static int null_init(struct scheduler *ops) +{ + struct null_private *prv; + + printk("Initializing null scheduler\n" + "WARNING: This is experimental software in development.\n" + "Use at your own risk.\n"); + + prv = xzalloc(struct null_private); + if ( prv == NULL ) + return -ENOMEM; + + spin_lock_init(&prv->lock); + spin_lock_init(&prv->waitq_lock); + INIT_LIST_HEAD(&prv->ndom); + INIT_LIST_HEAD(&prv->waitq); + + ops->sched_data = prv; + + return 0; +} + +static void null_deinit(struct scheduler *ops) +{ + xfree(ops->sched_data); + ops->sched_data = NULL; +} + +static void init_pdata(struct null_private *prv, unsigned int cpu) +{ + /* Mark the pCPU as free, and with no unit assigned */ + cpumask_set_cpu(cpu, &prv->cpus_free); + per_cpu(npc, cpu).unit = NULL; +} + +static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu) +{ + struct null_private *prv = null_priv(ops); + + /* alloc_pdata is not implemented, so we want this to be NULL. */ + ASSERT(!pdata); + + init_pdata(prv, cpu); +} + +static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct null_private *prv = null_priv(ops); + + /* alloc_pdata not implemented, so this must have stayed NULL */ + ASSERT(!pcpu); + + cpumask_clear_cpu(cpu, &prv->cpus_free); + per_cpu(npc, cpu).unit = NULL; +} + +static void *null_alloc_udata(const struct scheduler *ops, + struct sched_unit *unit, void *dd) +{ + struct null_unit *nvc; + + nvc = xzalloc(struct null_unit); + if ( nvc == NULL ) + return NULL; + + INIT_LIST_HEAD(&nvc->waitq_elem); + nvc->unit = unit; + + SCHED_STAT_CRANK(unit_alloc); + + return nvc; +} + +static void null_free_udata(const struct scheduler *ops, void *priv) +{ + struct null_unit *nvc = priv; + + xfree(nvc); +} + +static void * null_alloc_domdata(const struct scheduler *ops, + struct domain *d) +{ + struct null_private *prv = null_priv(ops); + struct null_dom *ndom; + unsigned long flags; + + ndom = xzalloc(struct null_dom); + if ( ndom == NULL ) + return ERR_PTR(-ENOMEM); + + ndom->dom = d; + + spin_lock_irqsave(&prv->lock, flags); + list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom); + spin_unlock_irqrestore(&prv->lock, flags); + + return ndom; +} + +static void null_free_domdata(const struct scheduler *ops, void *data) +{ + struct null_dom *ndom = data; + struct null_private *prv = null_priv(ops); + + if ( ndom ) + { + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + list_del_init(&ndom->ndom_elem); + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(ndom); + } +} + +/* + * unit to pCPU assignment and placement. This _only_ happens: + * - on insert, + * - on migrate. + * + * Insert occurs when a unit joins this scheduler for the first time + * (e.g., when the domain it's part of is moved to the scheduler's + * cpupool). + * + * Migration may be necessary if a pCPU (with a unit assigned to it) + * is removed from the scheduler's cpupool. + * + * So this is not part of any hot path. + */ +static struct sched_resource * +pick_res(struct null_private *prv, const struct sched_unit *unit) +{ + unsigned int bs; + unsigned int cpu = sched_unit_master(unit), new_cpu; + cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain); + + ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); + + for_each_affinity_balance_step( bs ) + { + if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) + continue; + + affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus); + + /* + * If our processor is free, or we are assigned to it, and it is also + * still valid and part of our affinity, just go for it. + * (Note that we may call unit_check_affinity(), but we deliberately + * don't, so we get to keep in the scratch cpumask what we have just + * put in it.) + */ + if ( likely((per_cpu(npc, cpu).unit == NULL || + per_cpu(npc, cpu).unit == unit) + && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) + { + new_cpu = cpu; + goto out; + } + + /* If not, just go for a free pCPU, within our affinity, if any */ + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &prv->cpus_free); + new_cpu = cpumask_first(cpumask_scratch_cpu(cpu)); + + if ( likely(new_cpu != nr_cpu_ids) ) + goto out; + } + + /* + * If we didn't find any free pCPU, just pick any valid pcpu, even if + * it has another unit assigned. This will happen during shutdown and + * suspend/resume, but it may also happen during "normal operation", if + * all the pCPUs are busy. + * + * In fact, there must always be something sane in v->processor, or + * unit_schedule_lock() and friends won't work. This is not a problem, + * as we will actually assign the unit to the pCPU we return from here, + * only if the pCPU is free. + */ + cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity); + new_cpu = cpumask_any(cpumask_scratch_cpu(cpu)); + + out: + if ( unlikely(tb_init_done) ) + { + struct { + uint16_t unit, dom; + uint32_t new_cpu; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.new_cpu = new_cpu; + __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d); + } + + return get_sched_res(new_cpu); +} + +static void unit_assign(struct null_private *prv, struct sched_unit *unit, + unsigned int cpu) +{ + ASSERT(is_unit_online(unit)); + + per_cpu(npc, cpu).unit = unit; + sched_set_res(unit, get_sched_res(cpu)); + cpumask_clear_cpu(cpu, &prv->cpus_free); + + dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id); + + if ( unlikely(tb_init_done) ) + { + struct { + uint16_t unit, dom; + uint32_t cpu; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.cpu = cpu; + __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d); + } +} + +/* Returns true if a cpu was tickled */ +static bool unit_deassign(struct null_private *prv, struct sched_unit *unit) +{ + unsigned int bs; + unsigned int cpu = sched_unit_master(unit); + struct null_unit *wvc; + + ASSERT(list_empty(&null_unit(unit)->waitq_elem)); + ASSERT(per_cpu(npc, cpu).unit == unit); + ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free)); + + per_cpu(npc, cpu).unit = NULL; + cpumask_set_cpu(cpu, &prv->cpus_free); + + dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain, + unit->unit_id); + + if ( unlikely(tb_init_done) ) + { + struct { + uint16_t unit, dom; + uint32_t cpu; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.cpu = cpu; + __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d); + } + + spin_lock(&prv->waitq_lock); + + /* + * If unit is assigned to a pCPU, let's see if there is someone waiting, + * suitable to be assigned to it (prioritizing units that have + * soft-affinity with cpu). + */ + for_each_affinity_balance_step( bs ) + { + list_for_each_entry( wvc, &prv->waitq, waitq_elem ) + { + if ( bs == BALANCE_SOFT_AFFINITY && + !has_soft_affinity(wvc->unit) ) + continue; + + if ( unit_check_affinity(wvc->unit, cpu, bs) ) + { + list_del_init(&wvc->waitq_elem); + unit_assign(prv, wvc->unit, cpu); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + spin_unlock(&prv->waitq_lock); + return true; + } + } + } + spin_unlock(&prv->waitq_lock); + + return false; +} + +/* Change the scheduler of cpu to us (null). */ +static spinlock_t *null_switch_sched(struct scheduler *new_ops, + unsigned int cpu, + void *pdata, void *vdata) +{ + struct sched_resource *sr = get_sched_res(cpu); + struct null_private *prv = null_priv(new_ops); + struct null_unit *nvc = vdata; + + ASSERT(nvc && is_idle_unit(nvc->unit)); + + sched_idle_unit(cpu)->priv = vdata; + + /* + * We are holding the runqueue lock already (it's been taken in + * schedule_cpu_switch()). It actually may or may not be the 'right' + * one for this cpu, but that is ok for preventing races. + */ + ASSERT(!local_irq_is_enabled()); + + init_pdata(prv, cpu); + + return &sr->_lock; +} + +static void null_unit_insert(const struct scheduler *ops, + struct sched_unit *unit) +{ + struct null_private *prv = null_priv(ops); + struct null_unit *nvc = null_unit(unit); + unsigned int cpu; + spinlock_t *lock; + + ASSERT(!is_idle_unit(unit)); + + lock = unit_schedule_lock_irq(unit); + + if ( unlikely(!is_unit_online(unit)) ) + { + unit_schedule_unlock_irq(lock, unit); + return; + } + + retry: + sched_set_res(unit, pick_res(prv, unit)); + cpu = sched_unit_master(unit); + + spin_unlock(lock); + + lock = unit_schedule_lock(unit); + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + + /* If the pCPU is free, we assign unit to it */ + if ( likely(per_cpu(npc, cpu).unit == NULL) ) + { + /* + * Insert is followed by vcpu_wake(), so there's no need to poke + * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that. + */ + unit_assign(prv, unit, cpu); + } + else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) ) + { + /* + * If the pCPU is not free (e.g., because we raced with another + * insert or a migrate), but there are other free pCPUs, we can + * try to pick again. + */ + goto retry; + } + else + { + /* + * If the pCPU is not free, and there aren't any (valid) others, + * we have no alternatives than to go into the waitqueue. + */ + spin_lock(&prv->waitq_lock); + list_add_tail(&nvc->waitq_elem, &prv->waitq); + dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n", + unit->domain, unit->unit_id); + spin_unlock(&prv->waitq_lock); + } + spin_unlock_irq(lock); + + SCHED_STAT_CRANK(unit_insert); +} + +static void null_unit_remove(const struct scheduler *ops, + struct sched_unit *unit) +{ + struct null_private *prv = null_priv(ops); + struct null_unit *nvc = null_unit(unit); + spinlock_t *lock; + + ASSERT(!is_idle_unit(unit)); + + lock = unit_schedule_lock_irq(unit); + + /* If offline, the unit shouldn't be assigned, nor in the waitqueue */ + if ( unlikely(!is_unit_online(unit)) ) + { + ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit); + ASSERT(list_empty(&nvc->waitq_elem)); + goto out; + } + + /* If unit is in waitqueue, just get it out of there and bail */ + if ( unlikely(!list_empty(&nvc->waitq_elem)) ) + { + spin_lock(&prv->waitq_lock); + list_del_init(&nvc->waitq_elem); + spin_unlock(&prv->waitq_lock); + + goto out; + } + + unit_deassign(prv, unit); + + out: + unit_schedule_unlock_irq(lock, unit); + + SCHED_STAT_CRANK(unit_remove); +} + +static void null_unit_wake(const struct scheduler *ops, + struct sched_unit *unit) +{ + struct null_private *prv = null_priv(ops); + struct null_unit *nvc = null_unit(unit); + unsigned int cpu = sched_unit_master(unit); + + ASSERT(!is_idle_unit(unit)); + + if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) + { + SCHED_STAT_CRANK(unit_wake_running); + return; + } + + if ( unlikely(!list_empty(&nvc->waitq_elem)) ) + { + /* Not exactly "on runq", but close enough for reusing the counter */ + SCHED_STAT_CRANK(unit_wake_onrunq); + return; + } + + if ( likely(unit_runnable(unit)) ) + SCHED_STAT_CRANK(unit_wake_runnable); + else + SCHED_STAT_CRANK(unit_wake_not_runnable); + + if ( likely(per_cpu(npc, cpu).unit == unit) ) + { + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + return; + } + + /* + * If a unit is neither on a pCPU nor in the waitqueue, it means it was + * offline, and that it is now coming back being online. If we're lucky, + * and its previous resource is free (and affinities match), we can just + * assign the unit to it (we own the proper lock already) and be done. + */ + if ( per_cpu(npc, cpu).unit == NULL && + unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) ) + { + if ( !has_soft_affinity(unit) || + unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) ) + { + unit_assign(prv, unit, cpu); + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + return; + } + } + + /* + * If the resource is not free (or affinities do not match) we need + * to assign unit to some other one, but we can't do it here, as: + * - we don't own the proper lock, + * - we can't change v->processor under vcpu_wake()'s feet. + * So we add it to the waitqueue, and tickle all the free CPUs (if any) + * on which unit can run. The first one that schedules will pick it up. + */ + spin_lock(&prv->waitq_lock); + list_add_tail(&nvc->waitq_elem, &prv->waitq); + spin_unlock(&prv->waitq_lock); + + cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, + cpupool_domain_master_cpumask(unit->domain)); + cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), + &prv->cpus_free); + + if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) + dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", + unit->domain->domain_id, unit->unit_id); + else + cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ); +} + +static void null_unit_sleep(const struct scheduler *ops, + struct sched_unit *unit) +{ + struct null_private *prv = null_priv(ops); + unsigned int cpu = sched_unit_master(unit); + bool tickled = false; + + ASSERT(!is_idle_unit(unit)); + + /* + * Check if the unit is in the process of being offlined. If yes, + * we need to remove it from either its pCPU or the waitqueue. + */ + if ( unlikely(!is_unit_online(unit)) ) + { + struct null_unit *nvc = null_unit(unit); + + if ( unlikely(!list_empty(&nvc->waitq_elem)) ) + { + spin_lock(&prv->waitq_lock); + list_del_init(&nvc->waitq_elem); + spin_unlock(&prv->waitq_lock); + } + else if ( per_cpu(npc, cpu).unit == unit ) + tickled = unit_deassign(prv, unit); + } + + /* If unit is not assigned to a pCPU, or is not running, no need to bother */ + if ( likely(!tickled && curr_on_cpu(cpu) == unit) ) + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); + + SCHED_STAT_CRANK(unit_sleep); +} + +static struct sched_resource * +null_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + ASSERT(!is_idle_unit(unit)); + return pick_res(null_priv(ops), unit); +} + +static void null_unit_migrate(const struct scheduler *ops, + struct sched_unit *unit, unsigned int new_cpu) +{ + struct null_private *prv = null_priv(ops); + struct null_unit *nvc = null_unit(unit); + + ASSERT(!is_idle_unit(unit)); + + if ( sched_unit_master(unit) == new_cpu ) + return; + + if ( unlikely(tb_init_done) ) + { + struct { + uint16_t unit, dom; + uint16_t cpu, new_cpu; + } d; + d.dom = unit->domain->domain_id; + d.unit = unit->unit_id; + d.cpu = sched_unit_master(unit); + d.new_cpu = new_cpu; + __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d); + } + + /* + * If unit is assigned to a pCPU, then such pCPU becomes free, and we + * should look in the waitqueue if anyone else can be assigned to it. + */ + if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) ) + { + unit_deassign(prv, unit); + SCHED_STAT_CRANK(migrate_running); + } + else if ( !list_empty(&nvc->waitq_elem) ) + SCHED_STAT_CRANK(migrate_on_runq); + + SCHED_STAT_CRANK(migrated); + + /* + * If a unit is (going) offline, we want it to be neither assigned + * to a pCPU, nor in the waitqueue. + * + * If it was on a cpu, we've removed it from there above. If it is + * in the waitqueue, we remove it from there now. And then we bail. + */ + if ( unlikely(!is_unit_online(unit)) ) + { + spin_lock(&prv->waitq_lock); + list_del_init(&nvc->waitq_elem); + spin_unlock(&prv->waitq_lock); + goto out; + } + + /* + * Let's now consider new_cpu, which is where unit is being sent. It can be + * either free, or have a unit already assigned to it. + * + * In the former case we should assign unit to it, and try to get it to run, + * if possible, according to affinity. + * + * In latter, all we can do is to park unit in the waitqueue. + */ + if ( per_cpu(npc, new_cpu).unit == NULL && + unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) ) + { + /* unit might have been in the waitqueue, so remove it */ + spin_lock(&prv->waitq_lock); + list_del_init(&nvc->waitq_elem); + spin_unlock(&prv->waitq_lock); + + unit_assign(prv, unit, new_cpu); + } + else + { + /* Put unit in the waitqueue, if it wasn't there already */ + spin_lock(&prv->waitq_lock); + if ( list_empty(&nvc->waitq_elem) ) + { + list_add_tail(&nvc->waitq_elem, &prv->waitq); + dprintk(XENLOG_G_WARNING, + "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain, + unit->unit_id); + } + spin_unlock(&prv->waitq_lock); + } + + /* + * Whatever all the above, we always at least override v->processor. + * This is especially important for shutdown or suspend/resume paths, + * when it is important to let our caller (cpu_disable_scheduler()) + * know that the migration did happen, to the best of our possibilities, + * at least. In case of suspend, any temporary inconsistency caused + * by this, will be fixed-up during resume. + */ + out: + sched_set_res(unit, get_sched_res(new_cpu)); +} + +#ifndef NDEBUG +static inline void null_unit_check(struct sched_unit *unit) +{ + struct null_unit * const nvc = null_unit(unit); + struct null_dom * const ndom = unit->domain->sched_priv; + + BUG_ON(nvc->unit != unit); + + if ( ndom ) + BUG_ON(is_idle_unit(unit)); + else + BUG_ON(!is_idle_unit(unit)); + + SCHED_STAT_CRANK(unit_check); +} +#define NULL_UNIT_CHECK(unit) (null_unit_check(unit)) +#else +#define NULL_UNIT_CHECK(unit) +#endif + + +/* + * The most simple scheduling function of all times! We either return: + * - the unit assigned to the pCPU, if there's one and it can run; + * - the idle unit, otherwise. + */ +static void null_schedule(const struct scheduler *ops, struct sched_unit *prev, + s_time_t now, bool tasklet_work_scheduled) +{ + unsigned int bs; + const unsigned int cur_cpu = smp_processor_id(); + const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); + struct null_private *prv = null_priv(ops); + struct null_unit *wvc; + + SCHED_STAT_CRANK(schedule); + NULL_UNIT_CHECK(current->sched_unit); + + if ( unlikely(tb_init_done) ) + { + struct { + uint16_t tasklet, cpu; + int16_t unit, dom; + } d; + d.cpu = cur_cpu; + d.tasklet = tasklet_work_scheduled; + if ( per_cpu(npc, sched_cpu).unit == NULL ) + { + d.unit = d.dom = -1; + } + else + { + d.unit = per_cpu(npc, sched_cpu).unit->unit_id; + d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id; + } + __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d); + } + + if ( tasklet_work_scheduled ) + { + trace_var(TRC_SNULL_TASKLET, 1, 0, NULL); + prev->next_task = sched_idle_unit(sched_cpu); + } + else + prev->next_task = per_cpu(npc, sched_cpu).unit; + prev->next_time = -1; + + /* + * We may be new in the cpupool, or just coming back online. In which + * case, there may be units in the waitqueue that we can assign to us + * and run. + */ + if ( unlikely(prev->next_task == NULL) ) + { + bool unit_found; + + spin_lock(&prv->waitq_lock); + + if ( list_empty(&prv->waitq) ) + goto unlock; + + /* + * We scan the waitqueue twice, for prioritizing units that have + * soft-affinity with cpu. This may look like something expensive to + * do here in null_schedule(), but it's actually fine, because we do + * it only in cases where a pcpu has no unit associated (e.g., as + * said above, the cpu has just joined a cpupool). + */ + unit_found = false; + for_each_affinity_balance_step( bs ) + { + list_for_each_entry( wvc, &prv->waitq, waitq_elem ) + { + if ( bs == BALANCE_SOFT_AFFINITY && + !has_soft_affinity(wvc->unit) ) + continue; + + if ( unit_check_affinity(wvc->unit, sched_cpu, bs) ) + { + spinlock_t *lock; + + unit_found = true; + + /* + * If the unit in the waitqueue has just come up online, + * we risk racing with vcpu_wake(). To avoid this, sync + * on the spinlock that vcpu_wake() holds, but only with + * trylock, to avoid deadlock). + */ + lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit)); + + /* + * We know the vcpu's lock is not this resource's lock. In + * fact, if it were, since this cpu is free, vcpu_wake() + * would have assigned the unit to here directly. + */ + ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock); + + if ( lock ) { + unit_assign(prv, wvc->unit, sched_cpu); + list_del_init(&wvc->waitq_elem); + prev->next_task = wvc->unit; + spin_unlock(lock); + goto unlock; + } + } + } + } + /* + * If we did find a unit with suitable affinity in the waitqueue, but + * we could not pick it up (due to lock contention), and hence we are + * still free, plan for another try. In fact, we don't want such unit + * to be stuck in the waitqueue, when there are free cpus where it + * could run. + */ + if ( unlikely( unit_found && prev->next_task == NULL && + !list_empty(&prv->waitq)) ) + cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ); + unlock: + spin_unlock(&prv->waitq_lock); + + if ( prev->next_task == NULL && + !cpumask_test_cpu(sched_cpu, &prv->cpus_free) ) + cpumask_set_cpu(sched_cpu, &prv->cpus_free); + } + + if ( unlikely(prev->next_task == NULL || + !unit_runnable_state(prev->next_task)) ) + prev->next_task = sched_idle_unit(sched_cpu); + + NULL_UNIT_CHECK(prev->next_task); + + prev->next_task->migrated = false; +} + +static inline void dump_unit(struct null_private *prv, struct null_unit *nvc) +{ + printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id, + nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ? + sched_unit_master(nvc->unit) : -1); +} + +static void null_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct null_private *prv = null_priv(ops); + struct null_unit *nvc; + spinlock_t *lock; + unsigned long flags; + + lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}", + cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), + CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); + if ( per_cpu(npc, cpu).unit != NULL ) + printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain, + per_cpu(npc, cpu).unit->unit_id); + printk("\n"); + + /* current unit (nothing to say if that's the idle unit) */ + nvc = null_unit(curr_on_cpu(cpu)); + if ( nvc && !is_idle_unit(nvc->unit) ) + { + printk("\trun: "); + dump_unit(prv, nvc); + printk("\n"); + } + + pcpu_schedule_unlock_irqrestore(lock, flags, cpu); +} + +static void null_dump(const struct scheduler *ops) +{ + struct null_private *prv = null_priv(ops); + struct list_head *iter; + unsigned long flags; + unsigned int loop; + + spin_lock_irqsave(&prv->lock, flags); + + printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free)); + + printk("Domain info:\n"); + loop = 0; + list_for_each( iter, &prv->ndom ) + { + struct null_dom *ndom; + struct sched_unit *unit; + + ndom = list_entry(iter, struct null_dom, ndom_elem); + + printk("\tDomain: %d\n", ndom->dom->domain_id); + for_each_sched_unit( ndom->dom, unit ) + { + struct null_unit * const nvc = null_unit(unit); + spinlock_t *lock; + + lock = unit_schedule_lock(unit); + + printk("\t%3d: ", ++loop); + dump_unit(prv, nvc); + printk("\n"); + + unit_schedule_unlock(lock, unit); + } + } + + printk("Waitqueue: "); + loop = 0; + spin_lock(&prv->waitq_lock); + list_for_each( iter, &prv->waitq ) + { + struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem); + + if ( loop++ != 0 ) + printk(", "); + if ( loop % 24 == 0 ) + printk("\n\t"); + printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id); + } + printk("\n"); + spin_unlock(&prv->waitq_lock); + + spin_unlock_irqrestore(&prv->lock, flags); +} + +static const struct scheduler sched_null_def = { + .name = "null Scheduler", + .opt_name = "null", + .sched_id = XEN_SCHEDULER_NULL, + .sched_data = NULL, + + .init = null_init, + .deinit = null_deinit, + .init_pdata = null_init_pdata, + .switch_sched = null_switch_sched, + .deinit_pdata = null_deinit_pdata, + + .alloc_udata = null_alloc_udata, + .free_udata = null_free_udata, + .alloc_domdata = null_alloc_domdata, + .free_domdata = null_free_domdata, + + .insert_unit = null_unit_insert, + .remove_unit = null_unit_remove, + + .wake = null_unit_wake, + .sleep = null_unit_sleep, + .pick_resource = null_res_pick, + .migrate = null_unit_migrate, + .do_schedule = null_schedule, + + .dump_cpu_state = null_dump_pcpu, + .dump_settings = null_dump, +}; + +REGISTER_SCHEDULER(sched_null_def); diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c new file mode 100644 index 0000000000..c40a7e4990 --- /dev/null +++ b/xen/common/sched/rt.c @@ -0,0 +1,1571 @@ +/***************************************************************************** + * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen + * EDF scheduling is a real-time scheduling algorithm used in embedded field. + * + * by Sisu Xi, 2013, Washington University in Saint Louis + * Meng Xu, 2014-2016, University of Pennsylvania + * + * Conversion toward event driven model by Tianyang Chen + * and Dagaen Golomb, 2016, University of Pennsylvania + * + * based on the code of credit Scheduler + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TODO: + * + * Migration compensation and resist like credit2 to better use cache; + * Lock Holder Problem, using yield? + * Self switch problem: UNITs of the same domain may preempt each other; + */ + +/* + * Design: + * + * This scheduler follows the Preemptive Global Earliest Deadline First (EDF) + * theory in real-time field. + * At any scheduling point, the UNIT with earlier deadline has higher priority. + * The scheduler always picks highest priority UNIT to run on a feasible PCPU. + * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or + * has a lower-priority UNIT running on it.) + * + * Each UNIT has a dedicated period, budget and a extratime flag + * The deadline of an UNIT is at the end of each period; + * An UNIT has its budget replenished at the beginning of each period; + * While scheduled, an UNIT burns its budget. + * The UNIT needs to finish its budget before its deadline in each period; + * The UNIT discards its unused budget at the end of each period. + * When an UNIT runs out of budget in a period, if its extratime flag is set, + * the UNIT increases its priority_level by 1 and refills its budget; otherwise, + * it has to wait until next period. + * + * Each UNIT is implemented as a deferable server. + * When an UNIT has a task running on it, its budget is continuously burned; + * When an UNIT has no task but with budget left, its budget is preserved. + * + * Queue scheme: + * A global runqueue and a global depletedqueue for each CPU pool. + * The runqueue holds all runnable UNITs with budget, + * sorted by priority_level and deadline; + * The depletedqueue holds all UNITs without budget, unsorted; + * + * Note: cpumask and cpupool is supported. + */ + +/* + * Locking: + * A global system lock is used to protect the RunQ and DepletedQ. + * The global lock is referenced by sched_res->schedule_lock + * from all physical cpus. + * + * The lock is already grabbed when calling wake/sleep/schedule/ functions + * in schedule.c + * + * The functions involes RunQ and needs to grab locks are: + * unit_insert, unit_remove, context_saved, runq_insert + */ + + +/* + * Default parameters: + * Period and budget in default is 10 and 4 ms, respectively + */ +#define RTDS_DEFAULT_PERIOD (MICROSECS(10000)) +#define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) + +/* + * Max period: max delta of time type, because period is added to the time + * an unit activates, so this must not overflow. + * Min period: 10 us, considering the scheduling overhead (when period is + * too low, scheduling is invoked too frequently, causing high overhead). + */ +#define RTDS_MAX_PERIOD (STIME_DELTA_MAX) +#define RTDS_MIN_PERIOD (MICROSECS(10)) + +/* + * Min budget: 10 us, considering the scheduling overhead (when budget is + * consumed too fast, scheduling is invoked too frequently, causing + * high overhead). + */ +#define RTDS_MIN_BUDGET (MICROSECS(10)) + +/* + * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding + * the next deadline, performing addition could be faster if the difference + * between cur_deadline and now is small. If the difference is bigger than + * 1024 * period, use multiplication. + */ +#define UPDATE_LIMIT_SHIFT 10 + +/* + * Flags + */ +/* + * RTDS_scheduled: Is this unit either running on, or context-switching off, + * a physical cpu? + * + Accessed only with global lock held. + * + Set when chosen as next in rt_schedule(). + * + Cleared after context switch has been saved in rt_context_saved() + * + Checked in unit_wake to see if we can add to the Runqueue, or if we should + * set RTDS_delayed_runq_add + * + Checked to be false in runq_insert. + */ +#define __RTDS_scheduled 1 +#define RTDS_scheduled (1<<__RTDS_scheduled) +/* + * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ + * once it's done being context switching out? + * + Set when scheduling out in rt_schedule() if prev is runable + * + Set in rt_unit_wake if it finds RTDS_scheduled set + * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ + * and clears the bit. + */ +#define __RTDS_delayed_runq_add 2 +#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) + +/* + * RTDS_depleted: Does this vcp run out of budget? + * This flag is + * + set in burn_budget() if an unit has zero budget left; + * + cleared and checked in the repenishment handler, + * for the units that are being replenished. + */ +#define __RTDS_depleted 3 +#define RTDS_depleted (1<<__RTDS_depleted) + +/* + * RTDS_extratime: Can the unit run in the time that is + * not part of any real-time reservation, and would therefore + * be otherwise left idle? + */ +#define __RTDS_extratime 4 +#define RTDS_extratime (1<<__RTDS_extratime) + +/* + * rt tracing events ("only" 512 available!). Check + * include/public/trace.h for more details. + */ +#define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1) +#define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2) +#define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3) +#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4) +#define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5) +#define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6) + +static void repl_timer_handler(void *data); + +/* + * System-wide private data, include global RunQueue/DepletedQ + * Global lock is referenced by sched_res->schedule_lock from all + * physical cpus. It can be grabbed via unit_schedule_lock_irq() + */ +struct rt_private { + spinlock_t lock; /* the global coarse-grained lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ + + struct list_head runq; /* ordered list of runnable units */ + struct list_head depletedq; /* unordered list of depleted units */ + + struct timer repl_timer; /* replenishment timer */ + struct list_head replq; /* ordered list of units that need replenishment */ + + cpumask_t tickled; /* cpus been tickled */ +}; + +/* + * Virtual CPU + */ +struct rt_unit { + struct list_head q_elem; /* on the runq/depletedq list */ + struct list_head replq_elem; /* on the replenishment events list */ + + /* UNIT parameters, in nanoseconds */ + s_time_t period; + s_time_t budget; + + /* UNIT current information in nanosecond */ + s_time_t cur_budget; /* current budget */ + s_time_t last_start; /* last start time */ + s_time_t cur_deadline; /* current deadline for EDF */ + + /* Up-pointers */ + struct rt_dom *sdom; + struct sched_unit *unit; + + unsigned priority_level; + + unsigned flags; /* mark __RTDS_scheduled, etc.. */ +}; + +/* + * Domain + */ +struct rt_dom { + struct list_head sdom_elem; /* link list on rt_priv */ + struct domain *dom; /* pointer to upper domain */ +}; + +/* + * Useful inline functions + */ +static inline struct rt_private *rt_priv(const struct scheduler *ops) +{ + return ops->sched_data; +} + +static inline struct rt_unit *rt_unit(const struct sched_unit *unit) +{ + return unit->priv; +} + +static inline struct list_head *rt_runq(const struct scheduler *ops) +{ + return &rt_priv(ops)->runq; +} + +static inline struct list_head *rt_depletedq(const struct scheduler *ops) +{ + return &rt_priv(ops)->depletedq; +} + +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ + return &rt_priv(ops)->replq; +} + +static inline bool has_extratime(const struct rt_unit *svc) +{ + return svc->flags & RTDS_extratime; +} + +/* + * Helper functions for manipulating the runqueue, the depleted queue, + * and the replenishment events queue. + */ +static int +unit_on_q(const struct rt_unit *svc) +{ + return !list_empty(&svc->q_elem); +} + +static struct rt_unit * +q_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_unit, q_elem); +} + +static struct rt_unit * +replq_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_unit, replq_elem); +} + +static int +unit_on_replq(const struct rt_unit *svc) +{ + return !list_empty(&svc->replq_elem); +} + +/* + * If v1 priority >= v2 priority, return value > 0 + * Otherwise, return value < 0 + */ +static s_time_t +compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2) +{ + int prio = v2->priority_level - v1->priority_level; + + if ( prio == 0 ) + return v2->cur_deadline - v1->cur_deadline; + + return prio; +} + +/* + * Debug related code, dump unit/cpu information + */ +static void +rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc) +{ + cpumask_t *cpupool_mask, *mask; + + ASSERT(svc != NULL); + /* idle unit */ + if( svc->sdom == NULL ) + { + printk("\n"); + return; + } + + /* + * We can't just use 'cpumask_scratch' because the dumping can + * happen from a pCPU outside of this scheduler's cpupool, and + * hence it's not right to use its pCPU's scratch mask. + * On the other hand, it is safe to use sched_unit_master(svc->unit)'s + * own scratch space, since we hold the runqueue lock. + */ + mask = cpumask_scratch_cpu(sched_unit_master(svc->unit)); + + cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain); + cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity); + printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," + " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n" + " \t\t priority_level=%d has_extratime=%d\n" + " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n", + svc->unit->domain->domain_id, + svc->unit->unit_id, + sched_unit_master(svc->unit), + svc->period, + svc->budget, + svc->cur_budget, + svc->cur_deadline, + svc->last_start, + svc->priority_level, + has_extratime(svc), + unit_on_q(svc), + unit_runnable(svc->unit), + svc->flags, CPUMASK_PR(mask)); +} + +static void +rt_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_unit *svc; + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + printk("CPU[%02d]\n", cpu); + /* current UNIT (nothing to say if that's the idle unit). */ + svc = rt_unit(curr_on_cpu(cpu)); + if ( svc && !is_idle_unit(svc->unit) ) + { + rt_dump_unit(ops, svc); + } + spin_unlock_irqrestore(&prv->lock, flags); +} + +static void +rt_dump(const struct scheduler *ops) +{ + struct list_head *runq, *depletedq, *replq, *iter; + struct rt_private *prv = rt_priv(ops); + struct rt_unit *svc; + struct rt_dom *sdom; + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + + if ( list_empty(&prv->sdom) ) + goto out; + + runq = rt_runq(ops); + depletedq = rt_depletedq(ops); + replq = rt_replq(ops); + + printk("Global RunQueue info:\n"); + list_for_each ( iter, runq ) + { + svc = q_elem(iter); + rt_dump_unit(ops, svc); + } + + printk("Global DepletedQueue info:\n"); + list_for_each ( iter, depletedq ) + { + svc = q_elem(iter); + rt_dump_unit(ops, svc); + } + + printk("Global Replenishment Events info:\n"); + list_for_each ( iter, replq ) + { + svc = replq_elem(iter); + rt_dump_unit(ops, svc); + } + + printk("Domain info:\n"); + list_for_each ( iter, &prv->sdom ) + { + struct sched_unit *unit; + + sdom = list_entry(iter, struct rt_dom, sdom_elem); + printk("\tdomain: %d\n", sdom->dom->domain_id); + + for_each_sched_unit ( sdom->dom, unit ) + { + svc = rt_unit(unit); + rt_dump_unit(ops, svc); + } + } + + out: + spin_unlock_irqrestore(&prv->lock, flags); +} + +/* + * update deadline and budget when now >= cur_deadline + * it needs to be updated to the deadline of the current period + */ +static void +rt_update_deadline(s_time_t now, struct rt_unit *svc) +{ + ASSERT(now >= svc->cur_deadline); + ASSERT(svc->period != 0); + + if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now ) + { + do + svc->cur_deadline += svc->period; + while ( svc->cur_deadline <= now ); + } + else + { + long count = ((now - svc->cur_deadline) / svc->period) + 1; + svc->cur_deadline += count * svc->period; + } + + /* + * svc may be scheduled to run immediately after it misses deadline + * Then rt_update_deadline is called before rt_schedule, which + * should only deduct the time spent in current period from the budget + */ + svc->last_start = now; + svc->cur_budget = svc->budget; + svc->priority_level = 0; + + /* TRACE */ + { + struct __packed { + unsigned unit:16, dom:16; + unsigned priority_level; + uint64_t cur_deadline, cur_budget; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.priority_level = svc->priority_level; + d.cur_deadline = (uint64_t) svc->cur_deadline; + d.cur_budget = (uint64_t) svc->cur_budget; + trace_var(TRC_RTDS_BUDGET_REPLENISH, 1, + sizeof(d), + (unsigned char *) &d); + } + + return; +} + +/* + * Helpers for removing and inserting an unit in a queue + * that is being kept ordered by the units' deadlines (as EDF + * mandates). + * + * For callers' convenience, the unit removing helper returns + * true if the unit removed was the one at the front of the + * queue; similarly, the inserting helper returns true if the + * inserted ended at the front of the queue (i.e., in both + * cases, if the unit with the earliest deadline is what we + * are dealing with). + */ +static inline bool +deadline_queue_remove(struct list_head *queue, struct list_head *elem) +{ + int pos = 0; + + if ( queue->next != elem ) + pos = 1; + + list_del_init(elem); + return !pos; +} + +static inline bool +deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *), + struct rt_unit *svc, struct list_head *elem, + struct list_head *queue) +{ + struct list_head *iter; + int pos = 0; + + list_for_each ( iter, queue ) + { + struct rt_unit * iter_svc = (*qelem)(iter); + if ( compare_unit_priority(svc, iter_svc) > 0 ) + break; + pos++; + } + list_add_tail(elem, iter); + return !pos; +} +#define deadline_runq_insert(...) \ + deadline_queue_insert(&q_elem, ##__VA_ARGS__) +#define deadline_replq_insert(...) \ + deadline_queue_insert(&replq_elem, ##__VA_ARGS__) + +static inline void +q_remove(struct rt_unit *svc) +{ + ASSERT( unit_on_q(svc) ); + list_del_init(&svc->q_elem); +} + +static inline void +replq_remove(const struct scheduler *ops, struct rt_unit *svc) +{ + struct rt_private *prv = rt_priv(ops); + struct list_head *replq = rt_replq(ops); + + ASSERT( unit_on_replq(svc) ); + + if ( deadline_queue_remove(replq, &svc->replq_elem) ) + { + /* + * The replenishment timer needs to be set to fire when a + * replenishment for the unit at the front of the replenishment + * queue is due. If it is such unit that we just removed, we may + * need to reprogram the timer. + */ + if ( !list_empty(replq) ) + { + struct rt_unit *svc_next = replq_elem(replq->next); + set_timer(&prv->repl_timer, svc_next->cur_deadline); + } + else + stop_timer(&prv->repl_timer); + } +} + +/* + * Insert svc with budget in RunQ according to EDF: + * units with smaller deadlines go first. + * Insert svc without budget in DepletedQ unsorted; + */ +static void +runq_insert(const struct scheduler *ops, struct rt_unit *svc) +{ + struct rt_private *prv = rt_priv(ops); + struct list_head *runq = rt_runq(ops); + + ASSERT( spin_is_locked(&prv->lock) ); + ASSERT( !unit_on_q(svc) ); + ASSERT( unit_on_replq(svc) ); + + /* add svc to runq if svc still has budget or its extratime is set */ + if ( svc->cur_budget > 0 || + has_extratime(svc) ) + deadline_runq_insert(svc, &svc->q_elem, runq); + else + list_add(&svc->q_elem, &prv->depletedq); +} + +static void +replq_insert(const struct scheduler *ops, struct rt_unit *svc) +{ + struct list_head *replq = rt_replq(ops); + struct rt_private *prv = rt_priv(ops); + + ASSERT( !unit_on_replq(svc) ); + + /* + * The timer may be re-programmed if svc is inserted + * at the front of the event list. + */ + if ( deadline_replq_insert(svc, &svc->replq_elem, replq) ) + set_timer(&prv->repl_timer, svc->cur_deadline); +} + +/* + * Removes and re-inserts an event to the replenishment queue. + * The aim is to update its position inside the queue, as its + * deadline (and hence its replenishment time) could have + * changed. + */ +static void +replq_reinsert(const struct scheduler *ops, struct rt_unit *svc) +{ + struct list_head *replq = rt_replq(ops); + struct rt_unit *rearm_svc = svc; + bool_t rearm = 0; + + ASSERT( unit_on_replq(svc) ); + + /* + * If svc was at the front of the replenishment queue, we certainly + * need to re-program the timer, and we want to use the deadline of + * the unit which is now at the front of the queue (which may still + * be svc or not). + * + * We may also need to re-program, if svc has been put at the front + * of the replenishment queue when being re-inserted. + */ + if ( deadline_queue_remove(replq, &svc->replq_elem) ) + { + deadline_replq_insert(svc, &svc->replq_elem, replq); + rearm_svc = replq_elem(replq->next); + rearm = 1; + } + else + rearm = deadline_replq_insert(svc, &svc->replq_elem, replq); + + if ( rearm ) + set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline); +} + +/* + * Pick a valid resource for the unit vc + * Valid resource of an unit is intesection of unit's affinity + * and available resources + */ +static struct sched_resource * +rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit) +{ + cpumask_t cpus; + cpumask_t *online; + int cpu; + + online = cpupool_domain_master_cpumask(unit->domain); + cpumask_and(&cpus, online, unit->cpu_hard_affinity); + + cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus) + ? sched_unit_master(unit) + : cpumask_cycle(sched_unit_master(unit), &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + + return get_sched_res(cpu); +} + +/* + * Init/Free related code + */ +static int +rt_init(struct scheduler *ops) +{ + int rc = -ENOMEM; + struct rt_private *prv = xzalloc(struct rt_private); + + printk("Initializing RTDS scheduler\n" + "WARNING: This is experimental software in development.\n" + "Use at your own risk.\n"); + + if ( prv == NULL ) + goto err; + + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); + INIT_LIST_HEAD(&prv->runq); + INIT_LIST_HEAD(&prv->depletedq); + INIT_LIST_HEAD(&prv->replq); + + ops->sched_data = prv; + rc = 0; + + err: + if ( rc ) + xfree(prv); + + return rc; +} + +static void +rt_deinit(struct scheduler *ops) +{ + struct rt_private *prv = rt_priv(ops); + + ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid || + prv->repl_timer.status == TIMER_STATUS_killed); + + ops->sched_data = NULL; + xfree(prv); +} + +/* + * Point per_cpu spinlock to the global system lock; + * All cpu have same global system lock + */ +static void +rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu) +{ + struct rt_private *prv = rt_priv(ops); + spinlock_t *old_lock; + unsigned long flags; + + old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); + + /* + * TIMER_STATUS_invalid means we are the first cpu that sees the timer + * allocated but not initialized, and so it's up to us to initialize it. + */ + if ( prv->repl_timer.status == TIMER_STATUS_invalid ) + { + init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu); + dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu); + } + + /* Move the scheduler lock to our global runqueue lock. */ + get_sched_res(cpu)->schedule_lock = &prv->lock; + + /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */ + spin_unlock_irqrestore(old_lock, flags); +} + +/* Change the scheduler of cpu to us (RTDS). */ +static spinlock_t * +rt_switch_sched(struct scheduler *new_ops, unsigned int cpu, + void *pdata, void *vdata) +{ + struct rt_private *prv = rt_priv(new_ops); + struct rt_unit *svc = vdata; + + ASSERT(!pdata && svc && is_idle_unit(svc->unit)); + + /* + * We are holding the runqueue lock already (it's been taken in + * schedule_cpu_switch()). It's actually the runqueue lock of + * another scheduler, but that is how things need to be, for + * preventing races. + */ + ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock); + + /* + * If we are the absolute first cpu being switched toward this + * scheduler (in which case we'll see TIMER_STATUS_invalid), or the + * first one that is added back to the cpupool that had all its cpus + * removed (in which case we'll see TIMER_STATUS_killed), it's our + * job to (re)initialize the timer. + */ + if ( prv->repl_timer.status == TIMER_STATUS_invalid || + prv->repl_timer.status == TIMER_STATUS_killed ) + { + init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu); + dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu); + } + + sched_idle_unit(cpu)->priv = vdata; + + return &prv->lock; +} + +static void +rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + unsigned long flags; + struct rt_private *prv = rt_priv(ops); + + spin_lock_irqsave(&prv->lock, flags); + + if ( prv->repl_timer.cpu == cpu ) + { + cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid; + unsigned int new_cpu = cpumask_cycle(cpu, online); + + /* + * Make sure the timer run on one of the cpus that are still available + * to this scheduler. If there aren't any left, it means it's the time + * to just kill it. + */ + if ( new_cpu >= nr_cpu_ids ) + { + kill_timer(&prv->repl_timer); + dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu); + } + else + { + migrate_timer(&prv->repl_timer, new_cpu); + } + } + + spin_unlock_irqrestore(&prv->lock, flags); +} + +static void * +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + unsigned long flags; + struct rt_dom *sdom; + struct rt_private * prv = rt_priv(ops); + + sdom = xzalloc(struct rt_dom); + if ( sdom == NULL ) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + + /* spinlock here to insert the dom */ + spin_lock_irqsave(&prv->lock, flags); + list_add_tail(&sdom->sdom_elem, &(prv->sdom)); + spin_unlock_irqrestore(&prv->lock, flags); + + return sdom; +} + +static void +rt_free_domdata(const struct scheduler *ops, void *data) +{ + struct rt_dom *sdom = data; + struct rt_private *prv = rt_priv(ops); + + if ( sdom ) + { + unsigned long flags; + + spin_lock_irqsave(&prv->lock, flags); + list_del_init(&sdom->sdom_elem); + spin_unlock_irqrestore(&prv->lock, flags); + + xfree(sdom); + } +} + +static void * +rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd) +{ + struct rt_unit *svc; + + /* Allocate per-UNIT info */ + svc = xzalloc(struct rt_unit); + if ( svc == NULL ) + return NULL; + + INIT_LIST_HEAD(&svc->q_elem); + INIT_LIST_HEAD(&svc->replq_elem); + svc->flags = 0U; + svc->sdom = dd; + svc->unit = unit; + svc->last_start = 0; + + __set_bit(__RTDS_extratime, &svc->flags); + svc->priority_level = 0; + svc->period = RTDS_DEFAULT_PERIOD; + if ( !is_idle_unit(unit) ) + svc->budget = RTDS_DEFAULT_BUDGET; + + SCHED_STAT_CRANK(unit_alloc); + + return svc; +} + +static void +rt_free_udata(const struct scheduler *ops, void *priv) +{ + struct rt_unit *svc = priv; + + xfree(svc); +} + +/* + * It is called in sched_move_domain() and sched_init_vcpu + * in schedule.c. + * When move a domain to a new cpupool. + * It inserts units of moving domain to the scheduler's RunQ in + * dest. cpupool. + */ +static void +rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit) +{ + struct rt_unit *svc = rt_unit(unit); + s_time_t now; + spinlock_t *lock; + + BUG_ON( is_idle_unit(unit) ); + + /* This is safe because unit isn't yet being scheduled */ + sched_set_res(unit, rt_res_pick(ops, unit)); + + lock = unit_schedule_lock_irq(unit); + + now = NOW(); + if ( now >= svc->cur_deadline ) + rt_update_deadline(now, svc); + + if ( !unit_on_q(svc) && unit_runnable(unit) ) + { + replq_insert(ops, svc); + + if ( !unit->is_running ) + runq_insert(ops, svc); + } + unit_schedule_unlock_irq(lock, unit); + + SCHED_STAT_CRANK(unit_insert); +} + +/* + * Remove rt_unit svc from the old scheduler in source cpupool. + */ +static void +rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit) +{ + struct rt_unit * const svc = rt_unit(unit); + struct rt_dom * const sdom = svc->sdom; + spinlock_t *lock; + + SCHED_STAT_CRANK(unit_remove); + + BUG_ON( sdom == NULL ); + + lock = unit_schedule_lock_irq(unit); + if ( unit_on_q(svc) ) + q_remove(svc); + + if ( unit_on_replq(svc) ) + replq_remove(ops,svc); + + unit_schedule_unlock_irq(lock, unit); +} + +/* + * Burn budget in nanosecond granularity + */ +static void +burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now) +{ + s_time_t delta; + + /* don't burn budget for idle UNIT */ + if ( is_idle_unit(svc->unit) ) + return; + + /* burn at nanoseconds level */ + delta = now - svc->last_start; + /* + * delta < 0 only happens in nested virtualization; + * TODO: how should we handle delta < 0 in a better way? + */ + if ( delta < 0 ) + { + printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n", + __func__, delta); + svc->last_start = now; + return; + } + + svc->cur_budget -= delta; + svc->last_start = now; + + if ( svc->cur_budget <= 0 ) + { + if ( has_extratime(svc) ) + { + svc->priority_level++; + svc->cur_budget = svc->budget; + } + else + { + svc->cur_budget = 0; + __set_bit(__RTDS_depleted, &svc->flags); + } + } + + /* TRACE */ + { + struct __packed { + unsigned unit:16, dom:16; + uint64_t cur_budget; + int delta; + unsigned priority_level; + bool has_extratime; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.cur_budget = (uint64_t) svc->cur_budget; + d.delta = delta; + d.priority_level = svc->priority_level; + d.has_extratime = svc->flags & RTDS_extratime; + trace_var(TRC_RTDS_BUDGET_BURN, 1, + sizeof(d), + (unsigned char *) &d); + } +} + +/* + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL + * lock is grabbed before calling this function + */ +static struct rt_unit * +runq_pick(const struct scheduler *ops, const cpumask_t *mask) +{ + struct list_head *runq = rt_runq(ops); + struct list_head *iter; + struct rt_unit *svc = NULL; + struct rt_unit *iter_svc = NULL; + cpumask_t cpu_common; + cpumask_t *online; + + list_for_each ( iter, runq ) + { + iter_svc = q_elem(iter); + + /* mask cpu_hard_affinity & cpupool & mask */ + online = cpupool_domain_master_cpumask(iter_svc->unit->domain); + cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity); + cpumask_and(&cpu_common, mask, &cpu_common); + if ( cpumask_empty(&cpu_common) ) + continue; + + ASSERT( iter_svc->cur_budget > 0 ); + + svc = iter_svc; + break; + } + + /* TRACE */ + { + if( svc != NULL ) + { + struct __packed { + unsigned unit:16, dom:16; + uint64_t cur_deadline, cur_budget; + } d; + d.dom = svc->unit->domain->domain_id; + d.unit = svc->unit->unit_id; + d.cur_deadline = (uint64_t) svc->cur_deadline; + d.cur_budget = (uint64_t) svc->cur_budget; + trace_var(TRC_RTDS_RUNQ_PICK, 1, + sizeof(d), + (unsigned char *) &d); + } + } + + return svc; +} + +/* + * schedule function for rt scheduler. + * The lock is already grabbed in schedule.c, no need to lock here + */ +static void +rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, + s_time_t now, bool tasklet_work_scheduled) +{ + const unsigned int cur_cpu = smp_processor_id(); + const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); + struct rt_private *prv = rt_priv(ops); + struct rt_unit *const scurr = rt_unit(currunit); + struct rt_unit *snext = NULL; + bool migrated = false; + + /* TRACE */ + { + struct __packed { + unsigned cpu:16, tasklet:8, tickled:4, idle:4; + } d; + d.cpu = cur_cpu; + d.tasklet = tasklet_work_scheduled; + d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled); + d.idle = is_idle_unit(currunit); + trace_var(TRC_RTDS_SCHEDULE, 1, + sizeof(d), + (unsigned char *)&d); + } + + /* clear ticked bit now that we've been scheduled */ + cpumask_clear_cpu(sched_cpu, &prv->tickled); + + /* burn_budget would return for IDLE UNIT */ + burn_budget(ops, scurr, now); + + if ( tasklet_work_scheduled ) + { + trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL); + snext = rt_unit(sched_idle_unit(sched_cpu)); + } + else + { + snext = runq_pick(ops, cpumask_of(sched_cpu)); + + if ( snext == NULL ) + snext = rt_unit(sched_idle_unit(sched_cpu)); + else if ( !unit_runnable_state(snext->unit) ) + { + q_remove(snext); + snext = rt_unit(sched_idle_unit(sched_cpu)); + } + + /* if scurr has higher priority and budget, still pick scurr */ + if ( !is_idle_unit(currunit) && + unit_runnable_state(currunit) && + scurr->cur_budget > 0 && + ( is_idle_unit(snext->unit) || + compare_unit_priority(scurr, snext) > 0 ) ) + snext = scurr; + } + + if ( snext != scurr && + !is_idle_unit(currunit) && + unit_runnable(currunit) ) + __set_bit(__RTDS_delayed_runq_add, &scurr->flags); + + snext->last_start = now; + currunit->next_time = -1; /* if an idle unit is picked */ + if ( !is_idle_unit(snext->unit) ) + { + if ( snext != scurr ) + { + q_remove(snext); + __set_bit(__RTDS_scheduled, &snext->flags); + } + if ( sched_unit_master(snext->unit) != sched_cpu ) + { + sched_set_res(snext->unit, get_sched_res(sched_cpu)); + migrated = true; + } + /* Invoke the scheduler next time. */ + currunit->next_time = snext->cur_budget; + } + currunit->next_task = snext->unit; + snext->unit->migrated = migrated; +} + +/* + * Remove UNIT from RunQ + * The lock is already grabbed in schedule.c, no need to lock here + */ +static void +rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) +{ + struct rt_unit * const svc = rt_unit(unit); + + BUG_ON( is_idle_unit(unit) ); + SCHED_STAT_CRANK(unit_sleep); + + if ( curr_on_cpu(sched_unit_master(unit)) == unit ) + cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); + else if ( unit_on_q(svc) ) + { + q_remove(svc); + replq_remove(ops, svc); + } + else if ( svc->flags & RTDS_delayed_runq_add ) + __clear_bit(__RTDS_delayed_runq_add, &svc->flags); +} + +/* + * Pick a cpu where to run an unit, + * possibly kicking out the unit running there + * Called by wake() and context_saved() + * We have a running candidate here, the kick logic is: + * Among all the cpus that are within the cpu affinity + * 1) if there are any idle CPUs, kick one. + For cache benefit, we check new->cpu as first + * 2) now all pcpus are busy; + * among all the running units, pick lowest priority one + * if snext has higher priority, kick it. + * + * TODO: + * 1) what if these two units belongs to the same domain? + * replace an unit belonging to the same domain introduces more overhead + * + * lock is grabbed before calling this function + */ +static void +runq_tickle(const struct scheduler *ops, struct rt_unit *new) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */ + struct rt_unit *iter_svc; + struct sched_unit *iter_unit; + int cpu = 0, cpu_to_tickle = 0; + cpumask_t not_tickled; + cpumask_t *online; + + if ( new == NULL || is_idle_unit(new->unit) ) + return; + + online = cpupool_domain_master_cpumask(new->unit->domain); + cpumask_and(¬_tickled, online, new->unit->cpu_hard_affinity); + cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled); + + /* + * 1) If there are any idle CPUs, kick one. + * For cache benefit,we first search new->cpu. + * The same loop also find the one with lowest priority. + */ + cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), ¬_tickled); + while ( cpu!= nr_cpu_ids ) + { + iter_unit = curr_on_cpu(cpu); + if ( is_idle_unit(iter_unit) ) + { + SCHED_STAT_CRANK(tickled_idle_cpu); + cpu_to_tickle = cpu; + goto out; + } + iter_svc = rt_unit(iter_unit); + if ( latest_deadline_unit == NULL || + compare_unit_priority(iter_svc, latest_deadline_unit) < 0 ) + latest_deadline_unit = iter_svc; + + cpumask_clear_cpu(cpu, ¬_tickled); + cpu = cpumask_cycle(cpu, ¬_tickled); + } + + /* 2) candicate has higher priority, kick out lowest priority unit */ + if ( latest_deadline_unit != NULL && + compare_unit_priority(latest_deadline_unit, new) < 0 ) + { + SCHED_STAT_CRANK(tickled_busy_cpu); + cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit); + goto out; + } + + /* didn't tickle any cpu */ + SCHED_STAT_CRANK(tickled_no_cpu); + return; + out: + /* TRACE */ + { + struct { + unsigned cpu:16, pad:16; + } d; + d.cpu = cpu_to_tickle; + d.pad = 0; + trace_var(TRC_RTDS_TICKLE, 1, + sizeof(d), + (unsigned char *)&d); + } + + cpumask_set_cpu(cpu_to_tickle, &prv->tickled); + cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); + return; +} + +/* + * Should always wake up runnable unit, put it back to RunQ. + * Check priority to raise interrupt + * The lock is already grabbed in schedule.c, no need to lock here + * TODO: what if these two units belongs to the same domain? + */ +static void +rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit) +{ + struct rt_unit * const svc = rt_unit(unit); + s_time_t now; + bool_t missed; + + BUG_ON( is_idle_unit(unit) ); + + if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) + { + SCHED_STAT_CRANK(unit_wake_running); + return; + } + + /* on RunQ/DepletedQ, just update info is ok */ + if ( unlikely(unit_on_q(svc)) ) + { + SCHED_STAT_CRANK(unit_wake_onrunq); + return; + } + + if ( likely(unit_runnable(unit)) ) + SCHED_STAT_CRANK(unit_wake_runnable); + else + SCHED_STAT_CRANK(unit_wake_not_runnable); + + /* + * If a deadline passed while svc was asleep/blocked, we need new + * scheduling parameters (a new deadline and full budget). + */ + now = NOW(); + + missed = ( now >= svc->cur_deadline ); + if ( missed ) + rt_update_deadline(now, svc); + + /* + * If context hasn't been saved for this unit yet, we can't put it on + * the run-queue/depleted-queue. Instead, we set the appropriate flag, + * the unit will be put back on queue after the context has been saved + * (in rt_context_save()). + */ + if ( unlikely(svc->flags & RTDS_scheduled) ) + { + __set_bit(__RTDS_delayed_runq_add, &svc->flags); + /* + * The unit is waking up already, and we didn't even had the time to + * remove its next replenishment event from the replenishment queue + * when it blocked! No big deal. If we did not miss the deadline in + * the meantime, let's just leave it there. If we did, let's remove it + * and queue a new one (to occur at our new deadline). + */ + if ( missed ) + replq_reinsert(ops, svc); + return; + } + + /* Replenishment event got cancelled when we blocked. Add it back. */ + replq_insert(ops, svc); + /* insert svc to runq/depletedq because svc is not in queue now */ + runq_insert(ops, svc); + + runq_tickle(ops, svc); +} + +/* + * scurr has finished context switch, insert it back to the RunQ, + * and then pick the highest priority unit from runq to run + */ +static void +rt_context_saved(const struct scheduler *ops, struct sched_unit *unit) +{ + struct rt_unit *svc = rt_unit(unit); + spinlock_t *lock = unit_schedule_lock_irq(unit); + + __clear_bit(__RTDS_scheduled, &svc->flags); + /* not insert idle unit to runq */ + if ( is_idle_unit(unit) ) + goto out; + + if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && + likely(unit_runnable(unit)) ) + { + runq_insert(ops, svc); + runq_tickle(ops, svc); + } + else + replq_remove(ops, svc); + +out: + unit_schedule_unlock_irq(lock, unit); +} + +/* + * set/get each unit info of each domain + */ +static int +rt_dom_cntl( + const struct scheduler *ops, + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_unit *svc; + struct sched_unit *unit; + unsigned long flags; + int rc = 0; + struct xen_domctl_schedparam_vcpu local_sched; + s_time_t period, budget; + uint32_t index = 0; + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + /* Return the default parameters. */ + op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1); + op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1); + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 ) + { + rc = -EINVAL; + break; + } + spin_lock_irqsave(&prv->lock, flags); + for_each_sched_unit ( d, unit ) + { + svc = rt_unit(unit); + svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */ + svc->budget = MICROSECS(op->u.rtds.budget); + } + spin_unlock_irqrestore(&prv->lock, flags); + break; + case XEN_DOMCTL_SCHEDOP_getvcpuinfo: + case XEN_DOMCTL_SCHEDOP_putvcpuinfo: + while ( index < op->u.v.nr_vcpus ) + { + if ( copy_from_guest_offset(&local_sched, + op->u.v.vcpus, index, 1) ) + { + rc = -EFAULT; + break; + } + if ( local_sched.vcpuid >= d->max_vcpus || + d->vcpu[local_sched.vcpuid] == NULL ) + { + rc = -EINVAL; + break; + } + + if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo ) + { + spin_lock_irqsave(&prv->lock, flags); + svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); + local_sched.u.rtds.budget = svc->budget / MICROSECS(1); + local_sched.u.rtds.period = svc->period / MICROSECS(1); + if ( has_extratime(svc) ) + local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra; + else + local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra; + spin_unlock_irqrestore(&prv->lock, flags); + + if ( copy_to_guest_offset(op->u.v.vcpus, index, + &local_sched, 1) ) + { + rc = -EFAULT; + break; + } + } + else + { + period = MICROSECS(local_sched.u.rtds.period); + budget = MICROSECS(local_sched.u.rtds.budget); + if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET || + budget > period || period < RTDS_MIN_PERIOD ) + { + rc = -EINVAL; + break; + } + + spin_lock_irqsave(&prv->lock, flags); + svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); + svc->period = period; + svc->budget = budget; + if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra ) + __set_bit(__RTDS_extratime, &svc->flags); + else + __clear_bit(__RTDS_extratime, &svc->flags); + spin_unlock_irqrestore(&prv->lock, flags); + } + /* Process a most 64 vCPUs without checking for preemptions. */ + if ( (++index > 63) && hypercall_preempt_check() ) + break; + } + if ( !rc ) + /* notify upper caller how many units have been processed. */ + op->u.v.nr_vcpus = index; + break; + } + + return rc; +} + +/* + * The replenishment timer handler picks units + * from the replq and does the actual replenishment. + */ +static void repl_timer_handler(void *data){ + s_time_t now; + struct scheduler *ops = data; + struct rt_private *prv = rt_priv(ops); + struct list_head *replq = rt_replq(ops); + struct list_head *runq = rt_runq(ops); + struct list_head *iter, *tmp; + struct rt_unit *svc; + LIST_HEAD(tmp_replq); + + spin_lock_irq(&prv->lock); + + now = NOW(); + + /* + * Do the replenishment and move replenished units + * to the temporary list to tickle. + * If svc is on run queue, we need to put it at + * the correct place since its deadline changes. + */ + list_for_each_safe ( iter, tmp, replq ) + { + svc = replq_elem(iter); + + if ( now < svc->cur_deadline ) + break; + + list_del(&svc->replq_elem); + rt_update_deadline(now, svc); + list_add(&svc->replq_elem, &tmp_replq); + + if ( unit_on_q(svc) ) + { + q_remove(svc); + runq_insert(ops, svc); + } + } + + /* + * Iterate through the list of updated units. + * If an updated unit is running, tickle the head of the + * runqueue if it has a higher priority. + * If an updated unit was depleted and on the runqueue, tickle it. + * Finally, reinsert the units back to replenishement events list. + */ + list_for_each_safe ( iter, tmp, &tmp_replq ) + { + svc = replq_elem(iter); + + if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit && + !list_empty(runq) ) + { + struct rt_unit *next_on_runq = q_elem(runq->next); + + if ( compare_unit_priority(svc, next_on_runq) < 0 ) + runq_tickle(ops, next_on_runq); + } + else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) && + unit_on_q(svc) ) + runq_tickle(ops, svc); + + list_del(&svc->replq_elem); + deadline_replq_insert(svc, &svc->replq_elem, replq); + } + + /* + * If there are units left in the replenishment event list, + * set the next replenishment to happen at the deadline of + * the one in the front. + */ + if ( !list_empty(replq) ) + set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline); + + spin_unlock_irq(&prv->lock); +} + +static const struct scheduler sched_rtds_def = { + .name = "SMP RTDS Scheduler", + .opt_name = "rtds", + .sched_id = XEN_SCHEDULER_RTDS, + .sched_data = NULL, + + .dump_cpu_state = rt_dump_pcpu, + .dump_settings = rt_dump, + .init = rt_init, + .deinit = rt_deinit, + .init_pdata = rt_init_pdata, + .switch_sched = rt_switch_sched, + .deinit_pdata = rt_deinit_pdata, + .alloc_domdata = rt_alloc_domdata, + .free_domdata = rt_free_domdata, + .alloc_udata = rt_alloc_udata, + .free_udata = rt_free_udata, + .insert_unit = rt_unit_insert, + .remove_unit = rt_unit_remove, + + .adjust = rt_dom_cntl, + + .pick_resource = rt_res_pick, + .do_schedule = rt_schedule, + .sleep = rt_unit_sleep, + .wake = rt_unit_wake, + .context_saved = rt_context_saved, +}; + +REGISTER_SCHEDULER(sched_rtds_def); diff --git a/xen/common/sched_arinc653.c b/xen/common/sched_arinc653.c deleted file mode 100644 index 565575c326..0000000000 --- a/xen/common/sched_arinc653.c +++ /dev/null @@ -1,739 +0,0 @@ -/****************************************************************************** - * sched_arinc653.c - * - * An ARINC653-compatible scheduling algorithm for use in Xen. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Copyright (c) 2010, DornerWorks, Ltd. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/************************************************************************** - * Private Macros * - **************************************************************************/ - -/** - * Default timeslice for domain 0. - */ -#define DEFAULT_TIMESLICE MILLISECS(10) - -/** - * Retrieve the idle UNIT for a given physical CPU - */ -#define IDLETASK(cpu) (sched_idle_unit(cpu)) - -/** - * Return a pointer to the ARINC 653-specific scheduler data information - * associated with the given UNIT (unit) - */ -#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv) - -/** - * Return the global scheduler private data given the scheduler ops pointer - */ -#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data)) - -/************************************************************************** - * Private Type Definitions * - **************************************************************************/ - -/** - * The arinc653_unit_t structure holds ARINC 653-scheduler-specific - * information for all non-idle UNITs - */ -typedef struct arinc653_unit_s -{ - /* unit points to Xen's struct sched_unit so we can get to it from an - * arinc653_unit_t pointer. */ - struct sched_unit * unit; - /* awake holds whether the UNIT has been woken with vcpu_wake() */ - bool_t awake; - /* list holds the linked list information for the list this UNIT - * is stored in */ - struct list_head list; -} arinc653_unit_t; - -/** - * The sched_entry_t structure holds a single entry of the - * ARINC 653 schedule. - */ -typedef struct sched_entry_s -{ - /* dom_handle holds the handle ("UUID") for the domain that this - * schedule entry refers to. */ - xen_domain_handle_t dom_handle; - /* unit_id holds the UNIT number for the UNIT that this schedule - * entry refers to. */ - int unit_id; - /* runtime holds the number of nanoseconds that the UNIT for this - * schedule entry should be allowed to run per major frame. */ - s_time_t runtime; - /* unit holds a pointer to the Xen sched_unit structure */ - struct sched_unit * unit; -} sched_entry_t; - -/** - * This structure defines data that is global to an instance of the scheduler - */ -typedef struct a653sched_priv_s -{ - /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ - spinlock_t lock; - - /** - * This array holds the active ARINC 653 schedule. - * - * When the system tries to start a new UNIT, this schedule is scanned - * to look for a matching (handle, UNIT #) pair. If both the handle (UUID) - * and UNIT number match, then the UNIT is allowed to run. Its run time - * (per major frame) is given in the third entry of the schedule. - */ - sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE]; - - /** - * This variable holds the number of entries that are valid in - * the arinc653_schedule table. - * - * This is not necessarily the same as the number of domains in the - * schedule. A domain could be listed multiple times within the schedule, - * or a domain with multiple UNITs could have a different - * schedule entry for each UNIT. - */ - unsigned int num_schedule_entries; - - /** - * the major frame time for the ARINC 653 schedule. - */ - s_time_t major_frame; - - /** - * the time that the next major frame starts - */ - s_time_t next_major_frame; - - /** - * pointers to all Xen UNIT structures for iterating through - */ - struct list_head unit_list; -} a653sched_priv_t; - -/************************************************************************** - * Helper functions * - **************************************************************************/ - -/** - * This function compares two domain handles. - * - * @param h1 Pointer to handle 1 - * @param h2 Pointer to handle 2 - * - * @return
    - *
  • <0: handle 1 is less than handle 2 - *
  • 0: handle 1 is equal to handle 2 - *
  • >0: handle 1 is greater than handle 2 - *
- */ -static int dom_handle_cmp(const xen_domain_handle_t h1, - const xen_domain_handle_t h2) -{ - return memcmp(h1, h2, sizeof(xen_domain_handle_t)); -} - -/** - * This function searches the unit list to find a UNIT that matches - * the domain handle and UNIT ID specified. - * - * @param ops Pointer to this instance of the scheduler structure - * @param handle Pointer to handler - * @param unit_id UNIT ID - * - * @return
    - *
  • Pointer to the matching UNIT if one is found - *
  • NULL otherwise - *
- */ -static struct sched_unit *find_unit( - const struct scheduler *ops, - xen_domain_handle_t handle, - int unit_id) -{ - arinc653_unit_t *aunit; - - /* loop through the unit_list looking for the specified UNIT */ - list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list ) - if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0) - && (unit_id == aunit->unit->unit_id) ) - return aunit->unit; - - return NULL; -} - -/** - * This function updates the pointer to the Xen UNIT structure for each entry - * in the ARINC 653 schedule. - * - * @param ops Pointer to this instance of the scheduler structure - * @return - */ -static void update_schedule_units(const struct scheduler *ops) -{ - unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries; - - for ( i = 0; i < n_entries; i++ ) - SCHED_PRIV(ops)->schedule[i].unit = - find_unit(ops, - SCHED_PRIV(ops)->schedule[i].dom_handle, - SCHED_PRIV(ops)->schedule[i].unit_id); -} - -/** - * This function is called by the adjust_global scheduler hook to put - * in place a new ARINC653 schedule. - * - * @param ops Pointer to this instance of the scheduler structure - * - * @return
    - *
  • 0 = success - *
  • !0 = error - *
- */ -static int -arinc653_sched_set( - const struct scheduler *ops, - struct xen_sysctl_arinc653_schedule *schedule) -{ - a653sched_priv_t *sched_priv = SCHED_PRIV(ops); - s_time_t total_runtime = 0; - unsigned int i; - unsigned long flags; - int rc = -EINVAL; - - spin_lock_irqsave(&sched_priv->lock, flags); - - /* Check for valid major frame and number of schedule entries. */ - if ( (schedule->major_frame <= 0) - || (schedule->num_sched_entries < 1) - || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) ) - goto fail; - - for ( i = 0; i < schedule->num_sched_entries; i++ ) - { - /* Check for a valid run time. */ - if ( schedule->sched_entries[i].runtime <= 0 ) - goto fail; - - /* Add this entry's run time to total run time. */ - total_runtime += schedule->sched_entries[i].runtime; - } - - /* - * Error if the major frame is not large enough to run all entries as - * indicated by comparing the total run time to the major frame length. - */ - if ( total_runtime > schedule->major_frame ) - goto fail; - - /* Copy the new schedule into place. */ - sched_priv->num_schedule_entries = schedule->num_sched_entries; - sched_priv->major_frame = schedule->major_frame; - for ( i = 0; i < schedule->num_sched_entries; i++ ) - { - memcpy(sched_priv->schedule[i].dom_handle, - schedule->sched_entries[i].dom_handle, - sizeof(sched_priv->schedule[i].dom_handle)); - sched_priv->schedule[i].unit_id = - schedule->sched_entries[i].vcpu_id; - sched_priv->schedule[i].runtime = - schedule->sched_entries[i].runtime; - } - update_schedule_units(ops); - - /* - * The newly-installed schedule takes effect immediately. We do not even - * wait for the current major frame to expire. - * - * Signal a new major frame to begin. The next major frame is set up by - * the do_schedule callback function when it is next invoked. - */ - sched_priv->next_major_frame = NOW(); - - rc = 0; - - fail: - spin_unlock_irqrestore(&sched_priv->lock, flags); - return rc; -} - -/** - * This function is called by the adjust_global scheduler hook to read the - * current ARINC 653 schedule - * - * @param ops Pointer to this instance of the scheduler structure - * @return
    - *
  • 0 = success - *
  • !0 = error - *
- */ -static int -arinc653_sched_get( - const struct scheduler *ops, - struct xen_sysctl_arinc653_schedule *schedule) -{ - a653sched_priv_t *sched_priv = SCHED_PRIV(ops); - unsigned int i; - unsigned long flags; - - spin_lock_irqsave(&sched_priv->lock, flags); - - schedule->num_sched_entries = sched_priv->num_schedule_entries; - schedule->major_frame = sched_priv->major_frame; - for ( i = 0; i < sched_priv->num_schedule_entries; i++ ) - { - memcpy(schedule->sched_entries[i].dom_handle, - sched_priv->schedule[i].dom_handle, - sizeof(sched_priv->schedule[i].dom_handle)); - schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id; - schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime; - } - - spin_unlock_irqrestore(&sched_priv->lock, flags); - - return 0; -} - -/************************************************************************** - * Scheduler callback functions * - **************************************************************************/ - -/** - * This function performs initialization for an instance of the scheduler. - * - * @param ops Pointer to this instance of the scheduler structure - * - * @return
    - *
  • 0 = success - *
  • !0 = error - *
- */ -static int -a653sched_init(struct scheduler *ops) -{ - a653sched_priv_t *prv; - - prv = xzalloc(a653sched_priv_t); - if ( prv == NULL ) - return -ENOMEM; - - ops->sched_data = prv; - - prv->next_major_frame = 0; - spin_lock_init(&prv->lock); - INIT_LIST_HEAD(&prv->unit_list); - - return 0; -} - -/** - * This function performs deinitialization for an instance of the scheduler - * - * @param ops Pointer to this instance of the scheduler structure - */ -static void -a653sched_deinit(struct scheduler *ops) -{ - xfree(SCHED_PRIV(ops)); - ops->sched_data = NULL; -} - -/** - * This function allocates scheduler-specific data for a UNIT - * - * @param ops Pointer to this instance of the scheduler structure - * @param unit Pointer to struct sched_unit - * - * @return Pointer to the allocated data - */ -static void * -a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, - void *dd) -{ - a653sched_priv_t *sched_priv = SCHED_PRIV(ops); - arinc653_unit_t *svc; - unsigned int entry; - unsigned long flags; - - /* - * Allocate memory for the ARINC 653-specific scheduler data information - * associated with the given UNIT (unit). - */ - svc = xmalloc(arinc653_unit_t); - if ( svc == NULL ) - return NULL; - - spin_lock_irqsave(&sched_priv->lock, flags); - - /* - * Add every one of dom0's units to the schedule, as long as there are - * slots available. - */ - if ( unit->domain->domain_id == 0 ) - { - entry = sched_priv->num_schedule_entries; - - if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE ) - { - sched_priv->schedule[entry].dom_handle[0] = '\0'; - sched_priv->schedule[entry].unit_id = unit->unit_id; - sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE; - sched_priv->schedule[entry].unit = unit; - - sched_priv->major_frame += DEFAULT_TIMESLICE; - ++sched_priv->num_schedule_entries; - } - } - - /* - * Initialize our ARINC 653 scheduler-specific information for the UNIT. - * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it - * will call the vcpu_wake scheduler callback function and our scheduler - * will mark the UNIT awake. - */ - svc->unit = unit; - svc->awake = 0; - if ( !is_idle_unit(unit) ) - list_add(&svc->list, &SCHED_PRIV(ops)->unit_list); - update_schedule_units(ops); - - spin_unlock_irqrestore(&sched_priv->lock, flags); - - return svc; -} - -/** - * This function frees scheduler-specific UNIT data - * - * @param ops Pointer to this instance of the scheduler structure - */ -static void -a653sched_free_udata(const struct scheduler *ops, void *priv) -{ - a653sched_priv_t *sched_priv = SCHED_PRIV(ops); - arinc653_unit_t *av = priv; - unsigned long flags; - - if (av == NULL) - return; - - spin_lock_irqsave(&sched_priv->lock, flags); - - if ( !is_idle_unit(av->unit) ) - list_del(&av->list); - - xfree(av); - update_schedule_units(ops); - - spin_unlock_irqrestore(&sched_priv->lock, flags); -} - -/** - * Xen scheduler callback function to sleep a UNIT - * - * @param ops Pointer to this instance of the scheduler structure - * @param unit Pointer to struct sched_unit - */ -static void -a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) -{ - if ( AUNIT(unit) != NULL ) - AUNIT(unit)->awake = 0; - - /* - * If the UNIT being put to sleep is the same one that is currently - * running, raise a softirq to invoke the scheduler to switch domains. - */ - if ( get_sched_res(sched_unit_master(unit))->curr == unit ) - cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); -} - -/** - * Xen scheduler callback function to wake up a UNIT - * - * @param ops Pointer to this instance of the scheduler structure - * @param unit Pointer to struct sched_unit - */ -static void -a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) -{ - if ( AUNIT(unit) != NULL ) - AUNIT(unit)->awake = 1; - - cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); -} - -/** - * Xen scheduler callback function to select a UNIT to run. - * This is the main scheduler routine. - * - * @param ops Pointer to this instance of the scheduler structure - * @param now Current time - */ -static void -a653sched_do_schedule( - const struct scheduler *ops, - struct sched_unit *prev, - s_time_t now, - bool tasklet_work_scheduled) -{ - struct sched_unit *new_task = NULL; - static unsigned int sched_index = 0; - static s_time_t next_switch_time; - a653sched_priv_t *sched_priv = SCHED_PRIV(ops); - const unsigned int cpu = sched_get_resource_cpu(smp_processor_id()); - unsigned long flags; - - spin_lock_irqsave(&sched_priv->lock, flags); - - if ( sched_priv->num_schedule_entries < 1 ) - sched_priv->next_major_frame = now + DEFAULT_TIMESLICE; - else if ( now >= sched_priv->next_major_frame ) - { - /* time to enter a new major frame - * the first time this function is called, this will be true */ - /* start with the first domain in the schedule */ - sched_index = 0; - sched_priv->next_major_frame = now + sched_priv->major_frame; - next_switch_time = now + sched_priv->schedule[0].runtime; - } - else - { - while ( (now >= next_switch_time) - && (sched_index < sched_priv->num_schedule_entries) ) - { - /* time to switch to the next domain in this major frame */ - sched_index++; - next_switch_time += sched_priv->schedule[sched_index].runtime; - } - } - - /* - * If we exhausted the domains in the schedule and still have time left - * in the major frame then switch next at the next major frame. - */ - if ( sched_index >= sched_priv->num_schedule_entries ) - next_switch_time = sched_priv->next_major_frame; - - /* - * If there are more domains to run in the current major frame, set - * new_task equal to the address of next domain's sched_unit structure. - * Otherwise, set new_task equal to the address of the idle task's - * sched_unit structure. - */ - new_task = (sched_index < sched_priv->num_schedule_entries) - ? sched_priv->schedule[sched_index].unit - : IDLETASK(cpu); - - /* Check to see if the new task can be run (awake & runnable). */ - if ( !((new_task != NULL) - && (AUNIT(new_task) != NULL) - && AUNIT(new_task)->awake - && unit_runnable_state(new_task)) ) - new_task = IDLETASK(cpu); - BUG_ON(new_task == NULL); - - /* - * Check to make sure we did not miss a major frame. - * This is a good test for robust partitioning. - */ - BUG_ON(now >= sched_priv->next_major_frame); - - spin_unlock_irqrestore(&sched_priv->lock, flags); - - /* Tasklet work (which runs in idle UNIT context) overrides all else. */ - if ( tasklet_work_scheduled ) - new_task = IDLETASK(cpu); - - /* Running this task would result in a migration */ - if ( !is_idle_unit(new_task) - && (sched_unit_master(new_task) != cpu) ) - new_task = IDLETASK(cpu); - - /* - * Return the amount of time the next domain has to run and the address - * of the selected task's UNIT structure. - */ - prev->next_time = next_switch_time - now; - prev->next_task = new_task; - new_task->migrated = false; - - BUG_ON(prev->next_time <= 0); -} - -/** - * Xen scheduler callback function to select a resource for the UNIT to run on - * - * @param ops Pointer to this instance of the scheduler structure - * @param unit Pointer to struct sched_unit - * - * @return Scheduler resource to run on - */ -static struct sched_resource * -a653sched_pick_resource(const struct scheduler *ops, - const struct sched_unit *unit) -{ - cpumask_t *online; - unsigned int cpu; - - /* - * If present, prefer unit's current processor, else - * just find the first valid unit. - */ - online = cpupool_domain_master_cpumask(unit->domain); - - cpu = cpumask_first(online); - - if ( cpumask_test_cpu(sched_unit_master(unit), online) - || (cpu >= nr_cpu_ids) ) - cpu = sched_unit_master(unit); - - return get_sched_res(cpu); -} - -/** - * Xen scheduler callback to change the scheduler of a cpu - * - * @param new_ops Pointer to this instance of the scheduler structure - * @param cpu The cpu that is changing scheduler - * @param pdata scheduler specific PCPU data (we don't have any) - * @param vdata scheduler specific UNIT data of the idle unit - */ -static spinlock_t * -a653_switch_sched(struct scheduler *new_ops, unsigned int cpu, - void *pdata, void *vdata) -{ - struct sched_resource *sr = get_sched_res(cpu); - arinc653_unit_t *svc = vdata; - - ASSERT(!pdata && svc && is_idle_unit(svc->unit)); - - sched_idle_unit(cpu)->priv = vdata; - - return &sr->_lock; -} - -/** - * Xen scheduler callback function to perform a global (not domain-specific) - * adjustment. It is used by the ARINC 653 scheduler to put in place a new - * ARINC 653 schedule or to retrieve the schedule currently in place. - * - * @param ops Pointer to this instance of the scheduler structure - * @param sc Pointer to the scheduler operation specified by Domain 0 - */ -static int -a653sched_adjust_global(const struct scheduler *ops, - struct xen_sysctl_scheduler_op *sc) -{ - struct xen_sysctl_arinc653_schedule local_sched; - int rc = -EINVAL; - - switch ( sc->cmd ) - { - case XEN_SYSCTL_SCHEDOP_putinfo: - if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) ) - { - rc = -EFAULT; - break; - } - - rc = arinc653_sched_set(ops, &local_sched); - break; - case XEN_SYSCTL_SCHEDOP_getinfo: - memset(&local_sched, -1, sizeof(local_sched)); - rc = arinc653_sched_get(ops, &local_sched); - if ( rc ) - break; - - if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) ) - rc = -EFAULT; - break; - } - - return rc; -} - -/** - * This structure defines our scheduler for Xen. - * The entries tell Xen where to find our scheduler-specific - * callback functions. - * The symbol must be visible to the rest of Xen at link time. - */ -static const struct scheduler sched_arinc653_def = { - .name = "ARINC 653 Scheduler", - .opt_name = "arinc653", - .sched_id = XEN_SCHEDULER_ARINC653, - .sched_data = NULL, - - .init = a653sched_init, - .deinit = a653sched_deinit, - - .free_udata = a653sched_free_udata, - .alloc_udata = a653sched_alloc_udata, - - .insert_unit = NULL, - .remove_unit = NULL, - - .sleep = a653sched_unit_sleep, - .wake = a653sched_unit_wake, - .yield = NULL, - .context_saved = NULL, - - .do_schedule = a653sched_do_schedule, - - .pick_resource = a653sched_pick_resource, - - .switch_sched = a653_switch_sched, - - .adjust = NULL, - .adjust_global = a653sched_adjust_global, - - .dump_settings = NULL, - .dump_cpu_state = NULL, -}; - -REGISTER_SCHEDULER(sched_arinc653_def); - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c deleted file mode 100644 index aa41a3301b..0000000000 --- a/xen/common/sched_credit.c +++ /dev/null @@ -1,2284 +0,0 @@ -/**************************************************************************** - * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. - **************************************************************************** - * - * File: common/csched_credit.c - * Author: Emmanuel Ackaouy - * - * Description: Credit-based SMP CPU scheduler - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * Locking: - * - Scheduler-lock (a.k.a. runqueue lock): - * + is per-runqueue, and there is one runqueue per-cpu; - * + serializes all runqueue manipulation operations; - * - Private data lock (a.k.a. private scheduler lock): - * + serializes accesses to the scheduler global state (weight, - * credit, balance_credit, etc); - * + serializes updates to the domains' scheduling parameters. - * - * Ordering is "private lock always comes first": - * + if we need both locks, we must acquire the private - * scheduler lock for first; - * + if we already own a runqueue lock, we must never acquire - * the private scheduler lock. - */ - -/* - * Basic constants - */ -#define CSCHED_DEFAULT_WEIGHT 256 -#define CSCHED_TICKS_PER_TSLICE 3 -/* Default timeslice: 30ms */ -#define CSCHED_DEFAULT_TSLICE_MS 30 -#define CSCHED_CREDITS_PER_MSEC 10 -/* Never set a timer shorter than this value. */ -#define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN - - -/* - * Priorities - */ -#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ -#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ -#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ -#define CSCHED_PRI_IDLE -64 /* idle */ - - -/* - * Flags - * - * Note that svc->flags (where these flags live) is protected by an - * inconsistent set of locks. Therefore atomic-safe bit operations must - * be used for accessing it. - */ -#define CSCHED_FLAG_UNIT_PARKED 0x0 /* UNIT over capped credits */ -#define CSCHED_FLAG_UNIT_YIELD 0x1 /* UNIT yielding */ -#define CSCHED_FLAG_UNIT_MIGRATING 0x2 /* UNIT may have moved to a new pcpu */ -#define CSCHED_FLAG_UNIT_PINNED 0x4 /* UNIT can run only on 1 pcpu */ - - -/* - * Useful macros - */ -#define CSCHED_PRIV(_ops) \ - ((struct csched_private *)((_ops)->sched_data)) -#define CSCHED_PCPU(_c) \ - ((struct csched_pcpu *)get_sched_res(_c)->sched_priv) -#define CSCHED_UNIT(unit) ((struct csched_unit *) (unit)->priv) -#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) -#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) - - -/* - * CSCHED_STATS - * - * Manage very basic per-unit counters and stats. - * - * Useful for debugging live systems. The stats are displayed - * with runq dumps ('r' on the Xen console). - */ -#ifdef SCHED_STATS - -#define CSCHED_STATS - -#define SCHED_UNIT_STATS_RESET(_V) \ - do \ - { \ - memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ - } while ( 0 ) - -#define SCHED_UNIT_STAT_CRANK(_V, _X) (((_V)->stats._X)++) - -#define SCHED_UNIT_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) - -#else /* !SCHED_STATS */ - -#undef CSCHED_STATS - -#define SCHED_UNIT_STATS_RESET(_V) do {} while ( 0 ) -#define SCHED_UNIT_STAT_CRANK(_V, _X) do {} while ( 0 ) -#define SCHED_UNIT_STAT_SET(_V, _X, _Y) do {} while ( 0 ) - -#endif /* SCHED_STATS */ - - -/* - * Credit tracing events ("only" 512 available!). Check - * include/public/trace.h for more details. - */ -#define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1) -#define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2) -#define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3) -#define TRC_CSCHED_STOLEN_UNIT TRC_SCHED_CLASS_EVT(CSCHED, 4) -#define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5) -#define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6) -#define TRC_CSCHED_BOOST_START TRC_SCHED_CLASS_EVT(CSCHED, 7) -#define TRC_CSCHED_BOOST_END TRC_SCHED_CLASS_EVT(CSCHED, 8) -#define TRC_CSCHED_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED, 9) -#define TRC_CSCHED_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED, 10) -#define TRC_CSCHED_STEAL_CHECK TRC_SCHED_CLASS_EVT(CSCHED, 11) - -/* - * Boot parameters - */ -static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; -integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms); - -/* - * Physical CPU - */ -struct csched_pcpu { - struct list_head runq; - uint32_t runq_sort_last; - - unsigned int idle_bias; - unsigned int nr_runnable; - - unsigned int tick; - struct timer ticker; -}; - -/* - * Virtual UNIT - */ -struct csched_unit { - struct list_head runq_elem; - struct list_head active_unit_elem; - - /* Up-pointers */ - struct csched_dom *sdom; - struct sched_unit *unit; - - s_time_t start_time; /* When we were scheduled (used for credit) */ - unsigned flags; - int pri; - - atomic_t credit; - unsigned int residual; - - s_time_t last_sched_time; - -#ifdef CSCHED_STATS - struct { - int credit_last; - uint32_t credit_incr; - uint32_t state_active; - uint32_t state_idle; - uint32_t migrate_q; - uint32_t migrate_r; - uint32_t kicked_away; - } stats; -#endif -}; - -/* - * Domain - */ -struct csched_dom { - struct list_head active_unit; - struct list_head active_sdom_elem; - struct domain *dom; - uint16_t active_unit_count; - uint16_t weight; - uint16_t cap; -}; - -/* - * System-wide private data - */ -struct csched_private { - /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ - spinlock_t lock; - - cpumask_var_t idlers; - cpumask_var_t cpus; - uint32_t *balance_bias; - uint32_t runq_sort; - uint32_t ncpus; - - /* Period of master and tick in milliseconds */ - unsigned int tick_period_us, ticks_per_tslice; - s_time_t ratelimit, tslice, unit_migr_delay; - - struct list_head active_sdom; - uint32_t weight; - uint32_t credit; - int credit_balance; - unsigned int credits_per_tslice; - - unsigned int master; - struct timer master_ticker; -}; - -static void csched_tick(void *_cpu); -static void csched_acct(void *dummy); - -static inline int -__unit_on_runq(struct csched_unit *svc) -{ - return !list_empty(&svc->runq_elem); -} - -static inline struct csched_unit * -__runq_elem(struct list_head *elem) -{ - return list_entry(elem, struct csched_unit, runq_elem); -} - -/* Is the first element of cpu's runq (if any) cpu's idle unit? */ -static inline bool_t is_runq_idle(unsigned int cpu) -{ - /* - * We're peeking at cpu's runq, we must hold the proper lock. - */ - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - return list_empty(RUNQ(cpu)) || - is_idle_unit(__runq_elem(RUNQ(cpu)->next)->unit); -} - -static inline void -inc_nr_runnable(unsigned int cpu) -{ - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - CSCHED_PCPU(cpu)->nr_runnable++; - -} - -static inline void -dec_nr_runnable(unsigned int cpu) -{ - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 1); - CSCHED_PCPU(cpu)->nr_runnable--; -} - -static inline void -__runq_insert(struct csched_unit *svc) -{ - unsigned int cpu = sched_unit_master(svc->unit); - const struct list_head * const runq = RUNQ(cpu); - struct list_head *iter; - - BUG_ON( __unit_on_runq(svc) ); - - list_for_each( iter, runq ) - { - const struct csched_unit * const iter_svc = __runq_elem(iter); - if ( svc->pri > iter_svc->pri ) - break; - } - - /* If the unit yielded, try to put it behind one lower-priority - * runnable unit if we can. The next runq_sort will bring it forward - * within 30ms if the queue too long. */ - if ( test_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags) - && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) - { - iter=iter->next; - - /* Some sanity checks */ - BUG_ON(iter == runq); - } - - list_add_tail(&svc->runq_elem, iter); -} - -static inline void -runq_insert(struct csched_unit *svc) -{ - __runq_insert(svc); - inc_nr_runnable(sched_unit_master(svc->unit)); -} - -static inline void -__runq_remove(struct csched_unit *svc) -{ - BUG_ON( !__unit_on_runq(svc) ); - list_del_init(&svc->runq_elem); -} - -static inline void -runq_remove(struct csched_unit *svc) -{ - dec_nr_runnable(sched_unit_master(svc->unit)); - __runq_remove(svc); -} - -static void burn_credits(struct csched_unit *svc, s_time_t now) -{ - s_time_t delta; - uint64_t val; - unsigned int credits; - - /* Assert svc is current */ - ASSERT( svc == CSCHED_UNIT(curr_on_cpu(sched_unit_master(svc->unit))) ); - - if ( (delta = now - svc->start_time) <= 0 ) - return; - - val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual; - svc->residual = do_div(val, MILLISECS(1)); - credits = val; - ASSERT(credits == val); /* make sure we haven't truncated val */ - atomic_sub(credits, &svc->credit); - svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC; -} - -static bool_t __read_mostly opt_tickle_one_idle = 1; -boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle); - -DEFINE_PER_CPU(unsigned int, last_tickle_cpu); - -static inline void __runq_tickle(struct csched_unit *new) -{ - unsigned int cpu = sched_unit_master(new->unit); - struct sched_resource *sr = get_sched_res(cpu); - struct sched_unit *unit = new->unit; - struct csched_unit * const cur = CSCHED_UNIT(curr_on_cpu(cpu)); - struct csched_private *prv = CSCHED_PRIV(sr->scheduler); - cpumask_t mask, idle_mask, *online; - int balance_step, idlers_empty; - - ASSERT(cur); - cpumask_clear(&mask); - - online = cpupool_domain_master_cpumask(new->sdom->dom); - cpumask_and(&idle_mask, prv->idlers, online); - idlers_empty = cpumask_empty(&idle_mask); - - /* - * Exclusive pinning is when a unit has hard-affinity with only one - * cpu, and there is no other unit that has hard-affinity with that - * same cpu. This is infrequent, but if it happens, is for achieving - * the most possible determinism, and least possible overhead for - * the units in question. - * - * Try to identify the vast majority of these situations, and deal - * with them quickly. - */ - if ( unlikely(test_bit(CSCHED_FLAG_UNIT_PINNED, &new->flags) && - cpumask_test_cpu(cpu, &idle_mask)) ) - { - ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); - SCHED_STAT_CRANK(tickled_idle_cpu_excl); - __cpumask_set_cpu(cpu, &mask); - goto tickle; - } - - /* - * If the pcpu is idle, or there are no idlers and the new - * unit is a higher priority than the old unit, run it here. - * - * If there are idle cpus, first try to find one suitable to run - * new, so we can avoid preempting cur. If we cannot find a - * suitable idler on which to run new, run it here, but try to - * find a suitable idler on which to run cur instead. - */ - if ( cur->pri == CSCHED_PRI_IDLE - || (idlers_empty && new->pri > cur->pri) ) - { - if ( cur->pri != CSCHED_PRI_IDLE ) - SCHED_STAT_CRANK(tickled_busy_cpu); - else - SCHED_STAT_CRANK(tickled_idle_cpu); - __cpumask_set_cpu(cpu, &mask); - } - else if ( !idlers_empty ) - { - /* - * Soft and hard affinity balancing loop. For units without - * a useful soft affinity, consider hard affinity only. - */ - for_each_affinity_balance_step( balance_step ) - { - int new_idlers_empty; - - if ( balance_step == BALANCE_SOFT_AFFINITY - && !has_soft_affinity(unit) ) - continue; - - /* Are there idlers suitable for new (for this balance step)? */ - affinity_balance_cpumask(unit, balance_step, - cpumask_scratch_cpu(cpu)); - cpumask_and(cpumask_scratch_cpu(cpu), - cpumask_scratch_cpu(cpu), &idle_mask); - new_idlers_empty = cpumask_empty(cpumask_scratch_cpu(cpu)); - - /* - * Let's not be too harsh! If there aren't idlers suitable - * for new in its soft affinity mask, make sure we check its - * hard affinity as well, before taking final decisions. - */ - if ( new_idlers_empty - && balance_step == BALANCE_SOFT_AFFINITY ) - continue; - - /* - * If there are no suitable idlers for new, and it's higher - * priority than cur, check whether we can migrate cur away. - * We have to do it indirectly, via _VPF_migrating (instead - * of just tickling any idler suitable for cur) because cur - * is running. - * - * If there are suitable idlers for new, no matter priorities, - * leave cur alone (as it is running and is, likely, cache-hot) - * and wake some of them (which is waking up and so is, likely, - * cache cold anyway). - */ - if ( new_idlers_empty && new->pri > cur->pri ) - { - if ( cpumask_intersects(unit->cpu_hard_affinity, &idle_mask) ) - { - SCHED_UNIT_STAT_CRANK(cur, kicked_away); - SCHED_UNIT_STAT_CRANK(cur, migrate_r); - SCHED_STAT_CRANK(migrate_kicked_away); - sched_set_pause_flags_atomic(cur->unit, _VPF_migrating); - } - /* Tickle cpu anyway, to let new preempt cur. */ - SCHED_STAT_CRANK(tickled_busy_cpu); - __cpumask_set_cpu(cpu, &mask); - } - else if ( !new_idlers_empty ) - { - /* Which of the idlers suitable for new shall we wake up? */ - SCHED_STAT_CRANK(tickled_idle_cpu); - if ( opt_tickle_one_idle ) - { - this_cpu(last_tickle_cpu) = - cpumask_cycle(this_cpu(last_tickle_cpu), - cpumask_scratch_cpu(cpu)); - __cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); - } - else - cpumask_or(&mask, &mask, cpumask_scratch_cpu(cpu)); - } - - /* Did we find anyone? */ - if ( !cpumask_empty(&mask) ) - break; - } - } - - tickle: - if ( !cpumask_empty(&mask) ) - { - if ( unlikely(tb_init_done) ) - { - /* Avoid TRACE_*: saves checking !tb_init_done each step */ - for_each_cpu(cpu, &mask) - __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu); - } - - /* - * Mark the designated CPUs as busy and send them all the scheduler - * interrupt. We need the for_each_cpu for dealing with the - * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and - * can't use cpumask_andnot(), because prv->idlers needs atomic access. - * - * In the default (and most common) case, when opt_rickle_one_idle is - * true, the loop does only one step, and only one bit is cleared. - */ - for_each_cpu(cpu, &mask) - cpumask_clear_cpu(cpu, prv->idlers); - cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ); - } - else - SCHED_STAT_CRANK(tickled_no_cpu); -} - -static void -csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - struct csched_private *prv = CSCHED_PRIV(ops); - - /* - * pcpu either points to a valid struct csched_pcpu, or is NULL, if we're - * beeing called from CPU_UP_CANCELLED, because bringing up a pCPU failed - * very early. xfree() does not really mind, but we want to be sure that, - * when we get here, either init_pdata has never been called, or - * deinit_pdata has been called already. - */ - ASSERT(!cpumask_test_cpu(cpu, prv->cpus)); - - xfree(pcpu); -} - -static void -csched_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - struct csched_private *prv = CSCHED_PRIV(ops); - struct csched_pcpu *spc = pcpu; - unsigned int node = cpu_to_node(cpu); - unsigned long flags; - - /* - * Scheduler specific data for this pCPU must still be there and and be - * valid. In fact, if we are here: - * 1. alloc_pdata must have been called for this cpu, and free_pdata - * must not have been called on it before us, - * 2. init_pdata must have been called on this cpu, and deinit_pdata - * (us!) must not have been called on it already. - */ - ASSERT(spc && cpumask_test_cpu(cpu, prv->cpus)); - - spin_lock_irqsave(&prv->lock, flags); - - prv->credit -= prv->credits_per_tslice; - prv->ncpus--; - cpumask_clear_cpu(cpu, prv->idlers); - cpumask_clear_cpu(cpu, prv->cpus); - if ( (prv->master == cpu) && (prv->ncpus > 0) ) - { - prv->master = cpumask_first(prv->cpus); - migrate_timer(&prv->master_ticker, prv->master); - } - if ( prv->balance_bias[node] == cpu ) - { - cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(node)); - if ( !cpumask_empty(cpumask_scratch) ) - prv->balance_bias[node] = cpumask_first(cpumask_scratch); - } - kill_timer(&spc->ticker); - if ( prv->ncpus == 0 ) - kill_timer(&prv->master_ticker); - - spin_unlock_irqrestore(&prv->lock, flags); -} - -static void * -csched_alloc_pdata(const struct scheduler *ops, int cpu) -{ - struct csched_pcpu *spc; - - /* Allocate per-PCPU info */ - spc = xzalloc(struct csched_pcpu); - if ( spc == NULL ) - return ERR_PTR(-ENOMEM); - - return spc; -} - -static void -init_pdata(struct csched_private *prv, struct csched_pcpu *spc, int cpu) -{ - ASSERT(spin_is_locked(&prv->lock)); - /* cpu data needs to be allocated, but STILL uninitialized. */ - ASSERT(spc && spc->runq.next == NULL && spc->runq.prev == NULL); - - /* Initialize/update system-wide config */ - prv->credit += prv->credits_per_tslice; - prv->ncpus++; - cpumask_set_cpu(cpu, prv->cpus); - if ( prv->ncpus == 1 ) - { - prv->master = cpu; - init_timer(&prv->master_ticker, csched_acct, prv, cpu); - set_timer(&prv->master_ticker, NOW() + prv->tslice); - } - - cpumask_and(cpumask_scratch, prv->cpus, &node_to_cpumask(cpu_to_node(cpu))); - if ( cpumask_weight(cpumask_scratch) == 1 ) - prv->balance_bias[cpu_to_node(cpu)] = cpu; - - init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); - set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); - - INIT_LIST_HEAD(&spc->runq); - spc->runq_sort_last = prv->runq_sort; - spc->idle_bias = nr_cpu_ids - 1; - - /* Start off idling... */ - BUG_ON(!is_idle_unit(curr_on_cpu(cpu))); - cpumask_set_cpu(cpu, prv->idlers); - spc->nr_runnable = 0; -} - -static void -csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu) -{ - unsigned long flags; - struct csched_private *prv = CSCHED_PRIV(ops); - - spin_lock_irqsave(&prv->lock, flags); - init_pdata(prv, pdata, cpu); - spin_unlock_irqrestore(&prv->lock, flags); -} - -/* Change the scheduler of cpu to us (Credit). */ -static spinlock_t * -csched_switch_sched(struct scheduler *new_ops, unsigned int cpu, - void *pdata, void *vdata) -{ - struct sched_resource *sr = get_sched_res(cpu); - struct csched_private *prv = CSCHED_PRIV(new_ops); - struct csched_unit *svc = vdata; - - ASSERT(svc && is_idle_unit(svc->unit)); - - sched_idle_unit(cpu)->priv = vdata; - - /* - * We are holding the runqueue lock already (it's been taken in - * schedule_cpu_switch()). It actually may or may not be the 'right' - * one for this cpu, but that is ok for preventing races. - */ - ASSERT(!local_irq_is_enabled()); - spin_lock(&prv->lock); - init_pdata(prv, pdata, cpu); - spin_unlock(&prv->lock); - - return &sr->_lock; -} - -#ifndef NDEBUG -static inline void -__csched_unit_check(struct sched_unit *unit) -{ - struct csched_unit * const svc = CSCHED_UNIT(unit); - struct csched_dom * const sdom = svc->sdom; - - BUG_ON( svc->unit != unit ); - BUG_ON( sdom != CSCHED_DOM(unit->domain) ); - if ( sdom ) - { - BUG_ON( is_idle_unit(unit) ); - BUG_ON( sdom->dom != unit->domain ); - } - else - { - BUG_ON( !is_idle_unit(unit) ); - } - - SCHED_STAT_CRANK(unit_check); -} -#define CSCHED_UNIT_CHECK(unit) (__csched_unit_check(unit)) -#else -#define CSCHED_UNIT_CHECK(unit) -#endif - -/* - * Delay, in microseconds, between migrations of a UNIT between PCPUs. - * This prevents rapid fluttering of a UNIT between CPUs, and reduces the - * implicit overheads such as cache-warming. 1ms (1000) has been measured - * as a good value. - */ -static unsigned int vcpu_migration_delay_us; -integer_param("vcpu_migration_delay", vcpu_migration_delay_us); - -static inline bool -__csched_vcpu_is_cache_hot(const struct csched_private *prv, - const struct csched_unit *svc) -{ - bool hot = prv->unit_migr_delay && - (NOW() - svc->last_sched_time) < prv->unit_migr_delay; - - if ( hot ) - SCHED_STAT_CRANK(unit_hot); - - return hot; -} - -static inline int -__csched_unit_is_migrateable(const struct csched_private *prv, - struct sched_unit *unit, - int dest_cpu, cpumask_t *mask) -{ - const struct csched_unit *svc = CSCHED_UNIT(unit); - /* - * Don't pick up work that's hot on peer PCPU, or that can't (or - * would prefer not to) run on cpu. - * - * The caller is supposed to have already checked that unit is also - * not running. - */ - ASSERT(!unit->is_running); - - return !__csched_vcpu_is_cache_hot(prv, svc) && - cpumask_test_cpu(dest_cpu, mask); -} - -static int -_csched_cpu_pick(const struct scheduler *ops, const struct sched_unit *unit, - bool_t commit) -{ - int cpu = sched_unit_master(unit); - /* We must always use cpu's scratch space */ - cpumask_t *cpus = cpumask_scratch_cpu(cpu); - cpumask_t idlers; - cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); - struct csched_pcpu *spc = NULL; - int balance_step; - - for_each_affinity_balance_step( balance_step ) - { - affinity_balance_cpumask(unit, balance_step, cpus); - cpumask_and(cpus, online, cpus); - /* - * We want to pick up a pcpu among the ones that are online and - * can accommodate vc. As far as hard affinity is concerned, there - * always will be at least one of these pcpus in the scratch cpumask, - * hence, the calls to cpumask_cycle() and cpumask_test_cpu() below - * are ok. - * - * On the other hand, when considering soft affinity, it is possible - * that the mask is empty (for instance, if the domain has been put - * in a cpupool that does not contain any of the pcpus in its soft - * affinity), which would result in the ASSERT()-s inside cpumask_*() - * operations triggering (in debug builds). - * - * Therefore, if that is the case, we just skip the soft affinity - * balancing step all together. - */ - if ( balance_step == BALANCE_SOFT_AFFINITY && - (!has_soft_affinity(unit) || cpumask_empty(cpus)) ) - continue; - - /* If present, prefer vc's current processor */ - cpu = cpumask_test_cpu(sched_unit_master(unit), cpus) - ? sched_unit_master(unit) - : cpumask_cycle(sched_unit_master(unit), cpus); - ASSERT(cpumask_test_cpu(cpu, cpus)); - - /* - * Try to find an idle processor within the above constraints. - * - * In multi-core and multi-threaded CPUs, not all idle execution - * vehicles are equal! - * - * We give preference to the idle execution vehicle with the most - * idling neighbours in its grouping. This distributes work across - * distinct cores first and guarantees we don't do something stupid - * like run two UNITs on co-hyperthreads while there are idle cores - * or sockets. - * - * Notice that, when computing the "idleness" of cpu, we may want to - * discount unit. That is, iff unit is the currently running and the - * only runnable unit on cpu, we add cpu to the idlers. - */ - cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); - if ( sched_unit_master(unit) == cpu && is_runq_idle(cpu) ) - __cpumask_set_cpu(cpu, &idlers); - cpumask_and(cpus, &idlers, cpus); - - /* - * It is important that cpu points to an idle processor, if a suitable - * one exists (and we can use cpus to check and, possibly, choose a new - * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and - * cpu points to a busy thread with an idle sibling, both the threads - * will be considered the same, from the "idleness" calculation point - * of view", preventing unit from being moved to the thread that is - * actually idle. - * - * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so - * we check for it first. - */ - if ( !cpumask_test_cpu(cpu, cpus) && !cpumask_empty(cpus) ) - cpu = cpumask_cycle(cpu, cpus); - __cpumask_clear_cpu(cpu, cpus); - - while ( !cpumask_empty(cpus) ) - { - cpumask_t cpu_idlers; - cpumask_t nxt_idlers; - int nxt, weight_cpu, weight_nxt; - int migrate_factor; - - nxt = cpumask_cycle(cpu, cpus); - - if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) - { - /* We're on the same socket, so check the busy-ness of threads. - * Migrate if # of idlers is less at all */ - ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 1; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, - cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, - nxt)); - } - else - { - /* We're on different sockets, so check the busy-ness of cores. - * Migrate only if the other core is twice as idle */ - ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 2; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); - } - - weight_cpu = cpumask_weight(&cpu_idlers); - weight_nxt = cpumask_weight(&nxt_idlers); - /* smt_power_savings: consolidate work rather than spreading it */ - if ( sched_smt_power_savings ? - weight_cpu > weight_nxt : - weight_cpu * migrate_factor < weight_nxt ) - { - cpumask_and(&nxt_idlers, &nxt_idlers, cpus); - spc = CSCHED_PCPU(nxt); - cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); - cpumask_andnot(cpus, cpus, per_cpu(cpu_sibling_mask, cpu)); - } - else - { - cpumask_andnot(cpus, cpus, &nxt_idlers); - } - } - - /* Stop if cpu is idle */ - if ( cpumask_test_cpu(cpu, &idlers) ) - break; - } - - if ( commit && spc ) - spc->idle_bias = cpu; - - TRACE_3D(TRC_CSCHED_PICKED_CPU, unit->domain->domain_id, unit->unit_id, - cpu); - - return cpu; -} - -static struct sched_resource * -csched_res_pick(const struct scheduler *ops, const struct sched_unit *unit) -{ - struct csched_unit *svc = CSCHED_UNIT(unit); - - /* - * We have been called by vcpu_migrate() (in schedule.c), as part - * of the process of seeing if vc can be migrated to another pcpu. - * We make a note about this in svc->flags so that later, in - * csched_unit_wake() (still called from vcpu_migrate()) we won't - * get boosted, which we don't deserve as we are "only" migrating. - */ - set_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); - return get_sched_res(_csched_cpu_pick(ops, unit, 1)); -} - -static inline void -__csched_unit_acct_start(struct csched_private *prv, struct csched_unit *svc) -{ - struct csched_dom * const sdom = svc->sdom; - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - - if ( list_empty(&svc->active_unit_elem) ) - { - SCHED_UNIT_STAT_CRANK(svc, state_active); - SCHED_STAT_CRANK(acct_unit_active); - - sdom->active_unit_count++; - list_add(&svc->active_unit_elem, &sdom->active_unit); - /* Make weight per-unit */ - prv->weight += sdom->weight; - if ( list_empty(&sdom->active_sdom_elem) ) - { - list_add(&sdom->active_sdom_elem, &prv->active_sdom); - } - } - - TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id, - svc->unit->unit_id, sdom->active_unit_count); - - spin_unlock_irqrestore(&prv->lock, flags); -} - -static inline void -__csched_unit_acct_stop_locked(struct csched_private *prv, - struct csched_unit *svc) -{ - struct csched_dom * const sdom = svc->sdom; - - BUG_ON( list_empty(&svc->active_unit_elem) ); - - SCHED_UNIT_STAT_CRANK(svc, state_idle); - SCHED_STAT_CRANK(acct_unit_idle); - - BUG_ON( prv->weight < sdom->weight ); - sdom->active_unit_count--; - list_del_init(&svc->active_unit_elem); - prv->weight -= sdom->weight; - if ( list_empty(&sdom->active_unit) ) - { - list_del_init(&sdom->active_sdom_elem); - } - - TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id, - svc->unit->unit_id, sdom->active_unit_count); -} - -static void -csched_unit_acct(struct csched_private *prv, unsigned int cpu) -{ - struct sched_unit *currunit = current->sched_unit; - struct csched_unit * const svc = CSCHED_UNIT(currunit); - struct sched_resource *sr = get_sched_res(cpu); - const struct scheduler *ops = sr->scheduler; - - ASSERT( sched_unit_master(currunit) == cpu ); - ASSERT( svc->sdom != NULL ); - ASSERT( !is_idle_unit(svc->unit) ); - - /* - * If this UNIT's priority was boosted when it last awoke, reset it. - * If the UNIT is found here, then it's consuming a non-negligeable - * amount of CPU resources and should no longer be boosted. - */ - if ( svc->pri == CSCHED_PRI_TS_BOOST ) - { - svc->pri = CSCHED_PRI_TS_UNDER; - TRACE_2D(TRC_CSCHED_BOOST_END, svc->sdom->dom->domain_id, - svc->unit->unit_id); - } - - /* - * Update credits - */ - burn_credits(svc, NOW()); - - /* - * Put this UNIT and domain back on the active list if it was - * idling. - */ - if ( list_empty(&svc->active_unit_elem) ) - { - __csched_unit_acct_start(prv, svc); - } - else - { - unsigned int new_cpu; - unsigned long flags; - spinlock_t *lock = unit_schedule_lock_irqsave(currunit, &flags); - - /* - * If it's been active a while, check if we'd be better off - * migrating it to run elsewhere (see multi-core and multi-thread - * support in csched_res_pick()). - */ - new_cpu = _csched_cpu_pick(ops, currunit, 0); - - unit_schedule_unlock_irqrestore(lock, flags, currunit); - - if ( new_cpu != cpu ) - { - SCHED_UNIT_STAT_CRANK(svc, migrate_r); - SCHED_STAT_CRANK(migrate_running); - sched_set_pause_flags_atomic(currunit, _VPF_migrating); - /* - * As we are about to tickle cpu, we should clear its bit in - * idlers. But, if we are here, it means there is someone running - * on it, and hence the bit must be zero already. - */ - ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(ops)->idlers)); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - } - } -} - -static void * -csched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, - void *dd) -{ - struct csched_unit *svc; - - /* Allocate per-UNIT info */ - svc = xzalloc(struct csched_unit); - if ( svc == NULL ) - return NULL; - - INIT_LIST_HEAD(&svc->runq_elem); - INIT_LIST_HEAD(&svc->active_unit_elem); - svc->sdom = dd; - svc->unit = unit; - svc->pri = is_idle_unit(unit) ? - CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; - SCHED_UNIT_STATS_RESET(svc); - SCHED_STAT_CRANK(unit_alloc); - return svc; -} - -static void -csched_unit_insert(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched_unit *svc = unit->priv; - spinlock_t *lock; - - BUG_ON( is_idle_unit(unit) ); - - /* csched_res_pick() looks in vc->processor's runq, so we need the lock. */ - lock = unit_schedule_lock_irq(unit); - - sched_set_res(unit, csched_res_pick(ops, unit)); - - spin_unlock_irq(lock); - - lock = unit_schedule_lock_irq(unit); - - if ( !__unit_on_runq(svc) && unit_runnable(unit) && !unit->is_running ) - runq_insert(svc); - - unit_schedule_unlock_irq(lock, unit); - - SCHED_STAT_CRANK(unit_insert); -} - -static void -csched_free_udata(const struct scheduler *ops, void *priv) -{ - struct csched_unit *svc = priv; - - BUG_ON( !list_empty(&svc->runq_elem) ); - - xfree(svc); -} - -static void -csched_unit_remove(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched_private *prv = CSCHED_PRIV(ops); - struct csched_unit * const svc = CSCHED_UNIT(unit); - struct csched_dom * const sdom = svc->sdom; - - SCHED_STAT_CRANK(unit_remove); - - ASSERT(!__unit_on_runq(svc)); - - if ( test_and_clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) - { - SCHED_STAT_CRANK(unit_unpark); - sched_unit_unpause(svc->unit); - } - - spin_lock_irq(&prv->lock); - - if ( !list_empty(&svc->active_unit_elem) ) - __csched_unit_acct_stop_locked(prv, svc); - - spin_unlock_irq(&prv->lock); - - BUG_ON( sdom == NULL ); -} - -static void -csched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched_unit * const svc = CSCHED_UNIT(unit); - unsigned int cpu = sched_unit_master(unit); - struct sched_resource *sr = get_sched_res(cpu); - - SCHED_STAT_CRANK(unit_sleep); - - BUG_ON( is_idle_unit(unit) ); - - if ( curr_on_cpu(cpu) == unit ) - { - /* - * We are about to tickle cpu, so we should clear its bit in idlers. - * But, we are here because unit is going to sleep while running on cpu, - * so the bit must be zero already. - */ - ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(sr->scheduler)->idlers)); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - } - else if ( __unit_on_runq(svc) ) - runq_remove(svc); -} - -static void -csched_unit_wake(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched_unit * const svc = CSCHED_UNIT(unit); - bool_t migrating; - - BUG_ON( is_idle_unit(unit) ); - - if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) - { - SCHED_STAT_CRANK(unit_wake_running); - return; - } - if ( unlikely(__unit_on_runq(svc)) ) - { - SCHED_STAT_CRANK(unit_wake_onrunq); - return; - } - - if ( likely(unit_runnable(unit)) ) - SCHED_STAT_CRANK(unit_wake_runnable); - else - SCHED_STAT_CRANK(unit_wake_not_runnable); - - /* - * We temporarily boost the priority of awaking UNITs! - * - * If this UNIT consumes a non negligible amount of CPU, it - * will eventually find itself in the credit accounting code - * path where its priority will be reset to normal. - * - * If on the other hand the UNIT consumes little CPU and is - * blocking and awoken a lot (doing I/O for example), its - * priority will remain boosted, optimizing it's wake-to-run - * latencies. - * - * This allows wake-to-run latency sensitive UNITs to preempt - * more CPU resource intensive UNITs without impacting overall - * system fairness. - * - * There are two cases, when we don't want to boost: - * - UNITs that are waking up after a migration, rather than - * after having block; - * - UNITs of capped domains unpausing after earning credits - * they had overspent. - */ - migrating = test_and_clear_bit(CSCHED_FLAG_UNIT_MIGRATING, &svc->flags); - - if ( !migrating && svc->pri == CSCHED_PRI_TS_UNDER && - !test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) - { - TRACE_2D(TRC_CSCHED_BOOST_START, unit->domain->domain_id, - unit->unit_id); - SCHED_STAT_CRANK(unit_boost); - svc->pri = CSCHED_PRI_TS_BOOST; - } - - /* Put the UNIT on the runq and tickle CPUs */ - runq_insert(svc); - __runq_tickle(svc); -} - -static void -csched_unit_yield(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched_unit * const svc = CSCHED_UNIT(unit); - - /* Let the scheduler know that this vcpu is trying to yield */ - set_bit(CSCHED_FLAG_UNIT_YIELD, &svc->flags); -} - -static int -csched_dom_cntl( - const struct scheduler *ops, - struct domain *d, - struct xen_domctl_scheduler_op *op) -{ - struct csched_dom * const sdom = CSCHED_DOM(d); - struct csched_private *prv = CSCHED_PRIV(ops); - unsigned long flags; - int rc = 0; - - /* Protect both get and put branches with the pluggable scheduler - * lock. Runq lock not needed anywhere in here. */ - spin_lock_irqsave(&prv->lock, flags); - - switch ( op->cmd ) - { - case XEN_DOMCTL_SCHEDOP_getinfo: - op->u.credit.weight = sdom->weight; - op->u.credit.cap = sdom->cap; - break; - case XEN_DOMCTL_SCHEDOP_putinfo: - if ( op->u.credit.weight != 0 ) - { - if ( !list_empty(&sdom->active_sdom_elem) ) - { - prv->weight -= sdom->weight * sdom->active_unit_count; - prv->weight += op->u.credit.weight * sdom->active_unit_count; - } - sdom->weight = op->u.credit.weight; - } - - if ( op->u.credit.cap != (uint16_t)~0U ) - sdom->cap = op->u.credit.cap; - break; - default: - rc = -EINVAL; - break; - } - - spin_unlock_irqrestore(&prv->lock, flags); - - return rc; -} - -static void -csched_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, - const cpumask_t *hard, const cpumask_t *soft) -{ - struct csched_unit *svc = CSCHED_UNIT(unit); - - if ( !hard ) - return; - - /* Are we becoming exclusively pinned? */ - if ( cpumask_weight(hard) == 1 ) - set_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); - else - clear_bit(CSCHED_FLAG_UNIT_PINNED, &svc->flags); -} - -static inline void -__csched_set_tslice(struct csched_private *prv, unsigned int timeslice_ms) -{ - prv->tslice = MILLISECS(timeslice_ms); - prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE; - if ( timeslice_ms < prv->ticks_per_tslice ) - prv->ticks_per_tslice = 1; - prv->tick_period_us = timeslice_ms * 1000 / prv->ticks_per_tslice; - prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * timeslice_ms; - prv->credit = prv->credits_per_tslice * prv->ncpus; -} - -static int -csched_sys_cntl(const struct scheduler *ops, - struct xen_sysctl_scheduler_op *sc) -{ - int rc = -EINVAL; - struct xen_sysctl_credit_schedule *params = &sc->u.sched_credit; - struct csched_private *prv = CSCHED_PRIV(ops); - unsigned long flags; - - switch ( sc->cmd ) - { - case XEN_SYSCTL_SCHEDOP_putinfo: - if ( params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX - || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN - || (params->ratelimit_us - && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX - || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN)) - || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) - || params->vcpu_migr_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) - goto out; - - spin_lock_irqsave(&prv->lock, flags); - __csched_set_tslice(prv, params->tslice_ms); - if ( !prv->ratelimit && params->ratelimit_us ) - printk(XENLOG_INFO "Enabling context switch rate limiting\n"); - else if ( prv->ratelimit && !params->ratelimit_us ) - printk(XENLOG_INFO "Disabling context switch rate limiting\n"); - prv->ratelimit = MICROSECS(params->ratelimit_us); - prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us); - spin_unlock_irqrestore(&prv->lock, flags); - - /* FALLTHRU */ - case XEN_SYSCTL_SCHEDOP_getinfo: - params->tslice_ms = prv->tslice / MILLISECS(1); - params->ratelimit_us = prv->ratelimit / MICROSECS(1); - params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1); - rc = 0; - break; - } - out: - return rc; -} - -static void * -csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) -{ - struct csched_dom *sdom; - - sdom = xzalloc(struct csched_dom); - if ( sdom == NULL ) - return ERR_PTR(-ENOMEM); - - /* Initialize credit and weight */ - INIT_LIST_HEAD(&sdom->active_unit); - INIT_LIST_HEAD(&sdom->active_sdom_elem); - sdom->dom = dom; - sdom->weight = CSCHED_DEFAULT_WEIGHT; - - return sdom; -} - -static void -csched_free_domdata(const struct scheduler *ops, void *data) -{ - xfree(data); -} - -/* - * This is a O(n) optimized sort of the runq. - * - * Time-share UNITs can only be one of two priorities, UNDER or OVER. We walk - * through the runq and move up any UNDERs that are preceded by OVERS. We - * remember the last UNDER to make the move up operation O(1). - */ -static void -csched_runq_sort(struct csched_private *prv, unsigned int cpu) -{ - struct csched_pcpu * const spc = CSCHED_PCPU(cpu); - struct list_head *runq, *elem, *next, *last_under; - struct csched_unit *svc_elem; - spinlock_t *lock; - unsigned long flags; - int sort_epoch; - - sort_epoch = prv->runq_sort; - if ( sort_epoch == spc->runq_sort_last ) - return; - - spc->runq_sort_last = sort_epoch; - - lock = pcpu_schedule_lock_irqsave(cpu, &flags); - - runq = &spc->runq; - elem = runq->next; - last_under = runq; - - while ( elem != runq ) - { - next = elem->next; - svc_elem = __runq_elem(elem); - - if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) - { - /* does elem need to move up the runq? */ - if ( elem->prev != last_under ) - { - list_del(elem); - list_add(elem, last_under); - } - last_under = elem; - } - - elem = next; - } - - pcpu_schedule_unlock_irqrestore(lock, flags, cpu); -} - -static void -csched_acct(void* dummy) -{ - struct csched_private *prv = dummy; - unsigned long flags; - struct list_head *iter_unit, *next_unit; - struct list_head *iter_sdom, *next_sdom; - struct csched_unit *svc; - struct csched_dom *sdom; - uint32_t credit_total; - uint32_t weight_total; - uint32_t weight_left; - uint32_t credit_fair; - uint32_t credit_peak; - uint32_t credit_cap; - int credit_balance; - int credit_xtra; - int credit; - - - spin_lock_irqsave(&prv->lock, flags); - - weight_total = prv->weight; - credit_total = prv->credit; - - /* Converge balance towards 0 when it drops negative */ - if ( prv->credit_balance < 0 ) - { - credit_total -= prv->credit_balance; - SCHED_STAT_CRANK(acct_balance); - } - - if ( unlikely(weight_total == 0) ) - { - prv->credit_balance = 0; - spin_unlock_irqrestore(&prv->lock, flags); - SCHED_STAT_CRANK(acct_no_work); - goto out; - } - - SCHED_STAT_CRANK(acct_run); - - weight_left = weight_total; - credit_balance = 0; - credit_xtra = 0; - credit_cap = 0U; - - list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) - { - sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); - - BUG_ON( is_idle_domain(sdom->dom) ); - BUG_ON( sdom->active_unit_count == 0 ); - BUG_ON( sdom->weight == 0 ); - BUG_ON( (sdom->weight * sdom->active_unit_count) > weight_left ); - - weight_left -= ( sdom->weight * sdom->active_unit_count ); - - /* - * A domain's fair share is computed using its weight in competition - * with that of all other active domains. - * - * At most, a domain can use credits to run all its active UNITs - * for one full accounting period. We allow a domain to earn more - * only when the system-wide credit balance is negative. - */ - credit_peak = sdom->active_unit_count * prv->credits_per_tslice; - if ( prv->credit_balance < 0 ) - { - credit_peak += ( ( -prv->credit_balance - * sdom->weight - * sdom->active_unit_count) + - (weight_total - 1) - ) / weight_total; - } - - if ( sdom->cap != 0U ) - { - credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100; - if ( credit_cap < credit_peak ) - credit_peak = credit_cap; - - /* FIXME -- set cap per-unit as well...? */ - credit_cap = ( credit_cap + ( sdom->active_unit_count - 1 ) - ) / sdom->active_unit_count; - } - - credit_fair = ( ( credit_total - * sdom->weight - * sdom->active_unit_count ) - + (weight_total - 1) - ) / weight_total; - - if ( credit_fair < credit_peak ) - { - credit_xtra = 1; - } - else - { - if ( weight_left != 0U ) - { - /* Give other domains a chance at unused credits */ - credit_total += ( ( ( credit_fair - credit_peak - ) * weight_total - ) + ( weight_left - 1 ) - ) / weight_left; - } - - if ( credit_xtra ) - { - /* - * Lazily keep domains with extra credits at the head of - * the queue to give others a chance at them in future - * accounting periods. - */ - SCHED_STAT_CRANK(acct_reorder); - list_del(&sdom->active_sdom_elem); - list_add(&sdom->active_sdom_elem, &prv->active_sdom); - } - - credit_fair = credit_peak; - } - - /* Compute fair share per UNIT */ - credit_fair = ( credit_fair + ( sdom->active_unit_count - 1 ) - ) / sdom->active_unit_count; - - - list_for_each_safe( iter_unit, next_unit, &sdom->active_unit ) - { - svc = list_entry(iter_unit, struct csched_unit, active_unit_elem); - BUG_ON( sdom != svc->sdom ); - - /* Increment credit */ - atomic_add(credit_fair, &svc->credit); - credit = atomic_read(&svc->credit); - - /* - * Recompute priority or, if UNIT is idling, remove it from - * the active list. - */ - if ( credit < 0 ) - { - svc->pri = CSCHED_PRI_TS_OVER; - - /* Park running UNITs of capped-out domains */ - if ( sdom->cap != 0U && - credit < -credit_cap && - !test_and_set_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) - { - SCHED_STAT_CRANK(unit_park); - sched_unit_pause_nosync(svc->unit); - } - - /* Lower bound on credits */ - if ( credit < -prv->credits_per_tslice ) - { - SCHED_STAT_CRANK(acct_min_credit); - credit = -prv->credits_per_tslice; - atomic_set(&svc->credit, credit); - } - } - else - { - svc->pri = CSCHED_PRI_TS_UNDER; - - /* Unpark any capped domains whose credits go positive */ - if ( test_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags) ) - { - /* - * It's important to unset the flag AFTER the unpause() - * call to make sure the UNIT's priority is not boosted - * if it is woken up here. - */ - SCHED_STAT_CRANK(unit_unpark); - sched_unit_unpause(svc->unit); - clear_bit(CSCHED_FLAG_UNIT_PARKED, &svc->flags); - } - - /* Upper bound on credits means UNIT stops earning */ - if ( credit > prv->credits_per_tslice ) - { - __csched_unit_acct_stop_locked(prv, svc); - /* Divide credits in half, so that when it starts - * accounting again, it starts a little bit "ahead" */ - credit /= 2; - atomic_set(&svc->credit, credit); - } - } - - SCHED_UNIT_STAT_SET(svc, credit_last, credit); - SCHED_UNIT_STAT_SET(svc, credit_incr, credit_fair); - credit_balance += credit; - } - } - - prv->credit_balance = credit_balance; - - spin_unlock_irqrestore(&prv->lock, flags); - - /* Inform each CPU that its runq needs to be sorted */ - prv->runq_sort++; - -out: - set_timer( &prv->master_ticker, NOW() + prv->tslice); -} - -static void -csched_tick(void *_cpu) -{ - unsigned int cpu = (unsigned long)_cpu; - struct sched_resource *sr = get_sched_res(cpu); - struct csched_pcpu *spc = CSCHED_PCPU(cpu); - struct csched_private *prv = CSCHED_PRIV(sr->scheduler); - - spc->tick++; - - /* - * Accounting for running UNIT - */ - if ( !is_idle_unit(current->sched_unit) ) - csched_unit_acct(prv, cpu); - - /* - * Check if runq needs to be sorted - * - * Every physical CPU resorts the runq after the accounting master has - * modified priorities. This is a special O(n) sort and runs at most - * once per accounting period (currently 30 milliseconds). - */ - csched_runq_sort(prv, cpu); - - set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); -} - -static struct csched_unit * -csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) -{ - struct sched_resource *sr = get_sched_res(cpu); - const struct csched_private * const prv = CSCHED_PRIV(sr->scheduler); - const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); - struct csched_unit *speer; - struct list_head *iter; - struct sched_unit *unit; - - ASSERT(peer_pcpu != NULL); - - /* - * Don't steal from an idle CPU's runq because it's about to - * pick up work from it itself. - */ - if ( unlikely(is_idle_unit(curr_on_cpu(peer_cpu))) ) - goto out; - - list_for_each( iter, &peer_pcpu->runq ) - { - speer = __runq_elem(iter); - - /* - * If next available UNIT here is not of strictly higher - * priority than ours, this PCPU is useless to us. - */ - if ( speer->pri <= pri ) - break; - - /* Is this UNIT runnable on our PCPU? */ - unit = speer->unit; - BUG_ON( is_idle_unit(unit) ); - - /* - * If the unit is still in peer_cpu's scheduling tail, or if it - * has no useful soft affinity, skip it. - * - * In fact, what we want is to check if we have any "soft-affine - * work" to steal, before starting to look at "hard-affine work". - * - * Notice that, if not even one unit on this runq has a useful - * soft affinity, we could have avoid considering this runq for - * a soft balancing step in the first place. This, for instance, - * can be implemented by taking note of on what runq there are - * units with useful soft affinities in some sort of bitmap - * or counter. - */ - if ( unit->is_running || (balance_step == BALANCE_SOFT_AFFINITY && - !has_soft_affinity(unit)) ) - continue; - - affinity_balance_cpumask(unit, balance_step, cpumask_scratch); - if ( __csched_unit_is_migrateable(prv, unit, cpu, cpumask_scratch) ) - { - /* We got a candidate. Grab it! */ - TRACE_3D(TRC_CSCHED_STOLEN_UNIT, peer_cpu, - unit->domain->domain_id, unit->unit_id); - SCHED_UNIT_STAT_CRANK(speer, migrate_q); - SCHED_STAT_CRANK(migrate_queued); - runq_remove(speer); - sched_set_res(unit, get_sched_res(cpu)); - /* - * speer will start executing directly on cpu, without having to - * go through runq_insert(). So we must update the runnable count - * for cpu here. - */ - inc_nr_runnable(cpu); - return speer; - } - } - out: - SCHED_STAT_CRANK(steal_peer_idle); - return NULL; -} - -static struct csched_unit * -csched_load_balance(struct csched_private *prv, int cpu, - struct csched_unit *snext, bool *stolen) -{ - struct cpupool *c = get_sched_res(cpu)->cpupool; - struct csched_unit *speer; - cpumask_t workers; - cpumask_t *online = c->res_valid; - int peer_cpu, first_cpu, peer_node, bstep; - int node = cpu_to_node(cpu); - - BUG_ON(get_sched_res(cpu) != snext->unit->res); - - /* - * If this CPU is going offline, or is not (yet) part of any cpupool - * (as it happens, e.g., during cpu bringup), we shouldn't steal work. - */ - if ( unlikely(!cpumask_test_cpu(cpu, online) || c == NULL) ) - goto out; - - if ( snext->pri == CSCHED_PRI_IDLE ) - SCHED_STAT_CRANK(load_balance_idle); - else if ( snext->pri == CSCHED_PRI_TS_OVER ) - SCHED_STAT_CRANK(load_balance_over); - else - SCHED_STAT_CRANK(load_balance_other); - - /* - * Let's look around for work to steal, taking both hard affinity - * and soft affinity into account. More specifically, we check all - * the non-idle CPUs' runq, looking for: - * 1. any "soft-affine work" to steal first, - * 2. if not finding anything, any "hard-affine work" to steal. - */ - for_each_affinity_balance_step( bstep ) - { - /* - * We peek at the non-idling CPUs in a node-wise fashion. In fact, - * it is more likely that we find some affine work on our same - * node, not to mention that migrating units within the same node - * could well expected to be cheaper than across-nodes (memory - * stays local, there might be some node-wide cache[s], etc.). - */ - peer_node = node; - do - { - /* Select the pCPUs in this node that have work we can steal. */ - cpumask_andnot(&workers, online, prv->idlers); - cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); - __cpumask_clear_cpu(cpu, &workers); - - first_cpu = cpumask_cycle(prv->balance_bias[peer_node], &workers); - if ( first_cpu >= nr_cpu_ids ) - goto next_node; - peer_cpu = first_cpu; - do - { - spinlock_t *lock; - - /* - * If there is only one runnable unit on peer_cpu, it means - * there's no one to be stolen in its runqueue, so skip it. - * - * Checking this without holding the lock is racy... But that's - * the whole point of this optimization! - * - * In more details: - * - if we race with dec_nr_runnable(), we may try to take the - * lock and call csched_runq_steal() for no reason. This is - * not a functional issue, and should be infrequent enough. - * And we can avoid that by re-checking nr_runnable after - * having grabbed the lock, if we want; - * - if we race with inc_nr_runnable(), we skip a pCPU that may - * have runnable units in its runqueue, but that's not a - * problem because: - * + if racing with csched_unit_insert() or csched_unit_wake(), - * __runq_tickle() will be called afterwords, so the unit - * won't get stuck in the runqueue for too long; - * + if racing with csched_runq_steal(), it may be that an - * unit that we could have picked up, stays in a runqueue - * until someone else tries to steal it again. But this is - * no worse than what can happen already (without this - * optimization), it the pCPU would schedule right after we - * have taken the lock, and hence block on it. - */ - if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 ) - { - TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ 0); - goto next_cpu; - } - - /* - * Get ahold of the scheduler lock for this peer CPU. - * - * Note: We don't spin on this lock but simply try it. Spinning - * could cause a deadlock if the peer CPU is also load - * balancing and trying to lock this CPU. - */ - lock = pcpu_schedule_trylock(peer_cpu); - SCHED_STAT_CRANK(steal_trylock); - if ( !lock ) - { - SCHED_STAT_CRANK(steal_trylock_failed); - TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0); - goto next_cpu; - } - - TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1); - - /* Any work over there to steal? */ - speer = cpumask_test_cpu(peer_cpu, online) ? - csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; - pcpu_schedule_unlock(lock, peer_cpu); - - /* As soon as one unit is found, balancing ends */ - if ( speer != NULL ) - { - *stolen = true; - /* - * Next time we'll look for work to steal on this node, we - * will start from the next pCPU, with respect to this one, - * so we don't risk stealing always from the same ones. - */ - prv->balance_bias[peer_node] = peer_cpu; - return speer; - } - - next_cpu: - peer_cpu = cpumask_cycle(peer_cpu, &workers); - - } while( peer_cpu != first_cpu ); - - next_node: - peer_node = cycle_node(peer_node, node_online_map); - } while( peer_node != node ); - } - - out: - /* Failed to find more important work elsewhere... */ - __runq_remove(snext); - return snext; -} - -/* - * This function is in the critical path. It is designed to be simple and - * fast for the common case. - */ -static void csched_schedule( - const struct scheduler *ops, struct sched_unit *unit, s_time_t now, - bool tasklet_work_scheduled) -{ - const unsigned int cur_cpu = smp_processor_id(); - const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); - struct csched_pcpu *spc = CSCHED_PCPU(cur_cpu); - struct list_head * const runq = RUNQ(sched_cpu); - struct csched_unit * const scurr = CSCHED_UNIT(unit); - struct csched_private *prv = CSCHED_PRIV(ops); - struct csched_unit *snext; - s_time_t runtime, tslice; - bool migrated = false; - - SCHED_STAT_CRANK(schedule); - CSCHED_UNIT_CHECK(unit); - - /* - * Here in Credit1 code, we usually just call TRACE_nD() helpers, and - * don't care about packing. But scheduling happens very often, so it - * actually is important that the record is as small as possible. - */ - if ( unlikely(tb_init_done) ) - { - struct { - unsigned cpu:16, tasklet:8, idle:8; - } d; - d.cpu = cur_cpu; - d.tasklet = tasklet_work_scheduled; - d.idle = is_idle_unit(unit); - __trace_var(TRC_CSCHED_SCHEDULE, 1, sizeof(d), - (unsigned char *)&d); - } - - runtime = now - unit->state_entry_time; - if ( runtime < 0 ) /* Does this ever happen? */ - runtime = 0; - - if ( !is_idle_unit(unit) ) - { - /* Update credits of a non-idle UNIT. */ - burn_credits(scurr, now); - scurr->start_time -= now; - scurr->last_sched_time = now; - } - else - { - /* Re-instate a boosted idle UNIT as normal-idle. */ - scurr->pri = CSCHED_PRI_IDLE; - } - - /* Choices, choices: - * - If we have a tasklet, we need to run the idle unit no matter what. - * - If sched rate limiting is in effect, and the current unit has - * run for less than that amount of time, continue the current one, - * but with a shorter timeslice and return it immediately - * - Otherwise, chose the one with the highest priority (which may - * be the one currently running) - * - If the currently running one is TS_OVER, see if there - * is a higher priority one waiting on the runqueue of another - * cpu and steal it. - */ - - /* - * If we have schedule rate limiting enabled, check to see - * how long we've run for. - * - * If scurr is yielding, however, we don't let rate limiting kick in. - * In fact, it may be the case that scurr is about to spin, and there's - * no point forcing it to do so until rate limiting expires. - */ - if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags) - && !tasklet_work_scheduled - && prv->ratelimit - && unit_runnable_state(unit) - && !is_idle_unit(unit) - && runtime < prv->ratelimit ) - { - snext = scurr; - snext->start_time += now; - perfc_incr(delay_ms); - /* - * Next timeslice must last just until we'll have executed for - * ratelimit. However, to avoid setting a really short timer, which - * will most likely be inaccurate and counterproductive, we never go - * below CSCHED_MIN_TIMER. - */ - tslice = prv->ratelimit - runtime; - if ( unlikely(runtime < CSCHED_MIN_TIMER) ) - tslice = CSCHED_MIN_TIMER; - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned runtime; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.runtime = runtime; - __trace_var(TRC_CSCHED_RATELIMIT, 1, sizeof(d), - (unsigned char *)&d); - } - - goto out; - } - tslice = prv->tslice; - - /* - * Select next runnable local UNIT (ie top of local runq) - */ - if ( unit_runnable(unit) ) - __runq_insert(scurr); - else - { - BUG_ON( is_idle_unit(unit) || list_empty(runq) ); - /* Current has blocked. Update the runnable counter for this cpu. */ - dec_nr_runnable(sched_cpu); - } - - /* - * Clear YIELD flag before scheduling out - */ - clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags); - - do { - snext = __runq_elem(runq->next); - - /* Tasklet work (which runs in idle UNIT context) overrides all else. */ - if ( tasklet_work_scheduled ) - { - TRACE_0D(TRC_CSCHED_SCHED_TASKLET); - snext = CSCHED_UNIT(sched_idle_unit(sched_cpu)); - snext->pri = CSCHED_PRI_TS_BOOST; - } - - /* - * SMP Load balance: - * - * If the next highest priority local runnable UNIT has already eaten - * through its credits, look on other PCPUs to see if we have more - * urgent work... If not, csched_load_balance() will return snext, but - * already removed from the runq. - */ - if ( snext->pri > CSCHED_PRI_TS_OVER ) - __runq_remove(snext); - else - snext = csched_load_balance(prv, sched_cpu, snext, &migrated); - - } while ( !unit_runnable_state(snext->unit) ); - - /* - * Update idlers mask if necessary. When we're idling, other CPUs - * will tickle us when they get extra work. - */ - if ( !tasklet_work_scheduled && snext->pri == CSCHED_PRI_IDLE ) - { - if ( !cpumask_test_cpu(sched_cpu, prv->idlers) ) - cpumask_set_cpu(sched_cpu, prv->idlers); - } - else if ( cpumask_test_cpu(sched_cpu, prv->idlers) ) - { - cpumask_clear_cpu(sched_cpu, prv->idlers); - } - - if ( !is_idle_unit(snext->unit) ) - snext->start_time += now; - -out: - /* - * Return task to run next... - */ - unit->next_time = (is_idle_unit(snext->unit) ? - -1 : tslice); - unit->next_task = snext->unit; - snext->unit->migrated = migrated; - - /* Stop credit tick when going to idle, restart it when coming from idle. */ - if ( !is_idle_unit(unit) && is_idle_unit(unit->next_task) ) - stop_timer(&spc->ticker); - if ( is_idle_unit(unit) && !is_idle_unit(unit->next_task) ) - set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us) - - now % MICROSECS(prv->tick_period_us) ); - - CSCHED_UNIT_CHECK(unit->next_task); -} - -static void -csched_dump_unit(struct csched_unit *svc) -{ - struct csched_dom * const sdom = svc->sdom; - - printk("[%i.%i] pri=%i flags=%x cpu=%i", - svc->unit->domain->domain_id, - svc->unit->unit_id, - svc->pri, - svc->flags, - sched_unit_master(svc->unit)); - - if ( sdom ) - { - printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit), - sdom->weight, sdom->cap); -#ifdef CSCHED_STATS - printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}", - svc->stats.credit_last, - svc->stats.credit_incr, - svc->stats.state_active, - svc->stats.state_idle, - svc->stats.migrate_q, - svc->stats.migrate_r, - svc->stats.kicked_away); -#endif - } - - printk("\n"); -} - -static void -csched_dump_pcpu(const struct scheduler *ops, int cpu) -{ - struct list_head *runq, *iter; - struct csched_private *prv = CSCHED_PRIV(ops); - struct csched_pcpu *spc; - struct csched_unit *svc; - spinlock_t *lock; - unsigned long flags; - int loop; - - /* - * We need both locks: - * - csched_dump_unit() wants to access domains' scheduling - * parameters, which are protected by the private scheduler lock; - * - we scan through the runqueue, so we need the proper runqueue - * lock (the one of the runqueue of this cpu). - */ - spin_lock_irqsave(&prv->lock, flags); - lock = pcpu_schedule_lock(cpu); - - spc = CSCHED_PCPU(cpu); - runq = &spc->runq; - - printk("CPU[%02d] nr_run=%d, sort=%d, sibling={%*pbl}, core={%*pbl}\n", - cpu, spc->nr_runnable, spc->runq_sort_last, - CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), - CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); - - /* current UNIT (nothing to say if that's the idle unit). */ - svc = CSCHED_UNIT(curr_on_cpu(cpu)); - if ( svc && !is_idle_unit(svc->unit) ) - { - printk("\trun: "); - csched_dump_unit(svc); - } - - loop = 0; - list_for_each( iter, runq ) - { - svc = __runq_elem(iter); - if ( svc ) - { - printk("\t%3d: ", ++loop); - csched_dump_unit(svc); - } - } - - pcpu_schedule_unlock(lock, cpu); - spin_unlock_irqrestore(&prv->lock, flags); -} - -static void -csched_dump(const struct scheduler *ops) -{ - struct list_head *iter_sdom, *iter_svc; - struct csched_private *prv = CSCHED_PRIV(ops); - int loop; - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - - printk("info:\n" - "\tncpus = %u\n" - "\tmaster = %u\n" - "\tcredit = %u\n" - "\tcredit balance = %d\n" - "\tweight = %u\n" - "\trunq_sort = %u\n" - "\tdefault-weight = %d\n" - "\ttslice = %"PRI_stime"ms\n" - "\tratelimit = %"PRI_stime"us\n" - "\tcredits per msec = %d\n" - "\tticks per tslice = %d\n" - "\tmigration delay = %"PRI_stime"us\n", - prv->ncpus, - prv->master, - prv->credit, - prv->credit_balance, - prv->weight, - prv->runq_sort, - CSCHED_DEFAULT_WEIGHT, - prv->tslice / MILLISECS(1), - prv->ratelimit / MICROSECS(1), - CSCHED_CREDITS_PER_MSEC, - prv->ticks_per_tslice, - prv->unit_migr_delay/ MICROSECS(1)); - - printk("idlers: %*pb\n", CPUMASK_PR(prv->idlers)); - - printk("active units:\n"); - loop = 0; - list_for_each( iter_sdom, &prv->active_sdom ) - { - struct csched_dom *sdom; - sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); - - list_for_each( iter_svc, &sdom->active_unit ) - { - struct csched_unit *svc; - spinlock_t *lock; - - svc = list_entry(iter_svc, struct csched_unit, active_unit_elem); - lock = unit_schedule_lock(svc->unit); - - printk("\t%3d: ", ++loop); - csched_dump_unit(svc); - - unit_schedule_unlock(lock, svc->unit); - } - } - - spin_unlock_irqrestore(&prv->lock, flags); -} - -static int __init -csched_global_init(void) -{ - if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || - sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN ) - { - printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n" - " Resetting to default %u\n", - XEN_SYSCTL_CSCHED_TSLICE_MIN, - XEN_SYSCTL_CSCHED_TSLICE_MAX, - CSCHED_DEFAULT_TSLICE_MS); - sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; - } - - if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) - printk("WARNING: sched_ratelimit_us >" - "sched_credit_tslice_ms is undefined\n" - "Setting ratelimit to tslice\n"); - - if ( vcpu_migration_delay_us > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) - { - vcpu_migration_delay_us = 0; - printk("WARNING: vcpu_migration_delay outside of valid range [0,%d]us.\n" - "Resetting to default: %u\n", - XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us); - } - - return 0; -} - -static int -csched_init(struct scheduler *ops) -{ - struct csched_private *prv; - - prv = xzalloc(struct csched_private); - if ( prv == NULL ) - return -ENOMEM; - - prv->balance_bias = xzalloc_array(uint32_t, MAX_NUMNODES); - if ( prv->balance_bias == NULL ) - { - xfree(prv); - return -ENOMEM; - } - - if ( !zalloc_cpumask_var(&prv->cpus) || - !zalloc_cpumask_var(&prv->idlers) ) - { - free_cpumask_var(prv->cpus); - xfree(prv->balance_bias); - xfree(prv); - return -ENOMEM; - } - - ops->sched_data = prv; - spin_lock_init(&prv->lock); - INIT_LIST_HEAD(&prv->active_sdom); - prv->master = UINT_MAX; - - __csched_set_tslice(prv, sched_credit_tslice_ms); - - if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) - prv->ratelimit = prv->tslice; - else - prv->ratelimit = MICROSECS(sched_ratelimit_us); - - prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us); - - return 0; -} - -static void -csched_deinit(struct scheduler *ops) -{ - struct csched_private *prv; - - prv = CSCHED_PRIV(ops); - if ( prv != NULL ) - { - ops->sched_data = NULL; - free_cpumask_var(prv->cpus); - free_cpumask_var(prv->idlers); - xfree(prv->balance_bias); - xfree(prv); - } -} - -static const struct scheduler sched_credit_def = { - .name = "SMP Credit Scheduler", - .opt_name = "credit", - .sched_id = XEN_SCHEDULER_CREDIT, - .sched_data = NULL, - - .global_init = csched_global_init, - - .insert_unit = csched_unit_insert, - .remove_unit = csched_unit_remove, - - .sleep = csched_unit_sleep, - .wake = csched_unit_wake, - .yield = csched_unit_yield, - - .adjust = csched_dom_cntl, - .adjust_affinity= csched_aff_cntl, - .adjust_global = csched_sys_cntl, - - .pick_resource = csched_res_pick, - .do_schedule = csched_schedule, - - .dump_cpu_state = csched_dump_pcpu, - .dump_settings = csched_dump, - .init = csched_init, - .deinit = csched_deinit, - .alloc_udata = csched_alloc_udata, - .free_udata = csched_free_udata, - .alloc_pdata = csched_alloc_pdata, - .init_pdata = csched_init_pdata, - .deinit_pdata = csched_deinit_pdata, - .free_pdata = csched_free_pdata, - .switch_sched = csched_switch_sched, - .alloc_domdata = csched_alloc_domdata, - .free_domdata = csched_free_domdata, -}; - -REGISTER_SCHEDULER(sched_credit_def); diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c deleted file mode 100644 index f7c477053c..0000000000 --- a/xen/common/sched_credit2.c +++ /dev/null @@ -1,4122 +0,0 @@ - -/**************************************************************************** - * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd - **************************************************************************** - * - * File: common/sched_credit2.c - * Author: George Dunlap - * - * Description: Credit-based SMP CPU scheduler - * Based on an earlier verson by Emmanuel Ackaouy. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Meant only for helping developers during debugging. */ -/* #define d2printk printk */ -#define d2printk(x...) - - -/* - * Credit2 tracing events ("only" 512 available!). Check - * include/public/trace.h for more details. - */ -#define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) -#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) -#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) -#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) -#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) -#define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) -#define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) -#define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) -#define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) -#define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) -#define TRC_CSCHED2_UPDATE_UNIT_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) -#define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) -#define TRC_CSCHED2_TICKLE_NEW TRC_SCHED_CLASS_EVT(CSCHED2, 13) -#define TRC_CSCHED2_RUNQ_MAX_WEIGHT TRC_SCHED_CLASS_EVT(CSCHED2, 14) -#define TRC_CSCHED2_MIGRATE TRC_SCHED_CLASS_EVT(CSCHED2, 15) -#define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16) -#define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17) -#define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19) -#define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20) -#define TRC_CSCHED2_SCHEDULE TRC_SCHED_CLASS_EVT(CSCHED2, 21) -#define TRC_CSCHED2_RATELIMIT TRC_SCHED_CLASS_EVT(CSCHED2, 22) -#define TRC_CSCHED2_RUNQ_CAND_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 23) - -/* - * TODO: - * + Hyperthreading - * - "Discount" time run on a thread with busy siblings - * + Algorithm: - * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., - * a flash animation in the background) can we schedule it with low enough latency - * so that audio doesn't skip? - * + Optimizing - * - Profiling, making new algorithms, making math more efficient (no long division) - */ - -/* - * Design: - * - * VMs "burn" credits based on their weight; higher weight means - * credits burn more slowly. The highest weight unit burns credits at - * a rate of 1 credit per nanosecond. Others burn proportionally - * more. - * - * units are inserted into the runqueue by credit order. - * - * Credits are "reset" when the next unit in the runqueue is less than - * or equal to zero. At that point, everyone's credits are "clipped" - * to a small value, and a fixed credit is added to everyone. - */ - -/* - * Utilization cap: - * - * Setting an pCPU utilization cap for a domain means the following: - * - * - a domain can have a cap, expressed in terms of % of physical CPU time. - * A domain that must not use more than 1/4 of _one_ physical CPU, will - * be given a cap of 25%; a domain that must not use more than 1+1/2 of - * physical CPU time, will be given a cap of 150%; - * - * - caps are per-domain (not per-unit). If a domain has only 1 unit, and - * a 40% cap, that one unit will use 40% of one pCPU. If a somain has 4 - * units, and a 200% cap, the equivalent of 100% time on 2 pCPUs will be - * split among the v units. How much each of the units will actually get, - * during any given interval of time, is unspecified (as it depends on - * various aspects: workload, system load, etc.). For instance, it is - * possible that, during a given time interval, 2 units use 100% each, - * and the other two use nothing; while during another time interval, - * two units use 80%, one uses 10% and the other 30%; or that each use - * 50% (and so on and so forth). - * - * For implementing this, we use the following approach: - * - * - each domain is given a 'budget', an each domain has a timer, which - * replenishes the domain's budget periodically. The budget is the amount - * of time the units of the domain can use every 'period'; - * - * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains - * (but each domain has its own timer; so the all are periodic by the same - * period, but replenishment of the budgets of the various domains, at - * periods boundaries, are not synchronous); - * - * - when units run, they consume budget. When they don't run, they don't - * consume budget. If there is no budget left for the domain, no unit of - * that domain can run. If an unit tries to run and finds that there is no - * budget, it blocks. - * At whatever time an unit wants to run, it must check the domain's budget, - * and if there is some, it can use it. - * - * - budget is replenished to the top of the capacity for the domain once - * per period. Even if there was some leftover budget from previous period, - * though, the budget after a replenishment will always be at most equal - * to the total capacify of the domain ('tot_budget'); - * - * - when a budget replenishment occurs, if there are units that had been - * blocked because of lack of budget, they'll be unblocked, and they will - * (potentially) be able to run again. - * - * Finally, some even more implementation related detail: - * - * - budget is stored in a domain-wide pool. Units of the domain that want - * to run go to such pool, and grub some. When they do so, the amount - * they grabbed is _immediately_ removed from the pool. This happens in - * unit_grab_budget(); - * - * - when units stop running, if they've not consumed all the budget they - * took, the leftover is put back in the pool. This happens in - * unit_return_budget(); - * - * - the above means that an unit can find out that there is no budget and - * block, not only if the cap has actually been reached (for this period), - * but also if some other units, in order to run, have grabbed a certain - * quota of budget, no matter whether they've already used it all or not. - * An unit blocking because (any form of) lack of budget is said to be - * "parked", and such blocking happens in park_unit(); - * - * - when an unit stops running, and puts back some budget in the domain pool, - * we need to check whether there is someone which has been parked and that - * can be unparked. This happens in unpark_parked_units(), called from - * csched2_context_saved(); - * - * - of course, unparking happens also as a consequence of the domain's budget - * being replenished by the periodic timer. This also occurs by means of - * calling csched2_context_saved() (but from replenish_domain_budget()); - * - * - parked units of a domain are kept in a (per-domain) list, called - * 'parked_units'). Manipulation of the list and of the domain-wide budget - * pool, must occur only when holding the 'budget_lock'. - */ - -/* - * Locking: - * - * - runqueue lock - * + it is per-runqueue, so: - * * cpus in a runqueue take the runqueue lock, when using - * pcpu_schedule_lock() / unit_schedule_lock() (and friends), - * * a cpu may (try to) take a "remote" runqueue lock, e.g., for - * load balancing; - * + serializes runqueue operations (removing and inserting units); - * + protects runqueue-wide data in csched2_runqueue_data; - * + protects unit parameters in csched2_unit for the unit in the - * runqueue. - * - * - Private scheduler lock - * + protects scheduler-wide data in csched2_private, such as: - * * the list of domains active in this scheduler, - * * what cpus and what runqueues are active and in what - * runqueue each cpu is; - * + serializes the operation of changing the weights of domains; - * - * - Budget lock - * + it is per-domain; - * + protects, in domains that have an utilization cap; - * * manipulation of the total budget of the domain (as it is shared - * among all units of the domain), - * * manipulation of the list of units that are blocked waiting for - * some budget to be available. - * - * - Type: - * + runqueue locks are 'regular' spinlocks; - * + the private scheduler lock can be an rwlock. In fact, data - * it protects is modified only during initialization, cpupool - * manipulation and when changing weights, and read in all - * other cases (e.g., during load balancing); - * + budget locks are 'regular' spinlocks. - * - * Ordering: - * + tylock must be used when wanting to take a runqueue lock, - * if we already hold another one; - * + if taking both a runqueue lock and the private scheduler - * lock is, the latter must always be taken for first; - * + if taking both a runqueue lock and a budget lock, the former - * must always be taken for first. - */ - -/* - * Basic constants - */ -/* Default weight: How much a new domain starts with. */ -#define CSCHED2_DEFAULT_WEIGHT 256 -/* - * Min timer: Minimum length a timer will be set, to - * achieve efficiency. - */ -#define CSCHED2_MIN_TIMER MICROSECS(500) -/* - * Amount of credit VMs begin with, and are reset to. - * ATM, set so that highest-weight VMs can only run for 10ms - * before a reset event. - */ -#define CSCHED2_CREDIT_INIT MILLISECS(10) -/* - * Amount of credit the idle units have. It never changes, as idle - * units does not consume credits, and it must be lower than whatever - * amount of credit 'regular' unit would end up with. - */ -#define CSCHED2_IDLE_CREDIT (-(1U<<30)) -/* - * Carryover: How much "extra" credit may be carried over after - * a reset. - */ -#define CSCHED2_CARRYOVER_MAX CSCHED2_MIN_TIMER -/* - * Stickiness: Cross-L2 migration resistance. Should be less than - * MIN_TIMER. - */ -#define CSCHED2_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) -/* How much to "compensate" an unit for L2 migration. */ -#define CSCHED2_MIGRATE_COMPENSATION MICROSECS(50) -/* How tolerant we should be when peeking at runtime of units on other cpus */ -#define CSCHED2_RATELIMIT_TICKLE_TOLERANCE MICROSECS(50) -/* Reset: Value below which credit will be reset. */ -#define CSCHED2_CREDIT_RESET 0 -/* Max timer: Maximum time a guest can be run for. */ -#define CSCHED2_MAX_TIMER CSCHED2_CREDIT_INIT -/* Period of the cap replenishment timer. */ -#define CSCHED2_BDGT_REPL_PERIOD ((opt_cap_period)*MILLISECS(1)) - -/* - * Flags - */ -/* - * CSFLAG_scheduled: Is this unit either running on, or context-switching off, - * a physical cpu? - * + Accessed only with runqueue lock held - * + Set when chosen as next in csched2_schedule(). - * + Cleared after context switch has been saved in csched2_context_saved() - * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should - * set CSFLAG_delayed_runq_add - * + Checked to be false in runq_insert. - */ -#define __CSFLAG_scheduled 1 -#define CSFLAG_scheduled (1U<<__CSFLAG_scheduled) -/* - * CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done - * being context switched out? - * + Set when scheduling out in csched2_schedule() if prev is runnable - * + Set in csched2_unit_wake if it finds CSFLAG_scheduled set - * + Read in csched2_context_saved(). If set, it adds prev to the runqueue and - * clears the bit. - */ -#define __CSFLAG_delayed_runq_add 2 -#define CSFLAG_delayed_runq_add (1U<<__CSFLAG_delayed_runq_add) -/* - * CSFLAG_runq_migrate_request: This unit is being migrated as a result of a - * credit2-initiated runq migrate request; migrate it to the runqueue indicated - * in the svc struct. - */ -#define __CSFLAG_runq_migrate_request 3 -#define CSFLAG_runq_migrate_request (1U<<__CSFLAG_runq_migrate_request) -/* - * CSFLAG_unit_yield: this unit was running, and has called vcpu_yield(). The - * scheduler is invoked to see if we can give the cpu to someone else, and - * get back to the yielding unit in a while. - */ -#define __CSFLAG_unit_yield 4 -#define CSFLAG_unit_yield (1U<<__CSFLAG_unit_yield) -/* - * CSFLAGS_pinned: this unit is currently 'pinned', i.e., has its hard - * affinity set to one and only 1 cpu (and, hence, can only run there). - */ -#define __CSFLAG_pinned 5 -#define CSFLAG_pinned (1U<<__CSFLAG_pinned) - -static unsigned int __read_mostly opt_migrate_resist = 500; -integer_param("sched_credit2_migrate_resist", opt_migrate_resist); - -/* - * Load tracking and load balancing - * - * Load history of runqueues and units is accounted for by using an - * exponential weighted moving average algorithm. However, instead of using - * fractions,we shift everything to left by the number of bits we want to - * use for representing the fractional part (Q-format). - * - * We may also want to reduce the precision of time accounting, to - * accommodate 'longer windows'. So, if that is the case, we just need to - * shift all time samples to the right. - * - * The details of the formulas used for load tracking are explained close to - * update_runq_load(). Let's just say here that, with full nanosecond time - * granularity, a 30 bits wide 'decaying window' is ~1 second long. - * - * We want to consider the following equations: - * - * avg[0] = load*P - * avg[i+1] = avg[i] + delta*load*P/W - delta*avg[i]/W, 0 <= delta <= W - * - * where W is the length of the window, P the multiplier for transitiong into - * Q-format fixed point arithmetic and load is the instantaneous load of a - * runqueue, which basically is the number of runnable units there are on the - * runqueue (for the meaning of the other terms, look at the doc comment to - * update_runq_load()). - * - * So, again, with full nanosecond granularity, and 1 second window, we have: - * - * W = 2^30 - * P = 2^18 - * - * The maximum possible value for the average load, which we want to store in - * s_time_t type variables (i.e., we have 63 bits available) is load*P. This - * means that, with P 18 bits wide, load can occupy 45 bits. This in turn - * means we can have 2^45 units in each runqueue, before overflow occurs! - * - * However, it can happen that, at step j+1, if: - * - * avg[j] = load*P - * delta = W - * - * then: - * - * avg[j+i] = avg[j] + W*load*P/W - W*load*P/W - * - * So we must be able to deal with W*load*P. This means load can't be higher - * than: - * - * 2^(63 - 30 - 18) = 2^15 = 32768 - * - * So 32768 is the maximum number of units the we can have in a runqueue, - * at any given time, and still not have problems with the load tracking - * calculations... and this is more than fine. - * - * As a matter of fact, since we are using microseconds granularity, we have - * W=2^20. So, still with 18 fractional bits and a 1 second long window, there - * may be 2^25 = 33554432 units in a runq before we have to start thinking - * about overflow. - */ - -/* If >0, decreases the granularity of time samples used for load tracking. */ -#define LOADAVG_GRANULARITY_SHIFT (10) -/* Time window during which we still give value to previous load history. */ -#define LOADAVG_WINDOW_SHIFT (30) -/* 18 bits by default (and not less than 4) for decimals. */ -#define LOADAVG_PRECISION_SHIFT (18) -#define LOADAVG_PRECISION_SHIFT_MIN (4) - -/* - * Both the length of the window and the number of fractional bits can be - * decided with boot parameters. - * - * The length of the window is always expressed in nanoseconds. The actual - * value used by default is LOADAVG_WINDOW_SHIFT - LOADAVG_GRANULARITY_SHIFT. - */ -static unsigned int __read_mostly opt_load_window_shift = LOADAVG_WINDOW_SHIFT; -integer_param("credit2_load_window_shift", opt_load_window_shift); -static unsigned int __read_mostly opt_load_precision_shift = LOADAVG_PRECISION_SHIFT; -integer_param("credit2_load_precision_shift", opt_load_precision_shift); - -static int __read_mostly opt_underload_balance_tolerance = 0; -integer_param("credit2_balance_under", opt_underload_balance_tolerance); -static int __read_mostly opt_overload_balance_tolerance = -3; -integer_param("credit2_balance_over", opt_overload_balance_tolerance); -/* - * Domains subject to a cap receive a replenishment of their runtime budget - * once every opt_cap_period interval. Default is 10 ms. The amount of budget - * they receive depends on their cap. For instance, a domain with a 50% cap - * will receive 50% of 10 ms, so 5 ms. - */ -static unsigned int __read_mostly opt_cap_period = 10; /* ms */ -integer_param("credit2_cap_period_ms", opt_cap_period); - -/* - * Runqueue organization. - * - * The various cpus are to be assigned each one to a runqueue, and we - * want that to happen basing on topology. At the moment, it is possible - * to choose to arrange runqueues to be: - * - * - per-cpu: meaning that there will be one runqueue per logical cpu. This - * will happen when if the opt_runqueue parameter is set to 'cpu'. - * - * - per-core: meaning that there will be one runqueue per each physical - * core of the host. This will happen if the opt_runqueue - * parameter is set to 'core'; - * - * - per-socket: meaning that there will be one runqueue per each physical - * socket (AKA package, which often, but not always, also - * matches a NUMA node) of the host; This will happen if - * the opt_runqueue parameter is set to 'socket'; - * - * - per-node: meaning that there will be one runqueue per each physical - * NUMA node of the host. This will happen if the opt_runqueue - * parameter is set to 'node'; - * - * - global: meaning that there will be only one runqueue to which all the - * (logical) processors of the host belong. This will happen if - * the opt_runqueue parameter is set to 'all'. - * - * Depending on the value of opt_runqueue, therefore, cpus that are part of - * either the same physical core, the same physical socket, the same NUMA - * node, or just all of them, will be put together to form runqueues. - */ -#define OPT_RUNQUEUE_CPU 0 -#define OPT_RUNQUEUE_CORE 1 -#define OPT_RUNQUEUE_SOCKET 2 -#define OPT_RUNQUEUE_NODE 3 -#define OPT_RUNQUEUE_ALL 4 -static const char *const opt_runqueue_str[] = { - [OPT_RUNQUEUE_CPU] = "cpu", - [OPT_RUNQUEUE_CORE] = "core", - [OPT_RUNQUEUE_SOCKET] = "socket", - [OPT_RUNQUEUE_NODE] = "node", - [OPT_RUNQUEUE_ALL] = "all" -}; -static int __read_mostly opt_runqueue = OPT_RUNQUEUE_SOCKET; - -static int __init parse_credit2_runqueue(const char *s) -{ - unsigned int i; - - for ( i = 0; i < ARRAY_SIZE(opt_runqueue_str); i++ ) - { - if ( !strcmp(s, opt_runqueue_str[i]) ) - { - opt_runqueue = i; - return 0; - } - } - - return -EINVAL; -} -custom_param("credit2_runqueue", parse_credit2_runqueue); - -/* - * Per-runqueue data - */ -struct csched2_runqueue_data { - spinlock_t lock; /* Lock for this runqueue */ - - struct list_head runq; /* Ordered list of runnable vms */ - unsigned int nr_cpus; /* How many CPUs are sharing this runqueue */ - int id; /* ID of this runqueue (-1 if invalid) */ - - int load; /* Instantaneous load (num of non-idle units) */ - s_time_t load_last_update; /* Last time average was updated */ - s_time_t avgload; /* Decaying queue load */ - s_time_t b_avgload; /* Decaying queue load modified by balancing */ - - cpumask_t active, /* CPUs enabled for this runqueue */ - smt_idle, /* Fully idle-and-untickled cores (see below) */ - tickled, /* Have been asked to go through schedule */ - idle; /* Currently idle pcpus */ - - struct list_head svc; /* List of all units assigned to the runqueue */ - unsigned int max_weight; /* Max weight of the units in this runqueue */ - unsigned int pick_bias; /* Last picked pcpu. Start from it next time */ -}; - -/* - * System-wide private data - */ -struct csched2_private { - rwlock_t lock; /* Private scheduler lock */ - - unsigned int load_precision_shift; /* Precision of load calculations */ - unsigned int load_window_shift; /* Lenght of load decaying window */ - unsigned int ratelimit_us; /* Rate limiting for this scheduler */ - - cpumask_t active_queues; /* Runqueues with (maybe) active cpus */ - struct csched2_runqueue_data *rqd; /* Data of the various runqueues */ - - cpumask_t initialized; /* CPUs part of this scheduler */ - struct list_head sdom; /* List of domains (for debug key) */ -}; - -/* - * Physical CPU - */ -struct csched2_pcpu { - cpumask_t sibling_mask; /* Siblings in the same runqueue */ - int runq_id; -}; - -/* - * Schedule Unit - */ -struct csched2_unit { - struct csched2_dom *sdom; /* Up-pointer to domain */ - struct sched_unit *unit; /* Up-pointer, to schedule unit */ - struct csched2_runqueue_data *rqd; /* Up-pointer to the runqueue */ - - int credit; /* Current amount of credit */ - unsigned int weight; /* Weight of this unit */ - unsigned int residual; /* Reminder of div(max_weight/weight) */ - unsigned flags; /* Status flags (16 bits would be ok, */ - s_time_t budget; /* Current budget (if domains has cap) */ - /* but clear_bit() does not like that) */ - s_time_t budget_quota; /* Budget to which unit is entitled */ - - s_time_t start_time; /* Time we were scheduled (for credit) */ - - /* Individual contribution to load */ - s_time_t load_last_update; /* Last time average was updated */ - s_time_t avgload; /* Decaying queue load */ - - struct list_head runq_elem; /* On the runqueue (rqd->runq) */ - struct list_head parked_elem; /* On the parked_units list */ - struct list_head rqd_elem; /* On csched2_runqueue_data's svc list */ - struct csched2_runqueue_data *migrate_rqd; /* Pre-determined migr. target */ - int tickled_cpu; /* Cpu that will pick us (-1 if none) */ -}; - -/* - * Domain - */ -struct csched2_dom { - struct domain *dom; /* Up-pointer to domain */ - - spinlock_t budget_lock; /* Serialized budget calculations */ - s_time_t tot_budget; /* Total amount of budget */ - s_time_t budget; /* Currently available budget */ - - struct timer repl_timer; /* Timer for periodic replenishment of budget */ - s_time_t next_repl; /* Time at which next replenishment occurs */ - struct list_head parked_units; /* List of CPUs waiting for budget */ - - struct list_head sdom_elem; /* On csched2_runqueue_data's sdom list */ - uint16_t weight; /* User specified weight */ - uint16_t cap; /* User specified cap */ - uint16_t nr_units; /* Number of units of this domain */ -}; - -/* - * Accessor helpers functions. - */ -static inline struct csched2_private *csched2_priv(const struct scheduler *ops) -{ - return ops->sched_data; -} - -static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu) -{ - return get_sched_res(cpu)->sched_priv; -} - -static inline struct csched2_unit *csched2_unit(const struct sched_unit *unit) -{ - return unit->priv; -} - -static inline struct csched2_dom *csched2_dom(const struct domain *d) -{ - return d->sched_priv; -} - -/* CPU to runq_id macro */ -static inline int c2r(unsigned int cpu) -{ - return csched2_pcpu(cpu)->runq_id; -} - -/* CPU to runqueue struct macro */ -static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops, - unsigned int cpu) -{ - return &csched2_priv(ops)->rqd[c2r(cpu)]; -} - -/* Does the domain of this unit have a cap? */ -static inline bool has_cap(const struct csched2_unit *svc) -{ - return svc->budget != STIME_MAX; -} - -/* - * Hyperthreading (SMT) support. - * - * We use a special per-runq mask (smt_idle) and update it according to the - * following logic: - * - when _all_ the SMT sibling in a core are idle, all their corresponding - * bits are set in the smt_idle mask; - * - when even _just_one_ of the SMT siblings in a core is not idle, all the - * bits correspondings to it and to all its siblings are clear in the - * smt_idle mask. - * - * Once we have such a mask, it is easy to implement a policy that, either: - * - uses fully idle cores first: it is enough to try to schedule the units - * on pcpus from smt_idle mask first. This is what happens if - * sched_smt_power_savings was not set at boot (default), and it maximizes - * true parallelism, and hence performance; - * - uses already busy cores first: it is enough to try to schedule the units - * on pcpus that are idle, but are not in smt_idle. This is what happens if - * sched_smt_power_savings is set at boot, and it allows as more cores as - * possible to stay in low power states, minimizing power consumption. - * - * This logic is entirely implemented in runq_tickle(), and that is enough. - * In fact, in this scheduler, placement of an unit on one of the pcpus of a - * runq, _always_ happens by means of tickling: - * - when an unit wakes up, it calls csched2_unit_wake(), which calls - * runq_tickle(); - * - when a migration is initiated in schedule.c, we call csched2_res_pick(), - * csched2_unit_migrate() (which calls migrate()) and csched2_unit_wake(). - * csched2_res_pick() looks for the least loaded runq and return just any - * of its processors. Then, csched2_unit_migrate() just moves the unit to - * the chosen runq, and it is again runq_tickle(), called by - * csched2_unit_wake() that actually decides what pcpu to use within the - * chosen runq; - * - when a migration is initiated in sched_credit2.c, by calling migrate() - * directly, that again temporarily use a random pcpu from the new runq, - * and then calls runq_tickle(), by itself. - */ - -/* - * If all the siblings of cpu (including cpu itself) are both idle and - * untickled, set all their bits in mask. - * - * NB that rqd->smt_idle is different than rqd->idle. rqd->idle - * records pcpus that at are merely idle (i.e., at the moment do not - * have an unit running on them). But you have to manually filter out - * which pcpus have been tickled in order to find cores that are not - * going to be busy soon. Filtering out tickled cpus pairwise is a - * lot of extra pain; so for rqd->smt_idle, we explicitly make so that - * the bits of a pcpu are set only if all the threads on its core are - * both idle *and* untickled. - * - * This means changing the mask when either rqd->idle or rqd->tickled - * changes. - */ -static inline -void smt_idle_mask_set(unsigned int cpu, const cpumask_t *idlers, - cpumask_t *mask) -{ - const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; - - if ( cpumask_subset(cpu_siblings, idlers) ) - cpumask_or(mask, mask, cpu_siblings); -} - -/* - * Clear the bits of all the siblings of cpu from mask (if necessary). - */ -static inline -void smt_idle_mask_clear(unsigned int cpu, cpumask_t *mask) -{ - const cpumask_t *cpu_siblings = &csched2_pcpu(cpu)->sibling_mask; - - if ( cpumask_subset(cpu_siblings, mask) ) - cpumask_andnot(mask, mask, cpu_siblings); -} - -/* - * In csched2_res_pick(), it may not be possible to actually look at remote - * runqueues (the trylock-s on their spinlocks can fail!). If that happens, - * we pick, in order of decreasing preference: - * 1) svc's current pcpu, if it is part of svc's soft affinity; - * 2) a pcpu in svc's current runqueue that is also in svc's soft affinity; - * 3) svc's current pcpu, if it is part of svc's hard affinity; - * 4) a pcpu in svc's current runqueue that is also in svc's hard affinity; - * 5) just one valid pcpu from svc's hard affinity - * - * Of course, 1, 2 and 3 makes sense only if svc has a soft affinity. Also - * note that at least 5 is guaranteed to _always_ return at least one pcpu. - */ -static int get_fallback_cpu(struct csched2_unit *svc) -{ - struct sched_unit *unit = svc->unit; - unsigned int bs; - - SCHED_STAT_CRANK(need_fallback_cpu); - - for_each_affinity_balance_step( bs ) - { - int cpu = sched_unit_master(unit); - - if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) - continue; - - affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - cpupool_domain_master_cpumask(unit->domain)); - - /* - * This is cases 1 or 3 (depending on bs): if processor is (still) - * in our affinity, go for it, for cache betterness. - */ - if ( likely(cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) - return cpu; - - /* - * This is cases 2 or 4 (depending on bs): v->processor isn't there - * any longer, check if we at least can stay in our current runq. - */ - if ( likely(cpumask_intersects(cpumask_scratch_cpu(cpu), - &svc->rqd->active)) ) - { - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &svc->rqd->active); - return cpumask_first(cpumask_scratch_cpu(cpu)); - } - - /* - * We may well pick any valid pcpu from our soft-affinity, outside - * of our current runqueue, but we decide not to. In fact, changing - * runqueue is slow, affects load distribution, and is a source of - * overhead for the units running on the other runqueue (we need the - * lock). So, better do that as a consequence of a well informed - * decision (or if we really don't have any other chance, as we will, - * at step 5, if we get to there). - * - * Also, being here, looking for a fallback, is an unfortunate and - * infrequent event, while the decision of putting us in the runqueue - * wehere we are was (likely) made taking all the relevant factors - * into account. So let's not disrupt that, just for the sake of - * soft-affinity, and let's wait here to be able to made (hopefully, - * soon), another similar well informed decision. - */ - if ( bs == BALANCE_SOFT_AFFINITY ) - continue; - - /* - * This is cases 5: last stand, just one valid pcpu from our hard - * affinity. It's guaranteed that there is at least one valid cpu, - * and therefore we are sure that we return it, and never really - * exit the loop. - */ - ASSERT(bs == BALANCE_HARD_AFFINITY && - !cpumask_empty(cpumask_scratch_cpu(cpu))); - cpu = cpumask_first(cpumask_scratch_cpu(cpu)); - if ( likely(cpu < nr_cpu_ids) ) - return cpu; - } - ASSERT_UNREACHABLE(); - /* - * We can't be here. But if that somehow happen (in non-debug builds), - * at least return something which both online and in our hard-affinity. - */ - return cpumask_any(cpumask_scratch_cpu(sched_unit_master(unit))); -} - -/* - * Time-to-credit, credit-to-time. - * - * We keep track of the "residual" time to make sure that frequent short - * schedules still get accounted for in the end. - * - * FIXME: Do pre-calculated division? - */ -static void t2c_update(struct csched2_runqueue_data *rqd, s_time_t time, - struct csched2_unit *svc) -{ - uint64_t val = time * rqd->max_weight + svc->residual; - - svc->residual = do_div(val, svc->weight); - svc->credit -= val; -} - -static s_time_t c2t(struct csched2_runqueue_data *rqd, s_time_t credit, struct csched2_unit *svc) -{ - return credit * svc->weight / rqd->max_weight; -} - -/* - * Runqueue related code. - */ - -static inline int unit_on_runq(struct csched2_unit *svc) -{ - return !list_empty(&svc->runq_elem); -} - -static inline struct csched2_unit * runq_elem(struct list_head *elem) -{ - return list_entry(elem, struct csched2_unit, runq_elem); -} - -static void activate_runqueue(struct csched2_private *prv, int rqi) -{ - struct csched2_runqueue_data *rqd; - - rqd = prv->rqd + rqi; - - BUG_ON(!cpumask_empty(&rqd->active)); - - rqd->max_weight = 1; - rqd->id = rqi; - INIT_LIST_HEAD(&rqd->svc); - INIT_LIST_HEAD(&rqd->runq); - spin_lock_init(&rqd->lock); - - __cpumask_set_cpu(rqi, &prv->active_queues); -} - -static void deactivate_runqueue(struct csched2_private *prv, int rqi) -{ - struct csched2_runqueue_data *rqd; - - rqd = prv->rqd + rqi; - - BUG_ON(!cpumask_empty(&rqd->active)); - - rqd->id = -1; - - __cpumask_clear_cpu(rqi, &prv->active_queues); -} - -static inline bool same_node(unsigned int cpua, unsigned int cpub) -{ - return cpu_to_node(cpua) == cpu_to_node(cpub); -} - -static inline bool same_socket(unsigned int cpua, unsigned int cpub) -{ - return cpu_to_socket(cpua) == cpu_to_socket(cpub); -} - -static inline bool same_core(unsigned int cpua, unsigned int cpub) -{ - return same_socket(cpua, cpub) && - cpu_to_core(cpua) == cpu_to_core(cpub); -} - -static unsigned int -cpu_to_runqueue(struct csched2_private *prv, unsigned int cpu) -{ - struct csched2_runqueue_data *rqd; - unsigned int rqi; - - for ( rqi = 0; rqi < nr_cpu_ids; rqi++ ) - { - unsigned int peer_cpu; - - /* - * As soon as we come across an uninitialized runqueue, use it. - * In fact, either: - * - we are initializing the first cpu, and we assign it to - * runqueue 0. This is handy, especially if we are dealing - * with the boot cpu (if credit2 is the default scheduler), - * as we would not be able to use cpu_to_socket() and similar - * helpers anyway (they're result of which is not reliable yet); - * - we have gone through all the active runqueues, and have not - * found anyone whose cpus' topology matches the one we are - * dealing with, so activating a new runqueue is what we want. - */ - if ( prv->rqd[rqi].id == -1 ) - break; - - rqd = prv->rqd + rqi; - BUG_ON(cpumask_empty(&rqd->active)); - - peer_cpu = cpumask_first(&rqd->active); - BUG_ON(cpu_to_socket(cpu) == XEN_INVALID_SOCKET_ID || - cpu_to_socket(peer_cpu) == XEN_INVALID_SOCKET_ID); - - if (opt_runqueue == OPT_RUNQUEUE_CPU) - continue; - if ( opt_runqueue == OPT_RUNQUEUE_ALL || - (opt_runqueue == OPT_RUNQUEUE_CORE && same_core(peer_cpu, cpu)) || - (opt_runqueue == OPT_RUNQUEUE_SOCKET && same_socket(peer_cpu, cpu)) || - (opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu)) ) - break; - } - - /* We really expect to be able to assign each cpu to a runqueue. */ - BUG_ON(rqi >= nr_cpu_ids); - - return rqi; -} - -/* Find the domain with the highest weight. */ -static void update_max_weight(struct csched2_runqueue_data *rqd, int new_weight, - int old_weight) -{ - /* Try to avoid brute-force search: - * - If new_weight is larger, max_weigth <- new_weight - * - If old_weight != max_weight, someone else is still max_weight - * (No action required) - * - If old_weight == max_weight, brute-force search for max weight - */ - if ( new_weight > rqd->max_weight ) - { - rqd->max_weight = new_weight; - SCHED_STAT_CRANK(upd_max_weight_quick); - } - else if ( old_weight == rqd->max_weight ) - { - struct list_head *iter; - int max_weight = 1; - - list_for_each( iter, &rqd->svc ) - { - struct csched2_unit * svc = list_entry(iter, struct csched2_unit, rqd_elem); - - if ( svc->weight > max_weight ) - max_weight = svc->weight; - } - - rqd->max_weight = max_weight; - SCHED_STAT_CRANK(upd_max_weight_full); - } - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned rqi:16, max_weight:16; - } d; - d.rqi = rqd->id; - d.max_weight = rqd->max_weight; - __trace_var(TRC_CSCHED2_RUNQ_MAX_WEIGHT, 1, - sizeof(d), - (unsigned char *)&d); - } -} - -/* Add and remove from runqueue assignment (not active run queue) */ -static void -_runq_assign(struct csched2_unit *svc, struct csched2_runqueue_data *rqd) -{ - - svc->rqd = rqd; - list_add_tail(&svc->rqd_elem, &svc->rqd->svc); - - update_max_weight(svc->rqd, svc->weight, 0); - - /* Expected new load based on adding this unit */ - rqd->b_avgload += svc->avgload; - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned rqi:16; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.rqi=rqd->id; - __trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, - sizeof(d), - (unsigned char *)&d); - } - -} - -static void -runq_assign(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit *svc = unit->priv; - - ASSERT(svc->rqd == NULL); - - _runq_assign(svc, c2rqd(ops, sched_unit_master(unit))); -} - -static void -_runq_deassign(struct csched2_unit *svc) -{ - struct csched2_runqueue_data *rqd = svc->rqd; - - ASSERT(!unit_on_runq(svc)); - ASSERT(!(svc->flags & CSFLAG_scheduled)); - - list_del_init(&svc->rqd_elem); - update_max_weight(rqd, 0, svc->weight); - - /* Expected new load based on removing this unit */ - rqd->b_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); - - svc->rqd = NULL; -} - -static void -runq_deassign(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit *svc = unit->priv; - - ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); - - _runq_deassign(svc); -} - -/* - * Track the runq load by gathering instantaneous load samples, and using - * exponentially weighted moving average (EWMA) for the 'decaying'. - * - * We consider a window of length W=2^(prv->load_window_shift) nsecs - * (which takes LOADAVG_GRANULARITY_SHIFT into account). - * - * If load is the instantaneous load, the formula for EWMA looks as follows, - * for the i-eth sample: - * - * avg[i] = a*load + (1 - a)*avg[i-1] - * - * where avg[i] is the new value of the average load, avg[i-1] is the value - * of the average load calculated so far, and a is a coefficient less or - * equal to 1. - * - * So, for us, it becomes: - * - * avgload = a*load + (1 - a)*avgload - * - * For determining a, we consider _when_ we are doing the load update, wrt - * the length of the window. We define delta as follows: - * - * delta = t - load_last_update - * - * where t is current time (i.e., time at which we are both sampling and - * updating the load average) and load_last_update is the last time we did - * that. - * - * There are two possible situations: - * - * a) delta <= W - * this means that, during the last window of length W, the runeuque load - * was avgload for (W - detla) time, and load for delta time: - * - * |----------- W ---------| - * | | - * | load_last_update t - * -------------------------|---------|--- - * | | | - * \__W - delta__/\_delta__/ - * | | | - * |___avgload___|__load___| - * - * So, what about using delta/W as our smoothing coefficient a. If we do, - * here's what happens: - * - * a = delta / W - * 1 - a = 1 - (delta / W) = (W - delta) / W - * - * Which matches the above description of what happened in the last - * window of length W. - * - * Note that this also means that the weight that we assign to both the - * latest load sample, and to previous history, varies at each update. - * The longer the latest load sample has been in efect, within the last - * window, the higher it weights (and the lesser the previous history - * weights). - * - * This is some sort of extension of plain EWMA to fit even better to our - * use case. - * - * b) delta > W - * this means more than a full window has passed since the last update: - * - * |----------- W ---------| - * | | - * load_last_update t - * ----|------------------------------|--- - * | | - * \_________________delta________/ - * - * Basically, it means the last load sample has been in effect for more - * than W time, and hence we should just use it, and forget everything - * before that. - * - * This can be seen as a 'reset condition', occurring when, for whatever - * reason, load has not been updated for longer than we expected. (It is - * also how avgload is assigned its first value.) - * - * The formula for avgload then becomes: - * - * avgload = (delta/W)*load + (W - delta)*avgload/W - * avgload = delta*load/W + W*avgload/W - delta*avgload/W - * avgload = avgload + delta*load/W - delta*avgload/W - * - * So, final form is: - * - * avgload_0 = load - * avgload = avgload + delta*load/W - delta*avgload/W, 0<=delta<=W - * - * As a confirmation, let's look at the extremes, when delta is 0 (i.e., - * what happens if we update the load twice, at the same time instant?): - * - * avgload = avgload + 0*load/W - 0*avgload/W - * avgload = avgload - * - * and when delta is W (i.e., what happens if we update at the last - * possible instant before the window 'expires'?): - * - * avgload = avgload + W*load/W - W*avgload/W - * avgload = avgload + load - avgload - * avgload = load - * - * Which, in both cases, is what we expect. - */ -static void -update_runq_load(const struct scheduler *ops, - struct csched2_runqueue_data *rqd, int change, s_time_t now) -{ - struct csched2_private *prv = csched2_priv(ops); - s_time_t delta, load = rqd->load; - unsigned int P, W; - - W = prv->load_window_shift; - P = prv->load_precision_shift; - now >>= LOADAVG_GRANULARITY_SHIFT; - - /* - * To avoid using fractions, we shift to left by load_precision_shift, - * and use the least last load_precision_shift bits as fractional part. - * Looking back at the formula we want to use, we now have: - * - * P = 2^(load_precision_shift) - * P*avgload = P*(avgload + delta*load/W - delta*avgload/W) - * P*avgload = P*avgload + delta*load*P/W - delta*P*avgload/W - * - * And if we are ok storing and using P*avgload, we can rewrite this as: - * - * P*avgload = avgload' - * avgload' = avgload' + delta*P*load/W - delta*avgload'/W - * - * Coupled with, of course: - * - * avgload_0' = P*load - */ - - if ( rqd->load_last_update + (1ULL << W) < now ) - { - rqd->avgload = load << P; - rqd->b_avgload = load << P; - } - else - { - delta = now - rqd->load_last_update; - if ( unlikely(delta < 0) ) - { - d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", - __func__, now, rqd->load_last_update); - delta = 0; - } - - /* - * Note that, if we were to enforce (or check) some relationship - * between P and W, we may save one shift. E.g., if we are sure - * that P < W, we could write: - * - * (delta * (load << P)) >> W - * - * as: - * - * (delta * load) >> (W - P) - */ - rqd->avgload = rqd->avgload + - ((delta * (load << P)) >> W) - - ((delta * rqd->avgload) >> W); - rqd->b_avgload = rqd->b_avgload + - ((delta * (load << P)) >> W) - - ((delta * rqd->b_avgload) >> W); - } - rqd->load += change; - rqd->load_last_update = now; - - /* Overflow, capable of making the load look negative, must not occur. */ - ASSERT(rqd->avgload >= 0 && rqd->b_avgload >= 0); - - if ( unlikely(tb_init_done) ) - { - struct { - uint64_t rq_avgload, b_avgload; - unsigned rq_load:16, rq_id:8, shift:8; - } d; - d.rq_id = rqd->id; - d.rq_load = rqd->load; - d.rq_avgload = rqd->avgload; - d.b_avgload = rqd->b_avgload; - d.shift = P; - __trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, - sizeof(d), - (unsigned char *)&d); - } -} - -static void -update_svc_load(const struct scheduler *ops, - struct csched2_unit *svc, int change, s_time_t now) -{ - struct csched2_private *prv = csched2_priv(ops); - s_time_t delta, unit_load; - unsigned int P, W; - - if ( change == -1 ) - unit_load = 1; - else if ( change == 1 ) - unit_load = 0; - else - unit_load = unit_runnable(svc->unit); - - W = prv->load_window_shift; - P = prv->load_precision_shift; - now >>= LOADAVG_GRANULARITY_SHIFT; - - if ( svc->load_last_update + (1ULL << W) < now ) - { - svc->avgload = unit_load << P; - } - else - { - delta = now - svc->load_last_update; - if ( unlikely(delta < 0) ) - { - d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" llu %"PRI_stime"\n", - __func__, now, svc->load_last_update); - delta = 0; - } - - svc->avgload = svc->avgload + - ((delta * (unit_load << P)) >> W) - - ((delta * svc->avgload) >> W); - } - svc->load_last_update = now; - - /* Overflow, capable of making the load look negative, must not occur. */ - ASSERT(svc->avgload >= 0); - - if ( unlikely(tb_init_done) ) - { - struct { - uint64_t v_avgload; - unsigned unit:16, dom:16; - unsigned shift; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.v_avgload = svc->avgload; - d.shift = P; - __trace_var(TRC_CSCHED2_UPDATE_UNIT_LOAD, 1, - sizeof(d), - (unsigned char *)&d); - } -} - -static void -update_load(const struct scheduler *ops, - struct csched2_runqueue_data *rqd, - struct csched2_unit *svc, int change, s_time_t now) -{ - trace_var(TRC_CSCHED2_UPDATE_LOAD, 1, 0, NULL); - - update_runq_load(ops, rqd, change, now); - if ( svc ) - update_svc_load(ops, svc, change, now); -} - -static void -runq_insert(const struct scheduler *ops, struct csched2_unit *svc) -{ - struct list_head *iter; - unsigned int cpu = sched_unit_master(svc->unit); - struct list_head * runq = &c2rqd(ops, cpu)->runq; - int pos = 0; - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - ASSERT(!unit_on_runq(svc)); - ASSERT(c2r(cpu) == c2r(sched_unit_master(svc->unit))); - - ASSERT(&svc->rqd->runq == runq); - ASSERT(!is_idle_unit(svc->unit)); - ASSERT(!svc->unit->is_running); - ASSERT(!(svc->flags & CSFLAG_scheduled)); - - list_for_each( iter, runq ) - { - struct csched2_unit * iter_svc = runq_elem(iter); - - if ( svc->credit > iter_svc->credit ) - break; - - pos++; - } - list_add_tail(&svc->runq_elem, iter); - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned pos; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.pos = pos; - __trace_var(TRC_CSCHED2_RUNQ_POS, 1, - sizeof(d), - (unsigned char *)&d); - } -} - -static inline void runq_remove(struct csched2_unit *svc) -{ - ASSERT(unit_on_runq(svc)); - list_del_init(&svc->runq_elem); -} - -void burn_credits(struct csched2_runqueue_data *rqd, struct csched2_unit *, s_time_t); - -static inline void -tickle_cpu(unsigned int cpu, struct csched2_runqueue_data *rqd) -{ - __cpumask_set_cpu(cpu, &rqd->tickled); - smt_idle_mask_clear(cpu, &rqd->smt_idle); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); -} - -/* - * What we want to know is whether svc, which we assume to be running on some - * pcpu, can be interrupted and preempted (which, so far, basically means - * whether or not it already run for more than the ratelimit, to which we - * apply some tolerance). - */ -static inline bool is_preemptable(const struct csched2_unit *svc, - s_time_t now, s_time_t ratelimit) -{ - if ( ratelimit <= CSCHED2_RATELIMIT_TICKLE_TOLERANCE ) - return true; - - ASSERT(svc->unit->is_running); - return now - svc->unit->state_entry_time > - ratelimit - CSCHED2_RATELIMIT_TICKLE_TOLERANCE; -} - -/* - * Score to preempt the target cpu. Return a negative number if the - * credit isn't high enough; if it is, favor a preemption on cpu in - * this order: - * - cpu is in new's soft-affinity, not in cur's soft-affinity - * (2 x CSCHED2_CREDIT_INIT score bonus); - * - cpu is in new's soft-affinity and cur's soft-affinity, or - * cpu is not in new's soft-affinity, nor in cur's soft-affinity - * (1x CSCHED2_CREDIT_INIT score bonus); - * - cpu is not in new's soft-affinity, while it is in cur's soft-affinity - * (no bonus). - * - * Within the same class, the highest difference of credit. - */ -static s_time_t tickle_score(const struct scheduler *ops, s_time_t now, - struct csched2_unit *new, unsigned int cpu) -{ - struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); - struct csched2_unit * cur = csched2_unit(curr_on_cpu(cpu)); - struct csched2_private *prv = csched2_priv(ops); - s_time_t score; - - /* - * We are dealing with cpus that are marked non-idle (i.e., that are not - * in rqd->idle). However, some of them may be running their idle unit, - * if taking care of tasklets. In that case, we want to leave it alone. - */ - if ( unlikely(is_idle_unit(cur->unit) || - !is_preemptable(cur, now, MICROSECS(prv->ratelimit_us))) ) - return -1; - - burn_credits(rqd, cur, now); - - score = new->credit - cur->credit; - if ( sched_unit_master(new->unit) != cpu ) - score -= CSCHED2_MIGRATE_RESIST; - - /* - * If score is positive, it means new has enough credits (i.e., - * new->credit > cur->credit+CSCHED2_MIGRATE_RESIST). - * - * Let's compute the bonuses for soft-affinities. - */ - if ( score > 0 ) - { - if ( cpumask_test_cpu(cpu, new->unit->cpu_soft_affinity) ) - score += CSCHED2_CREDIT_INIT; - - if ( !cpumask_test_cpu(cpu, cur->unit->cpu_soft_affinity) ) - score += CSCHED2_CREDIT_INIT; - } - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - int credit, score; - } d; - d.dom = cur->unit->domain->domain_id; - d.unit = cur->unit->unit_id; - d.credit = cur->credit; - d.score = score; - __trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, - sizeof(d), - (unsigned char *)&d); - } - - return score; -} - -/* - * Check what processor it is best to 'wake', for picking up an unit that has - * just been put (back) in the runqueue. Logic is as follows: - * 1. if there are idle processors in the runq, wake one of them; - * 2. if there aren't idle processor, check the one were the unit was - * running before to see if we can preempt what's running there now - * (and hence doing just one migration); - * 3. last stand: check all processors and see if the unit is in right - * of preempting any of the other units running on them (this requires - * two migrations, and that's indeed why it is left as the last stand). - * - * Note that when we say 'idle processors' what we really mean is (pretty - * much always) both _idle_ and _not_already_tickled_. In fact, if a - * processor has been tickled, it will run csched2_schedule() shortly, and - * pick up some work, so it would be wrong to consider it idle. - */ -static void -runq_tickle(const struct scheduler *ops, struct csched2_unit *new, s_time_t now) -{ - int i, ipid = -1; - s_time_t max = 0; - struct sched_unit *unit = new->unit; - unsigned int bs, cpu = sched_unit_master(unit); - struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); - cpumask_t *online = cpupool_domain_master_cpumask(unit->domain); - cpumask_t mask; - - ASSERT(new->rqd == rqd); - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned processor; - int credit; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.processor = cpu; - d.credit = new->credit; - __trace_var(TRC_CSCHED2_TICKLE_NEW, 1, - sizeof(d), - (unsigned char *)&d); - } - - /* - * Exclusive pinning is when an unit has hard-affinity with only one - * cpu, and there is no other unit that has hard-affinity with that - * same cpu. This is infrequent, but if it happens, is for achieving - * the most possible determinism, and least possible overhead for - * the units in question. - * - * Try to identify the vast majority of these situations, and deal - * with them quickly. - */ - if ( unlikely((new->flags & CSFLAG_pinned) && - cpumask_test_cpu(cpu, &rqd->idle) && - !cpumask_test_cpu(cpu, &rqd->tickled)) ) - { - ASSERT(cpumask_cycle(cpu, unit->cpu_hard_affinity) == cpu); - SCHED_STAT_CRANK(tickled_idle_cpu_excl); - ipid = cpu; - goto tickle; - } - - for_each_affinity_balance_step( bs ) - { - /* Just skip first step, if we don't have a soft affinity */ - if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) - continue; - - affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); - - /* - * First of all, consider idle cpus, checking if we can just - * re-use the pcpu where we were running before. - * - * If there are cores where all the siblings are idle, consider - * them first, honoring whatever the spreading-vs-consolidation - * SMT policy wants us to do. - */ - if ( unlikely(sched_smt_power_savings) ) - { - cpumask_andnot(&mask, &rqd->idle, &rqd->smt_idle); - cpumask_and(&mask, &mask, online); - } - else - cpumask_and(&mask, &rqd->smt_idle, online); - cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); - i = cpumask_test_or_cycle(cpu, &mask); - if ( i < nr_cpu_ids ) - { - SCHED_STAT_CRANK(tickled_idle_cpu); - ipid = i; - goto tickle; - } - - /* - * If there are no fully idle cores, check all idlers, after - * having filtered out pcpus that have been tickled but haven't - * gone through the scheduler yet. - */ - cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), online); - cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); - i = cpumask_test_or_cycle(cpu, &mask); - if ( i < nr_cpu_ids ) - { - SCHED_STAT_CRANK(tickled_idle_cpu); - ipid = i; - goto tickle; - } - } - - /* - * Note that, if we are here, it means we have done the hard-affinity - * balancing step of the loop, and hence what we have in cpumask_scratch - * is what we put there for last, i.e., new's unit_hard_affinity & online - * which is exactly what we need for the next part of the function. - */ - - /* - * Otherwise, look for the non-idle (and non-tickled) processors with - * the lowest credit, among the ones new is allowed to run on. Again, - * the cpu were it was running on would be the best candidate. - * - * For deciding which cpu to tickle, we use tickle_score(), which will - * factor in both new's soft-affinity, and the soft-affinity of the - * unit running on each cpu that we consider. - */ - cpumask_andnot(&mask, &rqd->active, &rqd->idle); - cpumask_andnot(&mask, &mask, &rqd->tickled); - cpumask_and(&mask, &mask, cpumask_scratch_cpu(cpu)); - if ( __cpumask_test_and_clear_cpu(cpu, &mask) ) - { - s_time_t score = tickle_score(ops, now, new, cpu); - - if ( score > max ) - { - max = score; - ipid = cpu; - - /* If this is in new's soft affinity, just take it */ - if ( cpumask_test_cpu(cpu, unit->cpu_soft_affinity) ) - { - SCHED_STAT_CRANK(tickled_busy_cpu); - goto tickle; - } - } - } - - for_each_cpu(i, &mask) - { - s_time_t score; - - /* Already looked at this one above */ - ASSERT(i != cpu); - - score = tickle_score(ops, now, new, i); - - if ( score > max ) - { - max = score; - ipid = i; - } - } - - if ( ipid == -1 ) - { - SCHED_STAT_CRANK(tickled_no_cpu); - return; - } - - ASSERT(!is_idle_unit(curr_on_cpu(ipid))); - SCHED_STAT_CRANK(tickled_busy_cpu); - tickle: - BUG_ON(ipid == -1); - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned cpu:16, pad:16; - } d; - d.cpu = ipid; d.pad = 0; - __trace_var(TRC_CSCHED2_TICKLE, 1, - sizeof(d), - (unsigned char *)&d); - } - - tickle_cpu(ipid, rqd); - - if ( unlikely(new->tickled_cpu != -1) ) - SCHED_STAT_CRANK(tickled_cpu_overwritten); - new->tickled_cpu = ipid; -} - -/* - * Credit-related code - */ -static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now, - struct csched2_unit *snext) -{ - struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); - struct list_head *iter; - int m; - - /* - * Under normal circumstances, snext->credit should never be less - * than -CSCHED2_MIN_TIMER. However, under some circumstances, an - * unit with low credits may be allowed to run long enough that - * its credits are actually less than -CSCHED2_CREDIT_INIT. - * (Instances have been observed, for example, where an unit with - * 200us of credit was allowed to run for 11ms, giving it -10.8ms - * of credit. Thus it was still negative even after the reset.) - * - * If this is the case for snext, we simply want to keep moving - * everyone up until it is in the black again. This fair because - * none of the other units want to run at the moment. - * - * Rather than looping, however, we just calculate a multiplier, - * avoiding an integer division and multiplication in the common - * case. - */ - m = 1; - if ( snext->credit < -CSCHED2_CREDIT_INIT ) - m += (-snext->credit) / CSCHED2_CREDIT_INIT; - - list_for_each( iter, &rqd->svc ) - { - unsigned int svc_cpu; - struct csched2_unit * svc; - int start_credit; - - svc = list_entry(iter, struct csched2_unit, rqd_elem); - svc_cpu = sched_unit_master(svc->unit); - - ASSERT(!is_idle_unit(svc->unit)); - ASSERT(svc->rqd == rqd); - - /* - * If svc is running, it is our responsibility to make sure, here, - * that the credit it has spent so far get accounted. - */ - if ( svc->unit == curr_on_cpu(svc_cpu) ) - { - burn_credits(rqd, svc, now); - /* - * And, similarly, in case it has run out of budget, as a - * consequence of this round of accounting, we also must inform - * its pCPU that it's time to park it, and pick up someone else. - */ - if ( unlikely(svc->budget <= 0) ) - tickle_cpu(svc_cpu, rqd); - } - - start_credit = svc->credit; - - /* - * Add INIT * m, avoiding integer multiplication in the common case. - */ - if ( likely(m==1) ) - svc->credit += CSCHED2_CREDIT_INIT; - else - svc->credit += m * CSCHED2_CREDIT_INIT; - - /* "Clip" credits to max carryover */ - if ( svc->credit > CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX ) - svc->credit = CSCHED2_CREDIT_INIT + CSCHED2_CARRYOVER_MAX; - - svc->start_time = now; - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - int credit_start, credit_end; - unsigned multiplier; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.credit_start = start_credit; - d.credit_end = svc->credit; - d.multiplier = m; - __trace_var(TRC_CSCHED2_CREDIT_RESET, 1, - sizeof(d), - (unsigned char *)&d); - } - } - - SCHED_STAT_CRANK(credit_reset); - - /* No need to resort runqueue, as everyone's order should be the same. */ -} - -void burn_credits(struct csched2_runqueue_data *rqd, - struct csched2_unit *svc, s_time_t now) -{ - s_time_t delta; - - ASSERT(svc == csched2_unit(curr_on_cpu(sched_unit_master(svc->unit)))); - - if ( unlikely(is_idle_unit(svc->unit)) ) - { - ASSERT(svc->credit == CSCHED2_IDLE_CREDIT); - return; - } - - delta = now - svc->start_time; - - if ( unlikely(delta <= 0) ) - { - if ( unlikely(delta < 0) ) - d2printk("WARNING: %s: Time went backwards? now %"PRI_stime - " start_time %"PRI_stime"\n", __func__, now, - svc->start_time); - goto out; - } - - SCHED_STAT_CRANK(burn_credits_t2c); - t2c_update(rqd, delta, svc); - - if ( has_cap(svc) ) - svc->budget -= delta; - - svc->start_time = now; - - out: - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - int credit, budget; - int delta; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.credit = svc->credit; - d.budget = has_cap(svc) ? svc->budget : INT_MIN; - d.delta = delta; - __trace_var(TRC_CSCHED2_CREDIT_BURN, 1, - sizeof(d), - (unsigned char *)&d); - } -} - -/* - * Budget-related code. - */ - -static void park_unit(struct csched2_unit *svc) -{ - struct sched_unit *unit = svc->unit; - - ASSERT(spin_is_locked(&svc->sdom->budget_lock)); - - /* - * It was impossible to find budget for this unit, so it has to be - * "parked". This implies it is not runnable, so we mark it as such in - * its pause_flags. If the unit is currently scheduled (which means we - * are here after being called from within csched_schedule()), flagging - * is enough, as we'll choose someone else, and then context_saved() - * will take care of updating the load properly. - * - * If, OTOH, the unit is sitting in the runqueue (which means we are here - * after being called from within runq_candidate()), we must go all the - * way down to taking it out of there, and updating the load accordingly. - * - * In both cases, we also add it to the list of parked units of the domain. - */ - sched_set_pause_flags(unit, _VPF_parked); - if ( unit_on_runq(svc) ) - { - runq_remove(svc); - update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW()); - } - list_add(&svc->parked_elem, &svc->sdom->parked_units); -} - -static bool unit_grab_budget(struct csched2_unit *svc) -{ - struct csched2_dom *sdom = svc->sdom; - unsigned int cpu = sched_unit_master(svc->unit); - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - if ( svc->budget > 0 ) - return true; - - /* budget_lock nests inside runqueue lock. */ - spin_lock(&sdom->budget_lock); - - /* - * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if - * above!). That basically means the unit has overrun a bit --because of - * various reasons-- and we want to take that into account. With the +=, - * we are actually subtracting the amount of budget the unit has - * overconsumed, from the total domain budget. - */ - sdom->budget += svc->budget; - - if ( sdom->budget > 0 ) - { - s_time_t budget; - - /* Get our quota, if there's at least as much budget */ - if ( likely(sdom->budget >= svc->budget_quota) ) - budget = svc->budget_quota; - else - budget = sdom->budget; - - svc->budget = budget; - sdom->budget -= budget; - } - else - { - svc->budget = 0; - park_unit(svc); - } - - spin_unlock(&sdom->budget_lock); - - return svc->budget > 0; -} - -static void -unit_return_budget(struct csched2_unit *svc, struct list_head *parked) -{ - struct csched2_dom *sdom = svc->sdom; - unsigned int cpu = sched_unit_master(svc->unit); - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - ASSERT(list_empty(parked)); - - /* budget_lock nests inside runqueue lock. */ - spin_lock(&sdom->budget_lock); - - /* - * The unit is stopping running (e.g., because it's blocking, or it has - * been preempted). If it hasn't consumed all the budget it got when, - * starting to run, put that remaining amount back in the domain's budget - * pool. - */ - sdom->budget += svc->budget; - svc->budget = 0; - - /* - * Making budget available again to the domain means that parked units - * may be unparked and run. They are, if any, in the domain's parked_units - * list, so we want to go through that and unpark them (so they can try - * to get some budget). - * - * Touching the list requires the budget_lock, which we hold. Let's - * therefore put everyone in that list in another, temporary list, which - * then the caller will traverse, unparking the units it finds there. - * - * In fact, we can't do the actual unparking here, because that requires - * taking the runqueue lock of the units being unparked, and we can't - * take any runqueue locks while we hold a budget_lock. - */ - if ( sdom->budget > 0 ) - list_splice_init(&sdom->parked_units, parked); - - spin_unlock(&sdom->budget_lock); -} - -static void -unpark_parked_units(const struct scheduler *ops, struct list_head *units) -{ - struct csched2_unit *svc, *tmp; - spinlock_t *lock; - - list_for_each_entry_safe ( svc, tmp, units, parked_elem ) - { - unsigned long flags; - s_time_t now; - - lock = unit_schedule_lock_irqsave(svc->unit, &flags); - - sched_clear_pause_flags(svc->unit, _VPF_parked); - if ( unlikely(svc->flags & CSFLAG_scheduled) ) - { - /* - * We end here if a budget replenishment arrived between - * csched2_schedule() (and, in particular, after a call to - * unit_grab_budget() that returned false), and - * context_saved(). By setting __CSFLAG_delayed_runq_add, - * we tell context_saved() to put the unit back in the - * runqueue, from where it will compete with the others - * for the newly replenished budget. - */ - ASSERT( svc->rqd != NULL ); - ASSERT( c2rqd(ops, sched_unit_master(svc->unit)) == svc->rqd ); - __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); - } - else if ( unit_runnable(svc->unit) ) - { - /* - * The unit should go back to the runqueue, and compete for - * the newly replenished budget, but only if it is actually - * runnable (and was therefore offline only because of the - * lack of budget). - */ - now = NOW(); - update_load(ops, svc->rqd, svc, 1, now); - runq_insert(ops, svc); - runq_tickle(ops, svc, now); - } - list_del_init(&svc->parked_elem); - - unit_schedule_unlock_irqrestore(lock, flags, svc->unit); - } -} - -static inline void do_replenish(struct csched2_dom *sdom) -{ - sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD; - sdom->budget += sdom->tot_budget; -} - -static void replenish_domain_budget(void* data) -{ - struct csched2_dom *sdom = data; - unsigned long flags; - s_time_t now; - LIST_HEAD(parked); - - spin_lock_irqsave(&sdom->budget_lock, flags); - - now = NOW(); - - /* - * Let's do the replenishment. Note, though, that a domain may overrun, - * which means the budget would have gone below 0 (reasons may be system - * overbooking, accounting issues, etc.). It also may happen that we are - * handling the replenishment (much) later than we should (reasons may - * again be overbooking, or issues with timers). - * - * Even in cases of overrun or delay, however, we expect that in 99% of - * cases, doing just one replenishment will be good enough for being able - * to unpark the units that are waiting for some budget. - */ - do_replenish(sdom); - - /* - * And now, the special cases: - * 1) if we are late enough to have skipped (at least) one full period, - * what we must do is doing more replenishments. Note that, however, - * every time we add tot_budget to the budget, we also move next_repl - * away by CSCHED2_BDGT_REPL_PERIOD, to make sure the cap is always - * respected. - */ - if ( unlikely(sdom->next_repl <= now) ) - { - do - do_replenish(sdom); - while ( sdom->next_repl <= now ); - } - /* - * 2) if we overrun by more than tot_budget, then budget+tot_budget is - * still < 0, which means that we can't unpark the units. Let's bail, - * and wait for future replenishments. - */ - if ( unlikely(sdom->budget <= 0) ) - { - spin_unlock_irqrestore(&sdom->budget_lock, flags); - goto out; - } - - /* Since we do more replenishments, make sure we didn't overshot. */ - sdom->budget = min(sdom->budget, sdom->tot_budget); - - /* - * As above, let's prepare the temporary list, out of the domain's - * parked_units list, now that we hold the budget_lock. Then, drop such - * lock, and pass the list to the unparking function. - */ - list_splice_init(&sdom->parked_units, &parked); - - spin_unlock_irqrestore(&sdom->budget_lock, flags); - - unpark_parked_units(sdom->dom->cpupool->sched, &parked); - - out: - set_timer(&sdom->repl_timer, sdom->next_repl); -} - -#ifndef NDEBUG -static inline void -csched2_unit_check(struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - struct csched2_dom * const sdom = svc->sdom; - - BUG_ON( svc->unit != unit ); - BUG_ON( sdom != csched2_dom(unit->domain) ); - if ( sdom ) - { - BUG_ON( is_idle_unit(unit) ); - BUG_ON( sdom->dom != unit->domain ); - } - else - { - BUG_ON( !is_idle_unit(unit) ); - } - SCHED_STAT_CRANK(unit_check); -} -#define CSCHED2_UNIT_CHECK(unit) (csched2_unit_check(unit)) -#else -#define CSCHED2_UNIT_CHECK(unit) -#endif - -static void * -csched2_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, - void *dd) -{ - struct csched2_unit *svc; - - /* Allocate per-UNIT info */ - svc = xzalloc(struct csched2_unit); - if ( svc == NULL ) - return NULL; - - INIT_LIST_HEAD(&svc->rqd_elem); - INIT_LIST_HEAD(&svc->runq_elem); - - svc->sdom = dd; - svc->unit = unit; - svc->flags = 0U; - - if ( ! is_idle_unit(unit) ) - { - ASSERT(svc->sdom != NULL); - svc->credit = CSCHED2_CREDIT_INIT; - svc->weight = svc->sdom->weight; - /* Starting load of 50% */ - svc->avgload = 1ULL << (csched2_priv(ops)->load_precision_shift - 1); - svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT; - } - else - { - ASSERT(svc->sdom == NULL); - svc->credit = CSCHED2_IDLE_CREDIT; - svc->weight = 0; - } - svc->tickled_cpu = -1; - - svc->budget = STIME_MAX; - svc->budget_quota = 0; - INIT_LIST_HEAD(&svc->parked_elem); - - SCHED_STAT_CRANK(unit_alloc); - - return svc; -} - -static void -csched2_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - - ASSERT(!is_idle_unit(unit)); - SCHED_STAT_CRANK(unit_sleep); - - if ( curr_on_cpu(sched_unit_master(unit)) == unit ) - { - tickle_cpu(sched_unit_master(unit), svc->rqd); - } - else if ( unit_on_runq(svc) ) - { - ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); - update_load(ops, svc->rqd, svc, -1, NOW()); - runq_remove(svc); - } - else - __clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); -} - -static void -csched2_unit_wake(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - unsigned int cpu = sched_unit_master(unit); - s_time_t now; - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - ASSERT(!is_idle_unit(unit)); - - if ( unlikely(curr_on_cpu(cpu) == unit) ) - { - SCHED_STAT_CRANK(unit_wake_running); - goto out; - } - - if ( unlikely(unit_on_runq(svc)) ) - { - SCHED_STAT_CRANK(unit_wake_onrunq); - goto out; - } - - if ( likely(unit_runnable(unit)) ) - SCHED_STAT_CRANK(unit_wake_runnable); - else - SCHED_STAT_CRANK(unit_wake_not_runnable); - - /* If the context hasn't been saved for this unit yet, we can't put it on - * another runqueue. Instead, we set a flag so that it will be put on the runqueue - * after the context has been saved. */ - if ( unlikely(svc->flags & CSFLAG_scheduled) ) - { - __set_bit(__CSFLAG_delayed_runq_add, &svc->flags); - goto out; - } - - /* Add into the new runqueue if necessary */ - if ( svc->rqd == NULL ) - runq_assign(ops, unit); - else - ASSERT(c2rqd(ops, sched_unit_master(unit)) == svc->rqd ); - - now = NOW(); - - update_load(ops, svc->rqd, svc, 1, now); - - /* Put the UNIT on the runq */ - runq_insert(ops, svc); - runq_tickle(ops, svc, now); - -out: - return; -} - -static void -csched2_unit_yield(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - - __set_bit(__CSFLAG_unit_yield, &svc->flags); -} - -static void -csched2_context_saved(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - spinlock_t *lock = unit_schedule_lock_irq(unit); - s_time_t now = NOW(); - LIST_HEAD(were_parked); - - BUG_ON( !is_idle_unit(unit) && - svc->rqd != c2rqd(ops, sched_unit_master(unit))); - ASSERT(is_idle_unit(unit) || - svc->rqd == c2rqd(ops, sched_unit_master(unit))); - - /* This unit is now eligible to be put on the runqueue again */ - __clear_bit(__CSFLAG_scheduled, &svc->flags); - - if ( unlikely(has_cap(svc) && svc->budget > 0) ) - unit_return_budget(svc, &were_parked); - - /* If someone wants it on the runqueue, put it there. */ - /* - * NB: We can get rid of CSFLAG_scheduled by checking for - * vc->is_running and unit_on_runq(svc) here. However, - * since we're accessing the flags cacheline anyway, - * it seems a bit pointless; especially as we have plenty of - * bits free. - */ - if ( __test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) - && likely(unit_runnable(unit)) ) - { - ASSERT(!unit_on_runq(svc)); - - runq_insert(ops, svc); - runq_tickle(ops, svc, now); - } - else if ( !is_idle_unit(unit) ) - update_load(ops, svc->rqd, svc, -1, now); - - unit_schedule_unlock_irq(lock, unit); - - unpark_parked_units(ops, &were_parked); -} - -#define MAX_LOAD (STIME_MAX) -static struct sched_resource * -csched2_res_pick(const struct scheduler *ops, const struct sched_unit *unit) -{ - struct csched2_private *prv = csched2_priv(ops); - int i, min_rqi = -1, min_s_rqi = -1; - unsigned int new_cpu, cpu = sched_unit_master(unit); - struct csched2_unit *svc = csched2_unit(unit); - s_time_t min_avgload = MAX_LOAD, min_s_avgload = MAX_LOAD; - bool has_soft; - - ASSERT(!cpumask_empty(&prv->active_queues)); - - SCHED_STAT_CRANK(pick_resource); - - /* Locking: - * - Runqueue lock of vc->processor is already locked - * - Need to grab prv lock to make sure active runqueues don't - * change - * - Need to grab locks for other runqueues while checking - * avgload - * Locking constraint is: - * - Lock prv before runqueue locks - * - Trylock between runqueue locks (no ordering) - * - * Since one of the runqueue locks is already held, we can't - * just grab the prv lock. Instead, we'll have to trylock, and - * do something else reasonable if we fail. - */ - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - if ( !read_trylock(&prv->lock) ) - { - /* We may be here because someone requested us to migrate. */ - __clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); - new_cpu = get_fallback_cpu(svc); - /* - * Tracing of runq and its load won't be accurate, since we could - * not get the lock, but at least we will output the chosen pcpu. - */ - goto out; - } - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - - /* - * First check to see if we're here because someone else suggested a place - * for us to move. - */ - if ( __test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) - { - if ( unlikely(svc->migrate_rqd->id < 0) ) - { - printk(XENLOG_WARNING "%s: target runqueue disappeared!\n", - __func__); - } - else if ( cpumask_intersects(cpumask_scratch_cpu(cpu), - &svc->migrate_rqd->active) ) - { - /* - * If we've been asked to move to migrate_rqd, we should just do - * that, which we actually do by returning one cpu from that runq. - * There is no need to take care of soft affinity, as that will - * happen in runq_tickle(). - */ - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &svc->migrate_rqd->active); - new_cpu = cpumask_cycle(svc->migrate_rqd->pick_bias, - cpumask_scratch_cpu(cpu)); - - svc->migrate_rqd->pick_bias = new_cpu; - goto out_up; - } - /* Fall-through to normal cpu pick */ - } - - /* - * What we want is: - * - if we have soft affinity, the runqueue with the lowest average - * load, among the ones that contain cpus in our soft affinity; this - * represents the best runq on which we would want to run. - * - the runqueue with the lowest average load among the ones that - * contains cpus in our hard affinity; this represent the best runq - * on which we can run. - * - * Find both runqueues in one pass. - */ - has_soft = has_soft_affinity(unit); - for_each_cpu(i, &prv->active_queues) - { - struct csched2_runqueue_data *rqd; - s_time_t rqd_avgload = MAX_LOAD; - - rqd = prv->rqd + i; - - /* - * If none of the cpus of this runqueue is in svc's hard-affinity, - * skip the runqueue. - * - * Note that, in case svc's hard-affinity has changed, this is the - * first time when we see such change, so it is indeed possible - * that we end up skipping svc's current runqueue. - */ - if ( !cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active) ) - continue; - - /* - * If checking a different runqueue, grab the lock, read the avg, - * and then release the lock. - * - * If on our own runqueue, don't grab or release the lock; - * but subtract our own load from the runqueue load to simulate - * impartiality. - */ - if ( rqd == svc->rqd ) - { - rqd_avgload = max_t(s_time_t, rqd->b_avgload - svc->avgload, 0); - } - else if ( spin_trylock(&rqd->lock) ) - { - rqd_avgload = rqd->b_avgload; - spin_unlock(&rqd->lock); - } - - /* - * if svc has a soft-affinity, and some cpus of rqd are part of it, - * see if we need to update the "soft-affinity minimum". - */ - if ( has_soft && - rqd_avgload < min_s_avgload ) - { - cpumask_t mask; - - cpumask_and(&mask, cpumask_scratch_cpu(cpu), &rqd->active); - if ( cpumask_intersects(&mask, unit->cpu_soft_affinity) ) - { - min_s_avgload = rqd_avgload; - min_s_rqi = i; - } - } - /* In any case, keep the "hard-affinity minimum" updated too. */ - if ( rqd_avgload < min_avgload ) - { - min_avgload = rqd_avgload; - min_rqi = i; - } - } - - if ( has_soft && min_s_rqi != -1 ) - { - /* - * We have soft affinity, and we have a candidate runq, so go for it. - * - * Note that, to obtain the soft-affinity mask, we "just" put what we - * have in cpumask_scratch in && with unit->cpu_soft_affinity. This is - * ok because: - * - we know that unit->cpu_hard_affinity and ->cpu_soft_affinity have - * a non-empty intersection (because has_soft is true); - * - we have unit->cpu_hard_affinity & cpupool_domain_master_cpumask() - * already in cpumask_scratch, we do save a lot doing like this. - * - * It's kind of like open coding affinity_balance_cpumask() but, in - * this specific case, calling that would mean a lot of (unnecessary) - * cpumask operations. - */ - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - unit->cpu_soft_affinity); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &prv->rqd[min_s_rqi].active); - } - else if ( min_rqi != -1 ) - { - /* - * Either we don't have soft-affinity, or we do, but we did not find - * any suitable runq. But we did find one when considering hard - * affinity, so go for it. - * - * cpumask_scratch already has unit->cpu_hard_affinity & - * cpupool_domain_master_cpumask() in it, so it's enough that we filter - * with the cpus of the runq. - */ - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &prv->rqd[min_rqi].active); - } - else - { - /* - * We didn't find anyone at all (most likely because of spinlock - * contention). - */ - new_cpu = get_fallback_cpu(svc); - min_rqi = c2r(new_cpu); - min_avgload = prv->rqd[min_rqi].b_avgload; - goto out_up; - } - - new_cpu = cpumask_cycle(prv->rqd[min_rqi].pick_bias, - cpumask_scratch_cpu(cpu)); - prv->rqd[min_rqi].pick_bias = new_cpu; - BUG_ON(new_cpu >= nr_cpu_ids); - - out_up: - read_unlock(&prv->lock); - out: - if ( unlikely(tb_init_done) ) - { - struct { - uint64_t b_avgload; - unsigned unit:16, dom:16; - unsigned rq_id:16, new_cpu:16; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.rq_id = min_rqi; - d.b_avgload = min_avgload; - d.new_cpu = new_cpu; - __trace_var(TRC_CSCHED2_PICKED_CPU, 1, - sizeof(d), - (unsigned char *)&d); - } - - return get_sched_res(new_cpu); -} - -/* Working state of the load-balancing algorithm */ -typedef struct { - /* NB: Modified by consider() */ - s_time_t load_delta; - struct csched2_unit * best_push_svc, *best_pull_svc; - /* NB: Read by consider() */ - struct csched2_runqueue_data *lrqd; - struct csched2_runqueue_data *orqd; -} balance_state_t; - -static void consider(balance_state_t *st, - struct csched2_unit *push_svc, - struct csched2_unit *pull_svc) -{ - s_time_t l_load, o_load, delta; - - l_load = st->lrqd->b_avgload; - o_load = st->orqd->b_avgload; - if ( push_svc ) - { - /* What happens to the load on both if we push? */ - l_load -= push_svc->avgload; - o_load += push_svc->avgload; - } - if ( pull_svc ) - { - /* What happens to the load on both if we pull? */ - l_load += pull_svc->avgload; - o_load -= pull_svc->avgload; - } - - delta = l_load - o_load; - if ( delta < 0 ) - delta = -delta; - - if ( delta < st->load_delta ) - { - st->load_delta = delta; - st->best_push_svc=push_svc; - st->best_pull_svc=pull_svc; - } -} - - -static void migrate(const struct scheduler *ops, - struct csched2_unit *svc, - struct csched2_runqueue_data *trqd, - s_time_t now) -{ - struct sched_unit *unit = svc->unit; - int cpu = sched_unit_master(unit); - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned rqi:16, trqi:16; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.rqi = svc->rqd->id; - d.trqi = trqd->id; - __trace_var(TRC_CSCHED2_MIGRATE, 1, - sizeof(d), - (unsigned char *)&d); - } - - if ( svc->flags & CSFLAG_scheduled ) - { - /* It's running; mark it to migrate. */ - svc->migrate_rqd = trqd; - sched_set_pause_flags(unit, _VPF_migrating); - __set_bit(__CSFLAG_runq_migrate_request, &svc->flags); - SCHED_STAT_CRANK(migrate_requested); - tickle_cpu(cpu, svc->rqd); - } - else - { - int on_runq = 0; - /* It's not running; just move it */ - if ( unit_on_runq(svc) ) - { - runq_remove(svc); - update_load(ops, svc->rqd, NULL, -1, now); - on_runq = 1; - } - _runq_deassign(svc); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &trqd->active); - sched_set_res(unit, - get_sched_res(cpumask_cycle(trqd->pick_bias, - cpumask_scratch_cpu(cpu)))); - trqd->pick_bias = sched_unit_master(unit); - ASSERT(sched_unit_master(unit) < nr_cpu_ids); - - _runq_assign(svc, trqd); - if ( on_runq ) - { - update_load(ops, svc->rqd, NULL, 1, now); - runq_insert(ops, svc); - runq_tickle(ops, svc, now); - SCHED_STAT_CRANK(migrate_on_runq); - } - else - SCHED_STAT_CRANK(migrate_no_runq); - } -} - -/* - * It makes sense considering migrating svc to rqd, if: - * - svc is not already flagged to migrate, - * - if svc is allowed to run on at least one of the pcpus of rqd. - */ -static bool unit_is_migrateable(struct csched2_unit *svc, - struct csched2_runqueue_data *rqd) -{ - struct sched_unit *unit = svc->unit; - int cpu = sched_unit_master(unit); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - - return !(svc->flags & CSFLAG_runq_migrate_request) && - cpumask_intersects(cpumask_scratch_cpu(cpu), &rqd->active); -} - -static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) -{ - struct csched2_private *prv = csched2_priv(ops); - int i, max_delta_rqi; - struct list_head *push_iter, *pull_iter; - bool inner_load_updated = 0; - - balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; - - /* - * Basic algorithm: Push, pull, or swap. - * - Find the runqueue with the furthest load distance - * - Find a pair that makes the difference the least (where one - * on either side may be empty). - */ - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - st.lrqd = c2rqd(ops, cpu); - - update_runq_load(ops, st.lrqd, 0, now); - -retry: - max_delta_rqi = -1; - if ( !read_trylock(&prv->lock) ) - return; - - st.load_delta = 0; - - for_each_cpu(i, &prv->active_queues) - { - s_time_t delta; - - st.orqd = prv->rqd + i; - - if ( st.orqd == st.lrqd - || !spin_trylock(&st.orqd->lock) ) - continue; - - update_runq_load(ops, st.orqd, 0, now); - - delta = st.lrqd->b_avgload - st.orqd->b_avgload; - if ( delta < 0 ) - delta = -delta; - - if ( delta > st.load_delta ) - { - st.load_delta = delta; - max_delta_rqi = i; - } - - spin_unlock(&st.orqd->lock); - } - - /* Minimize holding the private scheduler lock. */ - read_unlock(&prv->lock); - if ( max_delta_rqi == -1 ) - goto out; - - { - s_time_t load_max; - int cpus_max; - - - load_max = st.lrqd->b_avgload; - if ( st.orqd->b_avgload > load_max ) - load_max = st.orqd->b_avgload; - - cpus_max = st.lrqd->nr_cpus; - i = st.orqd->nr_cpus; - if ( i > cpus_max ) - cpus_max = i; - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned lrq_id:16, orq_id:16; - unsigned load_delta; - } d; - d.lrq_id = st.lrqd->id; - d.orq_id = st.orqd->id; - d.load_delta = st.load_delta; - __trace_var(TRC_CSCHED2_LOAD_CHECK, 1, - sizeof(d), - (unsigned char *)&d); - } - - /* - * If we're under 100% capacaty, only shift if load difference - * is > 1. otherwise, shift if under 12.5% - */ - if ( load_max < ((s_time_t)cpus_max << prv->load_precision_shift) ) - { - if ( st.load_delta < (1ULL << (prv->load_precision_shift + - opt_underload_balance_tolerance)) ) - goto out; - } - else - if ( st.load_delta < (1ULL << (prv->load_precision_shift + - opt_overload_balance_tolerance)) ) - goto out; - } - - /* Try to grab the other runqueue lock; if it's been taken in the - * meantime, try the process over again. This can't deadlock - * because if it doesn't get any other rqd locks, it will simply - * give up and return. */ - st.orqd = prv->rqd + max_delta_rqi; - if ( !spin_trylock(&st.orqd->lock) ) - goto retry; - - /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ - if ( unlikely(st.orqd->id < 0) ) - goto out_up; - - if ( unlikely(tb_init_done) ) - { - struct { - uint64_t lb_avgload, ob_avgload; - unsigned lrq_id:16, orq_id:16; - } d; - d.lrq_id = st.lrqd->id; - d.lb_avgload = st.lrqd->b_avgload; - d.orq_id = st.orqd->id; - d.ob_avgload = st.orqd->b_avgload; - __trace_var(TRC_CSCHED2_LOAD_BALANCE, 1, - sizeof(d), - (unsigned char *)&d); - } - - SCHED_STAT_CRANK(acct_load_balance); - - /* Look for "swap" which gives the best load average - * FIXME: O(n^2)! */ - - /* Reuse load delta (as we're trying to minimize it) */ - list_for_each( push_iter, &st.lrqd->svc ) - { - struct csched2_unit * push_svc = list_entry(push_iter, struct csched2_unit, rqd_elem); - - update_svc_load(ops, push_svc, 0, now); - - if ( !unit_is_migrateable(push_svc, st.orqd) ) - continue; - - list_for_each( pull_iter, &st.orqd->svc ) - { - struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); - - if ( !inner_load_updated ) - update_svc_load(ops, pull_svc, 0, now); - - if ( !unit_is_migrateable(pull_svc, st.lrqd) ) - continue; - - consider(&st, push_svc, pull_svc); - } - - inner_load_updated = 1; - - /* Consider push only */ - consider(&st, push_svc, NULL); - } - - list_for_each( pull_iter, &st.orqd->svc ) - { - struct csched2_unit * pull_svc = list_entry(pull_iter, struct csched2_unit, rqd_elem); - - if ( !unit_is_migrateable(pull_svc, st.lrqd) ) - continue; - - /* Consider pull only */ - consider(&st, NULL, pull_svc); - } - - /* OK, now we have some candidates; do the moving */ - if ( st.best_push_svc ) - migrate(ops, st.best_push_svc, st.orqd, now); - if ( st.best_pull_svc ) - migrate(ops, st.best_pull_svc, st.lrqd, now); - - out_up: - spin_unlock(&st.orqd->lock); - out: - return; -} - -static void -csched2_unit_migrate( - const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu) -{ - struct domain *d = unit->domain; - struct csched2_unit * const svc = csched2_unit(unit); - struct csched2_runqueue_data *trqd; - s_time_t now = NOW(); - - /* - * Being passed a target pCPU which is outside of our cpupool is only - * valid if we are shutting down (or doing ACPI suspend), and we are - * moving everyone to BSP, no matter whether or not BSP is inside our - * cpupool. - * - * And since there indeed is the chance that it is not part of it, all - * we must do is remove _and_ unassign the unit from any runqueue, as - * well as updating v->processor with the target, so that the suspend - * process can continue. - * - * It will then be during resume that a new, meaningful, value for - * v->processor will be chosen, and during actual domain unpause that - * the unit will be assigned to and added to the proper runqueue. - */ - if ( unlikely(!cpumask_test_cpu(new_cpu, cpupool_domain_master_cpumask(d))) ) - { - ASSERT(system_state == SYS_STATE_suspend); - if ( unit_on_runq(svc) ) - { - runq_remove(svc); - update_load(ops, svc->rqd, NULL, -1, now); - } - _runq_deassign(svc); - sched_set_res(unit, get_sched_res(new_cpu)); - return; - } - - /* If here, new_cpu must be a valid Credit2 pCPU, and in our affinity. */ - ASSERT(cpumask_test_cpu(new_cpu, &csched2_priv(ops)->initialized)); - ASSERT(cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity)); - - trqd = c2rqd(ops, new_cpu); - - /* - * Do the actual movement toward new_cpu, and update vc->processor. - * If we are changing runqueue, migrate() takes care of everything. - * If we are not changing runqueue, we need to update vc->processor - * here. In fact, if, for instance, we are here because the unit's - * hard affinity changed, we don't want to risk leaving vc->processor - * pointing to a pcpu where we can't run any longer. - */ - if ( trqd != svc->rqd ) - migrate(ops, svc, trqd, now); - else - sched_set_res(unit, get_sched_res(new_cpu)); -} - -static int -csched2_dom_cntl( - const struct scheduler *ops, - struct domain *d, - struct xen_domctl_scheduler_op *op) -{ - struct csched2_dom * const sdom = csched2_dom(d); - struct csched2_private *prv = csched2_priv(ops); - unsigned long flags; - struct sched_unit *unit; - int rc = 0; - - /* - * Locking: - * - we must take the private lock for accessing the weights of the - * units of d, and/or the cap; - * - in the putinfo case, we also need the runqueue lock(s), for - * updating the max waight of the runqueue(s). - * If changing the cap, we also need the budget_lock, for updating - * the value of the domain budget pool (and the runqueue lock, - * for adjusting the parameters and rescheduling any unit that is - * running at the time of the change). - */ - switch ( op->cmd ) - { - case XEN_DOMCTL_SCHEDOP_getinfo: - read_lock_irqsave(&prv->lock, flags); - op->u.credit2.weight = sdom->weight; - op->u.credit2.cap = sdom->cap; - read_unlock_irqrestore(&prv->lock, flags); - break; - case XEN_DOMCTL_SCHEDOP_putinfo: - write_lock_irqsave(&prv->lock, flags); - /* Weight */ - if ( op->u.credit2.weight != 0 ) - { - int old_weight; - - old_weight = sdom->weight; - - sdom->weight = op->u.credit2.weight; - - /* Update weights for units, and max_weight for runqueues on which they reside */ - for_each_sched_unit ( d, unit ) - { - struct csched2_unit *svc = csched2_unit(unit); - spinlock_t *lock = unit_schedule_lock(unit); - - ASSERT(svc->rqd == c2rqd(ops, sched_unit_master(unit))); - - svc->weight = sdom->weight; - update_max_weight(svc->rqd, svc->weight, old_weight); - - unit_schedule_unlock(lock, unit); - } - } - /* Cap */ - if ( op->u.credit2.cap != 0 ) - { - struct csched2_unit *svc; - spinlock_t *lock; - - /* Cap is only valid if it's below 100 * nr_of_units */ - if ( op->u.credit2.cap > 100 * sdom->nr_units ) - { - rc = -EINVAL; - write_unlock_irqrestore(&prv->lock, flags); - break; - } - - spin_lock(&sdom->budget_lock); - sdom->tot_budget = (CSCHED2_BDGT_REPL_PERIOD * op->u.credit2.cap); - sdom->tot_budget /= 100; - spin_unlock(&sdom->budget_lock); - - /* - * When trying to get some budget and run, each unit will grab - * from the pool 1/N (with N = nr of units of the domain) of - * the total budget. Roughly speaking, this means each unit will - * have at least one chance to run during every period. - */ - for_each_sched_unit ( d, unit ) - { - svc = csched2_unit(unit); - lock = unit_schedule_lock(unit); - /* - * Too small quotas would in theory cause a lot of overhead, - * which then won't happen because, in csched2_runtime(), - * CSCHED2_MIN_TIMER is what would be used anyway. - */ - svc->budget_quota = max(sdom->tot_budget / sdom->nr_units, - CSCHED2_MIN_TIMER); - unit_schedule_unlock(lock, unit); - } - - if ( sdom->cap == 0 ) - { - /* - * We give to the domain the budget to which it is entitled, - * and queue its first replenishment event. - * - * Since cap is currently disabled for this domain, we - * know no unit is messing with the domain's budget, and - * the replenishment timer is still off. - * For these reasons, it is safe to do the following without - * taking the budget_lock. - */ - sdom->budget = sdom->tot_budget; - sdom->next_repl = NOW() + CSCHED2_BDGT_REPL_PERIOD; - set_timer(&sdom->repl_timer, sdom->next_repl); - - /* - * Now, let's enable budget accounting for all the units. - * For making sure that they will start to honour the domain's - * cap, we set their budget to 0. - * This way, as soon as they will try to run, they will have - * to get some budget. - * - * For the units that are already running, we trigger the - * scheduler on their pCPU. When, as a consequence of this, - * csched2_schedule() will run, it will figure out there is - * no budget, and the unit will try to get some (and be parked, - * if there's none, and we'll switch to someone else). - */ - for_each_sched_unit ( d, unit ) - { - svc = csched2_unit(unit); - lock = unit_schedule_lock(unit); - if ( unit->is_running ) - { - unsigned int cpu = sched_unit_master(unit); - struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); - - ASSERT(curr_on_cpu(cpu) == unit); - - /* - * We are triggering a reschedule on the unit's - * pCPU. That will run burn_credits() and, since - * the unit is capped now, it would charge all the - * execution time of this last round as budget as - * well. That will make the unit budget go negative, - * potentially by a large amount, and it's unfair. - * - * To avoid that, call burn_credit() here, to do the - * accounting of this current running instance now, - * with budgetting still disabled. This does not - * prevent some small amount of budget being charged - * to the unit (i.e., the amount of time it runs from - * now, to when scheduling happens). The budget will - * also go below 0, but a lot less than how it would - * if we don't do this. - */ - burn_credits(rqd, svc, NOW()); - __cpumask_set_cpu(cpu, &rqd->tickled); - ASSERT(!cpumask_test_cpu(cpu, &rqd->smt_idle)); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - } - svc->budget = 0; - unit_schedule_unlock(lock, unit); - } - } - - sdom->cap = op->u.credit2.cap; - } - else if ( sdom->cap != 0 ) - { - LIST_HEAD(parked); - - stop_timer(&sdom->repl_timer); - - /* Disable budget accounting for all the units. */ - for_each_sched_unit ( d, unit ) - { - struct csched2_unit *svc = csched2_unit(unit); - spinlock_t *lock = unit_schedule_lock(unit); - - svc->budget = STIME_MAX; - svc->budget_quota = 0; - - unit_schedule_unlock(lock, unit); - } - sdom->cap = 0; - /* - * We are disabling the cap for this domain, which may have - * units waiting for a replenishment, so we unpark them all. - * Note that, since we have already disabled budget accounting - * for all the units of the domain, no currently running unit - * will be added to the parked units list any longer. - */ - spin_lock(&sdom->budget_lock); - list_splice_init(&sdom->parked_units, &parked); - spin_unlock(&sdom->budget_lock); - - unpark_parked_units(ops, &parked); - } - write_unlock_irqrestore(&prv->lock, flags); - break; - default: - rc = -EINVAL; - break; - } - - - return rc; -} - -static void -csched2_aff_cntl(const struct scheduler *ops, struct sched_unit *unit, - const cpumask_t *hard, const cpumask_t *soft) -{ - struct csched2_unit *svc = csched2_unit(unit); - - if ( !hard ) - return; - - /* Are we becoming exclusively pinned? */ - if ( cpumask_weight(hard) == 1 ) - __set_bit(__CSFLAG_pinned, &svc->flags); - else - __clear_bit(__CSFLAG_pinned, &svc->flags); -} - -static int csched2_sys_cntl(const struct scheduler *ops, - struct xen_sysctl_scheduler_op *sc) -{ - struct xen_sysctl_credit2_schedule *params = &sc->u.sched_credit2; - struct csched2_private *prv = csched2_priv(ops); - unsigned long flags; - - switch (sc->cmd ) - { - case XEN_SYSCTL_SCHEDOP_putinfo: - if ( params->ratelimit_us && - (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || - params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN )) - return -EINVAL; - - write_lock_irqsave(&prv->lock, flags); - if ( !prv->ratelimit_us && params->ratelimit_us ) - printk(XENLOG_INFO "Enabling context switch rate limiting\n"); - else if ( prv->ratelimit_us && !params->ratelimit_us ) - printk(XENLOG_INFO "Disabling context switch rate limiting\n"); - prv->ratelimit_us = params->ratelimit_us; - write_unlock_irqrestore(&prv->lock, flags); - - /* FALLTHRU */ - case XEN_SYSCTL_SCHEDOP_getinfo: - params->ratelimit_us = prv->ratelimit_us; - break; - } - - return 0; -} - -static void * -csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom) -{ - struct csched2_private *prv = csched2_priv(ops); - struct csched2_dom *sdom; - unsigned long flags; - - sdom = xzalloc(struct csched2_dom); - if ( sdom == NULL ) - return ERR_PTR(-ENOMEM); - - /* Initialize credit, cap and weight */ - INIT_LIST_HEAD(&sdom->sdom_elem); - sdom->dom = dom; - sdom->weight = CSCHED2_DEFAULT_WEIGHT; - sdom->cap = 0U; - sdom->nr_units = 0; - - init_timer(&sdom->repl_timer, replenish_domain_budget, sdom, - cpumask_any(cpupool_domain_master_cpumask(dom))); - spin_lock_init(&sdom->budget_lock); - INIT_LIST_HEAD(&sdom->parked_units); - - write_lock_irqsave(&prv->lock, flags); - - list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom); - - write_unlock_irqrestore(&prv->lock, flags); - - return sdom; -} - -static void -csched2_free_domdata(const struct scheduler *ops, void *data) -{ - struct csched2_dom *sdom = data; - struct csched2_private *prv = csched2_priv(ops); - - if ( sdom ) - { - unsigned long flags; - - kill_timer(&sdom->repl_timer); - - write_lock_irqsave(&prv->lock, flags); - list_del_init(&sdom->sdom_elem); - write_unlock_irqrestore(&prv->lock, flags); - - xfree(sdom); - } -} - -static void -csched2_unit_insert(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit *svc = unit->priv; - struct csched2_dom * const sdom = svc->sdom; - spinlock_t *lock; - - ASSERT(!is_idle_unit(unit)); - ASSERT(list_empty(&svc->runq_elem)); - - /* csched2_res_pick() expects the pcpu lock to be held */ - lock = unit_schedule_lock_irq(unit); - - sched_set_res(unit, csched2_res_pick(ops, unit)); - - spin_unlock_irq(lock); - - lock = unit_schedule_lock_irq(unit); - - /* Add unit to runqueue of initial processor */ - runq_assign(ops, unit); - - unit_schedule_unlock_irq(lock, unit); - - sdom->nr_units++; - - SCHED_STAT_CRANK(unit_insert); - - CSCHED2_UNIT_CHECK(unit); -} - -static void -csched2_free_udata(const struct scheduler *ops, void *priv) -{ - struct csched2_unit *svc = priv; - - xfree(svc); -} - -static void -csched2_unit_remove(const struct scheduler *ops, struct sched_unit *unit) -{ - struct csched2_unit * const svc = csched2_unit(unit); - spinlock_t *lock; - - ASSERT(!is_idle_unit(unit)); - ASSERT(list_empty(&svc->runq_elem)); - - SCHED_STAT_CRANK(unit_remove); - - /* Remove from runqueue */ - lock = unit_schedule_lock_irq(unit); - - runq_deassign(ops, unit); - - unit_schedule_unlock_irq(lock, unit); - - svc->sdom->nr_units--; -} - -/* How long should we let this unit run for? */ -static s_time_t -csched2_runtime(const struct scheduler *ops, int cpu, - struct csched2_unit *snext, s_time_t now) -{ - s_time_t time, min_time; - int rt_credit; /* Proposed runtime measured in credits */ - struct csched2_runqueue_data *rqd = c2rqd(ops, cpu); - struct list_head *runq = &rqd->runq; - struct csched2_private *prv = csched2_priv(ops); - - /* - * If we're idle, just stay so. Others (or external events) - * will poke us when necessary. - */ - if ( is_idle_unit(snext->unit) ) - return -1; - - /* General algorithm: - * 1) Run until snext's credit will be 0. - * 2) But if someone is waiting, run until snext's credit is equal - * to his. - * 3) But, if we are capped, never run more than our budget. - * 4) And never run longer than MAX_TIMER or shorter than MIN_TIMER or - * the ratelimit time. - */ - - /* Calculate mintime */ - min_time = CSCHED2_MIN_TIMER; - if ( prv->ratelimit_us ) - { - s_time_t ratelimit_min = MICROSECS(prv->ratelimit_us); - if ( snext->unit->is_running ) - ratelimit_min = snext->unit->state_entry_time + - MICROSECS(prv->ratelimit_us) - now; - if ( ratelimit_min > min_time ) - min_time = ratelimit_min; - } - - /* 1) Run until snext's credit will be 0. */ - rt_credit = snext->credit; - - /* - * 2) If there's someone waiting whose credit is positive, - * run until your credit ~= his. - */ - if ( ! list_empty(runq) ) - { - struct csched2_unit *swait = runq_elem(runq->next); - - if ( ! is_idle_unit(swait->unit) - && swait->credit > 0 ) - { - rt_credit = snext->credit - swait->credit; - } - } - - /* - * The next guy on the runqueue may actually have a higher credit, - * if we've tried to avoid migrating him from a different cpu. - * Setting time=0 will ensure the minimum timeslice is chosen. - * - * FIXME: See if we can eliminate this conversion if we know time - * will be outside (MIN,MAX). Probably requires pre-calculating - * credit values of MIN,MAX per unit, since each unit burns credit - * at a different rate. - */ - if ( rt_credit > 0 ) - time = c2t(rqd, rt_credit, snext); - else - time = 0; - - /* - * 3) But, if capped, never run more than our budget. - */ - if ( has_cap(snext) ) - time = snext->budget < time ? snext->budget : time; - - /* - * 4) And never run longer than MAX_TIMER or less than MIN_TIMER or - * the rate_limit time. - */ - if ( time < min_time ) - { - time = min_time; - SCHED_STAT_CRANK(runtime_min_timer); - } - else if (time > CSCHED2_MAX_TIMER) - { - time = CSCHED2_MAX_TIMER; - SCHED_STAT_CRANK(runtime_max_timer); - } - - return time; -} - -/* - * Find a candidate. - */ -static struct csched2_unit * -runq_candidate(struct csched2_runqueue_data *rqd, - struct csched2_unit *scurr, - int cpu, s_time_t now, - unsigned int *skipped) -{ - struct list_head *iter, *temp; - struct sched_resource *sr = get_sched_res(cpu); - struct csched2_unit *snext = NULL; - struct csched2_private *prv = csched2_priv(sr->scheduler); - bool yield = false, soft_aff_preempt = false; - - *skipped = 0; - - if ( unlikely(is_idle_unit(scurr->unit)) ) - { - snext = scurr; - goto check_runq; - } - - yield = __test_and_clear_bit(__CSFLAG_unit_yield, &scurr->flags); - - /* - * Return the current unit if it has executed for less than ratelimit. - * Adjuststment for the selected unit's credit and decision - * for how long it will run will be taken in csched2_runtime. - * - * Note that, if scurr is yielding, we don't let rate limiting kick in. - * In fact, it may be the case that scurr is about to spin, and there's - * no point forcing it to do so until rate limiting expires. - */ - if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) && - (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) ) - { - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned runtime; - } d; - d.dom = scurr->unit->domain->domain_id; - d.unit = scurr->unit->unit_id; - d.runtime = now - scurr->unit->state_entry_time; - __trace_var(TRC_CSCHED2_RATELIMIT, 1, - sizeof(d), - (unsigned char *)&d); - } - return scurr; - } - - /* If scurr has a soft-affinity, let's check whether cpu is part of it */ - if ( has_soft_affinity(scurr->unit) ) - { - affinity_balance_cpumask(scurr->unit, BALANCE_SOFT_AFFINITY, - cpumask_scratch); - if ( unlikely(!cpumask_test_cpu(cpu, cpumask_scratch)) ) - { - cpumask_t *online = cpupool_domain_master_cpumask(scurr->unit->domain); - - /* Ok, is any of the pcpus in scurr soft-affinity idle? */ - cpumask_and(cpumask_scratch, cpumask_scratch, &rqd->idle); - cpumask_andnot(cpumask_scratch, cpumask_scratch, &rqd->tickled); - soft_aff_preempt = cpumask_intersects(cpumask_scratch, online); - } - } - - /* - * If scurr is runnable, and this cpu is in its soft-affinity, default to - * it. We also default to it, even if cpu is not in its soft-affinity, if - * there aren't any idle and not tickled cpu in its soft-affinity. In - * fact, we don't want to risk leaving scurr in the runq and this cpu idle - * only because scurr is running outside of its soft-affinity. - * - * On the other hand, if cpu is not in scurr's soft-affinity, and there - * looks to be better options, go for them. That happens by defaulting to - * idle here, which means scurr will be preempted, put back in runq, and - * one of those idle and not tickled cpus from its soft-affinity will be - * tickled to pick it up. - * - * Finally, if scurr does not have a valid soft-affinity, we also let it - * continue to run here (in fact, soft_aff_preempt will still be false, - * in this case). - * - * Of course, we also default to idle also if scurr is not runnable. - */ - if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt ) - snext = scurr; - else - snext = csched2_unit(sched_idle_unit(cpu)); - - check_runq: - list_for_each_safe( iter, temp, &rqd->runq ) - { - struct csched2_unit * svc = list_entry(iter, struct csched2_unit, runq_elem); - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - __trace_var(TRC_CSCHED2_RUNQ_CAND_CHECK, 1, - sizeof(d), - (unsigned char *)&d); - } - - /* Only consider units that are allowed to run on this processor. */ - if ( !cpumask_test_cpu(cpu, svc->unit->cpu_hard_affinity) ) - { - (*skipped)++; - continue; - } - - /* - * If an unit is meant to be picked up by another processor, and such - * processor has not scheduled yet, leave it in the runqueue for him. - */ - if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu && - cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) ) - { - (*skipped)++; - SCHED_STAT_CRANK(deferred_to_tickled_cpu); - continue; - } - - /* - * If this is on a different processor, don't pull it unless - * its credit is at least CSCHED2_MIGRATE_RESIST higher. - */ - if ( sched_unit_master(svc->unit) != cpu - && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit ) - { - (*skipped)++; - SCHED_STAT_CRANK(migrate_resisted); - continue; - } - - /* - * If the one in the runqueue has more credit than current (or idle, - * if current is not runnable), or if current is yielding, and also - * if the one in runqueue either is not capped, or is capped but has - * some budget, then choose it. - */ - if ( (yield || svc->credit > snext->credit) && - (!has_cap(svc) || unit_grab_budget(svc)) && - unit_runnable_state(svc->unit) ) - snext = svc; - - /* In any case, if we got this far, break. */ - break; - } - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned unit:16, dom:16; - unsigned tickled_cpu, skipped; - int credit; - } d; - d.dom = snext->unit->domain->domain_id; - d.unit = snext->unit->unit_id; - d.credit = snext->credit; - d.tickled_cpu = snext->tickled_cpu; - d.skipped = *skipped; - __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1, - sizeof(d), - (unsigned char *)&d); - } - - if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) ) - SCHED_STAT_CRANK(tickled_cpu_overridden); - - /* - * If snext is from a capped domain, it must have budget (or it - * wouldn't have been in the runq). If it is not, it'd be STIME_MAX, - * which still is >= 0. - */ - ASSERT(snext->budget >= 0); - - return snext; -} - -/* - * This function is in the critical path. It is designed to be simple and - * fast for the common case. - */ -static void csched2_schedule( - const struct scheduler *ops, struct sched_unit *currunit, s_time_t now, - bool tasklet_work_scheduled) -{ - const unsigned int cur_cpu = smp_processor_id(); - const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); - struct csched2_runqueue_data *rqd; - struct csched2_unit * const scurr = csched2_unit(currunit); - struct csched2_unit *snext = NULL; - unsigned int skipped_units = 0; - bool tickled; - bool migrated = false; - - SCHED_STAT_CRANK(schedule); - CSCHED2_UNIT_CHECK(currunit); - - BUG_ON(!cpumask_test_cpu(sched_cpu, &csched2_priv(ops)->initialized)); - - rqd = c2rqd(ops, sched_cpu); - BUG_ON(!cpumask_test_cpu(sched_cpu, &rqd->active)); - - ASSERT(spin_is_locked(get_sched_res(sched_cpu)->schedule_lock)); - - BUG_ON(!is_idle_unit(currunit) && scurr->rqd != rqd); - - /* Clear "tickled" bit now that we've been scheduled */ - tickled = cpumask_test_cpu(sched_cpu, &rqd->tickled); - if ( tickled ) - { - __cpumask_clear_cpu(sched_cpu, &rqd->tickled); - cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); - smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); - } - - if ( unlikely(tb_init_done) ) - { - struct { - unsigned cpu:16, rq_id:16; - unsigned tasklet:8, idle:8, smt_idle:8, tickled:8; - } d; - d.cpu = cur_cpu; - d.rq_id = c2r(sched_cpu); - d.tasklet = tasklet_work_scheduled; - d.idle = is_idle_unit(currunit); - d.smt_idle = cpumask_test_cpu(sched_cpu, &rqd->smt_idle); - d.tickled = tickled; - __trace_var(TRC_CSCHED2_SCHEDULE, 1, - sizeof(d), - (unsigned char *)&d); - } - - /* Update credits (and budget, if necessary). */ - burn_credits(rqd, scurr, now); - - /* - * Below 0, means that we are capped and we have overrun our budget. - * Let's try to get some more but, if we fail (e.g., because of the - * other running units), we will be parked. - */ - if ( unlikely(scurr->budget <= 0) ) - unit_grab_budget(scurr); - - /* - * Select next runnable local UNIT (ie top of local runq). - * - * If the current unit is runnable, and has higher credit than - * the next guy on the queue (or there is noone else), we want to - * run him again. - * - * If there's tasklet work to do, we want to chose the idle unit - * for this processor, and mark the current for delayed runqueue - * add. - * - * If the current unit is runnable, and there's another runnable - * candidate, we want to mark current for delayed runqueue add, - * and remove the next guy from the queue. - * - * If the current unit is not runnable, we want to chose the idle - * unit for this processor. - */ - if ( tasklet_work_scheduled ) - { - __clear_bit(__CSFLAG_unit_yield, &scurr->flags); - trace_var(TRC_CSCHED2_SCHED_TASKLET, 1, 0, NULL); - snext = csched2_unit(sched_idle_unit(sched_cpu)); - } - else - snext = runq_candidate(rqd, scurr, sched_cpu, now, &skipped_units); - - /* If switching from a non-idle runnable unit, put it - * back on the runqueue. */ - if ( snext != scurr - && !is_idle_unit(currunit) - && unit_runnable(currunit) ) - __set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); - - /* Accounting for non-idle tasks */ - if ( !is_idle_unit(snext->unit) ) - { - /* If switching, remove this from the runqueue and mark it scheduled */ - if ( snext != scurr ) - { - ASSERT(snext->rqd == rqd); - ASSERT(!snext->unit->is_running); - - runq_remove(snext); - __set_bit(__CSFLAG_scheduled, &snext->flags); - } - - /* Clear the idle mask if necessary */ - if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) - { - __cpumask_clear_cpu(sched_cpu, &rqd->idle); - smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); - } - - /* - * The reset condition is "has a scheduler epoch come to an end?". - * The way this is enforced is checking whether the unit at the top - * of the runqueue has negative credits. This means the epochs have - * variable length, as in one epoch expores when: - * 1) the unit at the top of the runqueue has executed for - * around 10 ms (with default parameters); - * 2) no other unit with higher credits wants to run. - * - * Here, where we want to check for reset, we need to make sure the - * proper unit is being used. In fact, runqueue_candidate() may have - * not returned the first unit in the runqueue, for various reasons - * (e.g., affinity). Only trigger a reset when it does. - */ - if ( skipped_units == 0 && snext->credit <= CSCHED2_CREDIT_RESET ) - { - reset_credit(ops, sched_cpu, now, snext); - balance_load(ops, sched_cpu, now); - } - - snext->start_time = now; - snext->tickled_cpu = -1; - - /* Safe because lock for old processor is held */ - if ( sched_unit_master(snext->unit) != sched_cpu ) - { - snext->credit += CSCHED2_MIGRATE_COMPENSATION; - sched_set_res(snext->unit, get_sched_res(sched_cpu)); - SCHED_STAT_CRANK(migrated); - migrated = true; - } - } - else - { - /* - * Update the idle mask if necessary. Note that, if we're scheduling - * idle in order to carry on some tasklet work, we want to play busy! - */ - if ( tasklet_work_scheduled ) - { - if ( cpumask_test_cpu(sched_cpu, &rqd->idle) ) - { - __cpumask_clear_cpu(sched_cpu, &rqd->idle); - smt_idle_mask_clear(sched_cpu, &rqd->smt_idle); - } - } - else if ( !cpumask_test_cpu(sched_cpu, &rqd->idle) ) - { - __cpumask_set_cpu(sched_cpu, &rqd->idle); - cpumask_andnot(cpumask_scratch, &rqd->idle, &rqd->tickled); - smt_idle_mask_set(sched_cpu, cpumask_scratch, &rqd->smt_idle); - } - /* Make sure avgload gets updated periodically even - * if there's no activity */ - update_load(ops, rqd, NULL, 0, now); - } - - /* - * Return task to run next... - */ - currunit->next_time = csched2_runtime(ops, sched_cpu, snext, now); - currunit->next_task = snext->unit; - snext->unit->migrated = migrated; - - CSCHED2_UNIT_CHECK(currunit->next_task); -} - -static void -csched2_dump_unit(struct csched2_private *prv, struct csched2_unit *svc) -{ - printk("[%i.%i] flags=%x cpu=%i", - svc->unit->domain->domain_id, - svc->unit->unit_id, - svc->flags, - sched_unit_master(svc->unit)); - - printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); - - if ( has_cap(svc) ) - printk(" budget=%"PRI_stime"(%"PRI_stime")", - svc->budget, svc->budget_quota); - - printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload, - (svc->avgload * 100) >> prv->load_precision_shift); - - printk("\n"); -} - -static inline void -dump_pcpu(const struct scheduler *ops, int cpu) -{ - struct csched2_private *prv = csched2_priv(ops); - struct csched2_unit *svc; - - printk("CPU[%02d] runq=%d, sibling={%*pbl}, core={%*pbl}\n", - cpu, c2r(cpu), - CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), - CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); - - /* current UNIT (nothing to say if that's the idle unit) */ - svc = csched2_unit(curr_on_cpu(cpu)); - if ( svc && !is_idle_unit(svc->unit) ) - { - printk("\trun: "); - csched2_dump_unit(prv, svc); - } -} - -static void -csched2_dump(const struct scheduler *ops) -{ - struct list_head *iter_sdom; - struct csched2_private *prv = csched2_priv(ops); - unsigned long flags; - unsigned int i, j, loop; - - /* - * We need the private scheduler lock as we access global - * scheduler data and (below) the list of active domains. - */ - read_lock_irqsave(&prv->lock, flags); - - printk("Active queues: %d\n" - "\tdefault-weight = %d\n", - cpumask_weight(&prv->active_queues), - CSCHED2_DEFAULT_WEIGHT); - for_each_cpu(i, &prv->active_queues) - { - s_time_t fraction; - - fraction = (prv->rqd[i].avgload * 100) >> prv->load_precision_shift; - - printk("Runqueue %d:\n" - "\tncpus = %u\n" - "\tcpus = %*pbl\n" - "\tmax_weight = %u\n" - "\tpick_bias = %u\n" - "\tinstload = %d\n" - "\taveload = %"PRI_stime" (~%"PRI_stime"%%)\n", - i, - prv->rqd[i].nr_cpus, - CPUMASK_PR(&prv->rqd[i].active), - prv->rqd[i].max_weight, - prv->rqd[i].pick_bias, - prv->rqd[i].load, - prv->rqd[i].avgload, - fraction); - - printk("\tidlers: %*pb\n" - "\ttickled: %*pb\n" - "\tfully idle cores: %*pb\n", - CPUMASK_PR(&prv->rqd[i].idle), - CPUMASK_PR(&prv->rqd[i].tickled), - CPUMASK_PR(&prv->rqd[i].smt_idle)); - } - - printk("Domain info:\n"); - loop = 0; - list_for_each( iter_sdom, &prv->sdom ) - { - struct csched2_dom *sdom; - struct sched_unit *unit; - - sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem); - - printk("\tDomain: %d w %d c %u v %d\n", - sdom->dom->domain_id, - sdom->weight, - sdom->cap, - sdom->nr_units); - - for_each_sched_unit ( sdom->dom, unit ) - { - struct csched2_unit * const svc = csched2_unit(unit); - spinlock_t *lock; - - lock = unit_schedule_lock(unit); - - printk("\t%3d: ", ++loop); - csched2_dump_unit(prv, svc); - - unit_schedule_unlock(lock, unit); - } - } - - for_each_cpu(i, &prv->active_queues) - { - struct csched2_runqueue_data *rqd = prv->rqd + i; - struct list_head *iter, *runq = &rqd->runq; - int loop = 0; - - /* We need the lock to scan the runqueue. */ - spin_lock(&rqd->lock); - - printk("Runqueue %d:\n", i); - - for_each_cpu(j, &rqd->active) - dump_pcpu(ops, j); - - printk("RUNQ:\n"); - list_for_each( iter, runq ) - { - struct csched2_unit *svc = runq_elem(iter); - - if ( svc ) - { - printk("\t%3d: ", loop++); - csched2_dump_unit(prv, svc); - } - } - spin_unlock(&rqd->lock); - } - - read_unlock_irqrestore(&prv->lock, flags); -} - -static void * -csched2_alloc_pdata(const struct scheduler *ops, int cpu) -{ - struct csched2_pcpu *spc; - - spc = xzalloc(struct csched2_pcpu); - if ( spc == NULL ) - return ERR_PTR(-ENOMEM); - - /* Not in any runqueue yet */ - spc->runq_id = -1; - - return spc; -} - -/* Returns the ID of the runqueue the cpu is assigned to. */ -static unsigned -init_pdata(struct csched2_private *prv, struct csched2_pcpu *spc, - unsigned int cpu) -{ - struct csched2_runqueue_data *rqd; - unsigned int rcpu; - - ASSERT(rw_is_write_locked(&prv->lock)); - ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); - /* CPU data needs to be allocated, but still uninitialized. */ - ASSERT(spc && spc->runq_id == -1); - - /* Figure out which runqueue to put it in */ - spc->runq_id = cpu_to_runqueue(prv, cpu); - - rqd = prv->rqd + spc->runq_id; - - printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id); - if ( ! cpumask_test_cpu(spc->runq_id, &prv->active_queues) ) - { - printk(XENLOG_INFO " First cpu on runqueue, activating\n"); - activate_runqueue(prv, spc->runq_id); - } - - __cpumask_set_cpu(cpu, &spc->sibling_mask); - - if ( rqd->nr_cpus > 0 ) - for_each_cpu ( rcpu, per_cpu(cpu_sibling_mask, cpu) ) - if ( cpumask_test_cpu(rcpu, &rqd->active) ) - { - __cpumask_set_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); - __cpumask_set_cpu(rcpu, &spc->sibling_mask); - } - - __cpumask_set_cpu(cpu, &rqd->idle); - __cpumask_set_cpu(cpu, &rqd->active); - __cpumask_set_cpu(cpu, &prv->initialized); - __cpumask_set_cpu(cpu, &rqd->smt_idle); - - rqd->nr_cpus++; - ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); - - if ( rqd->nr_cpus == 1 ) - rqd->pick_bias = cpu; - - return spc->runq_id; -} - -static void -csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu) -{ - struct csched2_private *prv = csched2_priv(ops); - spinlock_t *old_lock; - unsigned long flags; - unsigned rqi; - - write_lock_irqsave(&prv->lock, flags); - old_lock = pcpu_schedule_lock(cpu); - - rqi = init_pdata(prv, pdata, cpu); - /* Move the scheduler lock to the new runq lock. */ - get_sched_res(cpu)->schedule_lock = &prv->rqd[rqi].lock; - - /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ - spin_unlock(old_lock); - write_unlock_irqrestore(&prv->lock, flags); -} - -/* Change the scheduler of cpu to us (Credit2). */ -static spinlock_t * -csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, - void *pdata, void *vdata) -{ - struct csched2_private *prv = csched2_priv(new_ops); - struct csched2_unit *svc = vdata; - unsigned rqi; - - ASSERT(pdata && svc && is_idle_unit(svc->unit)); - - /* - * We own one runqueue lock already (from schedule_cpu_switch()). This - * looks like it violates this scheduler's locking rules, but it does - * not, as what we own is the lock of another scheduler, that hence has - * no particular (ordering) relationship with our private global lock. - * And owning exactly that one (the lock of the old scheduler of this - * cpu) is what is necessary to prevent races. - */ - ASSERT(!local_irq_is_enabled()); - write_lock(&prv->lock); - - sched_idle_unit(cpu)->priv = vdata; - - rqi = init_pdata(prv, pdata, cpu); - - /* - * Now that we know what runqueue we'll go in, double check what's said - * above: the lock we already hold is not the one of this runqueue of - * this scheduler, and so it's safe to have taken it /before/ our - * private global lock. - */ - ASSERT(get_sched_res(cpu)->schedule_lock != &prv->rqd[rqi].lock); - - write_unlock(&prv->lock); - - return &prv->rqd[rqi].lock; -} - -static void -csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - unsigned long flags; - struct csched2_private *prv = csched2_priv(ops); - struct csched2_runqueue_data *rqd; - struct csched2_pcpu *spc = pcpu; - unsigned int rcpu; - - write_lock_irqsave(&prv->lock, flags); - - /* - * alloc_pdata is not implemented, so pcpu must be NULL. On the other - * hand, init_pdata must have been called for this pCPU. - */ - /* - * Scheduler specific data for this pCPU must still be there and and be - * valid. In fact, if we are here: - * 1. alloc_pdata must have been called for this cpu, and free_pdata - * must not have been called on it before us, - * 2. init_pdata must have been called on this cpu, and deinit_pdata - * (us!) must not have been called on it already. - */ - ASSERT(spc && spc->runq_id != -1); - ASSERT(cpumask_test_cpu(cpu, &prv->initialized)); - - /* Find the old runqueue and remove this cpu from it */ - rqd = prv->rqd + spc->runq_id; - - /* No need to save IRQs here, they're already disabled */ - spin_lock(&rqd->lock); - - printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id); - - __cpumask_clear_cpu(cpu, &rqd->idle); - __cpumask_clear_cpu(cpu, &rqd->smt_idle); - __cpumask_clear_cpu(cpu, &rqd->active); - - for_each_cpu ( rcpu, &rqd->active ) - __cpumask_clear_cpu(cpu, &csched2_pcpu(rcpu)->sibling_mask); - - rqd->nr_cpus--; - ASSERT(cpumask_weight(&rqd->active) == rqd->nr_cpus); - - if ( rqd->nr_cpus == 0 ) - { - printk(XENLOG_INFO " No cpus left on runqueue, disabling\n"); - deactivate_runqueue(prv, spc->runq_id); - } - else if ( rqd->pick_bias == cpu ) - rqd->pick_bias = cpumask_first(&rqd->active); - - spc->runq_id = -1; - - spin_unlock(&rqd->lock); - - __cpumask_clear_cpu(cpu, &prv->initialized); - - write_unlock_irqrestore(&prv->lock, flags); - - return; -} - -static void -csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - struct csched2_pcpu *spc = pcpu; - - /* - * pcpu either points to a valid struct csched2_pcpu, or is NULL (if - * CPU bringup failed, and we're beeing called from CPU_UP_CANCELLED). - * xfree() does not really mind, but we want to be sure that either - * init_pdata has never been called, or deinit_pdata has been called - * already. - */ - ASSERT(!pcpu || spc->runq_id == -1); - ASSERT(!cpumask_test_cpu(cpu, &csched2_priv(ops)->initialized)); - - xfree(pcpu); -} - -static int __init -csched2_global_init(void) -{ - if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN ) - { - printk("WARNING: %s: opt_load_precision_shift %u below min %d, resetting\n", - __func__, opt_load_precision_shift, LOADAVG_PRECISION_SHIFT_MIN); - opt_load_precision_shift = LOADAVG_PRECISION_SHIFT_MIN; - } - - if ( opt_load_window_shift <= LOADAVG_GRANULARITY_SHIFT ) - { - printk("WARNING: %s: opt_load_window_shift %u too short, resetting\n", - __func__, opt_load_window_shift); - opt_load_window_shift = LOADAVG_WINDOW_SHIFT; - } - - if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER ) - { - printk("WARNING: %s: opt_cap_period %u too small, resetting\n", - __func__, opt_cap_period); - opt_cap_period = 10; /* ms */ - } - - return 0; -} - -static int -csched2_init(struct scheduler *ops) -{ - int i; - struct csched2_private *prv; - - printk("Initializing Credit2 scheduler\n"); - - printk(XENLOG_INFO " load_precision_shift: %d\n" - XENLOG_INFO " load_window_shift: %d\n" - XENLOG_INFO " underload_balance_tolerance: %d\n" - XENLOG_INFO " overload_balance_tolerance: %d\n" - XENLOG_INFO " runqueues arrangement: %s\n" - XENLOG_INFO " cap enforcement granularity: %dms\n", - opt_load_precision_shift, - opt_load_window_shift, - opt_underload_balance_tolerance, - opt_overload_balance_tolerance, - opt_runqueue_str[opt_runqueue], - opt_cap_period); - - printk(XENLOG_INFO "load tracking window length %llu ns\n", - 1ULL << opt_load_window_shift); - - /* - * Basically no CPU information is available at this point; just - * set up basic structures, and a callback when the CPU info is - * available. - */ - - prv = xzalloc(struct csched2_private); - if ( prv == NULL ) - return -ENOMEM; - ops->sched_data = prv; - - rwlock_init(&prv->lock); - INIT_LIST_HEAD(&prv->sdom); - - /* Allocate all runqueues and mark them as un-initialized */ - prv->rqd = xzalloc_array(struct csched2_runqueue_data, nr_cpu_ids); - if ( !prv->rqd ) - { - xfree(prv); - return -ENOMEM; - } - for ( i = 0; i < nr_cpu_ids; i++ ) - prv->rqd[i].id = -1; - - /* initialize ratelimit */ - prv->ratelimit_us = sched_ratelimit_us; - - prv->load_precision_shift = opt_load_precision_shift; - prv->load_window_shift = opt_load_window_shift - LOADAVG_GRANULARITY_SHIFT; - ASSERT(opt_load_window_shift > 0); - - return 0; -} - -static void -csched2_deinit(struct scheduler *ops) -{ - struct csched2_private *prv; - - prv = csched2_priv(ops); - ops->sched_data = NULL; - if ( prv ) - xfree(prv->rqd); - xfree(prv); -} - -static const struct scheduler sched_credit2_def = { - .name = "SMP Credit Scheduler rev2", - .opt_name = "credit2", - .sched_id = XEN_SCHEDULER_CREDIT2, - .sched_data = NULL, - - .global_init = csched2_global_init, - - .insert_unit = csched2_unit_insert, - .remove_unit = csched2_unit_remove, - - .sleep = csched2_unit_sleep, - .wake = csched2_unit_wake, - .yield = csched2_unit_yield, - - .adjust = csched2_dom_cntl, - .adjust_affinity= csched2_aff_cntl, - .adjust_global = csched2_sys_cntl, - - .pick_resource = csched2_res_pick, - .migrate = csched2_unit_migrate, - .do_schedule = csched2_schedule, - .context_saved = csched2_context_saved, - - .dump_settings = csched2_dump, - .init = csched2_init, - .deinit = csched2_deinit, - .alloc_udata = csched2_alloc_udata, - .free_udata = csched2_free_udata, - .alloc_pdata = csched2_alloc_pdata, - .init_pdata = csched2_init_pdata, - .deinit_pdata = csched2_deinit_pdata, - .free_pdata = csched2_free_pdata, - .switch_sched = csched2_switch_sched, - .alloc_domdata = csched2_alloc_domdata, - .free_domdata = csched2_free_domdata, -}; - -REGISTER_SCHEDULER(sched_credit2_def); diff --git a/xen/common/sched_null.c b/xen/common/sched_null.c deleted file mode 100644 index 3f3418c9b1..0000000000 --- a/xen/common/sched_null.c +++ /dev/null @@ -1,1034 +0,0 @@ -/* - * xen/common/sched_null.c - * - * Copyright (c) 2017, Dario Faggioli, Citrix Ltd - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; If not, see . - */ - -/* - * The 'null' scheduler always choose to run, on each pCPU, either nothing - * (i.e., the pCPU stays idle) or always the same unit. - * - * It is aimed at supporting static scenarios, where there always are - * less units than pCPUs (and the units don't need to move among pCPUs - * for any reason) with the least possible overhead. - * - * Typical usecase are embedded applications, but also HPC, especially - * if the scheduler is used inside a cpupool. - */ - -#include -#include -#include -#include - -/* - * null tracing events. Check include/public/trace.h for more details. - */ -#define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1) -#define TRC_SNULL_UNIT_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2) -#define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3) -#define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4) -#define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5) -#define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6) - -/* - * Locking: - * - Scheduler-lock (a.k.a. runqueue lock): - * + is per-pCPU; - * + serializes assignment and deassignment of units to a pCPU. - * - Private data lock (a.k.a. private scheduler lock): - * + is scheduler-wide; - * + serializes accesses to the list of domains in this scheduler. - * - Waitqueue lock: - * + is scheduler-wide; - * + serialize accesses to the list of units waiting to be assigned - * to pCPUs. - * - * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH, - * waitqueue lock nests inside runqueue lock which nests inside private - * lock. More specifically: - * + if we need both runqueue and private locks, we must acquire the - * private lock for first; - * + if we need both runqueue and waitqueue locks, we must acquire - * the runqueue lock for first; - * + if we need both private and waitqueue locks, we must acquire - * the private lock for first; - * + if we already own a runqueue lock, we must never acquire - * the private lock; - * + if we already own the waitqueue lock, we must never acquire - * the runqueue lock or the private lock. - */ - -/* - * System-wide private data - */ -struct null_private { - spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */ - struct list_head ndom; /* Domains of this scheduler */ - struct list_head waitq; /* units not assigned to any pCPU */ - spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */ - cpumask_t cpus_free; /* CPUs without a unit associated to them */ -}; - -/* - * Physical CPU - */ -struct null_pcpu { - struct sched_unit *unit; -}; -DEFINE_PER_CPU(struct null_pcpu, npc); - -/* - * Schedule unit - */ -struct null_unit { - struct list_head waitq_elem; - struct sched_unit *unit; -}; - -/* - * Domain - */ -struct null_dom { - struct list_head ndom_elem; - struct domain *dom; -}; - -/* - * Accessor helpers functions - */ -static inline struct null_private *null_priv(const struct scheduler *ops) -{ - return ops->sched_data; -} - -static inline struct null_unit *null_unit(const struct sched_unit *unit) -{ - return unit->priv; -} - -static inline bool unit_check_affinity(struct sched_unit *unit, - unsigned int cpu, - unsigned int balance_step) -{ - affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu)); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - cpupool_domain_master_cpumask(unit->domain)); - - return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu)); -} - -static int null_init(struct scheduler *ops) -{ - struct null_private *prv; - - printk("Initializing null scheduler\n" - "WARNING: This is experimental software in development.\n" - "Use at your own risk.\n"); - - prv = xzalloc(struct null_private); - if ( prv == NULL ) - return -ENOMEM; - - spin_lock_init(&prv->lock); - spin_lock_init(&prv->waitq_lock); - INIT_LIST_HEAD(&prv->ndom); - INIT_LIST_HEAD(&prv->waitq); - - ops->sched_data = prv; - - return 0; -} - -static void null_deinit(struct scheduler *ops) -{ - xfree(ops->sched_data); - ops->sched_data = NULL; -} - -static void init_pdata(struct null_private *prv, unsigned int cpu) -{ - /* Mark the pCPU as free, and with no unit assigned */ - cpumask_set_cpu(cpu, &prv->cpus_free); - per_cpu(npc, cpu).unit = NULL; -} - -static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu) -{ - struct null_private *prv = null_priv(ops); - - /* alloc_pdata is not implemented, so we want this to be NULL. */ - ASSERT(!pdata); - - init_pdata(prv, cpu); -} - -static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - struct null_private *prv = null_priv(ops); - - /* alloc_pdata not implemented, so this must have stayed NULL */ - ASSERT(!pcpu); - - cpumask_clear_cpu(cpu, &prv->cpus_free); - per_cpu(npc, cpu).unit = NULL; -} - -static void *null_alloc_udata(const struct scheduler *ops, - struct sched_unit *unit, void *dd) -{ - struct null_unit *nvc; - - nvc = xzalloc(struct null_unit); - if ( nvc == NULL ) - return NULL; - - INIT_LIST_HEAD(&nvc->waitq_elem); - nvc->unit = unit; - - SCHED_STAT_CRANK(unit_alloc); - - return nvc; -} - -static void null_free_udata(const struct scheduler *ops, void *priv) -{ - struct null_unit *nvc = priv; - - xfree(nvc); -} - -static void * null_alloc_domdata(const struct scheduler *ops, - struct domain *d) -{ - struct null_private *prv = null_priv(ops); - struct null_dom *ndom; - unsigned long flags; - - ndom = xzalloc(struct null_dom); - if ( ndom == NULL ) - return ERR_PTR(-ENOMEM); - - ndom->dom = d; - - spin_lock_irqsave(&prv->lock, flags); - list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom); - spin_unlock_irqrestore(&prv->lock, flags); - - return ndom; -} - -static void null_free_domdata(const struct scheduler *ops, void *data) -{ - struct null_dom *ndom = data; - struct null_private *prv = null_priv(ops); - - if ( ndom ) - { - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - list_del_init(&ndom->ndom_elem); - spin_unlock_irqrestore(&prv->lock, flags); - - xfree(ndom); - } -} - -/* - * unit to pCPU assignment and placement. This _only_ happens: - * - on insert, - * - on migrate. - * - * Insert occurs when a unit joins this scheduler for the first time - * (e.g., when the domain it's part of is moved to the scheduler's - * cpupool). - * - * Migration may be necessary if a pCPU (with a unit assigned to it) - * is removed from the scheduler's cpupool. - * - * So this is not part of any hot path. - */ -static struct sched_resource * -pick_res(struct null_private *prv, const struct sched_unit *unit) -{ - unsigned int bs; - unsigned int cpu = sched_unit_master(unit), new_cpu; - cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain); - - ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock)); - - for_each_affinity_balance_step( bs ) - { - if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) ) - continue; - - affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu)); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus); - - /* - * If our processor is free, or we are assigned to it, and it is also - * still valid and part of our affinity, just go for it. - * (Note that we may call unit_check_affinity(), but we deliberately - * don't, so we get to keep in the scratch cpumask what we have just - * put in it.) - */ - if ( likely((per_cpu(npc, cpu).unit == NULL || - per_cpu(npc, cpu).unit == unit) - && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) ) - { - new_cpu = cpu; - goto out; - } - - /* If not, just go for a free pCPU, within our affinity, if any */ - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &prv->cpus_free); - new_cpu = cpumask_first(cpumask_scratch_cpu(cpu)); - - if ( likely(new_cpu != nr_cpu_ids) ) - goto out; - } - - /* - * If we didn't find any free pCPU, just pick any valid pcpu, even if - * it has another unit assigned. This will happen during shutdown and - * suspend/resume, but it may also happen during "normal operation", if - * all the pCPUs are busy. - * - * In fact, there must always be something sane in v->processor, or - * unit_schedule_lock() and friends won't work. This is not a problem, - * as we will actually assign the unit to the pCPU we return from here, - * only if the pCPU is free. - */ - cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity); - new_cpu = cpumask_any(cpumask_scratch_cpu(cpu)); - - out: - if ( unlikely(tb_init_done) ) - { - struct { - uint16_t unit, dom; - uint32_t new_cpu; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.new_cpu = new_cpu; - __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d); - } - - return get_sched_res(new_cpu); -} - -static void unit_assign(struct null_private *prv, struct sched_unit *unit, - unsigned int cpu) -{ - ASSERT(is_unit_online(unit)); - - per_cpu(npc, cpu).unit = unit; - sched_set_res(unit, get_sched_res(cpu)); - cpumask_clear_cpu(cpu, &prv->cpus_free); - - dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id); - - if ( unlikely(tb_init_done) ) - { - struct { - uint16_t unit, dom; - uint32_t cpu; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.cpu = cpu; - __trace_var(TRC_SNULL_UNIT_ASSIGN, 1, sizeof(d), &d); - } -} - -/* Returns true if a cpu was tickled */ -static bool unit_deassign(struct null_private *prv, struct sched_unit *unit) -{ - unsigned int bs; - unsigned int cpu = sched_unit_master(unit); - struct null_unit *wvc; - - ASSERT(list_empty(&null_unit(unit)->waitq_elem)); - ASSERT(per_cpu(npc, cpu).unit == unit); - ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free)); - - per_cpu(npc, cpu).unit = NULL; - cpumask_set_cpu(cpu, &prv->cpus_free); - - dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain, - unit->unit_id); - - if ( unlikely(tb_init_done) ) - { - struct { - uint16_t unit, dom; - uint32_t cpu; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.cpu = cpu; - __trace_var(TRC_SNULL_UNIT_DEASSIGN, 1, sizeof(d), &d); - } - - spin_lock(&prv->waitq_lock); - - /* - * If unit is assigned to a pCPU, let's see if there is someone waiting, - * suitable to be assigned to it (prioritizing units that have - * soft-affinity with cpu). - */ - for_each_affinity_balance_step( bs ) - { - list_for_each_entry( wvc, &prv->waitq, waitq_elem ) - { - if ( bs == BALANCE_SOFT_AFFINITY && - !has_soft_affinity(wvc->unit) ) - continue; - - if ( unit_check_affinity(wvc->unit, cpu, bs) ) - { - list_del_init(&wvc->waitq_elem); - unit_assign(prv, wvc->unit, cpu); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - spin_unlock(&prv->waitq_lock); - return true; - } - } - } - spin_unlock(&prv->waitq_lock); - - return false; -} - -/* Change the scheduler of cpu to us (null). */ -static spinlock_t *null_switch_sched(struct scheduler *new_ops, - unsigned int cpu, - void *pdata, void *vdata) -{ - struct sched_resource *sr = get_sched_res(cpu); - struct null_private *prv = null_priv(new_ops); - struct null_unit *nvc = vdata; - - ASSERT(nvc && is_idle_unit(nvc->unit)); - - sched_idle_unit(cpu)->priv = vdata; - - /* - * We are holding the runqueue lock already (it's been taken in - * schedule_cpu_switch()). It actually may or may not be the 'right' - * one for this cpu, but that is ok for preventing races. - */ - ASSERT(!local_irq_is_enabled()); - - init_pdata(prv, cpu); - - return &sr->_lock; -} - -static void null_unit_insert(const struct scheduler *ops, - struct sched_unit *unit) -{ - struct null_private *prv = null_priv(ops); - struct null_unit *nvc = null_unit(unit); - unsigned int cpu; - spinlock_t *lock; - - ASSERT(!is_idle_unit(unit)); - - lock = unit_schedule_lock_irq(unit); - - if ( unlikely(!is_unit_online(unit)) ) - { - unit_schedule_unlock_irq(lock, unit); - return; - } - - retry: - sched_set_res(unit, pick_res(prv, unit)); - cpu = sched_unit_master(unit); - - spin_unlock(lock); - - lock = unit_schedule_lock(unit); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - - /* If the pCPU is free, we assign unit to it */ - if ( likely(per_cpu(npc, cpu).unit == NULL) ) - { - /* - * Insert is followed by vcpu_wake(), so there's no need to poke - * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that. - */ - unit_assign(prv, unit, cpu); - } - else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) ) - { - /* - * If the pCPU is not free (e.g., because we raced with another - * insert or a migrate), but there are other free pCPUs, we can - * try to pick again. - */ - goto retry; - } - else - { - /* - * If the pCPU is not free, and there aren't any (valid) others, - * we have no alternatives than to go into the waitqueue. - */ - spin_lock(&prv->waitq_lock); - list_add_tail(&nvc->waitq_elem, &prv->waitq); - dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n", - unit->domain, unit->unit_id); - spin_unlock(&prv->waitq_lock); - } - spin_unlock_irq(lock); - - SCHED_STAT_CRANK(unit_insert); -} - -static void null_unit_remove(const struct scheduler *ops, - struct sched_unit *unit) -{ - struct null_private *prv = null_priv(ops); - struct null_unit *nvc = null_unit(unit); - spinlock_t *lock; - - ASSERT(!is_idle_unit(unit)); - - lock = unit_schedule_lock_irq(unit); - - /* If offline, the unit shouldn't be assigned, nor in the waitqueue */ - if ( unlikely(!is_unit_online(unit)) ) - { - ASSERT(per_cpu(npc, sched_unit_master(unit)).unit != unit); - ASSERT(list_empty(&nvc->waitq_elem)); - goto out; - } - - /* If unit is in waitqueue, just get it out of there and bail */ - if ( unlikely(!list_empty(&nvc->waitq_elem)) ) - { - spin_lock(&prv->waitq_lock); - list_del_init(&nvc->waitq_elem); - spin_unlock(&prv->waitq_lock); - - goto out; - } - - unit_deassign(prv, unit); - - out: - unit_schedule_unlock_irq(lock, unit); - - SCHED_STAT_CRANK(unit_remove); -} - -static void null_unit_wake(const struct scheduler *ops, - struct sched_unit *unit) -{ - struct null_private *prv = null_priv(ops); - struct null_unit *nvc = null_unit(unit); - unsigned int cpu = sched_unit_master(unit); - - ASSERT(!is_idle_unit(unit)); - - if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) - { - SCHED_STAT_CRANK(unit_wake_running); - return; - } - - if ( unlikely(!list_empty(&nvc->waitq_elem)) ) - { - /* Not exactly "on runq", but close enough for reusing the counter */ - SCHED_STAT_CRANK(unit_wake_onrunq); - return; - } - - if ( likely(unit_runnable(unit)) ) - SCHED_STAT_CRANK(unit_wake_runnable); - else - SCHED_STAT_CRANK(unit_wake_not_runnable); - - if ( likely(per_cpu(npc, cpu).unit == unit) ) - { - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - return; - } - - /* - * If a unit is neither on a pCPU nor in the waitqueue, it means it was - * offline, and that it is now coming back being online. If we're lucky, - * and its previous resource is free (and affinities match), we can just - * assign the unit to it (we own the proper lock already) and be done. - */ - if ( per_cpu(npc, cpu).unit == NULL && - unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) ) - { - if ( !has_soft_affinity(unit) || - unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) ) - { - unit_assign(prv, unit, cpu); - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - return; - } - } - - /* - * If the resource is not free (or affinities do not match) we need - * to assign unit to some other one, but we can't do it here, as: - * - we don't own the proper lock, - * - we can't change v->processor under vcpu_wake()'s feet. - * So we add it to the waitqueue, and tickle all the free CPUs (if any) - * on which unit can run. The first one that schedules will pick it up. - */ - spin_lock(&prv->waitq_lock); - list_add_tail(&nvc->waitq_elem, &prv->waitq); - spin_unlock(&prv->waitq_lock); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(unit->domain)); - cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), - &prv->cpus_free); - - if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) - dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n", - unit->domain->domain_id, unit->unit_id); - else - cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ); -} - -static void null_unit_sleep(const struct scheduler *ops, - struct sched_unit *unit) -{ - struct null_private *prv = null_priv(ops); - unsigned int cpu = sched_unit_master(unit); - bool tickled = false; - - ASSERT(!is_idle_unit(unit)); - - /* - * Check if the unit is in the process of being offlined. If yes, - * we need to remove it from either its pCPU or the waitqueue. - */ - if ( unlikely(!is_unit_online(unit)) ) - { - struct null_unit *nvc = null_unit(unit); - - if ( unlikely(!list_empty(&nvc->waitq_elem)) ) - { - spin_lock(&prv->waitq_lock); - list_del_init(&nvc->waitq_elem); - spin_unlock(&prv->waitq_lock); - } - else if ( per_cpu(npc, cpu).unit == unit ) - tickled = unit_deassign(prv, unit); - } - - /* If unit is not assigned to a pCPU, or is not running, no need to bother */ - if ( likely(!tickled && curr_on_cpu(cpu) == unit) ) - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - - SCHED_STAT_CRANK(unit_sleep); -} - -static struct sched_resource * -null_res_pick(const struct scheduler *ops, const struct sched_unit *unit) -{ - ASSERT(!is_idle_unit(unit)); - return pick_res(null_priv(ops), unit); -} - -static void null_unit_migrate(const struct scheduler *ops, - struct sched_unit *unit, unsigned int new_cpu) -{ - struct null_private *prv = null_priv(ops); - struct null_unit *nvc = null_unit(unit); - - ASSERT(!is_idle_unit(unit)); - - if ( sched_unit_master(unit) == new_cpu ) - return; - - if ( unlikely(tb_init_done) ) - { - struct { - uint16_t unit, dom; - uint16_t cpu, new_cpu; - } d; - d.dom = unit->domain->domain_id; - d.unit = unit->unit_id; - d.cpu = sched_unit_master(unit); - d.new_cpu = new_cpu; - __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d); - } - - /* - * If unit is assigned to a pCPU, then such pCPU becomes free, and we - * should look in the waitqueue if anyone else can be assigned to it. - */ - if ( likely(per_cpu(npc, sched_unit_master(unit)).unit == unit) ) - { - unit_deassign(prv, unit); - SCHED_STAT_CRANK(migrate_running); - } - else if ( !list_empty(&nvc->waitq_elem) ) - SCHED_STAT_CRANK(migrate_on_runq); - - SCHED_STAT_CRANK(migrated); - - /* - * If a unit is (going) offline, we want it to be neither assigned - * to a pCPU, nor in the waitqueue. - * - * If it was on a cpu, we've removed it from there above. If it is - * in the waitqueue, we remove it from there now. And then we bail. - */ - if ( unlikely(!is_unit_online(unit)) ) - { - spin_lock(&prv->waitq_lock); - list_del_init(&nvc->waitq_elem); - spin_unlock(&prv->waitq_lock); - goto out; - } - - /* - * Let's now consider new_cpu, which is where unit is being sent. It can be - * either free, or have a unit already assigned to it. - * - * In the former case we should assign unit to it, and try to get it to run, - * if possible, according to affinity. - * - * In latter, all we can do is to park unit in the waitqueue. - */ - if ( per_cpu(npc, new_cpu).unit == NULL && - unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) ) - { - /* unit might have been in the waitqueue, so remove it */ - spin_lock(&prv->waitq_lock); - list_del_init(&nvc->waitq_elem); - spin_unlock(&prv->waitq_lock); - - unit_assign(prv, unit, new_cpu); - } - else - { - /* Put unit in the waitqueue, if it wasn't there already */ - spin_lock(&prv->waitq_lock); - if ( list_empty(&nvc->waitq_elem) ) - { - list_add_tail(&nvc->waitq_elem, &prv->waitq); - dprintk(XENLOG_G_WARNING, - "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain, - unit->unit_id); - } - spin_unlock(&prv->waitq_lock); - } - - /* - * Whatever all the above, we always at least override v->processor. - * This is especially important for shutdown or suspend/resume paths, - * when it is important to let our caller (cpu_disable_scheduler()) - * know that the migration did happen, to the best of our possibilities, - * at least. In case of suspend, any temporary inconsistency caused - * by this, will be fixed-up during resume. - */ - out: - sched_set_res(unit, get_sched_res(new_cpu)); -} - -#ifndef NDEBUG -static inline void null_unit_check(struct sched_unit *unit) -{ - struct null_unit * const nvc = null_unit(unit); - struct null_dom * const ndom = unit->domain->sched_priv; - - BUG_ON(nvc->unit != unit); - - if ( ndom ) - BUG_ON(is_idle_unit(unit)); - else - BUG_ON(!is_idle_unit(unit)); - - SCHED_STAT_CRANK(unit_check); -} -#define NULL_UNIT_CHECK(unit) (null_unit_check(unit)) -#else -#define NULL_UNIT_CHECK(unit) -#endif - - -/* - * The most simple scheduling function of all times! We either return: - * - the unit assigned to the pCPU, if there's one and it can run; - * - the idle unit, otherwise. - */ -static void null_schedule(const struct scheduler *ops, struct sched_unit *prev, - s_time_t now, bool tasklet_work_scheduled) -{ - unsigned int bs; - const unsigned int cur_cpu = smp_processor_id(); - const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); - struct null_private *prv = null_priv(ops); - struct null_unit *wvc; - - SCHED_STAT_CRANK(schedule); - NULL_UNIT_CHECK(current->sched_unit); - - if ( unlikely(tb_init_done) ) - { - struct { - uint16_t tasklet, cpu; - int16_t unit, dom; - } d; - d.cpu = cur_cpu; - d.tasklet = tasklet_work_scheduled; - if ( per_cpu(npc, sched_cpu).unit == NULL ) - { - d.unit = d.dom = -1; - } - else - { - d.unit = per_cpu(npc, sched_cpu).unit->unit_id; - d.dom = per_cpu(npc, sched_cpu).unit->domain->domain_id; - } - __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d); - } - - if ( tasklet_work_scheduled ) - { - trace_var(TRC_SNULL_TASKLET, 1, 0, NULL); - prev->next_task = sched_idle_unit(sched_cpu); - } - else - prev->next_task = per_cpu(npc, sched_cpu).unit; - prev->next_time = -1; - - /* - * We may be new in the cpupool, or just coming back online. In which - * case, there may be units in the waitqueue that we can assign to us - * and run. - */ - if ( unlikely(prev->next_task == NULL) ) - { - bool unit_found; - - spin_lock(&prv->waitq_lock); - - if ( list_empty(&prv->waitq) ) - goto unlock; - - /* - * We scan the waitqueue twice, for prioritizing units that have - * soft-affinity with cpu. This may look like something expensive to - * do here in null_schedule(), but it's actually fine, because we do - * it only in cases where a pcpu has no unit associated (e.g., as - * said above, the cpu has just joined a cpupool). - */ - unit_found = false; - for_each_affinity_balance_step( bs ) - { - list_for_each_entry( wvc, &prv->waitq, waitq_elem ) - { - if ( bs == BALANCE_SOFT_AFFINITY && - !has_soft_affinity(wvc->unit) ) - continue; - - if ( unit_check_affinity(wvc->unit, sched_cpu, bs) ) - { - spinlock_t *lock; - - unit_found = true; - - /* - * If the unit in the waitqueue has just come up online, - * we risk racing with vcpu_wake(). To avoid this, sync - * on the spinlock that vcpu_wake() holds, but only with - * trylock, to avoid deadlock). - */ - lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit)); - - /* - * We know the vcpu's lock is not this resource's lock. In - * fact, if it were, since this cpu is free, vcpu_wake() - * would have assigned the unit to here directly. - */ - ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock); - - if ( lock ) { - unit_assign(prv, wvc->unit, sched_cpu); - list_del_init(&wvc->waitq_elem); - prev->next_task = wvc->unit; - spin_unlock(lock); - goto unlock; - } - } - } - } - /* - * If we did find a unit with suitable affinity in the waitqueue, but - * we could not pick it up (due to lock contention), and hence we are - * still free, plan for another try. In fact, we don't want such unit - * to be stuck in the waitqueue, when there are free cpus where it - * could run. - */ - if ( unlikely( unit_found && prev->next_task == NULL && - !list_empty(&prv->waitq)) ) - cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ); - unlock: - spin_unlock(&prv->waitq_lock); - - if ( prev->next_task == NULL && - !cpumask_test_cpu(sched_cpu, &prv->cpus_free) ) - cpumask_set_cpu(sched_cpu, &prv->cpus_free); - } - - if ( unlikely(prev->next_task == NULL || - !unit_runnable_state(prev->next_task)) ) - prev->next_task = sched_idle_unit(sched_cpu); - - NULL_UNIT_CHECK(prev->next_task); - - prev->next_task->migrated = false; -} - -static inline void dump_unit(struct null_private *prv, struct null_unit *nvc) -{ - printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id, - nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ? - sched_unit_master(nvc->unit) : -1); -} - -static void null_dump_pcpu(const struct scheduler *ops, int cpu) -{ - struct null_private *prv = null_priv(ops); - struct null_unit *nvc; - spinlock_t *lock; - unsigned long flags; - - lock = pcpu_schedule_lock_irqsave(cpu, &flags); - - printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}", - cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)), - CPUMASK_PR(per_cpu(cpu_core_mask, cpu))); - if ( per_cpu(npc, cpu).unit != NULL ) - printk(", unit=%pdv%d", per_cpu(npc, cpu).unit->domain, - per_cpu(npc, cpu).unit->unit_id); - printk("\n"); - - /* current unit (nothing to say if that's the idle unit) */ - nvc = null_unit(curr_on_cpu(cpu)); - if ( nvc && !is_idle_unit(nvc->unit) ) - { - printk("\trun: "); - dump_unit(prv, nvc); - printk("\n"); - } - - pcpu_schedule_unlock_irqrestore(lock, flags, cpu); -} - -static void null_dump(const struct scheduler *ops) -{ - struct null_private *prv = null_priv(ops); - struct list_head *iter; - unsigned long flags; - unsigned int loop; - - spin_lock_irqsave(&prv->lock, flags); - - printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free)); - - printk("Domain info:\n"); - loop = 0; - list_for_each( iter, &prv->ndom ) - { - struct null_dom *ndom; - struct sched_unit *unit; - - ndom = list_entry(iter, struct null_dom, ndom_elem); - - printk("\tDomain: %d\n", ndom->dom->domain_id); - for_each_sched_unit( ndom->dom, unit ) - { - struct null_unit * const nvc = null_unit(unit); - spinlock_t *lock; - - lock = unit_schedule_lock(unit); - - printk("\t%3d: ", ++loop); - dump_unit(prv, nvc); - printk("\n"); - - unit_schedule_unlock(lock, unit); - } - } - - printk("Waitqueue: "); - loop = 0; - spin_lock(&prv->waitq_lock); - list_for_each( iter, &prv->waitq ) - { - struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem); - - if ( loop++ != 0 ) - printk(", "); - if ( loop % 24 == 0 ) - printk("\n\t"); - printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id); - } - printk("\n"); - spin_unlock(&prv->waitq_lock); - - spin_unlock_irqrestore(&prv->lock, flags); -} - -static const struct scheduler sched_null_def = { - .name = "null Scheduler", - .opt_name = "null", - .sched_id = XEN_SCHEDULER_NULL, - .sched_data = NULL, - - .init = null_init, - .deinit = null_deinit, - .init_pdata = null_init_pdata, - .switch_sched = null_switch_sched, - .deinit_pdata = null_deinit_pdata, - - .alloc_udata = null_alloc_udata, - .free_udata = null_free_udata, - .alloc_domdata = null_alloc_domdata, - .free_domdata = null_free_domdata, - - .insert_unit = null_unit_insert, - .remove_unit = null_unit_remove, - - .wake = null_unit_wake, - .sleep = null_unit_sleep, - .pick_resource = null_res_pick, - .migrate = null_unit_migrate, - .do_schedule = null_schedule, - - .dump_cpu_state = null_dump_pcpu, - .dump_settings = null_dump, -}; - -REGISTER_SCHEDULER(sched_null_def); diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c deleted file mode 100644 index c40a7e4990..0000000000 --- a/xen/common/sched_rt.c +++ /dev/null @@ -1,1571 +0,0 @@ -/***************************************************************************** - * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen - * EDF scheduling is a real-time scheduling algorithm used in embedded field. - * - * by Sisu Xi, 2013, Washington University in Saint Louis - * Meng Xu, 2014-2016, University of Pennsylvania - * - * Conversion toward event driven model by Tianyang Chen - * and Dagaen Golomb, 2016, University of Pennsylvania - * - * based on the code of credit Scheduler - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * TODO: - * - * Migration compensation and resist like credit2 to better use cache; - * Lock Holder Problem, using yield? - * Self switch problem: UNITs of the same domain may preempt each other; - */ - -/* - * Design: - * - * This scheduler follows the Preemptive Global Earliest Deadline First (EDF) - * theory in real-time field. - * At any scheduling point, the UNIT with earlier deadline has higher priority. - * The scheduler always picks highest priority UNIT to run on a feasible PCPU. - * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or - * has a lower-priority UNIT running on it.) - * - * Each UNIT has a dedicated period, budget and a extratime flag - * The deadline of an UNIT is at the end of each period; - * An UNIT has its budget replenished at the beginning of each period; - * While scheduled, an UNIT burns its budget. - * The UNIT needs to finish its budget before its deadline in each period; - * The UNIT discards its unused budget at the end of each period. - * When an UNIT runs out of budget in a period, if its extratime flag is set, - * the UNIT increases its priority_level by 1 and refills its budget; otherwise, - * it has to wait until next period. - * - * Each UNIT is implemented as a deferable server. - * When an UNIT has a task running on it, its budget is continuously burned; - * When an UNIT has no task but with budget left, its budget is preserved. - * - * Queue scheme: - * A global runqueue and a global depletedqueue for each CPU pool. - * The runqueue holds all runnable UNITs with budget, - * sorted by priority_level and deadline; - * The depletedqueue holds all UNITs without budget, unsorted; - * - * Note: cpumask and cpupool is supported. - */ - -/* - * Locking: - * A global system lock is used to protect the RunQ and DepletedQ. - * The global lock is referenced by sched_res->schedule_lock - * from all physical cpus. - * - * The lock is already grabbed when calling wake/sleep/schedule/ functions - * in schedule.c - * - * The functions involes RunQ and needs to grab locks are: - * unit_insert, unit_remove, context_saved, runq_insert - */ - - -/* - * Default parameters: - * Period and budget in default is 10 and 4 ms, respectively - */ -#define RTDS_DEFAULT_PERIOD (MICROSECS(10000)) -#define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) - -/* - * Max period: max delta of time type, because period is added to the time - * an unit activates, so this must not overflow. - * Min period: 10 us, considering the scheduling overhead (when period is - * too low, scheduling is invoked too frequently, causing high overhead). - */ -#define RTDS_MAX_PERIOD (STIME_DELTA_MAX) -#define RTDS_MIN_PERIOD (MICROSECS(10)) - -/* - * Min budget: 10 us, considering the scheduling overhead (when budget is - * consumed too fast, scheduling is invoked too frequently, causing - * high overhead). - */ -#define RTDS_MIN_BUDGET (MICROSECS(10)) - -/* - * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding - * the next deadline, performing addition could be faster if the difference - * between cur_deadline and now is small. If the difference is bigger than - * 1024 * period, use multiplication. - */ -#define UPDATE_LIMIT_SHIFT 10 - -/* - * Flags - */ -/* - * RTDS_scheduled: Is this unit either running on, or context-switching off, - * a physical cpu? - * + Accessed only with global lock held. - * + Set when chosen as next in rt_schedule(). - * + Cleared after context switch has been saved in rt_context_saved() - * + Checked in unit_wake to see if we can add to the Runqueue, or if we should - * set RTDS_delayed_runq_add - * + Checked to be false in runq_insert. - */ -#define __RTDS_scheduled 1 -#define RTDS_scheduled (1<<__RTDS_scheduled) -/* - * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ - * once it's done being context switching out? - * + Set when scheduling out in rt_schedule() if prev is runable - * + Set in rt_unit_wake if it finds RTDS_scheduled set - * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ - * and clears the bit. - */ -#define __RTDS_delayed_runq_add 2 -#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) - -/* - * RTDS_depleted: Does this vcp run out of budget? - * This flag is - * + set in burn_budget() if an unit has zero budget left; - * + cleared and checked in the repenishment handler, - * for the units that are being replenished. - */ -#define __RTDS_depleted 3 -#define RTDS_depleted (1<<__RTDS_depleted) - -/* - * RTDS_extratime: Can the unit run in the time that is - * not part of any real-time reservation, and would therefore - * be otherwise left idle? - */ -#define __RTDS_extratime 4 -#define RTDS_extratime (1<<__RTDS_extratime) - -/* - * rt tracing events ("only" 512 available!). Check - * include/public/trace.h for more details. - */ -#define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1) -#define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2) -#define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3) -#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4) -#define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5) -#define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6) - -static void repl_timer_handler(void *data); - -/* - * System-wide private data, include global RunQueue/DepletedQ - * Global lock is referenced by sched_res->schedule_lock from all - * physical cpus. It can be grabbed via unit_schedule_lock_irq() - */ -struct rt_private { - spinlock_t lock; /* the global coarse-grained lock */ - struct list_head sdom; /* list of availalbe domains, used for dump */ - - struct list_head runq; /* ordered list of runnable units */ - struct list_head depletedq; /* unordered list of depleted units */ - - struct timer repl_timer; /* replenishment timer */ - struct list_head replq; /* ordered list of units that need replenishment */ - - cpumask_t tickled; /* cpus been tickled */ -}; - -/* - * Virtual CPU - */ -struct rt_unit { - struct list_head q_elem; /* on the runq/depletedq list */ - struct list_head replq_elem; /* on the replenishment events list */ - - /* UNIT parameters, in nanoseconds */ - s_time_t period; - s_time_t budget; - - /* UNIT current information in nanosecond */ - s_time_t cur_budget; /* current budget */ - s_time_t last_start; /* last start time */ - s_time_t cur_deadline; /* current deadline for EDF */ - - /* Up-pointers */ - struct rt_dom *sdom; - struct sched_unit *unit; - - unsigned priority_level; - - unsigned flags; /* mark __RTDS_scheduled, etc.. */ -}; - -/* - * Domain - */ -struct rt_dom { - struct list_head sdom_elem; /* link list on rt_priv */ - struct domain *dom; /* pointer to upper domain */ -}; - -/* - * Useful inline functions - */ -static inline struct rt_private *rt_priv(const struct scheduler *ops) -{ - return ops->sched_data; -} - -static inline struct rt_unit *rt_unit(const struct sched_unit *unit) -{ - return unit->priv; -} - -static inline struct list_head *rt_runq(const struct scheduler *ops) -{ - return &rt_priv(ops)->runq; -} - -static inline struct list_head *rt_depletedq(const struct scheduler *ops) -{ - return &rt_priv(ops)->depletedq; -} - -static inline struct list_head *rt_replq(const struct scheduler *ops) -{ - return &rt_priv(ops)->replq; -} - -static inline bool has_extratime(const struct rt_unit *svc) -{ - return svc->flags & RTDS_extratime; -} - -/* - * Helper functions for manipulating the runqueue, the depleted queue, - * and the replenishment events queue. - */ -static int -unit_on_q(const struct rt_unit *svc) -{ - return !list_empty(&svc->q_elem); -} - -static struct rt_unit * -q_elem(struct list_head *elem) -{ - return list_entry(elem, struct rt_unit, q_elem); -} - -static struct rt_unit * -replq_elem(struct list_head *elem) -{ - return list_entry(elem, struct rt_unit, replq_elem); -} - -static int -unit_on_replq(const struct rt_unit *svc) -{ - return !list_empty(&svc->replq_elem); -} - -/* - * If v1 priority >= v2 priority, return value > 0 - * Otherwise, return value < 0 - */ -static s_time_t -compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2) -{ - int prio = v2->priority_level - v1->priority_level; - - if ( prio == 0 ) - return v2->cur_deadline - v1->cur_deadline; - - return prio; -} - -/* - * Debug related code, dump unit/cpu information - */ -static void -rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc) -{ - cpumask_t *cpupool_mask, *mask; - - ASSERT(svc != NULL); - /* idle unit */ - if( svc->sdom == NULL ) - { - printk("\n"); - return; - } - - /* - * We can't just use 'cpumask_scratch' because the dumping can - * happen from a pCPU outside of this scheduler's cpupool, and - * hence it's not right to use its pCPU's scratch mask. - * On the other hand, it is safe to use sched_unit_master(svc->unit)'s - * own scratch space, since we hold the runqueue lock. - */ - mask = cpumask_scratch_cpu(sched_unit_master(svc->unit)); - - cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain); - cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity); - printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," - " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n" - " \t\t priority_level=%d has_extratime=%d\n" - " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n", - svc->unit->domain->domain_id, - svc->unit->unit_id, - sched_unit_master(svc->unit), - svc->period, - svc->budget, - svc->cur_budget, - svc->cur_deadline, - svc->last_start, - svc->priority_level, - has_extratime(svc), - unit_on_q(svc), - unit_runnable(svc->unit), - svc->flags, CPUMASK_PR(mask)); -} - -static void -rt_dump_pcpu(const struct scheduler *ops, int cpu) -{ - struct rt_private *prv = rt_priv(ops); - struct rt_unit *svc; - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - printk("CPU[%02d]\n", cpu); - /* current UNIT (nothing to say if that's the idle unit). */ - svc = rt_unit(curr_on_cpu(cpu)); - if ( svc && !is_idle_unit(svc->unit) ) - { - rt_dump_unit(ops, svc); - } - spin_unlock_irqrestore(&prv->lock, flags); -} - -static void -rt_dump(const struct scheduler *ops) -{ - struct list_head *runq, *depletedq, *replq, *iter; - struct rt_private *prv = rt_priv(ops); - struct rt_unit *svc; - struct rt_dom *sdom; - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - - if ( list_empty(&prv->sdom) ) - goto out; - - runq = rt_runq(ops); - depletedq = rt_depletedq(ops); - replq = rt_replq(ops); - - printk("Global RunQueue info:\n"); - list_for_each ( iter, runq ) - { - svc = q_elem(iter); - rt_dump_unit(ops, svc); - } - - printk("Global DepletedQueue info:\n"); - list_for_each ( iter, depletedq ) - { - svc = q_elem(iter); - rt_dump_unit(ops, svc); - } - - printk("Global Replenishment Events info:\n"); - list_for_each ( iter, replq ) - { - svc = replq_elem(iter); - rt_dump_unit(ops, svc); - } - - printk("Domain info:\n"); - list_for_each ( iter, &prv->sdom ) - { - struct sched_unit *unit; - - sdom = list_entry(iter, struct rt_dom, sdom_elem); - printk("\tdomain: %d\n", sdom->dom->domain_id); - - for_each_sched_unit ( sdom->dom, unit ) - { - svc = rt_unit(unit); - rt_dump_unit(ops, svc); - } - } - - out: - spin_unlock_irqrestore(&prv->lock, flags); -} - -/* - * update deadline and budget when now >= cur_deadline - * it needs to be updated to the deadline of the current period - */ -static void -rt_update_deadline(s_time_t now, struct rt_unit *svc) -{ - ASSERT(now >= svc->cur_deadline); - ASSERT(svc->period != 0); - - if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now ) - { - do - svc->cur_deadline += svc->period; - while ( svc->cur_deadline <= now ); - } - else - { - long count = ((now - svc->cur_deadline) / svc->period) + 1; - svc->cur_deadline += count * svc->period; - } - - /* - * svc may be scheduled to run immediately after it misses deadline - * Then rt_update_deadline is called before rt_schedule, which - * should only deduct the time spent in current period from the budget - */ - svc->last_start = now; - svc->cur_budget = svc->budget; - svc->priority_level = 0; - - /* TRACE */ - { - struct __packed { - unsigned unit:16, dom:16; - unsigned priority_level; - uint64_t cur_deadline, cur_budget; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.priority_level = svc->priority_level; - d.cur_deadline = (uint64_t) svc->cur_deadline; - d.cur_budget = (uint64_t) svc->cur_budget; - trace_var(TRC_RTDS_BUDGET_REPLENISH, 1, - sizeof(d), - (unsigned char *) &d); - } - - return; -} - -/* - * Helpers for removing and inserting an unit in a queue - * that is being kept ordered by the units' deadlines (as EDF - * mandates). - * - * For callers' convenience, the unit removing helper returns - * true if the unit removed was the one at the front of the - * queue; similarly, the inserting helper returns true if the - * inserted ended at the front of the queue (i.e., in both - * cases, if the unit with the earliest deadline is what we - * are dealing with). - */ -static inline bool -deadline_queue_remove(struct list_head *queue, struct list_head *elem) -{ - int pos = 0; - - if ( queue->next != elem ) - pos = 1; - - list_del_init(elem); - return !pos; -} - -static inline bool -deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *), - struct rt_unit *svc, struct list_head *elem, - struct list_head *queue) -{ - struct list_head *iter; - int pos = 0; - - list_for_each ( iter, queue ) - { - struct rt_unit * iter_svc = (*qelem)(iter); - if ( compare_unit_priority(svc, iter_svc) > 0 ) - break; - pos++; - } - list_add_tail(elem, iter); - return !pos; -} -#define deadline_runq_insert(...) \ - deadline_queue_insert(&q_elem, ##__VA_ARGS__) -#define deadline_replq_insert(...) \ - deadline_queue_insert(&replq_elem, ##__VA_ARGS__) - -static inline void -q_remove(struct rt_unit *svc) -{ - ASSERT( unit_on_q(svc) ); - list_del_init(&svc->q_elem); -} - -static inline void -replq_remove(const struct scheduler *ops, struct rt_unit *svc) -{ - struct rt_private *prv = rt_priv(ops); - struct list_head *replq = rt_replq(ops); - - ASSERT( unit_on_replq(svc) ); - - if ( deadline_queue_remove(replq, &svc->replq_elem) ) - { - /* - * The replenishment timer needs to be set to fire when a - * replenishment for the unit at the front of the replenishment - * queue is due. If it is such unit that we just removed, we may - * need to reprogram the timer. - */ - if ( !list_empty(replq) ) - { - struct rt_unit *svc_next = replq_elem(replq->next); - set_timer(&prv->repl_timer, svc_next->cur_deadline); - } - else - stop_timer(&prv->repl_timer); - } -} - -/* - * Insert svc with budget in RunQ according to EDF: - * units with smaller deadlines go first. - * Insert svc without budget in DepletedQ unsorted; - */ -static void -runq_insert(const struct scheduler *ops, struct rt_unit *svc) -{ - struct rt_private *prv = rt_priv(ops); - struct list_head *runq = rt_runq(ops); - - ASSERT( spin_is_locked(&prv->lock) ); - ASSERT( !unit_on_q(svc) ); - ASSERT( unit_on_replq(svc) ); - - /* add svc to runq if svc still has budget or its extratime is set */ - if ( svc->cur_budget > 0 || - has_extratime(svc) ) - deadline_runq_insert(svc, &svc->q_elem, runq); - else - list_add(&svc->q_elem, &prv->depletedq); -} - -static void -replq_insert(const struct scheduler *ops, struct rt_unit *svc) -{ - struct list_head *replq = rt_replq(ops); - struct rt_private *prv = rt_priv(ops); - - ASSERT( !unit_on_replq(svc) ); - - /* - * The timer may be re-programmed if svc is inserted - * at the front of the event list. - */ - if ( deadline_replq_insert(svc, &svc->replq_elem, replq) ) - set_timer(&prv->repl_timer, svc->cur_deadline); -} - -/* - * Removes and re-inserts an event to the replenishment queue. - * The aim is to update its position inside the queue, as its - * deadline (and hence its replenishment time) could have - * changed. - */ -static void -replq_reinsert(const struct scheduler *ops, struct rt_unit *svc) -{ - struct list_head *replq = rt_replq(ops); - struct rt_unit *rearm_svc = svc; - bool_t rearm = 0; - - ASSERT( unit_on_replq(svc) ); - - /* - * If svc was at the front of the replenishment queue, we certainly - * need to re-program the timer, and we want to use the deadline of - * the unit which is now at the front of the queue (which may still - * be svc or not). - * - * We may also need to re-program, if svc has been put at the front - * of the replenishment queue when being re-inserted. - */ - if ( deadline_queue_remove(replq, &svc->replq_elem) ) - { - deadline_replq_insert(svc, &svc->replq_elem, replq); - rearm_svc = replq_elem(replq->next); - rearm = 1; - } - else - rearm = deadline_replq_insert(svc, &svc->replq_elem, replq); - - if ( rearm ) - set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline); -} - -/* - * Pick a valid resource for the unit vc - * Valid resource of an unit is intesection of unit's affinity - * and available resources - */ -static struct sched_resource * -rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit) -{ - cpumask_t cpus; - cpumask_t *online; - int cpu; - - online = cpupool_domain_master_cpumask(unit->domain); - cpumask_and(&cpus, online, unit->cpu_hard_affinity); - - cpu = cpumask_test_cpu(sched_unit_master(unit), &cpus) - ? sched_unit_master(unit) - : cpumask_cycle(sched_unit_master(unit), &cpus); - ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); - - return get_sched_res(cpu); -} - -/* - * Init/Free related code - */ -static int -rt_init(struct scheduler *ops) -{ - int rc = -ENOMEM; - struct rt_private *prv = xzalloc(struct rt_private); - - printk("Initializing RTDS scheduler\n" - "WARNING: This is experimental software in development.\n" - "Use at your own risk.\n"); - - if ( prv == NULL ) - goto err; - - spin_lock_init(&prv->lock); - INIT_LIST_HEAD(&prv->sdom); - INIT_LIST_HEAD(&prv->runq); - INIT_LIST_HEAD(&prv->depletedq); - INIT_LIST_HEAD(&prv->replq); - - ops->sched_data = prv; - rc = 0; - - err: - if ( rc ) - xfree(prv); - - return rc; -} - -static void -rt_deinit(struct scheduler *ops) -{ - struct rt_private *prv = rt_priv(ops); - - ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid || - prv->repl_timer.status == TIMER_STATUS_killed); - - ops->sched_data = NULL; - xfree(prv); -} - -/* - * Point per_cpu spinlock to the global system lock; - * All cpu have same global system lock - */ -static void -rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu) -{ - struct rt_private *prv = rt_priv(ops); - spinlock_t *old_lock; - unsigned long flags; - - old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); - - /* - * TIMER_STATUS_invalid means we are the first cpu that sees the timer - * allocated but not initialized, and so it's up to us to initialize it. - */ - if ( prv->repl_timer.status == TIMER_STATUS_invalid ) - { - init_timer(&prv->repl_timer, repl_timer_handler, (void *)ops, cpu); - dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu); - } - - /* Move the scheduler lock to our global runqueue lock. */ - get_sched_res(cpu)->schedule_lock = &prv->lock; - - /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */ - spin_unlock_irqrestore(old_lock, flags); -} - -/* Change the scheduler of cpu to us (RTDS). */ -static spinlock_t * -rt_switch_sched(struct scheduler *new_ops, unsigned int cpu, - void *pdata, void *vdata) -{ - struct rt_private *prv = rt_priv(new_ops); - struct rt_unit *svc = vdata; - - ASSERT(!pdata && svc && is_idle_unit(svc->unit)); - - /* - * We are holding the runqueue lock already (it's been taken in - * schedule_cpu_switch()). It's actually the runqueue lock of - * another scheduler, but that is how things need to be, for - * preventing races. - */ - ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock); - - /* - * If we are the absolute first cpu being switched toward this - * scheduler (in which case we'll see TIMER_STATUS_invalid), or the - * first one that is added back to the cpupool that had all its cpus - * removed (in which case we'll see TIMER_STATUS_killed), it's our - * job to (re)initialize the timer. - */ - if ( prv->repl_timer.status == TIMER_STATUS_invalid || - prv->repl_timer.status == TIMER_STATUS_killed ) - { - init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu); - dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu); - } - - sched_idle_unit(cpu)->priv = vdata; - - return &prv->lock; -} - -static void -rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) -{ - unsigned long flags; - struct rt_private *prv = rt_priv(ops); - - spin_lock_irqsave(&prv->lock, flags); - - if ( prv->repl_timer.cpu == cpu ) - { - cpumask_t *online = get_sched_res(cpu)->cpupool->res_valid; - unsigned int new_cpu = cpumask_cycle(cpu, online); - - /* - * Make sure the timer run on one of the cpus that are still available - * to this scheduler. If there aren't any left, it means it's the time - * to just kill it. - */ - if ( new_cpu >= nr_cpu_ids ) - { - kill_timer(&prv->repl_timer); - dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu); - } - else - { - migrate_timer(&prv->repl_timer, new_cpu); - } - } - - spin_unlock_irqrestore(&prv->lock, flags); -} - -static void * -rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) -{ - unsigned long flags; - struct rt_dom *sdom; - struct rt_private * prv = rt_priv(ops); - - sdom = xzalloc(struct rt_dom); - if ( sdom == NULL ) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&sdom->sdom_elem); - sdom->dom = dom; - - /* spinlock here to insert the dom */ - spin_lock_irqsave(&prv->lock, flags); - list_add_tail(&sdom->sdom_elem, &(prv->sdom)); - spin_unlock_irqrestore(&prv->lock, flags); - - return sdom; -} - -static void -rt_free_domdata(const struct scheduler *ops, void *data) -{ - struct rt_dom *sdom = data; - struct rt_private *prv = rt_priv(ops); - - if ( sdom ) - { - unsigned long flags; - - spin_lock_irqsave(&prv->lock, flags); - list_del_init(&sdom->sdom_elem); - spin_unlock_irqrestore(&prv->lock, flags); - - xfree(sdom); - } -} - -static void * -rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd) -{ - struct rt_unit *svc; - - /* Allocate per-UNIT info */ - svc = xzalloc(struct rt_unit); - if ( svc == NULL ) - return NULL; - - INIT_LIST_HEAD(&svc->q_elem); - INIT_LIST_HEAD(&svc->replq_elem); - svc->flags = 0U; - svc->sdom = dd; - svc->unit = unit; - svc->last_start = 0; - - __set_bit(__RTDS_extratime, &svc->flags); - svc->priority_level = 0; - svc->period = RTDS_DEFAULT_PERIOD; - if ( !is_idle_unit(unit) ) - svc->budget = RTDS_DEFAULT_BUDGET; - - SCHED_STAT_CRANK(unit_alloc); - - return svc; -} - -static void -rt_free_udata(const struct scheduler *ops, void *priv) -{ - struct rt_unit *svc = priv; - - xfree(svc); -} - -/* - * It is called in sched_move_domain() and sched_init_vcpu - * in schedule.c. - * When move a domain to a new cpupool. - * It inserts units of moving domain to the scheduler's RunQ in - * dest. cpupool. - */ -static void -rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit) -{ - struct rt_unit *svc = rt_unit(unit); - s_time_t now; - spinlock_t *lock; - - BUG_ON( is_idle_unit(unit) ); - - /* This is safe because unit isn't yet being scheduled */ - sched_set_res(unit, rt_res_pick(ops, unit)); - - lock = unit_schedule_lock_irq(unit); - - now = NOW(); - if ( now >= svc->cur_deadline ) - rt_update_deadline(now, svc); - - if ( !unit_on_q(svc) && unit_runnable(unit) ) - { - replq_insert(ops, svc); - - if ( !unit->is_running ) - runq_insert(ops, svc); - } - unit_schedule_unlock_irq(lock, unit); - - SCHED_STAT_CRANK(unit_insert); -} - -/* - * Remove rt_unit svc from the old scheduler in source cpupool. - */ -static void -rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit) -{ - struct rt_unit * const svc = rt_unit(unit); - struct rt_dom * const sdom = svc->sdom; - spinlock_t *lock; - - SCHED_STAT_CRANK(unit_remove); - - BUG_ON( sdom == NULL ); - - lock = unit_schedule_lock_irq(unit); - if ( unit_on_q(svc) ) - q_remove(svc); - - if ( unit_on_replq(svc) ) - replq_remove(ops,svc); - - unit_schedule_unlock_irq(lock, unit); -} - -/* - * Burn budget in nanosecond granularity - */ -static void -burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now) -{ - s_time_t delta; - - /* don't burn budget for idle UNIT */ - if ( is_idle_unit(svc->unit) ) - return; - - /* burn at nanoseconds level */ - delta = now - svc->last_start; - /* - * delta < 0 only happens in nested virtualization; - * TODO: how should we handle delta < 0 in a better way? - */ - if ( delta < 0 ) - { - printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n", - __func__, delta); - svc->last_start = now; - return; - } - - svc->cur_budget -= delta; - svc->last_start = now; - - if ( svc->cur_budget <= 0 ) - { - if ( has_extratime(svc) ) - { - svc->priority_level++; - svc->cur_budget = svc->budget; - } - else - { - svc->cur_budget = 0; - __set_bit(__RTDS_depleted, &svc->flags); - } - } - - /* TRACE */ - { - struct __packed { - unsigned unit:16, dom:16; - uint64_t cur_budget; - int delta; - unsigned priority_level; - bool has_extratime; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.cur_budget = (uint64_t) svc->cur_budget; - d.delta = delta; - d.priority_level = svc->priority_level; - d.has_extratime = svc->flags & RTDS_extratime; - trace_var(TRC_RTDS_BUDGET_BURN, 1, - sizeof(d), - (unsigned char *) &d); - } -} - -/* - * RunQ is sorted. Pick first one within cpumask. If no one, return NULL - * lock is grabbed before calling this function - */ -static struct rt_unit * -runq_pick(const struct scheduler *ops, const cpumask_t *mask) -{ - struct list_head *runq = rt_runq(ops); - struct list_head *iter; - struct rt_unit *svc = NULL; - struct rt_unit *iter_svc = NULL; - cpumask_t cpu_common; - cpumask_t *online; - - list_for_each ( iter, runq ) - { - iter_svc = q_elem(iter); - - /* mask cpu_hard_affinity & cpupool & mask */ - online = cpupool_domain_master_cpumask(iter_svc->unit->domain); - cpumask_and(&cpu_common, online, iter_svc->unit->cpu_hard_affinity); - cpumask_and(&cpu_common, mask, &cpu_common); - if ( cpumask_empty(&cpu_common) ) - continue; - - ASSERT( iter_svc->cur_budget > 0 ); - - svc = iter_svc; - break; - } - - /* TRACE */ - { - if( svc != NULL ) - { - struct __packed { - unsigned unit:16, dom:16; - uint64_t cur_deadline, cur_budget; - } d; - d.dom = svc->unit->domain->domain_id; - d.unit = svc->unit->unit_id; - d.cur_deadline = (uint64_t) svc->cur_deadline; - d.cur_budget = (uint64_t) svc->cur_budget; - trace_var(TRC_RTDS_RUNQ_PICK, 1, - sizeof(d), - (unsigned char *) &d); - } - } - - return svc; -} - -/* - * schedule function for rt scheduler. - * The lock is already grabbed in schedule.c, no need to lock here - */ -static void -rt_schedule(const struct scheduler *ops, struct sched_unit *currunit, - s_time_t now, bool tasklet_work_scheduled) -{ - const unsigned int cur_cpu = smp_processor_id(); - const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu); - struct rt_private *prv = rt_priv(ops); - struct rt_unit *const scurr = rt_unit(currunit); - struct rt_unit *snext = NULL; - bool migrated = false; - - /* TRACE */ - { - struct __packed { - unsigned cpu:16, tasklet:8, tickled:4, idle:4; - } d; - d.cpu = cur_cpu; - d.tasklet = tasklet_work_scheduled; - d.tickled = cpumask_test_cpu(sched_cpu, &prv->tickled); - d.idle = is_idle_unit(currunit); - trace_var(TRC_RTDS_SCHEDULE, 1, - sizeof(d), - (unsigned char *)&d); - } - - /* clear ticked bit now that we've been scheduled */ - cpumask_clear_cpu(sched_cpu, &prv->tickled); - - /* burn_budget would return for IDLE UNIT */ - burn_budget(ops, scurr, now); - - if ( tasklet_work_scheduled ) - { - trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL); - snext = rt_unit(sched_idle_unit(sched_cpu)); - } - else - { - snext = runq_pick(ops, cpumask_of(sched_cpu)); - - if ( snext == NULL ) - snext = rt_unit(sched_idle_unit(sched_cpu)); - else if ( !unit_runnable_state(snext->unit) ) - { - q_remove(snext); - snext = rt_unit(sched_idle_unit(sched_cpu)); - } - - /* if scurr has higher priority and budget, still pick scurr */ - if ( !is_idle_unit(currunit) && - unit_runnable_state(currunit) && - scurr->cur_budget > 0 && - ( is_idle_unit(snext->unit) || - compare_unit_priority(scurr, snext) > 0 ) ) - snext = scurr; - } - - if ( snext != scurr && - !is_idle_unit(currunit) && - unit_runnable(currunit) ) - __set_bit(__RTDS_delayed_runq_add, &scurr->flags); - - snext->last_start = now; - currunit->next_time = -1; /* if an idle unit is picked */ - if ( !is_idle_unit(snext->unit) ) - { - if ( snext != scurr ) - { - q_remove(snext); - __set_bit(__RTDS_scheduled, &snext->flags); - } - if ( sched_unit_master(snext->unit) != sched_cpu ) - { - sched_set_res(snext->unit, get_sched_res(sched_cpu)); - migrated = true; - } - /* Invoke the scheduler next time. */ - currunit->next_time = snext->cur_budget; - } - currunit->next_task = snext->unit; - snext->unit->migrated = migrated; -} - -/* - * Remove UNIT from RunQ - * The lock is already grabbed in schedule.c, no need to lock here - */ -static void -rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit) -{ - struct rt_unit * const svc = rt_unit(unit); - - BUG_ON( is_idle_unit(unit) ); - SCHED_STAT_CRANK(unit_sleep); - - if ( curr_on_cpu(sched_unit_master(unit)) == unit ) - cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ); - else if ( unit_on_q(svc) ) - { - q_remove(svc); - replq_remove(ops, svc); - } - else if ( svc->flags & RTDS_delayed_runq_add ) - __clear_bit(__RTDS_delayed_runq_add, &svc->flags); -} - -/* - * Pick a cpu where to run an unit, - * possibly kicking out the unit running there - * Called by wake() and context_saved() - * We have a running candidate here, the kick logic is: - * Among all the cpus that are within the cpu affinity - * 1) if there are any idle CPUs, kick one. - For cache benefit, we check new->cpu as first - * 2) now all pcpus are busy; - * among all the running units, pick lowest priority one - * if snext has higher priority, kick it. - * - * TODO: - * 1) what if these two units belongs to the same domain? - * replace an unit belonging to the same domain introduces more overhead - * - * lock is grabbed before calling this function - */ -static void -runq_tickle(const struct scheduler *ops, struct rt_unit *new) -{ - struct rt_private *prv = rt_priv(ops); - struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */ - struct rt_unit *iter_svc; - struct sched_unit *iter_unit; - int cpu = 0, cpu_to_tickle = 0; - cpumask_t not_tickled; - cpumask_t *online; - - if ( new == NULL || is_idle_unit(new->unit) ) - return; - - online = cpupool_domain_master_cpumask(new->unit->domain); - cpumask_and(¬_tickled, online, new->unit->cpu_hard_affinity); - cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled); - - /* - * 1) If there are any idle CPUs, kick one. - * For cache benefit,we first search new->cpu. - * The same loop also find the one with lowest priority. - */ - cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), ¬_tickled); - while ( cpu!= nr_cpu_ids ) - { - iter_unit = curr_on_cpu(cpu); - if ( is_idle_unit(iter_unit) ) - { - SCHED_STAT_CRANK(tickled_idle_cpu); - cpu_to_tickle = cpu; - goto out; - } - iter_svc = rt_unit(iter_unit); - if ( latest_deadline_unit == NULL || - compare_unit_priority(iter_svc, latest_deadline_unit) < 0 ) - latest_deadline_unit = iter_svc; - - cpumask_clear_cpu(cpu, ¬_tickled); - cpu = cpumask_cycle(cpu, ¬_tickled); - } - - /* 2) candicate has higher priority, kick out lowest priority unit */ - if ( latest_deadline_unit != NULL && - compare_unit_priority(latest_deadline_unit, new) < 0 ) - { - SCHED_STAT_CRANK(tickled_busy_cpu); - cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit); - goto out; - } - - /* didn't tickle any cpu */ - SCHED_STAT_CRANK(tickled_no_cpu); - return; - out: - /* TRACE */ - { - struct { - unsigned cpu:16, pad:16; - } d; - d.cpu = cpu_to_tickle; - d.pad = 0; - trace_var(TRC_RTDS_TICKLE, 1, - sizeof(d), - (unsigned char *)&d); - } - - cpumask_set_cpu(cpu_to_tickle, &prv->tickled); - cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); - return; -} - -/* - * Should always wake up runnable unit, put it back to RunQ. - * Check priority to raise interrupt - * The lock is already grabbed in schedule.c, no need to lock here - * TODO: what if these two units belongs to the same domain? - */ -static void -rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit) -{ - struct rt_unit * const svc = rt_unit(unit); - s_time_t now; - bool_t missed; - - BUG_ON( is_idle_unit(unit) ); - - if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) ) - { - SCHED_STAT_CRANK(unit_wake_running); - return; - } - - /* on RunQ/DepletedQ, just update info is ok */ - if ( unlikely(unit_on_q(svc)) ) - { - SCHED_STAT_CRANK(unit_wake_onrunq); - return; - } - - if ( likely(unit_runnable(unit)) ) - SCHED_STAT_CRANK(unit_wake_runnable); - else - SCHED_STAT_CRANK(unit_wake_not_runnable); - - /* - * If a deadline passed while svc was asleep/blocked, we need new - * scheduling parameters (a new deadline and full budget). - */ - now = NOW(); - - missed = ( now >= svc->cur_deadline ); - if ( missed ) - rt_update_deadline(now, svc); - - /* - * If context hasn't been saved for this unit yet, we can't put it on - * the run-queue/depleted-queue. Instead, we set the appropriate flag, - * the unit will be put back on queue after the context has been saved - * (in rt_context_save()). - */ - if ( unlikely(svc->flags & RTDS_scheduled) ) - { - __set_bit(__RTDS_delayed_runq_add, &svc->flags); - /* - * The unit is waking up already, and we didn't even had the time to - * remove its next replenishment event from the replenishment queue - * when it blocked! No big deal. If we did not miss the deadline in - * the meantime, let's just leave it there. If we did, let's remove it - * and queue a new one (to occur at our new deadline). - */ - if ( missed ) - replq_reinsert(ops, svc); - return; - } - - /* Replenishment event got cancelled when we blocked. Add it back. */ - replq_insert(ops, svc); - /* insert svc to runq/depletedq because svc is not in queue now */ - runq_insert(ops, svc); - - runq_tickle(ops, svc); -} - -/* - * scurr has finished context switch, insert it back to the RunQ, - * and then pick the highest priority unit from runq to run - */ -static void -rt_context_saved(const struct scheduler *ops, struct sched_unit *unit) -{ - struct rt_unit *svc = rt_unit(unit); - spinlock_t *lock = unit_schedule_lock_irq(unit); - - __clear_bit(__RTDS_scheduled, &svc->flags); - /* not insert idle unit to runq */ - if ( is_idle_unit(unit) ) - goto out; - - if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && - likely(unit_runnable(unit)) ) - { - runq_insert(ops, svc); - runq_tickle(ops, svc); - } - else - replq_remove(ops, svc); - -out: - unit_schedule_unlock_irq(lock, unit); -} - -/* - * set/get each unit info of each domain - */ -static int -rt_dom_cntl( - const struct scheduler *ops, - struct domain *d, - struct xen_domctl_scheduler_op *op) -{ - struct rt_private *prv = rt_priv(ops); - struct rt_unit *svc; - struct sched_unit *unit; - unsigned long flags; - int rc = 0; - struct xen_domctl_schedparam_vcpu local_sched; - s_time_t period, budget; - uint32_t index = 0; - - switch ( op->cmd ) - { - case XEN_DOMCTL_SCHEDOP_getinfo: - /* Return the default parameters. */ - op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1); - op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1); - break; - case XEN_DOMCTL_SCHEDOP_putinfo: - if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 ) - { - rc = -EINVAL; - break; - } - spin_lock_irqsave(&prv->lock, flags); - for_each_sched_unit ( d, unit ) - { - svc = rt_unit(unit); - svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */ - svc->budget = MICROSECS(op->u.rtds.budget); - } - spin_unlock_irqrestore(&prv->lock, flags); - break; - case XEN_DOMCTL_SCHEDOP_getvcpuinfo: - case XEN_DOMCTL_SCHEDOP_putvcpuinfo: - while ( index < op->u.v.nr_vcpus ) - { - if ( copy_from_guest_offset(&local_sched, - op->u.v.vcpus, index, 1) ) - { - rc = -EFAULT; - break; - } - if ( local_sched.vcpuid >= d->max_vcpus || - d->vcpu[local_sched.vcpuid] == NULL ) - { - rc = -EINVAL; - break; - } - - if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo ) - { - spin_lock_irqsave(&prv->lock, flags); - svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); - local_sched.u.rtds.budget = svc->budget / MICROSECS(1); - local_sched.u.rtds.period = svc->period / MICROSECS(1); - if ( has_extratime(svc) ) - local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra; - else - local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra; - spin_unlock_irqrestore(&prv->lock, flags); - - if ( copy_to_guest_offset(op->u.v.vcpus, index, - &local_sched, 1) ) - { - rc = -EFAULT; - break; - } - } - else - { - period = MICROSECS(local_sched.u.rtds.period); - budget = MICROSECS(local_sched.u.rtds.budget); - if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET || - budget > period || period < RTDS_MIN_PERIOD ) - { - rc = -EINVAL; - break; - } - - spin_lock_irqsave(&prv->lock, flags); - svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit); - svc->period = period; - svc->budget = budget; - if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra ) - __set_bit(__RTDS_extratime, &svc->flags); - else - __clear_bit(__RTDS_extratime, &svc->flags); - spin_unlock_irqrestore(&prv->lock, flags); - } - /* Process a most 64 vCPUs without checking for preemptions. */ - if ( (++index > 63) && hypercall_preempt_check() ) - break; - } - if ( !rc ) - /* notify upper caller how many units have been processed. */ - op->u.v.nr_vcpus = index; - break; - } - - return rc; -} - -/* - * The replenishment timer handler picks units - * from the replq and does the actual replenishment. - */ -static void repl_timer_handler(void *data){ - s_time_t now; - struct scheduler *ops = data; - struct rt_private *prv = rt_priv(ops); - struct list_head *replq = rt_replq(ops); - struct list_head *runq = rt_runq(ops); - struct list_head *iter, *tmp; - struct rt_unit *svc; - LIST_HEAD(tmp_replq); - - spin_lock_irq(&prv->lock); - - now = NOW(); - - /* - * Do the replenishment and move replenished units - * to the temporary list to tickle. - * If svc is on run queue, we need to put it at - * the correct place since its deadline changes. - */ - list_for_each_safe ( iter, tmp, replq ) - { - svc = replq_elem(iter); - - if ( now < svc->cur_deadline ) - break; - - list_del(&svc->replq_elem); - rt_update_deadline(now, svc); - list_add(&svc->replq_elem, &tmp_replq); - - if ( unit_on_q(svc) ) - { - q_remove(svc); - runq_insert(ops, svc); - } - } - - /* - * Iterate through the list of updated units. - * If an updated unit is running, tickle the head of the - * runqueue if it has a higher priority. - * If an updated unit was depleted and on the runqueue, tickle it. - * Finally, reinsert the units back to replenishement events list. - */ - list_for_each_safe ( iter, tmp, &tmp_replq ) - { - svc = replq_elem(iter); - - if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit && - !list_empty(runq) ) - { - struct rt_unit *next_on_runq = q_elem(runq->next); - - if ( compare_unit_priority(svc, next_on_runq) < 0 ) - runq_tickle(ops, next_on_runq); - } - else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) && - unit_on_q(svc) ) - runq_tickle(ops, svc); - - list_del(&svc->replq_elem); - deadline_replq_insert(svc, &svc->replq_elem, replq); - } - - /* - * If there are units left in the replenishment event list, - * set the next replenishment to happen at the deadline of - * the one in the front. - */ - if ( !list_empty(replq) ) - set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline); - - spin_unlock_irq(&prv->lock); -} - -static const struct scheduler sched_rtds_def = { - .name = "SMP RTDS Scheduler", - .opt_name = "rtds", - .sched_id = XEN_SCHEDULER_RTDS, - .sched_data = NULL, - - .dump_cpu_state = rt_dump_pcpu, - .dump_settings = rt_dump, - .init = rt_init, - .deinit = rt_deinit, - .init_pdata = rt_init_pdata, - .switch_sched = rt_switch_sched, - .deinit_pdata = rt_deinit_pdata, - .alloc_domdata = rt_alloc_domdata, - .free_domdata = rt_free_domdata, - .alloc_udata = rt_alloc_udata, - .free_udata = rt_free_udata, - .insert_unit = rt_unit_insert, - .remove_unit = rt_unit_remove, - - .adjust = rt_dom_cntl, - - .pick_resource = rt_res_pick, - .do_schedule = rt_schedule, - .sleep = rt_unit_sleep, - .wake = rt_unit_wake, - .context_saved = rt_context_saved, -}; - -REGISTER_SCHEDULER(sched_rtds_def); diff --git a/xen/common/schedule.c b/xen/common/schedule.c deleted file mode 100644 index 54a07ff9e8..0000000000 --- a/xen/common/schedule.c +++ /dev/null @@ -1,3144 +0,0 @@ -/**************************************************************************** - * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge - * (C) 2002-2003 University of Cambridge - * (C) 2004 - Mark Williamson - Intel Research Cambridge - **************************************************************************** - * - * File: common/schedule.c - * Author: Rolf Neugebauer & Keir Fraser - * Updated for generic API by Mark Williamson - * - * Description: Generic CPU scheduling code - * implements support functionality for the Xen scheduler API. - * - */ - -#ifndef COMPAT -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_XEN_GUEST -#include -#else -#define pv_shim false -#endif - -/* opt_sched: scheduler - default to configured value */ -static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT; -string_param("sched", opt_sched); - -/* if sched_smt_power_savings is set, - * scheduler will give preferrence to partially idle package compared to - * the full idle package, when picking pCPU to schedule vCPU. - */ -bool_t sched_smt_power_savings = 0; -boolean_param("sched_smt_power_savings", sched_smt_power_savings); - -/* Default scheduling rate limit: 1ms - * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined - * */ -int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; -integer_param("sched_ratelimit_us", sched_ratelimit_us); - -/* Number of vcpus per struct sched_unit. */ -bool __read_mostly sched_disable_smt_switching; -cpumask_t sched_res_mask; - -/* Common lock for free cpus. */ -static DEFINE_SPINLOCK(sched_free_cpu_lock); - -/* Various timer handlers. */ -static void s_timer_fn(void *unused); -static void vcpu_periodic_timer_fn(void *data); -static void vcpu_singleshot_timer_fn(void *data); -static void poll_timer_fn(void *data); - -/* This is global for now so that private implementations can reach it */ -DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res); -static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx); -DEFINE_RCU_READ_LOCK(sched_res_rculock); - -/* Scratch space for cpumasks. */ -DEFINE_PER_CPU(cpumask_t, cpumask_scratch); - -/* How many urgent vcpus. */ -DEFINE_PER_CPU(atomic_t, sched_urgent_count); - -extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[]; -#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array) -#define schedulers __start_schedulers_array - -static struct scheduler __read_mostly ops; - -static bool scheduler_active; - -static void sched_set_affinity( - struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft); - -static struct sched_resource * -sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit) -{ - return unit->res; -} - -static void * -sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, - void *dd) -{ - /* Any non-NULL pointer is fine here. */ - return ZERO_BLOCK_PTR; -} - -static void -sched_idle_free_udata(const struct scheduler *ops, void *priv) -{ -} - -static void sched_idle_schedule( - const struct scheduler *ops, struct sched_unit *unit, s_time_t now, - bool tasklet_work_scheduled) -{ - const unsigned int cpu = smp_processor_id(); - - unit->next_time = -1; - unit->next_task = sched_idle_unit(cpu); -} - -static struct scheduler sched_idle_ops = { - .name = "Idle Scheduler", - .opt_name = "idle", - .sched_data = NULL, - - .pick_resource = sched_idle_res_pick, - .do_schedule = sched_idle_schedule, - - .alloc_udata = sched_idle_alloc_udata, - .free_udata = sched_idle_free_udata, -}; - -static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit, - unsigned int cpu) -{ - unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu); - const struct domain *d = unit->domain; - - return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL; -} - -static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit, - unsigned int cpu) -{ - struct vcpu *v = unit2vcpu_cpu(unit, cpu); - - return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu]; -} - -static inline struct scheduler *dom_scheduler(const struct domain *d) -{ - if ( likely(d->cpupool != NULL) ) - return d->cpupool->sched; - - /* - * If d->cpupool is NULL, this is the idle domain. This is special - * because the idle domain does not really belong to any cpupool, and, - * hence, does not really have a scheduler. - * - * This is (should be!) only called like this for allocating the idle - * vCPUs for the first time, during boot, in which case what we want - * is the default scheduler that has been, choosen at boot. - */ - ASSERT(is_idle_domain(d)); - return &ops; -} - -static inline struct scheduler *unit_scheduler(const struct sched_unit *unit) -{ - struct domain *d = unit->domain; - - if ( likely(d->cpupool != NULL) ) - return d->cpupool->sched; - - /* - * If d->cpupool is NULL, this is a unit of the idle domain. And this - * case is special because the idle domain does not really belong to - * a cpupool and, hence, doesn't really have a scheduler). In fact, its - * units (may) run on pCPUs which are in different pools, with different - * schedulers. - * - * What we want, in this case, is the scheduler of the pCPU where this - * particular idle unit is running. And, since unit->res never changes - * for idle units, it is safe to use it, with no locks, to figure that out. - */ - - ASSERT(is_idle_domain(d)); - return unit->res->scheduler; -} - -static inline struct scheduler *vcpu_scheduler(const struct vcpu *v) -{ - return unit_scheduler(v->sched_unit); -} -#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain) - -static inline void trace_runstate_change(struct vcpu *v, int new_state) -{ - struct { uint32_t vcpu:16, domain:16; } d; - uint32_t event; - - if ( likely(!tb_init_done) ) - return; - - d.vcpu = v->vcpu_id; - d.domain = v->domain->domain_id; - - event = TRC_SCHED_RUNSTATE_CHANGE; - event |= ( v->runstate.state & 0x3 ) << 8; - event |= ( new_state & 0x3 ) << 4; - - __trace_var(event, 1/*tsc*/, sizeof(d), &d); -} - -static inline void trace_continue_running(struct vcpu *v) -{ - struct { uint32_t vcpu:16, domain:16; } d; - - if ( likely(!tb_init_done) ) - return; - - d.vcpu = v->vcpu_id; - d.domain = v->domain->domain_id; - - __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); -} - -static inline void vcpu_urgent_count_update(struct vcpu *v) -{ - if ( is_idle_vcpu(v) ) - return; - - if ( unlikely(v->is_urgent) ) - { - if ( !(v->pause_flags & VPF_blocked) || - !test_bit(v->vcpu_id, v->domain->poll_mask) ) - { - v->is_urgent = 0; - atomic_dec(&per_cpu(sched_urgent_count, v->processor)); - } - } - else - { - if ( unlikely(v->pause_flags & VPF_blocked) && - unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) ) - { - v->is_urgent = 1; - atomic_inc(&per_cpu(sched_urgent_count, v->processor)); - } - } -} - -static inline void vcpu_runstate_change( - struct vcpu *v, int new_state, s_time_t new_entry_time) -{ - s_time_t delta; - struct sched_unit *unit = v->sched_unit; - - ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); - if ( v->runstate.state == new_state ) - return; - - vcpu_urgent_count_update(v); - - trace_runstate_change(v, new_state); - - if ( !is_idle_vcpu(v) ) - { - unit->runstate_cnt[v->runstate.state]--; - unit->runstate_cnt[new_state]++; - } - - delta = new_entry_time - v->runstate.state_entry_time; - if ( delta > 0 ) - { - v->runstate.time[v->runstate.state] += delta; - v->runstate.state_entry_time = new_entry_time; - } - - v->runstate.state = new_state; -} - -void sched_guest_idle(void (*idle) (void), unsigned int cpu) -{ - /* - * Another vcpu of the unit is active in guest context while this one is - * idle. In case of a scheduling event we don't want to have high latencies - * due to a cpu needing to wake up from deep C state for joining the - * rendezvous, so avoid those deep C states by incrementing the urgent - * count of the cpu. - */ - atomic_inc(&per_cpu(sched_urgent_count, cpu)); - idle(); - atomic_dec(&per_cpu(sched_urgent_count, cpu)); -} - -void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) -{ - spinlock_t *lock; - s_time_t delta; - - rcu_read_lock(&sched_res_rculock); - - lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit); - memcpy(runstate, &v->runstate, sizeof(*runstate)); - delta = NOW() - runstate->state_entry_time; - if ( delta > 0 ) - runstate->time[runstate->state] += delta; - - if ( unlikely(lock != NULL) ) - unit_schedule_unlock_irq(lock, v->sched_unit); - - rcu_read_unlock(&sched_res_rculock); -} - -uint64_t get_cpu_idle_time(unsigned int cpu) -{ - struct vcpu_runstate_info state = { 0 }; - struct vcpu *v = idle_vcpu[cpu]; - - if ( cpu_online(cpu) && v ) - vcpu_runstate_get(v, &state); - - return state.time[RUNSTATE_running]; -} - -/* - * If locks are different, take the one with the lower address first. - * This avoids dead- or live-locks when this code is running on both - * cpus at the same time. - */ -static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, - unsigned long *flags) -{ - if ( lock1 == lock2 ) - { - spin_lock_irqsave(lock1, *flags); - } - else if ( lock1 < lock2 ) - { - spin_lock_irqsave(lock1, *flags); - spin_lock(lock2); - } - else - { - spin_lock_irqsave(lock2, *flags); - spin_lock(lock1); - } -} - -static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, - unsigned long flags) -{ - if ( lock1 != lock2 ) - spin_unlock(lock2); - spin_unlock_irqrestore(lock1, flags); -} - -static void sched_free_unit_mem(struct sched_unit *unit) -{ - struct sched_unit *prev_unit; - struct domain *d = unit->domain; - - if ( d->sched_unit_list == unit ) - d->sched_unit_list = unit->next_in_list; - else - { - for_each_sched_unit ( d, prev_unit ) - { - if ( prev_unit->next_in_list == unit ) - { - prev_unit->next_in_list = unit->next_in_list; - break; - } - } - } - - free_cpumask_var(unit->cpu_hard_affinity); - free_cpumask_var(unit->cpu_hard_affinity_saved); - free_cpumask_var(unit->cpu_soft_affinity); - - xfree(unit); -} - -static void sched_free_unit(struct sched_unit *unit, struct vcpu *v) -{ - struct vcpu *vunit; - unsigned int cnt = 0; - - /* Don't count to be released vcpu, might be not in vcpu list yet. */ - for_each_sched_unit_vcpu ( unit, vunit ) - if ( vunit != v ) - cnt++; - - v->sched_unit = NULL; - unit->runstate_cnt[v->runstate.state]--; - - if ( unit->vcpu_list == v ) - unit->vcpu_list = v->next_in_list; - - if ( !cnt ) - sched_free_unit_mem(unit); -} - -static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v) -{ - v->sched_unit = unit; - - /* All but idle vcpus are allocated with sequential vcpu_id. */ - if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id ) - { - unit->vcpu_list = v; - /* - * unit_id is always the same as lowest vcpu_id of unit. - * This is used for stopping for_each_sched_unit_vcpu() loop and in - * order to support cpupools with different granularities. - */ - unit->unit_id = v->vcpu_id; - } - unit->runstate_cnt[v->runstate.state]++; -} - -static struct sched_unit *sched_alloc_unit_mem(void) -{ - struct sched_unit *unit; - - unit = xzalloc(struct sched_unit); - if ( !unit ) - return NULL; - - if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) || - !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) || - !zalloc_cpumask_var(&unit->cpu_soft_affinity) ) - { - sched_free_unit_mem(unit); - unit = NULL; - } - - return unit; -} - -static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d) -{ - struct sched_unit **prev_unit; - - unit->domain = d; - - for ( prev_unit = &d->sched_unit_list; *prev_unit; - prev_unit = &(*prev_unit)->next_in_list ) - if ( (*prev_unit)->next_in_list && - (*prev_unit)->next_in_list->unit_id > unit->unit_id ) - break; - - unit->next_in_list = *prev_unit; - *prev_unit = unit; -} - -static struct sched_unit *sched_alloc_unit(struct vcpu *v) -{ - struct sched_unit *unit; - struct domain *d = v->domain; - unsigned int gran = cpupool_get_granularity(d->cpupool); - - for_each_sched_unit ( d, unit ) - if ( unit->unit_id / gran == v->vcpu_id / gran ) - break; - - if ( unit ) - { - sched_unit_add_vcpu(unit, v); - return unit; - } - - if ( (unit = sched_alloc_unit_mem()) == NULL ) - return NULL; - - sched_unit_add_vcpu(unit, v); - sched_domain_insert_unit(unit, d); - - return unit; -} - -static unsigned int sched_select_initial_cpu(const struct vcpu *v) -{ - const struct domain *d = v->domain; - nodeid_t node; - spinlock_t *lock; - unsigned long flags; - unsigned int cpu_ret, cpu = smp_processor_id(); - cpumask_t *cpus = cpumask_scratch_cpu(cpu); - - lock = pcpu_schedule_lock_irqsave(cpu, &flags); - cpumask_clear(cpus); - for_each_node_mask ( node, d->node_affinity ) - cpumask_or(cpus, cpus, &node_to_cpumask(node)); - cpumask_and(cpus, cpus, d->cpupool->cpu_valid); - if ( cpumask_empty(cpus) ) - cpumask_copy(cpus, d->cpupool->cpu_valid); - - if ( v->vcpu_id == 0 ) - cpu_ret = cpumask_first(cpus); - else - { - /* We can rely on previous vcpu being available. */ - ASSERT(!is_idle_domain(d)); - - cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus); - } - - pcpu_schedule_unlock_irqrestore(lock, flags, cpu); - - return cpu_ret; -} - -int sched_init_vcpu(struct vcpu *v) -{ - struct domain *d = v->domain; - struct sched_unit *unit; - unsigned int processor; - - if ( (unit = sched_alloc_unit(v)) == NULL ) - return 1; - - if ( is_idle_domain(d) ) - processor = v->vcpu_id; - else - processor = sched_select_initial_cpu(v); - - /* Initialise the per-vcpu timers. */ - spin_lock_init(&v->periodic_timer_lock); - init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor); - init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor); - init_timer(&v->poll_timer, poll_timer_fn, v, processor); - - /* If this is not the first vcpu of the unit we are done. */ - if ( unit->priv != NULL ) - { - v->processor = processor; - return 0; - } - - rcu_read_lock(&sched_res_rculock); - - /* The first vcpu of an unit can be set via sched_set_res(). */ - sched_set_res(unit, get_sched_res(processor)); - - unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv); - if ( unit->priv == NULL ) - { - sched_free_unit(unit, v); - rcu_read_unlock(&sched_res_rculock); - return 1; - } - - /* - * Initialize affinity settings. The idler, and potentially - * domain-0 VCPUs, are pinned onto their respective physical CPUs. - */ - if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) ) - sched_set_affinity(unit, cpumask_of(processor), &cpumask_all); - else - sched_set_affinity(unit, &cpumask_all, &cpumask_all); - - /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */ - if ( is_idle_domain(d) ) - { - get_sched_res(v->processor)->curr = unit; - get_sched_res(v->processor)->sched_unit_idle = unit; - v->is_running = 1; - unit->is_running = true; - unit->state_entry_time = NOW(); - } - else - { - sched_insert_unit(dom_scheduler(d), unit); - } - - rcu_read_unlock(&sched_res_rculock); - - return 0; -} - -static void vcpu_move_irqs(struct vcpu *v) -{ - arch_move_irqs(v); - evtchn_move_pirqs(v); -} - -static void sched_move_irqs(const struct sched_unit *unit) -{ - struct vcpu *v; - - for_each_sched_unit_vcpu ( unit, v ) - vcpu_move_irqs(v); -} - -int sched_move_domain(struct domain *d, struct cpupool *c) -{ - struct vcpu *v; - struct sched_unit *unit; - unsigned int new_p, unit_idx; - void **unit_priv; - void *domdata; - void *unitdata; - struct scheduler *old_ops; - void *old_domdata; - unsigned int gran = cpupool_get_granularity(c); - int ret = 0; - - for_each_vcpu ( d, v ) - { - if ( v->affinity_broken ) - return -EBUSY; - } - - rcu_read_lock(&sched_res_rculock); - - domdata = sched_alloc_domdata(c->sched, d); - if ( IS_ERR(domdata) ) - { - ret = PTR_ERR(domdata); - goto out; - } - - unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran)); - if ( unit_priv == NULL ) - { - sched_free_domdata(c->sched, domdata); - ret = -ENOMEM; - goto out; - } - - unit_idx = 0; - for_each_sched_unit ( d, unit ) - { - unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata); - if ( unit_priv[unit_idx] == NULL ) - { - for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ ) - sched_free_udata(c->sched, unit_priv[unit_idx]); - xfree(unit_priv); - sched_free_domdata(c->sched, domdata); - ret = -ENOMEM; - goto out; - } - unit_idx++; - } - - domain_pause(d); - - old_ops = dom_scheduler(d); - old_domdata = d->sched_priv; - - for_each_sched_unit ( d, unit ) - { - sched_remove_unit(old_ops, unit); - } - - d->cpupool = c; - d->sched_priv = domdata; - - new_p = cpumask_first(c->cpu_valid); - unit_idx = 0; - for_each_sched_unit ( d, unit ) - { - spinlock_t *lock; - unsigned int unit_p = new_p; - - unitdata = unit->priv; - - for_each_sched_unit_vcpu ( unit, v ) - { - migrate_timer(&v->periodic_timer, new_p); - migrate_timer(&v->singleshot_timer, new_p); - migrate_timer(&v->poll_timer, new_p); - new_p = cpumask_cycle(new_p, c->cpu_valid); - } - - lock = unit_schedule_lock_irq(unit); - - sched_set_affinity(unit, &cpumask_all, &cpumask_all); - - sched_set_res(unit, get_sched_res(unit_p)); - /* - * With v->processor modified we must not - * - make any further changes assuming we hold the scheduler lock, - * - use unit_schedule_unlock_irq(). - */ - spin_unlock_irq(lock); - - unit->priv = unit_priv[unit_idx]; - if ( !d->is_dying ) - sched_move_irqs(unit); - - sched_insert_unit(c->sched, unit); - - sched_free_udata(old_ops, unitdata); - - unit_idx++; - } - - domain_update_node_affinity(d); - - domain_unpause(d); - - sched_free_domdata(old_ops, old_domdata); - - xfree(unit_priv); - -out: - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -void sched_destroy_vcpu(struct vcpu *v) -{ - struct sched_unit *unit = v->sched_unit; - - kill_timer(&v->periodic_timer); - kill_timer(&v->singleshot_timer); - kill_timer(&v->poll_timer); - if ( test_and_clear_bool(v->is_urgent) ) - atomic_dec(&per_cpu(sched_urgent_count, v->processor)); - /* - * Vcpus are being destroyed top-down. So being the first vcpu of an unit - * is the same as being the only one. - */ - if ( unit->vcpu_list == v ) - { - rcu_read_lock(&sched_res_rculock); - - sched_remove_unit(vcpu_scheduler(v), unit); - sched_free_udata(vcpu_scheduler(v), unit->priv); - sched_free_unit(unit, v); - - rcu_read_unlock(&sched_res_rculock); - } -} - -int sched_init_domain(struct domain *d, int poolid) -{ - void *sdom; - int ret; - - ASSERT(d->cpupool == NULL); - ASSERT(d->domain_id < DOMID_FIRST_RESERVED); - - if ( (ret = cpupool_add_domain(d, poolid)) ) - return ret; - - SCHED_STAT_CRANK(dom_init); - TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id); - - rcu_read_lock(&sched_res_rculock); - - sdom = sched_alloc_domdata(dom_scheduler(d), d); - - rcu_read_unlock(&sched_res_rculock); - - if ( IS_ERR(sdom) ) - return PTR_ERR(sdom); - - d->sched_priv = sdom; - - return 0; -} - -void sched_destroy_domain(struct domain *d) -{ - ASSERT(d->domain_id < DOMID_FIRST_RESERVED); - - if ( d->cpupool ) - { - SCHED_STAT_CRANK(dom_destroy); - TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); - - rcu_read_lock(&sched_res_rculock); - - sched_free_domdata(dom_scheduler(d), d->sched_priv); - d->sched_priv = NULL; - - rcu_read_unlock(&sched_res_rculock); - - cpupool_rm_domain(d); - } -} - -static void vcpu_sleep_nosync_locked(struct vcpu *v) -{ - struct sched_unit *unit = v->sched_unit; - - ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock)); - - if ( likely(!vcpu_runnable(v)) ) - { - if ( v->runstate.state == RUNSTATE_runnable ) - vcpu_runstate_change(v, RUNSTATE_offline, NOW()); - - /* Only put unit to sleep in case all vcpus are not runnable. */ - if ( likely(!unit_runnable(unit)) ) - sched_sleep(unit_scheduler(unit), unit); - else if ( unit_running(unit) > 1 && v->is_running && - !v->force_context_switch ) - { - v->force_context_switch = true; - cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); - } - } -} - -void vcpu_sleep_nosync(struct vcpu *v) -{ - unsigned long flags; - spinlock_t *lock; - - TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); - - rcu_read_lock(&sched_res_rculock); - - lock = unit_schedule_lock_irqsave(v->sched_unit, &flags); - - vcpu_sleep_nosync_locked(v); - - unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit); - - rcu_read_unlock(&sched_res_rculock); -} - -void vcpu_sleep_sync(struct vcpu *v) -{ - vcpu_sleep_nosync(v); - - while ( !vcpu_runnable(v) && v->is_running ) - cpu_relax(); - - sync_vcpu_execstate(v); -} - -void vcpu_wake(struct vcpu *v) -{ - unsigned long flags; - spinlock_t *lock; - struct sched_unit *unit = v->sched_unit; - - TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); - - rcu_read_lock(&sched_res_rculock); - - lock = unit_schedule_lock_irqsave(unit, &flags); - - if ( likely(vcpu_runnable(v)) ) - { - if ( v->runstate.state >= RUNSTATE_blocked ) - vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); - /* - * Call sched_wake() unconditionally, even if unit is running already. - * We might have not been de-scheduled after vcpu_sleep_nosync_locked() - * and are now to be woken up again. - */ - sched_wake(unit_scheduler(unit), unit); - if ( unit->is_running && !v->is_running && !v->force_context_switch ) - { - v->force_context_switch = true; - cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ); - } - } - else if ( !(v->pause_flags & VPF_blocked) ) - { - if ( v->runstate.state == RUNSTATE_blocked ) - vcpu_runstate_change(v, RUNSTATE_offline, NOW()); - } - - unit_schedule_unlock_irqrestore(lock, flags, unit); - - rcu_read_unlock(&sched_res_rculock); -} - -void vcpu_unblock(struct vcpu *v) -{ - if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) - return; - - /* Polling period ends when a VCPU is unblocked. */ - if ( unlikely(v->poll_evtchn != 0) ) - { - v->poll_evtchn = 0; - /* - * We *must* re-clear _VPF_blocked to avoid racing other wakeups of - * this VCPU (and it then going back to sleep on poll_mask). - * Test-and-clear is idiomatic and ensures clear_bit not reordered. - */ - if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) - clear_bit(_VPF_blocked, &v->pause_flags); - } - - vcpu_wake(v); -} - -/* - * Do the actual movement of an unit from old to new CPU. Locks for *both* - * CPUs needs to have been taken already when calling this! - */ -static void sched_unit_move_locked(struct sched_unit *unit, - unsigned int new_cpu) -{ - unsigned int old_cpu = unit->res->master_cpu; - struct vcpu *v; - - rcu_read_lock(&sched_res_rculock); - - /* - * Transfer urgency status to new CPU before switching CPUs, as - * once the switch occurs, v->is_urgent is no longer protected by - * the per-CPU scheduler lock we are holding. - */ - for_each_sched_unit_vcpu ( unit, v ) - { - if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) ) - { - atomic_inc(&per_cpu(sched_urgent_count, new_cpu)); - atomic_dec(&per_cpu(sched_urgent_count, old_cpu)); - } - } - - /* - * Actual CPU switch to new CPU. This is safe because the lock - * pointer can't change while the current lock is held. - */ - sched_migrate(unit_scheduler(unit), unit, new_cpu); - - rcu_read_unlock(&sched_res_rculock); -} - -/* - * Initiating migration - * - * In order to migrate, we need the unit in question to have stopped - * running and have called sched_sleep() (to take it off any - * runqueues, for instance); and if it is currently running, it needs - * to be scheduled out. Finally, we need to hold the scheduling locks - * for both the processor we're migrating from, and the processor - * we're migrating to. - * - * In order to avoid deadlock while satisfying the final requirement, - * we must release any scheduling lock we hold, then try to grab both - * locks we want, then double-check to make sure that what we started - * to do hasn't been changed in the mean time. - * - * These steps are encapsulated in the following two functions; they - * should be called like this: - * - * lock = unit_schedule_lock_irq(unit); - * sched_unit_migrate_start(unit); - * unit_schedule_unlock_irq(lock, unit) - * sched_unit_migrate_finish(unit); - * - * sched_unit_migrate_finish() will do the work now if it can, or simply - * return if it can't (because unit is still running); in that case - * sched_unit_migrate_finish() will be called by unit_context_saved(). - */ -static void sched_unit_migrate_start(struct sched_unit *unit) -{ - struct vcpu *v; - - for_each_sched_unit_vcpu ( unit, v ) - { - set_bit(_VPF_migrating, &v->pause_flags); - vcpu_sleep_nosync_locked(v); - } -} - -static void sched_unit_migrate_finish(struct sched_unit *unit) -{ - unsigned long flags; - unsigned int old_cpu, new_cpu; - spinlock_t *old_lock, *new_lock; - bool_t pick_called = 0; - struct vcpu *v; - - /* - * If the unit is currently running, this will be handled by - * unit_context_saved(); and in any case, if the bit is cleared, then - * someone else has already done the work so we don't need to. - */ - if ( unit->is_running ) - return; - for_each_sched_unit_vcpu ( unit, v ) - if ( !test_bit(_VPF_migrating, &v->pause_flags) ) - return; - - old_cpu = new_cpu = unit->res->master_cpu; - for ( ; ; ) - { - /* - * We need another iteration if the pre-calculated lock addresses - * are not correct any longer after evaluating old and new cpu holding - * the locks. - */ - old_lock = get_sched_res(old_cpu)->schedule_lock; - new_lock = get_sched_res(new_cpu)->schedule_lock; - - sched_spin_lock_double(old_lock, new_lock, &flags); - - old_cpu = unit->res->master_cpu; - if ( old_lock == get_sched_res(old_cpu)->schedule_lock ) - { - /* - * If we selected a CPU on the previosu iteration, check if it - * remains suitable for running this vCPU. - */ - if ( pick_called && - (new_lock == get_sched_res(new_cpu)->schedule_lock) && - cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) && - cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) - break; - - /* Select a new CPU. */ - new_cpu = sched_pick_resource(unit_scheduler(unit), - unit)->master_cpu; - if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) && - cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) ) - break; - pick_called = 1; - } - else - { - /* - * We do not hold the scheduler lock appropriate for this vCPU. - * Thus we cannot select a new CPU on this iteration. Try again. - */ - pick_called = 0; - } - - sched_spin_unlock_double(old_lock, new_lock, flags); - } - - /* - * NB. Check of v->running happens /after/ setting migration flag - * because they both happen in (different) spinlock regions, and those - * regions are strictly serialised. - */ - if ( unit->is_running ) - { - sched_spin_unlock_double(old_lock, new_lock, flags); - return; - } - for_each_sched_unit_vcpu ( unit, v ) - { - if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) - { - sched_spin_unlock_double(old_lock, new_lock, flags); - return; - } - } - - sched_unit_move_locked(unit, new_cpu); - - sched_spin_unlock_double(old_lock, new_lock, flags); - - if ( old_cpu != new_cpu ) - { - /* Vcpus are moved to other pcpus, commit their states to memory. */ - for_each_sched_unit_vcpu ( unit, v ) - sync_vcpu_execstate(v); - sched_move_irqs(unit); - } - - /* Wake on new CPU. */ - for_each_sched_unit_vcpu ( unit, v ) - vcpu_wake(v); -} - -static bool sched_check_affinity_broken(const struct sched_unit *unit) -{ - const struct vcpu *v; - - for_each_sched_unit_vcpu ( unit, v ) - if ( v->affinity_broken ) - return true; - - return false; -} - -static void sched_reset_affinity_broken(struct sched_unit *unit) -{ - struct vcpu *v; - - for_each_sched_unit_vcpu ( unit, v ) - v->affinity_broken = false; -} - -void restore_vcpu_affinity(struct domain *d) -{ - unsigned int cpu = smp_processor_id(); - struct sched_unit *unit; - - ASSERT(system_state == SYS_STATE_resume); - - rcu_read_lock(&sched_res_rculock); - - for_each_sched_unit ( d, unit ) - { - spinlock_t *lock; - unsigned int old_cpu = sched_unit_master(unit); - struct sched_resource *res; - - ASSERT(!unit_runnable(unit)); - - /* - * Re-assign the initial processor as after resume we have no - * guarantee the old processor has come back to life again. - * - * Therefore, here, before actually unpausing the domains, we should - * set v->processor of each of their vCPUs to something that will - * make sense for the scheduler of the cpupool in which they are in. - */ - lock = unit_schedule_lock_irq(unit); - - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(d)); - if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) - { - if ( sched_check_affinity_broken(unit) ) - { - sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); - sched_reset_affinity_broken(unit); - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(d)); - } - - if ( cpumask_empty(cpumask_scratch_cpu(cpu)) ) - { - /* Affinity settings of one vcpu are for the complete unit. */ - printk(XENLOG_DEBUG "Breaking affinity for %pv\n", - unit->vcpu_list); - sched_set_affinity(unit, &cpumask_all, NULL); - cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity, - cpupool_domain_master_cpumask(d)); - } - } - - res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu))); - sched_set_res(unit, res); - - spin_unlock_irq(lock); - - /* v->processor might have changed, so reacquire the lock. */ - lock = unit_schedule_lock_irq(unit); - res = sched_pick_resource(unit_scheduler(unit), unit); - sched_set_res(unit, res); - spin_unlock_irq(lock); - - if ( old_cpu != sched_unit_master(unit) ) - sched_move_irqs(unit); - } - - rcu_read_unlock(&sched_res_rculock); - - domain_update_node_affinity(d); -} - -/* - * This function is used by cpu_hotplug code via cpu notifier chain - * and from cpupools to switch schedulers on a cpu. - * Caller must get domlist_read_lock. - */ -int cpu_disable_scheduler(unsigned int cpu) -{ - struct domain *d; - struct cpupool *c; - cpumask_t online_affinity; - int ret = 0; - - rcu_read_lock(&sched_res_rculock); - - c = get_sched_res(cpu)->cpupool; - if ( c == NULL ) - goto out; - - for_each_domain_in_cpupool ( d, c ) - { - struct sched_unit *unit; - - for_each_sched_unit ( d, unit ) - { - unsigned long flags; - spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags); - - cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid); - if ( cpumask_empty(&online_affinity) && - cpumask_test_cpu(cpu, unit->cpu_hard_affinity) ) - { - if ( sched_check_affinity_broken(unit) ) - { - /* The unit is temporarily pinned, can't move it. */ - unit_schedule_unlock_irqrestore(lock, flags, unit); - ret = -EADDRINUSE; - break; - } - - printk(XENLOG_DEBUG "Breaking affinity for %pv\n", - unit->vcpu_list); - - sched_set_affinity(unit, &cpumask_all, NULL); - } - - if ( unit->res != get_sched_res(cpu) ) - { - /* The unit is not on this cpu, so we can move on. */ - unit_schedule_unlock_irqrestore(lock, flags, unit); - continue; - } - - /* If it is on this cpu, we must send it away. - * We are doing some cpupool manipulations: - * * we want to call the scheduler, and let it re-evaluation - * the placement of the vcpu, taking into account the new - * cpupool configuration; - * * the scheduler will always find a suitable solution, or - * things would have failed before getting in here. - */ - sched_unit_migrate_start(unit); - unit_schedule_unlock_irqrestore(lock, flags, unit); - sched_unit_migrate_finish(unit); - - /* - * The only caveat, in this case, is that if a vcpu active in - * the hypervisor isn't migratable. In this case, the caller - * should try again after releasing and reaquiring all locks. - */ - if ( unit->res == get_sched_res(cpu) ) - ret = -EAGAIN; - } - } - -out: - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -static int cpu_disable_scheduler_check(unsigned int cpu) -{ - struct domain *d; - struct vcpu *v; - struct cpupool *c; - - c = get_sched_res(cpu)->cpupool; - if ( c == NULL ) - return 0; - - for_each_domain_in_cpupool ( d, c ) - for_each_vcpu ( d, v ) - if ( v->affinity_broken ) - return -EADDRINUSE; - - return 0; -} - -/* - * In general, this must be called with the scheduler lock held, because the - * adjust_affinity hook may want to modify the vCPU state. However, when the - * vCPU is being initialized (either for dom0 or domU) there is no risk of - * races, and it's fine to not take the look (we're talking about - * sched_setup_dom0_vcpus() an sched_init_vcpu()). - */ -static void sched_set_affinity( - struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft) -{ - rcu_read_lock(&sched_res_rculock); - sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft); - rcu_read_unlock(&sched_res_rculock); - - if ( hard ) - cpumask_copy(unit->cpu_hard_affinity, hard); - if ( soft ) - cpumask_copy(unit->cpu_soft_affinity, soft); - - unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity, - unit->cpu_soft_affinity) && - cpumask_intersects(unit->cpu_soft_affinity, - unit->cpu_hard_affinity); -} - -static int vcpu_set_affinity( - struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which) -{ - struct sched_unit *unit = v->sched_unit; - spinlock_t *lock; - int ret = 0; - - rcu_read_lock(&sched_res_rculock); - - lock = unit_schedule_lock_irq(unit); - - if ( v->affinity_broken ) - ret = -EBUSY; - else - { - /* - * Tell the scheduler we changes something about affinity, - * and ask to re-evaluate vcpu placement. - */ - if ( which == unit->cpu_hard_affinity ) - { - sched_set_affinity(unit, affinity, NULL); - } - else - { - ASSERT(which == unit->cpu_soft_affinity); - sched_set_affinity(unit, NULL, affinity); - } - sched_unit_migrate_start(unit); - } - - unit_schedule_unlock_irq(lock, unit); - - domain_update_node_affinity(v->domain); - - sched_unit_migrate_finish(unit); - - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity) -{ - cpumask_t online_affinity; - cpumask_t *online; - - online = VCPU2ONLINE(v); - cpumask_and(&online_affinity, affinity, online); - if ( cpumask_empty(&online_affinity) ) - return -EINVAL; - - return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity); -} - -int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity) -{ - return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity); -} - -/* Block the currently-executing domain until a pertinent event occurs. */ -void vcpu_block(void) -{ - struct vcpu *v = current; - - set_bit(_VPF_blocked, &v->pause_flags); - - arch_vcpu_block(v); - - /* Check for events /after/ blocking: avoids wakeup waiting race. */ - if ( local_events_need_delivery() ) - { - clear_bit(_VPF_blocked, &v->pause_flags); - } - else - { - TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); - raise_softirq(SCHEDULE_SOFTIRQ); - } -} - -static void vcpu_block_enable_events(void) -{ - local_event_delivery_enable(); - vcpu_block(); -} - -static long do_poll(struct sched_poll *sched_poll) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - evtchn_port_t port = 0; - long rc; - unsigned int i; - - /* Fairly arbitrary limit. */ - if ( sched_poll->nr_ports > 128 ) - return -EINVAL; - - if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) - return -EFAULT; - - set_bit(_VPF_blocked, &v->pause_flags); - v->poll_evtchn = -1; - set_bit(v->vcpu_id, d->poll_mask); - - arch_vcpu_block(v); - -#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ - /* Check for events /after/ setting flags: avoids wakeup waiting race. */ - smp_mb(); - - /* - * Someone may have seen we are blocked but not that we are polling, or - * vice versa. We are certainly being woken, so clean up and bail. Beyond - * this point others can be guaranteed to clean up for us if they wake us. - */ - rc = 0; - if ( (v->poll_evtchn == 0) || - !test_bit(_VPF_blocked, &v->pause_flags) || - !test_bit(v->vcpu_id, d->poll_mask) ) - goto out; -#endif - - rc = 0; - if ( local_events_need_delivery() ) - goto out; - - for ( i = 0; i < sched_poll->nr_ports; i++ ) - { - rc = -EFAULT; - if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) - goto out; - - rc = -EINVAL; - if ( port >= d->max_evtchns ) - goto out; - - rc = 0; - if ( evtchn_port_is_pending(d, port) ) - goto out; - } - - if ( sched_poll->nr_ports == 1 ) - v->poll_evtchn = port; - - if ( sched_poll->timeout != 0 ) - set_timer(&v->poll_timer, sched_poll->timeout); - - TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); - raise_softirq(SCHEDULE_SOFTIRQ); - - return 0; - - out: - v->poll_evtchn = 0; - clear_bit(v->vcpu_id, d->poll_mask); - clear_bit(_VPF_blocked, &v->pause_flags); - return rc; -} - -/* Voluntarily yield the processor for this allocation. */ -long vcpu_yield(void) -{ - struct vcpu * v=current; - spinlock_t *lock; - - rcu_read_lock(&sched_res_rculock); - - lock = unit_schedule_lock_irq(v->sched_unit); - sched_yield(vcpu_scheduler(v), v->sched_unit); - unit_schedule_unlock_irq(lock, v->sched_unit); - - rcu_read_unlock(&sched_res_rculock); - - SCHED_STAT_CRANK(vcpu_yield); - - TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); - raise_softirq(SCHEDULE_SOFTIRQ); - return 0; -} - -static void domain_watchdog_timeout(void *data) -{ - struct domain *d = data; - - if ( d->is_shutting_down || d->is_dying ) - return; - - printk("Watchdog timer fired for domain %u\n", d->domain_id); - domain_shutdown(d, SHUTDOWN_watchdog); -} - -static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout) -{ - if ( id > NR_DOMAIN_WATCHDOG_TIMERS ) - return -EINVAL; - - spin_lock(&d->watchdog_lock); - - if ( id == 0 ) - { - for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ ) - { - if ( test_and_set_bit(id, &d->watchdog_inuse_map) ) - continue; - set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); - break; - } - spin_unlock(&d->watchdog_lock); - return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1; - } - - id -= 1; - if ( !test_bit(id, &d->watchdog_inuse_map) ) - { - spin_unlock(&d->watchdog_lock); - return -EINVAL; - } - - if ( timeout == 0 ) - { - stop_timer(&d->watchdog_timer[id]); - clear_bit(id, &d->watchdog_inuse_map); - } - else - { - set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); - } - - spin_unlock(&d->watchdog_lock); - return 0; -} - -void watchdog_domain_init(struct domain *d) -{ - unsigned int i; - - spin_lock_init(&d->watchdog_lock); - - d->watchdog_inuse_map = 0; - - for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) - init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); -} - -void watchdog_domain_destroy(struct domain *d) -{ - unsigned int i; - - for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) - kill_timer(&d->watchdog_timer[i]); -} - -/* - * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if - * cpu is NR_CPUS). - * Temporary pinning can be done due to two reasons, which may be nested: - * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case - * of a conflict (e.g. in case cpupool doesn't include requested CPU, or - * another conflicting temporary pinning is already in effect. - * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the - * CPU it is just running on. Can't fail if used properly. - */ -int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason) -{ - struct sched_unit *unit = v->sched_unit; - spinlock_t *lock; - int ret = -EINVAL; - bool migrate; - - rcu_read_lock(&sched_res_rculock); - - lock = unit_schedule_lock_irq(unit); - - if ( cpu == NR_CPUS ) - { - if ( v->affinity_broken & reason ) - { - ret = 0; - v->affinity_broken &= ~reason; - } - if ( !ret && !sched_check_affinity_broken(unit) ) - sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL); - } - else if ( cpu < nr_cpu_ids ) - { - if ( (v->affinity_broken & reason) || - (sched_check_affinity_broken(unit) && v->processor != cpu) ) - ret = -EBUSY; - else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) ) - { - if ( !sched_check_affinity_broken(unit) ) - { - cpumask_copy(unit->cpu_hard_affinity_saved, - unit->cpu_hard_affinity); - sched_set_affinity(unit, cpumask_of(cpu), NULL); - } - v->affinity_broken |= reason; - ret = 0; - } - } - - migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity); - if ( migrate ) - sched_unit_migrate_start(unit); - - unit_schedule_unlock_irq(lock, unit); - - if ( migrate ) - sched_unit_migrate_finish(unit); - - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -typedef long ret_t; - -#endif /* !COMPAT */ - -ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) -{ - ret_t ret = 0; - - switch ( cmd ) - { - case SCHEDOP_yield: - { - ret = vcpu_yield(); - break; - } - - case SCHEDOP_block: - { - vcpu_block_enable_events(); - break; - } - - case SCHEDOP_shutdown: - { - struct sched_shutdown sched_shutdown; - - ret = -EFAULT; - if ( copy_from_guest(&sched_shutdown, arg, 1) ) - break; - - TRACE_3D(TRC_SCHED_SHUTDOWN, - current->domain->domain_id, current->vcpu_id, - sched_shutdown.reason); - ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason); - - break; - } - - case SCHEDOP_shutdown_code: - { - struct sched_shutdown sched_shutdown; - struct domain *d = current->domain; - - ret = -EFAULT; - if ( copy_from_guest(&sched_shutdown, arg, 1) ) - break; - - TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, - d->domain_id, current->vcpu_id, sched_shutdown.reason); - - spin_lock(&d->shutdown_lock); - if ( d->shutdown_code == SHUTDOWN_CODE_INVALID ) - d->shutdown_code = (u8)sched_shutdown.reason; - spin_unlock(&d->shutdown_lock); - - ret = 0; - break; - } - - case SCHEDOP_poll: - { - struct sched_poll sched_poll; - - ret = -EFAULT; - if ( copy_from_guest(&sched_poll, arg, 1) ) - break; - - ret = do_poll(&sched_poll); - - break; - } - - case SCHEDOP_remote_shutdown: - { - struct domain *d; - struct sched_remote_shutdown sched_remote_shutdown; - - ret = -EFAULT; - if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) - break; - - ret = -ESRCH; - d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); - if ( d == NULL ) - break; - - ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); - if ( likely(!ret) ) - domain_shutdown(d, sched_remote_shutdown.reason); - - rcu_unlock_domain(d); - - break; - } - - case SCHEDOP_watchdog: - { - struct sched_watchdog sched_watchdog; - - ret = -EFAULT; - if ( copy_from_guest(&sched_watchdog, arg, 1) ) - break; - - ret = domain_watchdog( - current->domain, sched_watchdog.id, sched_watchdog.timeout); - break; - } - - case SCHEDOP_pin_override: - { - struct sched_pin_override sched_pin_override; - unsigned int cpu; - - ret = -EPERM; - if ( !is_hardware_domain(current->domain) ) - break; - - ret = -EFAULT; - if ( copy_from_guest(&sched_pin_override, arg, 1) ) - break; - - ret = -EINVAL; - if ( sched_pin_override.pcpu >= NR_CPUS ) - break; - - cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu; - ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE); - - break; - } - - default: - ret = -ENOSYS; - } - - return ret; -} - -#ifndef COMPAT - -/* Per-vcpu oneshot-timer hypercall. */ -long do_set_timer_op(s_time_t timeout) -{ - struct vcpu *v = current; - s_time_t offset = timeout - NOW(); - - if ( timeout == 0 ) - { - stop_timer(&v->singleshot_timer); - } - else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ - unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) - { - /* - * Linux workaround: occasionally we will see timeouts a long way in - * the future due to wrapping in Linux's jiffy time handling. We check - * for timeouts wrapped negative, and for positive timeouts more than - * about 13 days in the future (2^50ns). The correct fix is to trigger - * an interrupt immediately (since Linux in fact has pending work to - * do in this situation). However, older guests also set a long timeout - * when they have *no* pending timers at all: setting an immediate - * timeout in this case can burn a lot of CPU. We therefore go for a - * reasonable middleground of triggering a timer event in 100ms. - */ - gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n", - timeout); - set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); - } - else - { - migrate_timer(&v->singleshot_timer, smp_processor_id()); - set_timer(&v->singleshot_timer, timeout); - } - - return 0; -} - -/* sched_id - fetch ID of current scheduler */ -int sched_id(void) -{ - return ops.sched_id; -} - -/* Adjust scheduling parameter for a given domain. */ -long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) -{ - long ret; - - ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd); - if ( ret ) - return ret; - - if ( op->sched_id != dom_scheduler(d)->sched_id ) - return -EINVAL; - - switch ( op->cmd ) - { - case XEN_DOMCTL_SCHEDOP_putinfo: - case XEN_DOMCTL_SCHEDOP_getinfo: - case XEN_DOMCTL_SCHEDOP_putvcpuinfo: - case XEN_DOMCTL_SCHEDOP_getvcpuinfo: - break; - default: - return -EINVAL; - } - - /* NB: the pluggable scheduler code needs to take care - * of locking by itself. */ - rcu_read_lock(&sched_res_rculock); - - if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 ) - TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); - - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -long sched_adjust_global(struct xen_sysctl_scheduler_op *op) -{ - struct cpupool *pool; - int rc; - - rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd); - if ( rc ) - return rc; - - if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) && - (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) ) - return -EINVAL; - - pool = cpupool_get_by_id(op->cpupool_id); - if ( pool == NULL ) - return -ESRCH; - - rcu_read_lock(&sched_res_rculock); - - rc = ((op->sched_id == pool->sched->sched_id) - ? sched_adjust_cpupool(pool->sched, op) : -EINVAL); - - rcu_read_unlock(&sched_res_rculock); - - cpupool_put(pool); - - return rc; -} - -static void vcpu_periodic_timer_work_locked(struct vcpu *v) -{ - s_time_t now; - s_time_t periodic_next_event; - - now = NOW(); - periodic_next_event = v->periodic_last_event + v->periodic_period; - - if ( now >= periodic_next_event ) - { - send_timer_event(v); - v->periodic_last_event = now; - periodic_next_event = now + v->periodic_period; - } - - migrate_timer(&v->periodic_timer, v->processor); - set_timer(&v->periodic_timer, periodic_next_event); -} - -static void vcpu_periodic_timer_work(struct vcpu *v) -{ - if ( v->periodic_period == 0 ) - return; - - spin_lock(&v->periodic_timer_lock); - if ( v->periodic_period ) - vcpu_periodic_timer_work_locked(v); - spin_unlock(&v->periodic_timer_lock); -} - -/* - * Set the periodic timer of a vcpu. - */ -void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value) -{ - spin_lock(&v->periodic_timer_lock); - - stop_timer(&v->periodic_timer); - - v->periodic_period = value; - if ( value ) - vcpu_periodic_timer_work_locked(v); - - spin_unlock(&v->periodic_timer_lock); -} - -static void sched_switch_units(struct sched_resource *sr, - struct sched_unit *next, struct sched_unit *prev, - s_time_t now) -{ - unsigned int cpu; - - ASSERT(unit_running(prev)); - - if ( prev != next ) - { - sr->curr = next; - sr->prev = prev; - - TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, - prev->unit_id, now - prev->state_entry_time); - TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, - next->unit_id, - (next->vcpu_list->runstate.state == RUNSTATE_runnable) ? - (now - next->state_entry_time) : 0, prev->next_time); - TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id, - next->domain->domain_id, next->unit_id); - - ASSERT(!unit_running(next)); - - /* - * NB. Don't add any trace records from here until the actual context - * switch, else lost_records resume will not work properly. - */ - - ASSERT(!next->is_running); - next->is_running = true; - next->state_entry_time = now; - - if ( is_idle_unit(prev) ) - { - prev->runstate_cnt[RUNSTATE_running] = 0; - prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity; - } - if ( is_idle_unit(next) ) - { - next->runstate_cnt[RUNSTATE_running] = sr->granularity; - next->runstate_cnt[RUNSTATE_runnable] = 0; - } - } - - for_each_cpu ( cpu, sr->cpus ) - { - struct vcpu *vprev = get_cpu_current(cpu); - struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu); - - if ( vprev != vnext || vprev->runstate.state != vnext->new_state ) - { - vcpu_runstate_change(vprev, - ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked : - (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)), - now); - vcpu_runstate_change(vnext, vnext->new_state, now); - } - - vnext->is_running = 1; - - if ( is_idle_vcpu(vnext) ) - vnext->sched_unit = next; - } -} - -static bool sched_tasklet_check_cpu(unsigned int cpu) -{ - unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu); - - switch ( *tasklet_work ) - { - case TASKLET_enqueued: - set_bit(_TASKLET_scheduled, tasklet_work); - /* fallthrough */ - case TASKLET_enqueued|TASKLET_scheduled: - return true; - break; - case TASKLET_scheduled: - clear_bit(_TASKLET_scheduled, tasklet_work); - /* fallthrough */ - case 0: - /* return false; */ - break; - default: - BUG(); - } - - return false; -} - -static bool sched_tasklet_check(unsigned int cpu) -{ - bool tasklet_work_scheduled = false; - const cpumask_t *mask = get_sched_res(cpu)->cpus; - unsigned int cpu_iter; - - for_each_cpu ( cpu_iter, mask ) - if ( sched_tasklet_check_cpu(cpu_iter) ) - tasklet_work_scheduled = true; - - return tasklet_work_scheduled; -} - -static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now, - unsigned int cpu) -{ - struct sched_resource *sr = get_sched_res(cpu); - struct scheduler *sched = sr->scheduler; - struct sched_unit *next; - - /* get policy-specific decision on scheduling... */ - sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu)); - - next = prev->next_task; - - if ( prev->next_time >= 0 ) /* -ve means no limit */ - set_timer(&sr->s_timer, now + prev->next_time); - - sched_switch_units(sr, next, prev, now); - - return next; -} - -static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext) -{ - /* Clear running flag /after/ writing context to memory. */ - smp_wmb(); - - if ( vprev != vnext ) - vprev->is_running = 0; -} - -static void unit_context_saved(struct sched_resource *sr) -{ - struct sched_unit *unit = sr->prev; - - if ( !unit ) - return; - - unit->is_running = false; - unit->state_entry_time = NOW(); - sr->prev = NULL; - - /* Check for migration request /after/ clearing running flag. */ - smp_mb(); - - sched_context_saved(unit_scheduler(unit), unit); - - /* Idle never migrates and idle vcpus might belong to other units. */ - if ( !is_idle_unit(unit) ) - sched_unit_migrate_finish(unit); -} - -/* - * Rendezvous on end of context switch. - * As no lock is protecting this rendezvous function we need to use atomic - * access functions on the counter. - * The counter will be 0 in case no rendezvous is needed. For the rendezvous - * case it is initialised to the number of cpus to rendezvous plus 1. Each - * member entering decrements the counter. The last one will decrement it to - * 1 and perform the final needed action in that case (call of - * unit_context_saved()), and then set the counter to zero. The other members - * will wait until the counter becomes zero until they proceed. - */ -void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext) -{ - struct sched_unit *next = vnext->sched_unit; - struct sched_resource *sr; - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(smp_processor_id()); - - if ( atomic_read(&next->rendezvous_out_cnt) ) - { - int cnt = atomic_dec_return(&next->rendezvous_out_cnt); - - vcpu_context_saved(vprev, vnext); - - /* Call unit_context_saved() before releasing other waiters. */ - if ( cnt == 1 ) - { - unit_context_saved(sr); - atomic_set(&next->rendezvous_out_cnt, 0); - } - else - while ( atomic_read(&next->rendezvous_out_cnt) ) - cpu_relax(); - } - else - { - vcpu_context_saved(vprev, vnext); - if ( sr->granularity == 1 ) - unit_context_saved(sr); - } - - if ( is_idle_vcpu(vprev) && vprev != vnext ) - vprev->sched_unit = sr->sched_unit_idle; - - rcu_read_unlock(&sched_res_rculock); -} - -static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext, - bool reset_idle_unit, s_time_t now) -{ - if ( unlikely(vprev == vnext) ) - { - TRACE_4D(TRC_SCHED_SWITCH_INFCONT, - vnext->domain->domain_id, vnext->sched_unit->unit_id, - now - vprev->runstate.state_entry_time, - vprev->sched_unit->next_time); - sched_context_switched(vprev, vnext); - - /* - * We are switching from a non-idle to an idle unit. - * A vcpu of the idle unit might have been running before due to - * the guest vcpu being blocked. We must adjust the unit of the idle - * vcpu which might have been set to the guest's one. - */ - if ( reset_idle_unit ) - vnext->sched_unit = - get_sched_res(smp_processor_id())->sched_unit_idle; - - rcu_read_unlock(&sched_res_rculock); - - trace_continue_running(vnext); - return continue_running(vprev); - } - - SCHED_STAT_CRANK(sched_ctx); - - stop_timer(&vprev->periodic_timer); - - if ( vnext->sched_unit->migrated ) - vcpu_move_irqs(vnext); - - vcpu_periodic_timer_work(vnext); - - rcu_read_unlock(&sched_res_rculock); - - context_switch(vprev, vnext); -} - -/* - * Force a context switch of a single vcpu of an unit. - * Might be called either if a vcpu of an already running unit is woken up - * or if a vcpu of a running unit is put asleep with other vcpus of the same - * unit still running. - * Returns either NULL if v is already in the correct state or the vcpu to - * run next. - */ -static struct vcpu *sched_force_context_switch(struct vcpu *vprev, - struct vcpu *v, - unsigned int cpu, s_time_t now) -{ - v->force_context_switch = false; - - if ( vcpu_runnable(v) == v->is_running ) - return NULL; - - if ( vcpu_runnable(v) ) - { - if ( is_idle_vcpu(vprev) ) - { - vcpu_runstate_change(vprev, RUNSTATE_runnable, now); - vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; - } - vcpu_runstate_change(v, RUNSTATE_running, now); - } - else - { - /* Make sure not to switch last vcpu of an unit away. */ - if ( unit_running(v->sched_unit) == 1 ) - return NULL; - - v->new_state = vcpu_runstate_blocked(v); - vcpu_runstate_change(v, v->new_state, now); - v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu); - if ( v != vprev ) - { - if ( is_idle_vcpu(vprev) ) - { - vcpu_runstate_change(vprev, RUNSTATE_runnable, now); - vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle; - } - else - { - v->sched_unit = vprev->sched_unit; - vcpu_runstate_change(v, RUNSTATE_running, now); - } - } - } - - /* This vcpu will be switched to. */ - v->is_running = true; - - /* Make sure not to loose another slave call. */ - raise_softirq(SCHED_SLAVE_SOFTIRQ); - - return v; -} - -/* - * Rendezvous before taking a scheduling decision. - * Called with schedule lock held, so all accesses to the rendezvous counter - * can be normal ones (no atomic accesses needed). - * The counter is initialized to the number of cpus to rendezvous initially. - * Each cpu entering will decrement the counter. In case the counter becomes - * zero do_schedule() is called and the rendezvous counter for leaving - * context_switch() is set. All other members will wait until the counter is - * becoming zero, dropping the schedule lock in between. - */ -static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev, - spinlock_t **lock, int cpu, - s_time_t now) -{ - struct sched_unit *next; - struct vcpu *v; - unsigned int gran = get_sched_res(cpu)->granularity; - - if ( !--prev->rendezvous_in_cnt ) - { - next = do_schedule(prev, now, cpu); - atomic_set(&next->rendezvous_out_cnt, gran + 1); - return next; - } - - v = unit2vcpu_cpu(prev, cpu); - while ( prev->rendezvous_in_cnt ) - { - if ( v && v->force_context_switch ) - { - struct vcpu *vprev = current; - - v = sched_force_context_switch(vprev, v, cpu, now); - - if ( v ) - { - /* We'll come back another time, so adjust rendezvous_in_cnt. */ - prev->rendezvous_in_cnt++; - atomic_set(&prev->rendezvous_out_cnt, 0); - - pcpu_schedule_unlock_irq(*lock, cpu); - - sched_context_switch(vprev, v, false, now); - - return NULL; /* ARM only. */ - } - - v = unit2vcpu_cpu(prev, cpu); - } - /* - * Coming from idle might need to do tasklet work. - * In order to avoid deadlocks we can't do that here, but have to - * continue the idle loop. - * Undo the rendezvous_in_cnt decrement and schedule another call of - * sched_slave(). - */ - if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) ) - { - struct vcpu *vprev = current; - - prev->rendezvous_in_cnt++; - atomic_set(&prev->rendezvous_out_cnt, 0); - - pcpu_schedule_unlock_irq(*lock, cpu); - - raise_softirq(SCHED_SLAVE_SOFTIRQ); - sched_context_switch(vprev, vprev, false, now); - - return NULL; /* ARM only. */ - } - - pcpu_schedule_unlock_irq(*lock, cpu); - - cpu_relax(); - - *lock = pcpu_schedule_lock_irq(cpu); - - if ( unlikely(!scheduler_active) ) - { - ASSERT(is_idle_unit(prev)); - atomic_set(&prev->next_task->rendezvous_out_cnt, 0); - prev->rendezvous_in_cnt = 0; - } - } - - return prev->next_task; -} - -static void sched_slave(void) -{ - struct vcpu *v, *vprev = current; - struct sched_unit *prev = vprev->sched_unit, *next; - s_time_t now; - spinlock_t *lock; - bool do_softirq = false; - unsigned int cpu = smp_processor_id(); - - ASSERT_NOT_IN_ATOMIC(); - - rcu_read_lock(&sched_res_rculock); - - lock = pcpu_schedule_lock_irq(cpu); - - now = NOW(); - - v = unit2vcpu_cpu(prev, cpu); - if ( v && v->force_context_switch ) - { - v = sched_force_context_switch(vprev, v, cpu, now); - - if ( v ) - { - pcpu_schedule_unlock_irq(lock, cpu); - - sched_context_switch(vprev, v, false, now); - - return; - } - - do_softirq = true; - } - - if ( !prev->rendezvous_in_cnt ) - { - pcpu_schedule_unlock_irq(lock, cpu); - - rcu_read_unlock(&sched_res_rculock); - - /* Check for failed forced context switch. */ - if ( do_softirq ) - raise_softirq(SCHEDULE_SOFTIRQ); - - return; - } - - stop_timer(&get_sched_res(cpu)->s_timer); - - next = sched_wait_rendezvous_in(prev, &lock, cpu, now); - if ( !next ) - return; - - pcpu_schedule_unlock_irq(lock, cpu); - - sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu), - is_idle_unit(next) && !is_idle_unit(prev), now); -} - -/* - * The main function - * - deschedule the current domain (scheduler independent). - * - pick a new domain (scheduler dependent). - */ -static void schedule(void) -{ - struct vcpu *vnext, *vprev = current; - struct sched_unit *prev = vprev->sched_unit, *next = NULL; - s_time_t now; - struct sched_resource *sr; - spinlock_t *lock; - int cpu = smp_processor_id(); - unsigned int gran; - - ASSERT_NOT_IN_ATOMIC(); - - SCHED_STAT_CRANK(sched_run); - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); - gran = sr->granularity; - - lock = pcpu_schedule_lock_irq(cpu); - - if ( prev->rendezvous_in_cnt ) - { - /* - * We have a race: sched_slave() should be called, so raise a softirq - * in order to re-enter schedule() later and call sched_slave() now. - */ - pcpu_schedule_unlock_irq(lock, cpu); - - rcu_read_unlock(&sched_res_rculock); - - raise_softirq(SCHEDULE_SOFTIRQ); - return sched_slave(); - } - - stop_timer(&sr->s_timer); - - now = NOW(); - - if ( gran > 1 ) - { - cpumask_t mask; - - prev->rendezvous_in_cnt = gran; - cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu)); - cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ); - next = sched_wait_rendezvous_in(prev, &lock, cpu, now); - if ( !next ) - return; - } - else - { - prev->rendezvous_in_cnt = 0; - next = do_schedule(prev, now, cpu); - atomic_set(&next->rendezvous_out_cnt, 0); - } - - pcpu_schedule_unlock_irq(lock, cpu); - - vnext = sched_unit2vcpu_cpu(next, cpu); - sched_context_switch(vprev, vnext, - !is_idle_unit(prev) && is_idle_unit(next), now); -} - -/* The scheduler timer: force a run through the scheduler */ -static void s_timer_fn(void *unused) -{ - raise_softirq(SCHEDULE_SOFTIRQ); - SCHED_STAT_CRANK(sched_irq); -} - -/* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ -static void vcpu_periodic_timer_fn(void *data) -{ - struct vcpu *v = data; - vcpu_periodic_timer_work(v); -} - -/* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ -static void vcpu_singleshot_timer_fn(void *data) -{ - struct vcpu *v = data; - send_timer_event(v); -} - -/* SCHEDOP_poll timeout callback. */ -static void poll_timer_fn(void *data) -{ - struct vcpu *v = data; - - if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) - vcpu_unblock(v); -} - -static struct sched_resource *sched_alloc_res(void) -{ - struct sched_resource *sr; - - sr = xzalloc(struct sched_resource); - if ( sr == NULL ) - return NULL; - if ( !zalloc_cpumask_var(&sr->cpus) ) - { - xfree(sr); - return NULL; - } - return sr; -} - -static int cpu_schedule_up(unsigned int cpu) -{ - struct sched_resource *sr; - - sr = sched_alloc_res(); - if ( sr == NULL ) - return -ENOMEM; - - sr->master_cpu = cpu; - cpumask_copy(sr->cpus, cpumask_of(cpu)); - set_sched_res(cpu, sr); - - sr->scheduler = &sched_idle_ops; - spin_lock_init(&sr->_lock); - sr->schedule_lock = &sched_free_cpu_lock; - init_timer(&sr->s_timer, s_timer_fn, NULL, cpu); - atomic_set(&per_cpu(sched_urgent_count, cpu), 0); - - /* We start with cpu granularity. */ - sr->granularity = 1; - - cpumask_set_cpu(cpu, &sched_res_mask); - - /* Boot CPU is dealt with later in scheduler_init(). */ - if ( cpu == 0 ) - return 0; - - if ( idle_vcpu[cpu] == NULL ) - vcpu_create(idle_vcpu[0]->domain, cpu); - else - idle_vcpu[cpu]->sched_unit->res = sr; - - if ( idle_vcpu[cpu] == NULL ) - return -ENOMEM; - - idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0; - - /* - * No need to allocate any scheduler data, as cpus coming online are - * free initially and the idle scheduler doesn't need any data areas - * allocated. - */ - - sr->curr = idle_vcpu[cpu]->sched_unit; - sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit; - - sr->sched_priv = NULL; - - return 0; -} - -static void sched_res_free(struct rcu_head *head) -{ - struct sched_resource *sr = container_of(head, struct sched_resource, rcu); - - free_cpumask_var(sr->cpus); - if ( sr->sched_unit_idle ) - sched_free_unit_mem(sr->sched_unit_idle); - xfree(sr); -} - -static void cpu_schedule_down(unsigned int cpu) -{ - struct sched_resource *sr; - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); - - kill_timer(&sr->s_timer); - - cpumask_clear_cpu(cpu, &sched_res_mask); - set_sched_res(cpu, NULL); - - /* Keep idle unit. */ - sr->sched_unit_idle = NULL; - call_rcu(&sr->rcu, sched_res_free); - - rcu_read_unlock(&sched_res_rculock); -} - -void sched_rm_cpu(unsigned int cpu) -{ - int rc; - - rcu_read_lock(&domlist_read_lock); - rc = cpu_disable_scheduler(cpu); - BUG_ON(rc); - rcu_read_unlock(&domlist_read_lock); - cpu_schedule_down(cpu); -} - -static int cpu_schedule_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int rc = 0; - - /* - * All scheduler related suspend/resume handling needed is done in - * cpupool.c. - */ - if ( system_state > SYS_STATE_active ) - return NOTIFY_DONE; - - rcu_read_lock(&sched_res_rculock); - - /* - * From the scheduler perspective, bringing up a pCPU requires - * allocating and initializing the per-pCPU scheduler specific data, - * as well as "registering" this pCPU to the scheduler (which may - * involve modifying some scheduler wide data structures). - * As new pCPUs always start as "free" cpus with the minimal idle - * scheduler being in charge, we don't need any of that. - * - * On the other hand, at teardown, we need to reverse what has been done - * during initialization, and then free the per-pCPU specific data. A - * pCPU brought down is not forced through "free" cpus, so here we need to - * use the appropriate hooks. - * - * This happens by calling the deinit_pdata and free_pdata hooks, in this - * order. If no per-pCPU memory was allocated, there is no need to - * provide an implementation of free_pdata. deinit_pdata may, however, - * be necessary/useful in this case too (e.g., it can undo something done - * on scheduler wide data structure during init_pdata). Both deinit_pdata - * and free_pdata are called during CPU_DEAD. - * - * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED. - */ - switch ( action ) - { - case CPU_UP_PREPARE: - rc = cpu_schedule_up(cpu); - break; - case CPU_DOWN_PREPARE: - rcu_read_lock(&domlist_read_lock); - rc = cpu_disable_scheduler_check(cpu); - rcu_read_unlock(&domlist_read_lock); - break; - case CPU_DEAD: - sched_rm_cpu(cpu); - break; - case CPU_UP_CANCELED: - cpu_schedule_down(cpu); - break; - default: - break; - } - - rcu_read_unlock(&sched_res_rculock); - - return !rc ? NOTIFY_DONE : notifier_from_errno(rc); -} - -static struct notifier_block cpu_schedule_nfb = { - .notifier_call = cpu_schedule_callback -}; - -const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu) -{ - const cpumask_t *mask; - - switch ( opt ) - { - case SCHED_GRAN_cpu: - mask = cpumask_of(cpu); - break; - case SCHED_GRAN_core: - mask = per_cpu(cpu_sibling_mask, cpu); - break; - case SCHED_GRAN_socket: - mask = per_cpu(cpu_core_mask, cpu); - break; - default: - ASSERT_UNREACHABLE(); - return NULL; - } - - return mask; -} - -static void schedule_dummy(void) -{ - sched_tasklet_check_cpu(smp_processor_id()); -} - -void scheduler_disable(void) -{ - scheduler_active = false; - open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy); - open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy); -} - -void scheduler_enable(void) -{ - open_softirq(SCHEDULE_SOFTIRQ, schedule); - open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave); - scheduler_active = true; -} - -/* Initialise the data structures. */ -void __init scheduler_init(void) -{ - struct domain *idle_domain; - int i; - - scheduler_enable(); - - for ( i = 0; i < NUM_SCHEDULERS; i++) - { -#define sched_test_func(f) \ - if ( !schedulers[i]->f ) \ - { \ - printk("scheduler %s misses .%s, dropped\n", \ - schedulers[i]->opt_name, #f); \ - schedulers[i] = NULL; \ - } - - sched_test_func(init); - sched_test_func(deinit); - sched_test_func(pick_resource); - sched_test_func(alloc_udata); - sched_test_func(free_udata); - sched_test_func(switch_sched); - sched_test_func(do_schedule); - -#undef sched_test_func - - if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 ) - { - printk("scheduler %s failed initialization, dropped\n", - schedulers[i]->opt_name); - schedulers[i] = NULL; - } - - if ( schedulers[i] && !ops.name && - !strcmp(schedulers[i]->opt_name, opt_sched) ) - ops = *schedulers[i]; - } - - if ( !ops.name ) - { - printk("Could not find scheduler: %s\n", opt_sched); - for ( i = 0; i < NUM_SCHEDULERS; i++ ) - if ( schedulers[i] && - !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) ) - { - ops = *schedulers[i]; - break; - } - BUG_ON(!ops.name); - printk("Using '%s' (%s)\n", ops.name, ops.opt_name); - } - - if ( cpu_schedule_up(0) ) - BUG(); - register_cpu_notifier(&cpu_schedule_nfb); - - printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); - if ( sched_init(&ops) ) - panic("scheduler returned error on init\n"); - - if ( sched_ratelimit_us && - (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX - || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) ) - { - printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n" - " Resetting to default %u\n", - XEN_SYSCTL_SCHED_RATELIMIT_MIN, - XEN_SYSCTL_SCHED_RATELIMIT_MAX, - SCHED_DEFAULT_RATELIMIT_US); - sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; - } - - idle_domain = domain_create(DOMID_IDLE, NULL, false); - BUG_ON(IS_ERR(idle_domain)); - BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu)); - idle_domain->vcpu = idle_vcpu; - idle_domain->max_vcpus = nr_cpu_ids; - if ( vcpu_create(idle_domain, 0) == NULL ) - BUG(); - - rcu_read_lock(&sched_res_rculock); - - get_sched_res(0)->curr = idle_vcpu[0]->sched_unit; - get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit; - - rcu_read_unlock(&sched_res_rculock); -} - -/* - * Move a pCPU from free cpus (running the idle scheduler) to a cpupool - * using any "real" scheduler. - * The cpu is still marked as "free" and not yet valid for its cpupool. - */ -int schedule_cpu_add(unsigned int cpu, struct cpupool *c) -{ - struct vcpu *idle; - void *ppriv, *vpriv; - struct scheduler *new_ops = c->sched; - struct sched_resource *sr; - spinlock_t *old_lock, *new_lock; - unsigned long flags; - int ret = 0; - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); - - ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); - ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid)); - ASSERT(get_sched_res(cpu)->cpupool == NULL); - - /* - * To setup the cpu for the new scheduler we need: - * - a valid instance of per-CPU scheduler specific data, as it is - * allocated by sched_alloc_pdata(). Note that we do not want to - * initialize it yet (i.e., we are not calling sched_init_pdata()). - * That will be done by the target scheduler, in sched_switch_sched(), - * in proper ordering and with locking. - * - a valid instance of per-vCPU scheduler specific data, for the idle - * vCPU of cpu. That is what the target scheduler will use for the - * sched_priv field of the per-vCPU info of the idle domain. - */ - idle = idle_vcpu[cpu]; - ppriv = sched_alloc_pdata(new_ops, cpu); - if ( IS_ERR(ppriv) ) - { - ret = PTR_ERR(ppriv); - goto out; - } - - vpriv = sched_alloc_udata(new_ops, idle->sched_unit, - idle->domain->sched_priv); - if ( vpriv == NULL ) - { - sched_free_pdata(new_ops, ppriv, cpu); - ret = -ENOMEM; - goto out; - } - - /* - * The actual switch, including the rerouting of the scheduler lock to - * whatever new_ops prefers, needs to happen in one critical section, - * protected by old_ops' lock, or races are possible. - * It is, in fact, the lock of the idle scheduler that we are taking. - * But that is ok as anyone trying to schedule on this cpu will spin until - * when we release that lock (bottom of this function). When he'll get the - * lock --thanks to the loop inside *_schedule_lock() functions-- he'll - * notice that the lock itself changed, and retry acquiring the new one - * (which will be the correct, remapped one, at that point). - */ - old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); - - if ( cpupool_get_granularity(c) > 1 ) - { - const cpumask_t *mask; - unsigned int cpu_iter, idx = 0; - struct sched_unit *old_unit, *master_unit; - struct sched_resource *sr_old; - - /* - * We need to merge multiple idle_vcpu units and sched_resource structs - * into one. As the free cpus all share the same lock we are fine doing - * that now. The worst which could happen would be someone waiting for - * the lock, thus dereferencing sched_res->schedule_lock. This is the - * reason we are freeing struct sched_res via call_rcu() to avoid the - * lock pointer suddenly disappearing. - */ - mask = sched_get_opt_cpumask(c->gran, cpu); - master_unit = idle_vcpu[cpu]->sched_unit; - - for_each_cpu ( cpu_iter, mask ) - { - if ( idx ) - cpumask_clear_cpu(cpu_iter, &sched_res_mask); - - per_cpu(sched_res_idx, cpu_iter) = idx++; - - if ( cpu == cpu_iter ) - continue; - - old_unit = idle_vcpu[cpu_iter]->sched_unit; - sr_old = get_sched_res(cpu_iter); - kill_timer(&sr_old->s_timer); - idle_vcpu[cpu_iter]->sched_unit = master_unit; - master_unit->runstate_cnt[RUNSTATE_running]++; - set_sched_res(cpu_iter, sr); - cpumask_set_cpu(cpu_iter, sr->cpus); - - call_rcu(&sr_old->rcu, sched_res_free); - } - } - - new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv); - - sr->scheduler = new_ops; - sr->sched_priv = ppriv; - - /* - * Reroute the lock to the per pCPU lock as /last/ thing. In fact, - * if it is free (and it can be) we want that anyone that manages - * taking it, finds all the initializations we've done above in place. - */ - smp_wmb(); - sr->schedule_lock = new_lock; - - /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */ - spin_unlock_irqrestore(old_lock, flags); - - sr->granularity = cpupool_get_granularity(c); - sr->cpupool = c; - /* The cpu is added to a pool, trigger it to go pick up some work */ - cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); - -out: - rcu_read_unlock(&sched_res_rculock); - - return ret; -} - -/* - * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops - * (the idle scheduler). - * The cpu is already marked as "free" and not valid any longer for its - * cpupool. - */ -int schedule_cpu_rm(unsigned int cpu) -{ - void *ppriv_old, *vpriv_old; - struct sched_resource *sr, **sr_new = NULL; - struct sched_unit *unit; - struct scheduler *old_ops; - spinlock_t *old_lock; - unsigned long flags; - int idx, ret = -ENOMEM; - unsigned int cpu_iter; - - rcu_read_lock(&sched_res_rculock); - - sr = get_sched_res(cpu); - old_ops = sr->scheduler; - - if ( sr->granularity > 1 ) - { - sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); - if ( !sr_new ) - goto out; - for ( idx = 0; idx < sr->granularity - 1; idx++ ) - { - sr_new[idx] = sched_alloc_res(); - if ( sr_new[idx] ) - { - sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); - if ( !sr_new[idx]->sched_unit_idle ) - { - sched_res_free(&sr_new[idx]->rcu); - sr_new[idx] = NULL; - } - } - if ( !sr_new[idx] ) - { - for ( idx--; idx >= 0; idx-- ) - sched_res_free(&sr_new[idx]->rcu); - goto out; - } - sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; - sr_new[idx]->scheduler = &sched_idle_ops; - sr_new[idx]->granularity = 1; - - /* We want the lock not to change when replacing the resource. */ - sr_new[idx]->schedule_lock = sr->schedule_lock; - } - } - - ret = 0; - ASSERT(sr->cpupool != NULL); - ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); - ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); - - /* See comment in schedule_cpu_add() regarding lock switching. */ - old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); - - vpriv_old = idle_vcpu[cpu]->sched_unit->priv; - ppriv_old = sr->sched_priv; - - idx = 0; - for_each_cpu ( cpu_iter, sr->cpus ) - { - per_cpu(sched_res_idx, cpu_iter) = 0; - if ( cpu_iter == cpu ) - { - idle_vcpu[cpu_iter]->sched_unit->priv = NULL; - } - else - { - /* Initialize unit. */ - unit = sr_new[idx]->sched_unit_idle; - unit->res = sr_new[idx]; - unit->is_running = true; - sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); - sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); - - /* Adjust cpu masks of resources (old and new). */ - cpumask_clear_cpu(cpu_iter, sr->cpus); - cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); - - /* Init timer. */ - init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); - - /* Last resource initializations and insert resource pointer. */ - sr_new[idx]->master_cpu = cpu_iter; - set_sched_res(cpu_iter, sr_new[idx]); - - /* Last action: set the new lock pointer. */ - smp_mb(); - sr_new[idx]->schedule_lock = &sched_free_cpu_lock; - - idx++; - } - } - sr->scheduler = &sched_idle_ops; - sr->sched_priv = NULL; - - smp_mb(); - sr->schedule_lock = &sched_free_cpu_lock; - - /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ - spin_unlock_irqrestore(old_lock, flags); - - sched_deinit_pdata(old_ops, ppriv_old, cpu); - - sched_free_udata(old_ops, vpriv_old); - sched_free_pdata(old_ops, ppriv_old, cpu); - - sr->granularity = 1; - sr->cpupool = NULL; - -out: - rcu_read_unlock(&sched_res_rculock); - xfree(sr_new); - - return ret; -} - -struct scheduler *scheduler_get_default(void) -{ - return &ops; -} - -struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr) -{ - int i; - struct scheduler *sched; - - for ( i = 0; i < NUM_SCHEDULERS; i++ ) - if ( schedulers[i] && schedulers[i]->sched_id == sched_id ) - goto found; - *perr = -ENOENT; - return NULL; - - found: - *perr = -ENOMEM; - if ( (sched = xmalloc(struct scheduler)) == NULL ) - return NULL; - memcpy(sched, schedulers[i], sizeof(*sched)); - if ( (*perr = sched_init(sched)) != 0 ) - { - xfree(sched); - sched = NULL; - } - - return sched; -} - -void scheduler_free(struct scheduler *sched) -{ - BUG_ON(sched == &ops); - sched_deinit(sched); - xfree(sched); -} - -void schedule_dump(struct cpupool *c) -{ - unsigned int i; - struct scheduler *sched; - cpumask_t *cpus; - - /* Locking, if necessary, must be handled withing each scheduler */ - - rcu_read_lock(&sched_res_rculock); - - if ( c != NULL ) - { - sched = c->sched; - cpus = c->cpu_valid; - printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); - sched_dump_settings(sched); - } - else - { - sched = &ops; - cpus = &cpupool_free_cpus; - } - - if ( sched->dump_cpu_state != NULL ) - { - printk("CPUs info:\n"); - for_each_cpu (i, cpus) - sched_dump_cpu_state(sched, i); - } - - rcu_read_unlock(&sched_res_rculock); -} - -void sched_tick_suspend(void) -{ - rcu_idle_enter(smp_processor_id()); - rcu_idle_timer_start(); -} - -void sched_tick_resume(void) -{ - rcu_idle_timer_stop(); - rcu_idle_exit(smp_processor_id()); -} - -void wait(void) -{ - schedule(); -} - -#ifdef CONFIG_X86 -void __init sched_setup_dom0_vcpus(struct domain *d) -{ - unsigned int i; - struct sched_unit *unit; - - for ( i = 1; i < d->max_vcpus; i++ ) - vcpu_create(d, i); - - /* - * PV-shim: vcpus are pinned 1:1. - * Initially only 1 cpu is online, others will be dealt with when - * onlining them. This avoids pinning a vcpu to a not yet online cpu here. - */ - if ( pv_shim ) - sched_set_affinity(d->vcpu[0]->sched_unit, - cpumask_of(0), cpumask_of(0)); - else - { - for_each_sched_unit ( d, unit ) - { - if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed ) - sched_set_affinity(unit, &dom0_cpus, NULL); - sched_set_affinity(unit, NULL, &dom0_cpus); - } - } - - domain_update_node_affinity(d); -} -#endif - -#ifdef CONFIG_COMPAT -#include "compat/schedule.c" -#endif - -#endif /* !COMPAT */ - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */