alistair23-linux/drivers/xen/manage.c
Dongli Zhang 5e25f5db6a xen/time: do not decrease steal time after live migration on xen
After guest live migration on xen, steal time in /proc/stat
(cpustat[CPUTIME_STEAL]) might decrease because steal returned by
xen_steal_lock() might be less than this_rq()->prev_steal_time which is
derived from previous return value of xen_steal_clock().

For instance, steal time of each vcpu is 335 before live migration.

cpu  198 0 368 200064 1962 0 0 1340 0 0
cpu0 38 0 81 50063 492 0 0 335 0 0
cpu1 65 0 97 49763 634 0 0 335 0 0
cpu2 38 0 81 50098 462 0 0 335 0 0
cpu3 56 0 107 50138 374 0 0 335 0 0

After live migration, steal time is reduced to 312.

cpu  200 0 370 200330 1971 0 0 1248 0 0
cpu0 38 0 82 50123 500 0 0 312 0 0
cpu1 65 0 97 49832 634 0 0 312 0 0
cpu2 39 0 82 50167 462 0 0 312 0 0
cpu3 56 0 107 50207 374 0 0 312 0 0

Since runstate times are cumulative and cleared during xen live migration
by xen hypervisor, the idea of this patch is to accumulate runstate times
to global percpu variables before live migration suspend. Once guest VM is
resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new
runstate times and previously accumulated times stored in global percpu
variables.

Comment above HYPERVISOR_suspend() has been removed as it is inaccurate:
the call can return an error code (e.g., possibly -EPERM in the future).

Similar and more severe issue would impact prior linux 4.8-4.10 as
discussed by Michael Las at
https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest,
which would overflow steal time and lead to 100% st usage in top command
for linux 4.8-4.10. A backport of this patch would fix that issue.

[boris: added linux/slab.h to driver/xen/time.c, slightly reformatted
        commit message]

References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
2017-11-02 16:49:41 -04:00

375 lines
7.8 KiB
C

/*
* Handle extern requests for shutdown, reboot and sysrq
*/
#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <linux/stop_machine.h>
#include <linux/freezer.h>
#include <linux/syscore_ops.h>
#include <linux/export.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <xen/hvc-console.h>
#include <xen/page.h>
#include <xen/xen-ops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
enum shutdown_state {
SHUTDOWN_INVALID = -1,
SHUTDOWN_POWEROFF = 0,
SHUTDOWN_SUSPEND = 2,
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
report a crash, not be instructed to crash!
HALT is the same as POWEROFF, as far as we're concerned. The tools use
the distinction when we return the reason code to them. */
SHUTDOWN_HALT = 4,
};
/* Ignore multiple shutdown requests. */
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
struct suspend_info {
int cancelled;
};
static RAW_NOTIFIER_HEAD(xen_resume_notifier);
void xen_resume_notifier_register(struct notifier_block *nb)
{
raw_notifier_chain_register(&xen_resume_notifier, nb);
}
EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
void xen_resume_notifier_unregister(struct notifier_block *nb)
{
raw_notifier_chain_unregister(&xen_resume_notifier, nb);
}
EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
#ifdef CONFIG_HIBERNATE_CALLBACKS
static int xen_suspend(void *data)
{
struct suspend_info *si = data;
int err;
BUG_ON(!irqs_disabled());
err = syscore_suspend();
if (err) {
pr_err("%s: system core suspend failed: %d\n", __func__, err);
return err;
}
gnttab_suspend();
xen_manage_runstate_time(-1);
xen_arch_pre_suspend();
si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
? virt_to_gfn(xen_start_info)
: 0);
xen_arch_post_suspend(si->cancelled);
xen_manage_runstate_time(si->cancelled ? 1 : 0);
gnttab_resume();
if (!si->cancelled) {
xen_irq_resume();
xen_timer_resume();
}
syscore_resume();
return 0;
}
static void do_suspend(void)
{
int err;
struct suspend_info si;
shutting_down = SHUTDOWN_SUSPEND;
err = freeze_processes();
if (err) {
pr_err("%s: freeze processes failed %d\n", __func__, err);
goto out;
}
err = freeze_kernel_threads();
if (err) {
pr_err("%s: freeze kernel threads failed %d\n", __func__, err);
goto out_thaw;
}
err = dpm_suspend_start(PMSG_FREEZE);
if (err) {
pr_err("%s: dpm_suspend_start %d\n", __func__, err);
goto out_thaw;
}
printk(KERN_DEBUG "suspending xenstore...\n");
xs_suspend();
err = dpm_suspend_end(PMSG_FREEZE);
if (err) {
pr_err("dpm_suspend_end failed: %d\n", err);
si.cancelled = 0;
goto out_resume;
}
xen_arch_suspend();
si.cancelled = 1;
err = stop_machine(xen_suspend, &si, cpumask_of(0));
/* Resume console as early as possible. */
if (!si.cancelled)
xen_console_resume();
raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
if (err) {
pr_err("failed to start xen_suspend: %d\n", err);
si.cancelled = 1;
}
xen_arch_resume();
out_resume:
if (!si.cancelled)
xs_resume();
else
xs_suspend_cancel();
dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
out_thaw:
thaw_processes();
out:
shutting_down = SHUTDOWN_INVALID;
}
#endif /* CONFIG_HIBERNATE_CALLBACKS */
struct shutdown_handler {
#define SHUTDOWN_CMD_SIZE 11
const char command[SHUTDOWN_CMD_SIZE];
bool flag;
void (*cb)(void);
};
static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused)
{
switch (code) {
case SYS_DOWN:
case SYS_HALT:
case SYS_POWER_OFF:
shutting_down = SHUTDOWN_POWEROFF;
default:
break;
}
return NOTIFY_DONE;
}
static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
orderly_poweroff(false);
break;
default:
/* Don't do it when we are halting/rebooting. */
pr_info("Ignoring Xen toolstack shutdown.\n");
break;
}
}
static void do_reboot(void)
{
shutting_down = SHUTDOWN_POWEROFF; /* ? */
ctrl_alt_del();
}
static struct shutdown_handler shutdown_handlers[] = {
{ "poweroff", true, do_poweroff },
{ "halt", false, do_poweroff },
{ "reboot", true, do_reboot },
#ifdef CONFIG_HIBERNATE_CALLBACKS
{ "suspend", true, do_suspend },
#endif
};
static void shutdown_handler(struct xenbus_watch *watch,
const char *path, const char *token)
{
char *str;
struct xenbus_transaction xbt;
int err;
int idx;
if (shutting_down != SHUTDOWN_INVALID)
return;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
/* Ignore read errors and empty reads. */
if (XENBUS_IS_ERR_READ(str)) {
xenbus_transaction_end(xbt, 1);
return;
}
for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) {
if (strcmp(str, shutdown_handlers[idx].command) == 0)
break;
}
/* Only acknowledge commands which we are prepared to handle. */
if (idx < ARRAY_SIZE(shutdown_handlers))
xenbus_write(xbt, "control", "shutdown", "");
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN) {
kfree(str);
goto again;
}
if (idx < ARRAY_SIZE(shutdown_handlers)) {
shutdown_handlers[idx].cb();
} else {
pr_info("Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID;
}
kfree(str);
}
#ifdef CONFIG_MAGIC_SYSRQ
static void sysrq_handler(struct xenbus_watch *watch, const char *path,
const char *token)
{
char sysrq_key = '\0';
struct xenbus_transaction xbt;
int err;
again:
err = xenbus_transaction_start(&xbt);
if (err)
return;
err = xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key);
if (err < 0) {
/*
* The Xenstore watch fires directly after registering it and
* after a suspend/resume cycle. So ENOENT is no error but
* might happen in those cases.
*/
if (err != -ENOENT)
pr_err("Error %d reading sysrq code in control/sysrq\n",
err);
xenbus_transaction_end(xbt, 1);
return;
}
if (sysrq_key != '\0')
xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)
goto again;
if (sysrq_key != '\0')
handle_sysrq(sysrq_key);
}
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
#endif
static struct xenbus_watch shutdown_watch = {
.node = "control/shutdown",
.callback = shutdown_handler
};
static struct notifier_block xen_reboot_nb = {
.notifier_call = poweroff_nb,
};
static int setup_shutdown_watcher(void)
{
int err;
int idx;
#define FEATURE_PATH_SIZE (SHUTDOWN_CMD_SIZE + sizeof("feature-"))
char node[FEATURE_PATH_SIZE];
err = register_xenbus_watch(&shutdown_watch);
if (err) {
pr_err("Failed to set shutdown watcher\n");
return err;
}
#ifdef CONFIG_MAGIC_SYSRQ
err = register_xenbus_watch(&sysrq_watch);
if (err) {
pr_err("Failed to set sysrq watcher\n");
return err;
}
#endif
for (idx = 0; idx < ARRAY_SIZE(shutdown_handlers); idx++) {
if (!shutdown_handlers[idx].flag)
continue;
snprintf(node, FEATURE_PATH_SIZE, "feature-%s",
shutdown_handlers[idx].command);
xenbus_printf(XBT_NIL, "control", node, "%u", 1);
}
return 0;
}
static int shutdown_event(struct notifier_block *notifier,
unsigned long event,
void *data)
{
setup_shutdown_watcher();
return NOTIFY_DONE;
}
int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
};
if (!xen_domain())
return -ENODEV;
register_xenstore_notifier(&xenstore_notifier);
register_reboot_notifier(&xen_reboot_nb);
return 0;
}
EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
subsys_initcall(xen_setup_shutdown_event);