alistair23-linux/kernel/crash_core.c
David Hildenbrand e04b742f74 kexec: export PG_offline to VMCOREINFO
Right now, pages inflated as part of a balloon driver will be dumped by
dump tools like makedumpfile.  While XEN is able to check in the crash
kernel whether a certain pfn is actuall backed by memory in the
hypervisor (see xen_oldmem_pfn_is_ram) and optimize this case, dumps of
other balloon inflated memory will essentially result in zero pages
getting allocated by the hypervisor and the dump getting filled with
this data.

The allocation and reading of zero pages can directly be avoided if a
dumping tool could know which pages only contain stale information not
to be dumped.

We now have PG_offline which can be (and already is by virtio-balloon)
used for marking pages as logically offline.  Follow up patches will
make use of this flag also in other balloon implementations.

Let's export PG_offline via PAGE_OFFLINE_MAPCOUNT_VALUE, so makedumpfile
can directly skip pages that are logically offline and the content
therefore stale.

Please note that this is also helpful for a problem we were seeing under
Hyper-V: Dumping logically offline memory (pages kept fake offline while
onlining a section via online_page_callback) would under some condicions
result in a kernel panic when dumping them.

Link: http://lkml.kernel.org/r/20181119101616.8901-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Lianbo Jiang <lijiang@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Kazuhito Hagio <k-hagio@ab.jp.nec.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Christian Hansen <chansen3@cisco.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juergen Gross <jgross@suse.com>
Cc: Julien Freche <jfreche@vmware.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Miles Chen <miles.chen@mediatek.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Pankaj gupta <pagupta@redhat.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xavier Deguillard <xdeguillard@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-05 21:07:14 -08:00

478 lines
12 KiB
C

/*
* crash.c - kernel crash support code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
#include <linux/crash_core.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/sections.h>
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
u32 *vmcoreinfo_note;
/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
static unsigned char *vmcoreinfo_data_safecopy;
/*
* parsing the "crashkernel" commandline
*
* this code is intended to be called from architecture specific code
*/
/*
* This function parses command lines in the format
*
* crashkernel=ramsize-range:size[,...][@offset]
*
* The function returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_mem(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
char *cur = cmdline, *tmp;
/* for each entry of the comma-separated list */
do {
unsigned long long start, end = ULLONG_MAX, size;
/* get the start of the range */
start = memparse(cur, &tmp);
if (cur == tmp) {
pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (*cur != '-') {
pr_warn("crashkernel: '-' expected\n");
return -EINVAL;
}
cur++;
/* if no ':' is here, than we read the end */
if (*cur != ':') {
end = memparse(cur, &tmp);
if (cur == tmp) {
pr_warn("crashkernel: Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (end <= start) {
pr_warn("crashkernel: end <= start\n");
return -EINVAL;
}
}
if (*cur != ':') {
pr_warn("crashkernel: ':' expected\n");
return -EINVAL;
}
cur++;
size = memparse(cur, &tmp);
if (cur == tmp) {
pr_warn("Memory value expected\n");
return -EINVAL;
}
cur = tmp;
if (size >= system_ram) {
pr_warn("crashkernel: invalid size\n");
return -EINVAL;
}
/* match ? */
if (system_ram >= start && system_ram < end) {
*crash_size = size;
break;
}
} while (*cur++ == ',');
if (*crash_size > 0) {
while (*cur && *cur != ' ' && *cur != '@')
cur++;
if (*cur == '@') {
cur++;
*crash_base = memparse(cur, &tmp);
if (cur == tmp) {
pr_warn("Memory value expected after '@'\n");
return -EINVAL;
}
}
} else
pr_info("crashkernel size resulted in zero bytes\n");
return 0;
}
/*
* That function parses "simple" (old) crashkernel command lines like
*
* crashkernel=size[@offset]
*
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_simple(char *cmdline,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
char *cur = cmdline;
*crash_size = memparse(cmdline, &cur);
if (cmdline == cur) {
pr_warn("crashkernel: memory value expected\n");
return -EINVAL;
}
if (*cur == '@')
*crash_base = memparse(cur+1, &cur);
else if (*cur != ' ' && *cur != '\0') {
pr_warn("crashkernel: unrecognized char: %c\n", *cur);
return -EINVAL;
}
return 0;
}
#define SUFFIX_HIGH 0
#define SUFFIX_LOW 1
#define SUFFIX_NULL 2
static __initdata char *suffix_tbl[] = {
[SUFFIX_HIGH] = ",high",
[SUFFIX_LOW] = ",low",
[SUFFIX_NULL] = NULL,
};
/*
* That function parses "suffix" crashkernel command lines like
*
* crashkernel=size,[high|low]
*
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_suffix(char *cmdline,
unsigned long long *crash_size,
const char *suffix)
{
char *cur = cmdline;
*crash_size = memparse(cmdline, &cur);
if (cmdline == cur) {
pr_warn("crashkernel: memory value expected\n");
return -EINVAL;
}
/* check with suffix */
if (strncmp(cur, suffix, strlen(suffix))) {
pr_warn("crashkernel: unrecognized char: %c\n", *cur);
return -EINVAL;
}
cur += strlen(suffix);
if (*cur != ' ' && *cur != '\0') {
pr_warn("crashkernel: unrecognized char: %c\n", *cur);
return -EINVAL;
}
return 0;
}
static __init char *get_last_crashkernel(char *cmdline,
const char *name,
const char *suffix)
{
char *p = cmdline, *ck_cmdline = NULL;
/* find crashkernel and use the last one if there are more */
p = strstr(p, name);
while (p) {
char *end_p = strchr(p, ' ');
char *q;
if (!end_p)
end_p = p + strlen(p);
if (!suffix) {
int i;
/* skip the one with any known suffix */
for (i = 0; suffix_tbl[i]; i++) {
q = end_p - strlen(suffix_tbl[i]);
if (!strncmp(q, suffix_tbl[i],
strlen(suffix_tbl[i])))
goto next;
}
ck_cmdline = p;
} else {
q = end_p - strlen(suffix);
if (!strncmp(q, suffix, strlen(suffix)))
ck_cmdline = p;
}
next:
p = strstr(p+1, name);
}
if (!ck_cmdline)
return NULL;
return ck_cmdline;
}
static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
const char *name,
const char *suffix)
{
char *first_colon, *first_space;
char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
ck_cmdline += strlen(name);
if (suffix)
return parse_crashkernel_suffix(ck_cmdline, crash_size,
suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
*/
first_colon = strchr(ck_cmdline, ':');
first_space = strchr(ck_cmdline, ' ');
if (first_colon && (!first_space || first_colon < first_space))
return parse_crashkernel_mem(ck_cmdline, system_ram,
crash_size, crash_base);
return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
}
/*
* That function is the entry point for command line parsing and should be
* called from the arch-specific code.
*/
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel=", NULL);
}
int __init parse_crashkernel_high(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
}
int __init parse_crashkernel_low(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
struct elf_note *note = (struct elf_note *)buf;
note->n_namesz = strlen(name) + 1;
note->n_descsz = data_len;
note->n_type = type;
buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
memcpy(buf, name, note->n_namesz);
buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
memcpy(buf, data, data_len);
buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
return buf;
}
void final_note(Elf_Word *buf)
{
memset(buf, 0, sizeof(struct elf_note));
}
static void update_vmcoreinfo_note(void)
{
u32 *buf = vmcoreinfo_note;
if (!vmcoreinfo_size)
return;
buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
vmcoreinfo_size);
final_note(buf);
}
void crash_update_vmcoreinfo_safecopy(void *ptr)
{
if (ptr)
memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
vmcoreinfo_data_safecopy = ptr;
}
void crash_save_vmcoreinfo(void)
{
if (!vmcoreinfo_note)
return;
/* Use the safe copy to generate vmcoreinfo note if have */
if (vmcoreinfo_data_safecopy)
vmcoreinfo_data = vmcoreinfo_data_safecopy;
vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds());
update_vmcoreinfo_note();
}
void vmcoreinfo_append_str(const char *fmt, ...)
{
va_list args;
char buf[0x50];
size_t r;
va_start(args, fmt);
r = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
}
/*
* provide an empty default implementation here -- architecture
* code may override this
*/
void __weak arch_crash_save_vmcoreinfo(void)
{}
phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
return __pa(vmcoreinfo_note);
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
if (!vmcoreinfo_data) {
pr_warn("Memory allocation for vmcoreinfo_data failed\n");
return -ENOMEM;
}
vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
GFP_KERNEL | __GFP_ZERO);
if (!vmcoreinfo_note) {
free_page((unsigned long)vmcoreinfo_data);
vmcoreinfo_data = NULL;
pr_warn("Memory allocation for vmcoreinfo_note failed\n");
return -ENOMEM;
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
#ifdef CONFIG_MMU
VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
#endif
VMCOREINFO_STRUCT_SIZE(page);
VMCOREINFO_STRUCT_SIZE(pglist_data);
VMCOREINFO_STRUCT_SIZE(zone);
VMCOREINFO_STRUCT_SIZE(free_area);
VMCOREINFO_STRUCT_SIZE(list_head);
VMCOREINFO_SIZE(nodemask_t);
VMCOREINFO_OFFSET(page, flags);
VMCOREINFO_OFFSET(page, _refcount);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(page, compound_dtor);
VMCOREINFO_OFFSET(page, compound_order);
VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
VMCOREINFO_OFFSET(pglist_data, node_mem_map);
#endif
VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
VMCOREINFO_OFFSET(pglist_data, node_id);
VMCOREINFO_OFFSET(zone, free_area);
VMCOREINFO_OFFSET(zone, vm_stat);
VMCOREINFO_OFFSET(zone, spanned_pages);
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
VMCOREINFO_NUMBER(PG_swapbacked);
VMCOREINFO_NUMBER(PG_slab);
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
VMCOREINFO_NUMBER(PG_head_mask);
#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
return 0;
}
subsys_initcall(crash_save_vmcoreinfo_init);