From 287eeb5e02bfd9ddcb881f47400510b5cda686d1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH 01/94] [PATCH] x86-64: Update defconfig Signed-off-by: Andi Kleen --- arch/x86_64/defconfig | 45 ++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69584c295305..293a4a4c609e 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-rc3 -# Fri Jan 5 11:54:41 2007 +# Linux kernel version: 2.6.20-git8 +# Tue Feb 13 11:25:16 2007 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y +CONFIG_ZONE_DMA=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 @@ -201,13 +203,14 @@ CONFIG_ACPI=y CONFIG_ACPI_SLEEP=y CONFIG_ACPI_SLEEP_PROC_FS=y CONFIG_ACPI_SLEEP_PROC_SLEEP=y +CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y @@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y CONFIG_PCIEPORTBUS=y CONFIG_PCIEAER=y CONFIG_PCI_MSI=y -# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set @@ -398,6 +400,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set # @@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_TASK_IOCTL is not set # @@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_BLK_DEV_TC86C001 is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y # Serial ATA (prod) and Parallel ATA (experimental) drivers # CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set CONFIG_SATA_AHCI=y CONFIG_SATA_SVW=y CONFIG_ATA_PIIX=y @@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y # CONFIG_SATA_ULI is not set CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set @@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_HPT3X2N is not set # CONFIG_PATA_HPT3X3 is not set # CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set @@ -682,9 +691,7 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set -# CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers @@ -706,6 +713,11 @@ CONFIG_IEEE1394_RAWIO=y # # CONFIG_I2O is not set +# +# Macintosh device drivers +# +# CONFIG_MAC_EMUMOUSEBTN is not set + # # Network device support # @@ -774,6 +786,7 @@ CONFIG_8139TOO=y # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set # CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set # # Ethernet (1000 Mbit) @@ -795,11 +808,13 @@ CONFIG_E1000=y CONFIG_TIGON3=y CONFIG_BNX2=y # CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set # # Ethernet (10000 Mbit) # # CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set @@ -1115,6 +1130,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y @@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y # HID Devices # CONFIG_HID=y +# CONFIG_HID_DEBUG is not set # # USB support @@ -1142,10 +1159,8 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set +# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_BIG_ENDIAN is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set +# CONFIG_USB_GTCO is not set # # USB Imaging devices @@ -1312,6 +1330,10 @@ CONFIG_USB_MON=y # DMA Devices # +# +# Auxiliary Display support +# + # # Virtualization # @@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set @@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set @@ -1560,4 +1582,5 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y -CONFIG_IOMAP_COPY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y From ee55c0be30429d7c3e61fa26c7f7e323c80e14f0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH 02/94] [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 51 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index bb0c376b62b3..5ae1e0bc8fd7 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-rc3 -# Fri Jan 5 11:54:46 2007 +# Linux kernel version: 2.6.20-git8 +# Tue Feb 13 11:25:18 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -10,6 +10,7 @@ CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y CONFIG_MMU=y +CONFIG_ZONE_DMA=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_GENERIC_BUG=y @@ -139,7 +140,6 @@ CONFIG_MPENTIUMIII=y # CONFIG_MVIAC3_2 is not set CONFIG_X86_GENERIC=y CONFIG_X86_CMPXCHG=y -CONFIG_X86_XADD=y CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_RWSEM_XCHGADD_ALGORITHM=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set @@ -198,6 +198,7 @@ CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y @@ -211,6 +212,7 @@ CONFIG_HZ_250=y CONFIG_HZ=250 # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set +CONFIG_PHYSICAL_START=0x100000 # CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x100000 # CONFIG_HOTPLUG_CPU is not set @@ -229,13 +231,14 @@ CONFIG_PM_SYSFS_DEPRECATED=y # ACPI (Advanced Configuration and Power Interface) Support # CONFIG_ACPI=y +CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set @@ -306,7 +309,6 @@ CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y # CONFIG_PCIEPORTBUS is not set CONFIG_PCI_MSI=y -# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set CONFIG_ISA_DMA_API=y @@ -347,6 +349,7 @@ CONFIG_UNIX=y CONFIG_XFRM=y # CONFIG_XFRM_USER is not set # CONFIG_XFRM_SUB_POLICY is not set +# CONFIG_XFRM_MIGRATE is not set # CONFIG_NET_KEY is not set CONFIG_INET=y CONFIG_IP_MULTICAST=y @@ -446,6 +449,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set # @@ -466,8 +470,7 @@ CONFIG_FW_LOADER=y # # Plug and Play support # -CONFIG_PNP=y -CONFIG_PNPACPI=y +# CONFIG_PNP is not set # # Block devices @@ -515,6 +518,7 @@ CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_TASK_IOCTL is not set # @@ -547,6 +551,7 @@ CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -557,6 +562,7 @@ CONFIG_BLK_DEV_PIIX=y # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_BLK_DEV_TC86C001 is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -655,6 +661,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # Serial ATA (prod) and Parallel ATA (experimental) drivers # CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set CONFIG_SATA_AHCI=y CONFIG_SATA_SVW=y CONFIG_ATA_PIIX=y @@ -670,6 +677,7 @@ CONFIG_SATA_SIL=y # CONFIG_SATA_ULI is not set CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set @@ -687,6 +695,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_HPT3X2N is not set # CONFIG_PATA_HPT3X3 is not set # CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set @@ -739,9 +748,7 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set -# CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers @@ -766,6 +773,11 @@ CONFIG_IEEE1394_RAWIO=y # # CONFIG_I2O is not set +# +# Macintosh device drivers +# +# CONFIG_MAC_EMUMOUSEBTN is not set + # # Network device support # @@ -833,6 +845,7 @@ CONFIG_8139TOO=y # CONFIG_SUNDANCE is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set # # Ethernet (1000 Mbit) @@ -855,11 +868,13 @@ CONFIG_SKY2=y CONFIG_TIGON3=y CONFIG_BNX2=y # CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set # # Ethernet (10000 Mbit) # # CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set @@ -1090,6 +1105,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y @@ -1103,6 +1119,7 @@ CONFIG_SOUND_ICH=y # HID Devices # CONFIG_HID=y +# CONFIG_HID_DEBUG is not set # # USB support @@ -1117,10 +1134,8 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1130,9 +1145,11 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set +# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_BIG_ENDIAN is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -1183,6 +1200,7 @@ CONFIG_USB_HID=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set +# CONFIG_USB_GTCO is not set # # USB Imaging devices @@ -1287,6 +1305,10 @@ CONFIG_USB_MON=y # DMA Devices # +# +# Auxiliary Display support +# + # # Virtualization # @@ -1480,6 +1502,7 @@ CONFIG_UNUSED_SYMBOLS=y # CONFIG_DEBUG_FS is not set # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set @@ -1488,7 +1511,6 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set @@ -1533,7 +1555,8 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y -CONFIG_IOMAP_COPY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y From 0812a579c92fefa57506821fa08e90f47cb6dbdd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH 03/94] [PATCH] x86-64: Add __copy_from_user_nocache This does user copies in fs write() into the page cache with write combining. This pushes the destination out of the CPU's cache, but allows higher bandwidth in some case. The theory is that the page cache data is usually not touched by the CPU again and it's better to not pollute the cache with it. Also it is a little faster. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/x8664_ksyms.c | 1 + arch/x86_64/lib/Makefile | 2 +- arch/x86_64/lib/copy_user_nocache.S | 217 ++++++++++++++++++++++++++++ include/asm-x86_64/uaccess.h | 14 ++ 4 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 arch/x86_64/lib/copy_user_nocache.S diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 6d77e4797a47..23a7da312f30 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index b78d4170fce2..8d5f835af481 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S new file mode 100644 index 000000000000..4620efb12f13 --- /dev/null +++ b/arch/x86_64/lib/copy_user_nocache.S @@ -0,0 +1,217 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include +#include + +#define FIX_ALIGNMENT 1 + +#include +#include +#include +#include + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + * + * Input: + * rdi destination + * rsi source + * rdx count + * rcx zero flag when 1 zero on exception + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx /* save zero flag */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + + xorl %eax,%eax /* zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movnti %r11,(%rdi) +.Ld2: movnti %r8,1*8(%rdi) +.Ld3: movnti %r9,2*8(%rdi) +.Ld4: movnti %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movnti %r11,4*8(%rdi) +.Ld6: movnti %r8,5*8(%rdi) +.Ld7: movnti %r9,6*8(%rdi) +.Ld8: movnti %r10,7*8(%rdi) + + dec %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movnti %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE %rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) /* zero flag set? */ + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(__copy_user_nocache) + + diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index 8079e29c14fd..1981f70fcad1 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -367,4 +367,18 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) return copy_user_generic((__force void *)dst, src, size); } +#define ARCH_HAS_NOCACHE_UACCESS 1 +extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); + +static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) +{ + might_sleep(); + return __copy_user_nocache(dst, (__force void *)src, size, 1); +} + +static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) +{ + return __copy_user_nocache(dst, (__force void *)src, size, 0); +} + #endif /* __X86_64_UACCESS_H */ From 076422d2af7e3d8e72c6e70843f6ea377714b082 Mon Sep 17 00:00:00 2001 From: Amul Shah Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH 04/94] [PATCH] x86-64: Allocate the NUMA hash function nodemap dynamically Remove the statically allocated memory to NUMA node hash map in favor of a dynamically allocated memory to node hash map (it is cache aligned). This patch has the nice side effect in that it allows the hash map to grow for systems with large amounts of memory (256GB - 1TB), but suffer from having small PCI space tacked onto the boot node (which is somewhere between 192MB to 512MB on the ES7000). Signed-off-by: Amul Shah Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Rohit Seth Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 7 ++++ arch/x86_64/kernel/setup.c | 5 +++ arch/x86_64/mm/numa.c | 74 ++++++++++++++++++++++++++++++++----- include/asm-x86_64/e820.h | 1 + include/asm-x86_64/mmzone.h | 13 ++++--- 5 files changed, 85 insertions(+), 15 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 6fe191c58084..9d67955bbc31 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } +#ifdef CONFIG_NUMA + /* NUMA memory to node map */ + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { + *addrp = nodemap_addr + nodemap_size; + return 1; + } +#endif /* XXX ramdisk image here? */ return 0; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 60477244d1a3..f330f8285499 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p) /* reserve ebda region */ if (ebda_addr) reserve_bootmem_generic(ebda_addr, ebda_size); +#ifdef CONFIG_NUMA + /* reserve nodemap region */ + if (nodemap_addr) + reserve_bootmem_generic(nodemap_addr, nodemap_size); +#endif #ifdef CONFIG_SMP /* diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606c..7d9c428f4094 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) { + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + nodemap_addr, nodemap_addr + nodemap_size); + return 0; + } + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<= end) + continue; + bitfield |= start | end; + if (end > memtop) + memtop = end; + } + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + int __init compute_hash_shift(struct bootnode *nodes, int numnodes) { - int shift = 20; - - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) - shift++; + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index fa2086774105..855fb4a454b6 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -56,6 +56,7 @@ extern void finish_e820_parsing(void); extern struct e820map e820; extern unsigned ebda_addr, ebda_size; +extern unsigned long nodemap_addr, nodemap_size; #endif/*!__ASSEMBLY__*/ #endif/*__E820_HEADER*/ diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index c38ebdf6f426..39ef106986eb 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -11,24 +11,25 @@ #include -/* Should really switch to dynamic allocation at some point */ -#define NODEMAPSIZE 0x4fff - /* Simple perfect hash to map physical addresses to node numbers */ struct memnode { int shift; - u8 map[NODEMAPSIZE]; -} ____cacheline_aligned; + unsigned int mapsize; + u8 *map; + u8 embedded_map[64-16]; +} ____cacheline_aligned; /* total size = 64 bytes */ extern struct memnode memnode; #define memnode_shift memnode.shift #define memnodemap memnode.map +#define memnodemapsize memnode.mapsize extern struct pglist_data *node_data[]; static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) { unsigned nid; - VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE); + VIRTUAL_BUG_ON(!memnodemap); + VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize); nid = memnodemap[addr >> memnode_shift]; VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid; From 54413927f022292aeccadd268fbf1c0b42129945 Mon Sep 17 00:00:00 2001 From: Amul Shah Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 05/94] [PATCH] x86-64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix - Removed an extraneous debug message from allocate_cachealigned_map - Changed extract_lsb_from_nodes to return 63 for the case where there was only one memory node. The prevents the creation of the dynamic hashmap. - Changed extract_lsb_from_nodes to use only the starting memory address of a node. On an ES7000, our nodes overlap the starting and ending address, meaning, that we see nodes like 00000 - 10000 10000 - 20000 But other systems have nodes whose start and end addresses do not overlap. For example: 00000 - 0FFFF 10000 - 1FFFF In this case, using the ending address will result in an LSB much lower than what is possible. In this case an LSB of 1 when in reality it should be 16. Cc: Andi Kleen Cc: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/mm/numa.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 7d9c428f4094..1ec16ea97519 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -78,11 +78,8 @@ static int __init allocate_cachealigned_memnodemap(void) unsigned long pad, pad_addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) { - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); + if (memnodemapsize <= 48) return 0; - } pad = L1_CACHE_BYTES - 1; pad_addr = 0x8000; @@ -110,7 +107,7 @@ static int __init allocate_cachealigned_memnodemap(void) static int __init extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) { - int i; + int i, nodes_used = 0; unsigned long start, end; unsigned long bitfield = 0, memtop = 0; @@ -119,11 +116,15 @@ extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) end = nodes[i].end; if (start >= end) continue; - bitfield |= start | end; + bitfield |= start; + nodes_used++; if (end > memtop) memtop = end; } - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); memnodemapsize = (memtop >> i)+1; return i; } From 464d1a78fbf8cf6c7fd970e7b3e2db50a320ce28 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 06/94] [PATCH] i386: Convert i386 PDA code to use %fs Convert the PDA code to use %fs rather than %gs as the segment for per-processor data. This is because some processors show a small but measurable performance gain for reloading a NULL segment selector (as %fs generally is in user-space) versus a non-NULL one (as %gs generally is). On modern processors the difference is very small, perhaps undetectable. Some old AMD "K6 3D+" processors are noticably slower when %fs is used rather than %gs; I have no idea why this might be, but I think they're sufficiently rare that it doesn't matter much. This patch also fixes the math emulator, which had not been adjusted to match the changed struct pt_regs. [frederik.deweerdt@gmail.com: fixit with gdb] [mingo@elte.hu: Fix KVM too] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Ian Campbell Acked-by: Ingo Molnar Acked-by: Zachary Amsden Cc: Eric Dumazet Signed-off-by: Frederik Deweerdt Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 2 +- arch/i386/kernel/cpu/common.c | 14 +++++++------- arch/i386/kernel/entry.S | 32 +++++++++++++++---------------- arch/i386/kernel/head.S | 6 +++--- arch/i386/kernel/kprobes.c | 4 ++-- arch/i386/kernel/process.c | 24 +++++++++++------------ arch/i386/kernel/ptrace.c | 16 ++++++++-------- arch/i386/kernel/signal.c | 10 +++++----- arch/i386/kernel/traps.c | 7 ++++--- arch/i386/kernel/vm86.c | 33 ++++++++++++++++---------------- arch/i386/math-emu/get_address.c | 14 +++++--------- drivers/kvm/vmx.c | 12 ++++++------ include/asm-i386/elf.h | 4 ++-- include/asm-i386/mmu_context.h | 2 +- include/asm-i386/pda.h | 12 ++++++------ include/asm-i386/processor.h | 6 +++--- include/asm-i386/ptrace.h | 4 ++-- 17 files changed, 99 insertions(+), 103 deletions(-) diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1b2f3cd33270..c37535163bfc 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,7 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_GS, pt_regs, xgs); + OFFSET(PT_FS, pt_regs, xfs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 8a8bbdaaf38a..dcbbd0a8bfc2 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -605,7 +605,7 @@ void __init early_cpu_init(void) struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xgs = __KERNEL_PDA; + regs->xfs = __KERNEL_PDA; return regs; } @@ -662,12 +662,12 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; -static inline void set_kernel_gs(void) +static inline void set_kernel_fs(void) { - /* Set %gs for this CPU's PDA. Memory clobber is to create a + /* Set %fs for this CPU's PDA. Memory clobber is to create a barrier with respect to any PDA operations, so the compiler doesn't move any before here. */ - asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } /* Initialize the CPU's GDT and PDA. The boot CPU does this for @@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); - set_kernel_gs(); + set_kernel_fs(); } /* Common CPU init for both boot and secondary CPUs */ @@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs. */ - asm volatile ("mov %0, %%fs" : : "r" (0)); + /* Clear %gs. */ + asm volatile ("mov %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5e47683fc63a..8c6a22a42d2e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,7 +30,7 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - %gs + * 24(%esp) - %fs * 28(%esp) - orig_eax * 2C(%esp) - %eip * 30(%esp) - %cs @@ -99,9 +99,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ - pushl %gs; \ + pushl %fs; \ CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET gs, 0;*/\ + /*CFI_REL_OFFSET fs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -133,7 +133,7 @@ VM_MASK = 0x00020000 movl %edx, %ds; \ movl %edx, %es; \ movl $(__KERNEL_PDA), %edx; \ - movl %edx, %gs + movl %edx, %fs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -166,9 +166,9 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -3: popl %gs; \ +3: popl %fs; \ CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE gs;*/\ + /*CFI_RESTORE fs;*/\ .pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ jmp 1b; \ @@ -349,11 +349,11 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON -1: mov PT_GS(%esp), %gs +1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC .pushsection .fixup,"ax" -2: movl $0,PT_GS(%esp) +2: movl $0,PT_FS(%esp) jmp 1b .section __ex_table,"a" .align 4 @@ -550,7 +550,7 @@ syscall_badsys: #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %gs:PDA_cpu, %ebx; \ + movl %fs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ @@ -632,7 +632,7 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: - /* the function address is in %gs's slot on the stack */ + /* the function address is in %fs's slot on the stack */ pushl %es CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET es, 0*/ @@ -661,20 +661,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %gs + pushl %fs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET gs, 0*/ + /*CFI_REL_OFFSET fs, 0*/ movl $(__KERNEL_PDA), %ecx - movl %ecx, %gs + movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_GS(%esp), %edi # get the function address + movl PT_FS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_GS(%esp) - /*CFI_REL_OFFSET gs, ES*/ + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index cb9abdfced9b..15336c8b5960 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -319,12 +319,12 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS and LDT - movl %eax,%fs + xorl %eax,%eax # Clear GS and LDT + movl %eax,%gs lldt %ax movl $(__KERNEL_PDA),%eax - mov %eax,%gs + mov %eax,%fs cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index af1d53344993..b85cfa3ce1dd 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -363,7 +363,7 @@ no_kprobe: " pushf\n" /* skip cs, eip, orig_eax */ " subl $12, %esp\n" - " pushl %gs\n" + " pushl %fs\n" " pushl %ds\n" " pushl %es\n" " pushl %eax\n" @@ -387,7 +387,7 @@ no_kprobe: " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* skip eip, orig_eax, es, ds, gs */ + /* skip eip, orig_eax, es, ds, fs */ " addl $20, %esp\n" " popf\n" " ret\n"); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index c641056233a6..23ae198dbbc3 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -308,8 +308,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x GS: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); + printk(" DS: %04x ES: %04x FS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -340,7 +340,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xgs = __KERNEL_PDA; + regs.xfs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -425,7 +425,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; - savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -501,8 +501,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.eax = regs->eax; dump->regs.ds = regs->xds; dump->regs.es = regs->xes; - savesegment(fs,dump->regs.fs); - dump->regs.gs = regs->xgs; + dump->regs.fs = regs->xfs; + savesegment(gs,dump->regs.gs); dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; @@ -653,7 +653,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs. No need to save %gs, as it was saved on the + * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation @@ -662,7 +662,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - savesegment(fs, prev->fs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -670,12 +670,10 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs if needed. - * - * Glibc normally makes %fs be zero. + * Restore %gs if needed (which is common) */ - if (unlikely(prev->fs | next->fs)) - loadsegment(fs, next->fs); + if (prev->gs | next->gs) + loadsegment(gs, next->gs); write_pda(pcurrent, next_p); diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index af8aabe85800..4a8f8a259723 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -89,14 +89,14 @@ static int putreg(struct task_struct *child, unsigned long regno, unsigned long value) { switch (regno >> 2) { - case FS: + case GS: if (value && (value & 3) != 3) return -EIO; - child->thread.fs = value; + child->thread.gs = value; return 0; case DS: case ES: - case GS: + case FS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -112,7 +112,7 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; put_stack_long(child, regno, value); return 0; @@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child, unsigned long retval = ~0UL; switch (regno >> 2) { - case FS: - retval = child->thread.fs; + case GS: + retval = child->thread.gs; break; case DS: case ES: - case GS: + case FS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; retval &= get_stack_long(child, regno); } diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 65d7620eaa09..8f4afcc7d2ab 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -128,8 +128,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - COPY_SEG(gs); - GET_SEG(fs); + GET_SEG(gs); + COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); COPY(edi); @@ -244,9 +244,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); - savesegment(fs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 0efad8aeb41a..4ec21037a361 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -291,10 +291,11 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = 1; unsigned long esp; - unsigned short ss; + unsigned short ss, gs; esp = (unsigned long) (®s->esp); savesegment(ss, ss); + savesegment(gs, gs); if (user_mode_vm(regs)) { in_kernel = 0; esp = regs->esp; @@ -313,8 +314,8 @@ void show_registers(struct pt_regs *regs) regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index be2f96e67f78..d1b8f2b7aea6 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xfs, so copy everything up to - (but not including) xgs, and then rest after xgs. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + /* kernel_vm86_regs is missing xgs, so copy everything up to + (but not including) orig_eax, and then rest including orig_eax. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs)); + offsetof(struct kernel_vm86_regs, pt.orig_eax)); return ret; } @@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + /* copy eax-xfs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + /* copy orig_eax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs) + + offsetof(struct kernel_vm86_regs, pt.orig_eax) + extra); - return ret; } @@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) ret = KVM86->regs32; - loadsegment(fs, current->thread.saved_fs); - ret->xgs = current->thread.saved_gs; + ret->xfs = current->thread.saved_fs; + loadsegment(gs, current->thread.saved_gs); return ret; } @@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs.pt.xds = 0; info->regs.pt.xes = 0; - info->regs.pt.xgs = 0; + info->regs.pt.xfs = 0; -/* we are clearing fs later just before "jmp resume_userspace", +/* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ @@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; - savesegment(fs, tsk->thread.saved_fs); - tsk->thread.saved_gs = info->regs32->xgs; + tsk->thread.saved_fs = info->regs32->xfs; + savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" - "mov %2, %%fs\n\t" + "mov %2, %%gs\n\t" "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c index 9819b705efa4..2e2c51a8bd3a 100644 --- a/arch/i386/math-emu/get_address.c +++ b/arch/i386/math-emu/get_address.c @@ -56,15 +56,14 @@ static int reg_offset_vm86[] = { #define VM86_REG_(x) (*(unsigned short *) \ (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) -/* These are dummy, fs and gs are not saved on the stack. */ -#define ___FS ___ds +/* This dummy, gs is not saved on the stack. */ #define ___GS ___ds static int reg_offset_pm[] = { offsetof(struct info,___cs), offsetof(struct info,___ds), offsetof(struct info,___es), - offsetof(struct info,___FS), + offsetof(struct info,___fs), offsetof(struct info,___GS), offsetof(struct info,___ss), offsetof(struct info,___ds) @@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm, u_char segment, switch ( segment ) { - /* fs and gs aren't used by the kernel, so they still have their - user-space values. */ - case PREFIX_FS_-1: - /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ - savesegment(fs, addr->selector); - break; + /* gs isn't used by the kernel, so it still has its + user-space value. */ case PREFIX_GS_-1: + /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ savesegment(gs, addr->selector); break; default: diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 1e640b899175..fd4e91734388 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -1879,12 +1879,6 @@ again: asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - kvm_run->exit_type = 0; if (fail) { kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; @@ -1907,6 +1901,12 @@ again: reload_tss(); } + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) + profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); + vcpu->launched = 1; kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index 369035dfe4b6..8d33c9bb7c1c 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -90,8 +90,8 @@ typedef struct user_fxsr_struct elf_fpxregset_t; pr_reg[6] = regs->eax; \ pr_reg[7] = regs->xds; \ pr_reg[8] = regs->xes; \ - savesegment(fs,pr_reg[9]); \ - pr_reg[10] = regs->xgs; \ + pr_reg[9] = regs->xfs; \ + savesegment(gs,pr_reg[10]); \ pr_reg[11] = regs->orig_eax; \ pr_reg[12] = regs->eip; \ pr_reg[13] = regs->xcs; \ diff --git a/include/asm-i386/mmu_context.h b/include/asm-i386/mmu_context.h index 68ff102d6f5e..e6aa30f8de5b 100644 --- a/include/asm-i386/mmu_context.h +++ b/include/asm-i386/mmu_context.h @@ -63,7 +63,7 @@ static inline void switch_mm(struct mm_struct *prev, } #define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs": :"r" (0)); + asm("movl %0,%%gs": :"r" (0)); #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff --git a/include/asm-i386/pda.h b/include/asm-i386/pda.h index 2ba2736aa109..b12d59a318b7 100644 --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -39,19 +39,19 @@ extern struct i386_pda _proxy_pda; if (0) { T__ tmp__; tmp__ = (val); } \ switch (sizeof(_proxy_pda.field)) { \ case 1: \ - asm(op "b %1,%%gs:%c2" \ + asm(op "b %1,%%fs:%c2" \ : "+m" (_proxy_pda.field) \ :"ri" ((T__)val), \ "i"(pda_offset(field))); \ break; \ case 2: \ - asm(op "w %1,%%gs:%c2" \ + asm(op "w %1,%%fs:%c2" \ : "+m" (_proxy_pda.field) \ :"ri" ((T__)val), \ "i"(pda_offset(field))); \ break; \ case 4: \ - asm(op "l %1,%%gs:%c2" \ + asm(op "l %1,%%fs:%c2" \ : "+m" (_proxy_pda.field) \ :"ri" ((T__)val), \ "i"(pda_offset(field))); \ @@ -65,19 +65,19 @@ extern struct i386_pda _proxy_pda; typeof(_proxy_pda.field) ret__; \ switch (sizeof(_proxy_pda.field)) { \ case 1: \ - asm(op "b %%gs:%c1,%0" \ + asm(op "b %%fs:%c1,%0" \ : "=r" (ret__) \ : "i" (pda_offset(field)), \ "m" (_proxy_pda.field)); \ break; \ case 2: \ - asm(op "w %%gs:%c1,%0" \ + asm(op "w %%fs:%c1,%0" \ : "=r" (ret__) \ : "i" (pda_offset(field)), \ "m" (_proxy_pda.field)); \ break; \ case 4: \ - asm(op "l %%gs:%c1,%0" \ + asm(op "l %%fs:%c1,%0" \ : "=r" (ret__) \ : "i" (pda_offset(field)), \ "m" (_proxy_pda.field)); \ diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 359f10b54f59..11bf899de8aa 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -424,7 +424,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .gs = __KERNEL_PDA, \ + .fs = __KERNEL_PDA, \ } /* @@ -442,8 +442,8 @@ struct thread_struct { } #define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs": :"r" (0)); \ - regs->xgs = 0; \ + __asm__("movl %0,%%gs": :"r" (0)); \ + regs->xfs = 0; \ set_fs(USER_DS); \ regs->xds = __USER_DS; \ regs->xes = __USER_DS; \ diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index bdbc894339b4..1646996c73da 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -16,8 +16,8 @@ struct pt_regs { long eax; int xds; int xes; - /* int xfs; */ - int xgs; + int xfs; + /* int xgs; */ long orig_eax; long eip; int xcs; From 2e188938ab2358034801938c2329b016ca135823 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 07/94] [PATCH] i386: Fix a typo in an IRQ handler name The "fasteoi" IRQ handler is named "fasteio" incorrectly. This is a fix. Signed-off-by: Maciej W. Rozycki Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index ba8d302a0b72..1711f4e1093f 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2310,7 +2310,7 @@ static inline void __init check_timer(void) disable_8259A_irq(0); set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, - "fasteio"); + "fasteoi"); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); From b78673944b22b662b270c8bba5c198f19e4ee4e1 Mon Sep 17 00:00:00 2001 From: Olivier Galibert Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 08/94] [PATCH] mmconfig: Share parts of mmconfig code between i386 and x86-64 i386 and x86-64 pci mmconfig code have a lot in common. So share what's shareable between the two. Signed-off-by: Olivier Galibert Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/pci/Makefile | 2 +- arch/i386/pci/mmconfig-shared.c | 86 +++++++++++++++++++++++++++++++++ arch/i386/pci/mmconfig.c | 74 ++-------------------------- arch/i386/pci/pci.h | 6 +++ arch/x86_64/pci/Makefile | 3 +- arch/x86_64/pci/mmconfig.c | 76 +++++------------------------ 6 files changed, 111 insertions(+), 136 deletions(-) create mode 100644 arch/i386/pci/mmconfig-shared.c diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile index 1594d2f55c8f..44650e03308b 100644 --- a/arch/i386/pci/Makefile +++ b/arch/i386/pci/Makefile @@ -1,7 +1,7 @@ obj-y := i386.o init.o obj-$(CONFIG_PCI_BIOS) += pcbios.o -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o pci-y := fixup.o diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c new file mode 100644 index 000000000000..998e04f6d68f --- /dev/null +++ b/arch/i386/pci/mmconfig-shared.c @@ -0,0 +1,86 @@ +/* + * mmconfig-shared.c - Low-level direct PCI config space access via + * MMCONFIG - common code between i386 and x86-64. + * + * This code does: + * - ACPI decoding and validation + * + * Per-architecture code takes care of the mappings and accesses + * themselves. + */ + +#include +#include +#include +#include +#include + +#include "pci.h" + +/* aperture is up to 256MB but BIOS may reserve less */ +#define MMCONFIG_APER_MIN (2 * 1024*1024) +#define MMCONFIG_APER_MAX (256 * 1024*1024) + +/* Verify the first 16 busses. We assume that systems with more busses + get MCFG right. */ +#define PCI_MMCFG_MAX_CHECK_BUS 16 + +DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); + +/* K8 systems have some devices (typically in the builtin northbridge) + that are only accessible using type1 + Normally this can be expressed in the MCFG by not listing them + and assigning suitable _SEGs, but this isn't implemented in some BIOS. + Instead try to discover all devices on bus 0 that are unreachable using MM + and fallback for them. */ +static __init void unreachable_devices(void) +{ + int i, k; + /* Use the max bus number from ACPI here? */ + for (k = 0; k < PCI_MMCFG_MAX_CHECK_BUS; k++) { + for (i = 0; i < 32; i++) { + u32 val1, val2; + + pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); + if (val1 == 0xffffffff) + continue; + + raw_pci_ops->read(0, k, PCI_DEVFN(i, 0), 0, 4, &val2); + if (val1 != val2) { + set_bit(i + 32*k, pci_mmcfg_fallback_slots); + printk(KERN_NOTICE "PCI: No mmconfig possible" + " on device %02x:%02x\n", k, i); + } + } + } +} + +void __init pci_mmcfg_init(int type) +{ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return; + + acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return; + + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && + !e820_all_mapped(pci_mmcfg_config[0].address, + pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not E820-reserved\n", + pci_mmcfg_config[0].address); + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + return; + } + + if (pci_mmcfg_arch_init()) { + unreachable_devices(); + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + } +} diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 5700220dcf5f..97dcaaa0de0f 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -15,21 +15,13 @@ #include #include "pci.h" -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Assume systems with more busses have correct MCFG */ -#define MAX_CHECK_BUS 16 - #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) /* The base address of the last MMCONFIG device accessed */ static u32 mmcfg_last_accessed_device; static int mmcfg_last_accessed_cpu; -static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32); - /* * Functions for accessing PCI configuration space with MMCONFIG accesses */ @@ -38,8 +30,8 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) int cfg_num = -1; struct acpi_mcfg_allocation *cfg; - if (seg == 0 && bus < MAX_CHECK_BUS && - test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots)) + if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && + test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots)) return 0; while (1) { @@ -158,67 +150,9 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static __init void unreachable_devices(void) +int __init pci_mmcfg_arch_init(void) { - int i, k; - unsigned long flags; - - for (k = 0; k < MAX_CHECK_BUS; k++) { - for (i = 0; i < 32; i++) { - u32 val1; - u32 addr; - - pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - - /* Locking probably not needed, but safer */ - spin_lock_irqsave(&pci_config_lock, flags); - addr = get_base_addr(0, k, PCI_DEVFN(i, 0)); - if (addr != 0) - pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0)); - if (addr == 0 || - readl((u32 __iomem *)mmcfg_virt_addr) != val1) { - set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE - "PCI: No mmconfig possible on %x:%x\n", k, i); - } - spin_unlock_irqrestore(&pci_config_lock, flags); - } - } -} - -void __init pci_mmcfg_init(int type) -{ - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n", - (unsigned long)pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } - printk(KERN_INFO "PCI: Using MMCONFIG\n"); raw_pci_ops = &pci_mmcfg; - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; - - unreachable_devices(); + return 1; } diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index a0a25180b61a..0270c80d99cc 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -94,3 +94,9 @@ extern void pci_pcbios_init(void); extern void pci_mmcfg_init(int type); extern void pcibios_sort(void); +/* pci-mmconfig.c */ + +#define PCI_MMCFG_MAX_CHECK_BUS 16 +extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); + +extern int pci_mmcfg_arch_init(void); diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index 149aba05a5b8..c9eddc8859c0 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -11,7 +11,7 @@ obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_NUMA) += k8-bus.o @@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o early-y += ../../i386/pci/early.o +mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index faabb6e87f12..0847735bb31e 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -19,9 +19,7 @@ /* Verify the first 16 busses. We assume that systems with more busses get MCFG right. */ -#define MAX_CHECK_BUS 16 - -static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS); +#define PCI_MMCFG_MAX_CHECK_BUS 16 /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { @@ -63,8 +61,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus < MAX_CHECK_BUS && - test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && + test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -135,63 +133,16 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static __init void unreachable_devices(void) -{ - int i, k; - /* Use the max bus number from ACPI here? */ - for (k = 0; k < MAX_CHECK_BUS; k++) { - for (i = 0; i < 32; i++) { - u32 val1; - char __iomem *addr; - - pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); - if (addr == NULL|| readl(addr) != val1) { - set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE "PCI: No mmconfig possible" - " on device %02x:%02x\n", k, i); - } - } - } -} - -void __init pci_mmcfg_init(int type) +int __init pci_mmcfg_arch_init(void) { int i; - - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n", - (unsigned long)pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } - - pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); + pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * + pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return; + return 0; } + for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address, @@ -200,14 +151,11 @@ void __init pci_mmcfg_init(int type) printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", pci_mmcfg_config[i].pci_segment); - return; + return 0; } - printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n", - (unsigned long)pci_mmcfg_config[i].address); + printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n", + pci_mmcfg_config[i].address); } - - unreachable_devices(); - raw_pci_ops = &pci_mmcfg; - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + return 1; } From 5f027387bbdb5a4a4c1babd557fd976cd09d7495 Mon Sep 17 00:00:00 2001 From: Olivier Galibert Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 09/94] [PATCH] i386: Only call unreachable_devices() when type 1 is available. unreachable_devices compares between the results of pci configuration accesses through type1 and mmconfig, so it should be called only if type1 actually works in the first place. Signed-off-by: Olivier Galibert Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/pci/mmconfig-shared.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 998e04f6d68f..779c987acc5b 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -80,7 +80,8 @@ void __init pci_mmcfg_init(int type) } if (pci_mmcfg_arch_init()) { - unreachable_devices(); + if (type == 1) + unreachable_devices(); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; } } From 9358c693c5ac1afde28f24ac651f7903d32a850c Mon Sep 17 00:00:00 2001 From: Olivier Galibert Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 10/94] [PATCH] mmconfig: Detect and support the E7520 and the 945G/GZ/P/PL It seems that the only way to reliably support mmconfig in the presence of funky biosen is to detect the hostbridge and read where the window is mapped from its registers. Do that for the E7520 and the 945G/GZ/P/PL for a start. Signed-off-by: Olivier Galibert Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/pci/mmconfig-shared.c | 121 +++++++++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 2 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 779c987acc5b..d72f0439147c 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -3,6 +3,7 @@ * MMCONFIG - common code between i386 and x86-64. * * This code does: + * - known chipset handling * - ACPI decoding and validation * * Per-architecture code takes care of the mappings and accesses @@ -55,12 +56,128 @@ static __init void unreachable_devices(void) } } +static __init const char *pci_mmcfg_e7520(void) +{ + u32 win; + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); + + pci_mmcfg_config_num = 1; + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = (win & 0xf000) << 16; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = 255; + + return "Intel Corporation E7520 Memory Controller Hub"; +} + +static __init const char *pci_mmcfg_intel_945(void) +{ + u32 pciexbar, mask = 0, len = 0; + + pci_mmcfg_config_num = 1; + + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar); + + /* Enable bit */ + if (!(pciexbar & 1)) + pci_mmcfg_config_num = 0; + + /* Size bits */ + switch ((pciexbar >> 1) & 3) { + case 0: + mask = 0xf0000000U; + len = 0x10000000U; + break; + case 1: + mask = 0xf8000000U; + len = 0x08000000U; + break; + case 2: + mask = 0xfc000000U; + len = 0x04000000U; + break; + default: + pci_mmcfg_config_num = 0; + } + + /* Errata #2, things break when not aligned on a 256Mb boundary */ + /* Can only happen in 64M/128M mode */ + + if ((pciexbar & mask) & 0x0fffffffU) + pci_mmcfg_config_num = 0; + + if (pci_mmcfg_config_num) { + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = pciexbar & mask; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1; + } + + return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; +} + +struct pci_mmcfg_hostbridge_probe { + u32 vendor; + u32 device; + const char *(*probe)(void); +}; + +static __initdata struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] = { + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, +}; + +static int __init pci_mmcfg_check_hostbridge(void) +{ + u32 l; + u16 vendor, device; + int i; + const char *name; + + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l); + vendor = l & 0xffff; + device = (l >> 16) & 0xffff; + + pci_mmcfg_config_num = 0; + pci_mmcfg_config = NULL; + name = NULL; + + for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) + if ((pci_mmcfg_probes[i].vendor == PCI_ANY_ID || + pci_mmcfg_probes[i].vendor == vendor) && + (pci_mmcfg_probes[i].device == PCI_ANY_ID || + pci_mmcfg_probes[i].device == device)) + name = pci_mmcfg_probes[i].probe(); + + if (name) { + if (pci_mmcfg_config_num) + printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", name); + else + printk(KERN_INFO "PCI: Found %s without MMCONFIG support.\n", + name); + } + + return name != NULL; +} + void __init pci_mmcfg_init(int type) { + int known_bridge = 0; + if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + if (type == 1 && pci_mmcfg_check_hostbridge()) + known_bridge = 1; + + if (!known_bridge) + acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || @@ -69,7 +186,7 @@ void __init pci_mmcfg_init(int type) /* Only do this check when type 1 works. If it doesn't work assume we run on a Mac and always use MCFG */ - if (type == 1 && + if (type == 1 && !known_bridge && !e820_all_mapped(pci_mmcfg_config[0].address, pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, E820_RESERVED)) { From 6a0668fc41fa479df617151c2d4e297299a4ffe2 Mon Sep 17 00:00:00 2001 From: Olivier Galibert Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 11/94] [PATCH] mmconfig: Reserve resources but only when we're sure about them. Put back the resource reservation as per 4c6e052adfe285ede5884e4e8c4d33af33932c13 but use it *only* when the range(s) come from a chipset probe instead of the bios. Signed-off-by: Olivier Galibert Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/pci/mmconfig-shared.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index d72f0439147c..4757554b08cf 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -166,6 +166,37 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } +static __init void pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + num_buses = pci_mmcfg_config[i].end_bus_number - + pci_mmcfg_config[i].start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + pci_mmcfg_config[i].pci_segment); + res->start = pci_mmcfg_config[i].address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + void __init pci_mmcfg_init(int type) { int known_bridge = 0; @@ -199,6 +230,8 @@ void __init pci_mmcfg_init(int type) if (pci_mmcfg_arch_init()) { if (type == 1) unreachable_devices(); + if (known_bridge) + pci_mmcfg_insert_resources(); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; } } From faed197b7b44a6c4e6b81dd2db649fd452b0a7ef Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 12/94] [PATCH] mmconfig: Fix x86_64 ioremap base_address Current mmconfig has some problems of remapped range. a) In the case of broken MCFG tables on Asus etc., we need to remap 256M range, but currently only remap 1M. b) The base address always corresponds to bus number 0, but currently we are assuming it corresponds to start bus number. This patch fixes the above problems. (akpm: Arjan suggests that if the MCFG table is broken we just shouldn't use it, rather than try to work around things). Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen Cc: Arjan van de Ven Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/pci/mmconfig.c | 46 +++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 0847735bb31e..8e05449660fe 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -28,6 +28,39 @@ struct mmcfg_virt { }; static struct mmcfg_virt *pci_mmcfg_virt; +static inline int mcfg_broken(void) +{ + struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0]; + + /* Handle more broken MCFG tables on Asus etc. + They only contain a single entry for bus 0-0. Assume + this applies to all busses. */ + if (pci_mmcfg_config_num == 1 && + cfg->pci_segment_group_number == 0 && + (cfg->start_bus_number | cfg->end_bus_number) == 0) + return 1; + return 0; +} + +static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg) +{ + void __iomem *addr; + u32 size; + + if (mcfg_broken()) + size = 256 << 20; + else + size = (cfg->end_bus_number + 1) << 20; + + addr = ioremap_nocache(cfg->base_address, size); + if (addr) { + printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n", + cfg->base_address, + cfg->base_address + size - 1); + } + return addr; +} + static char __iomem *get_virt(unsigned int seg, unsigned bus) { int cfg_num = -1; @@ -45,13 +78,7 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) return pci_mmcfg_virt[cfg_num].virt; } - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - cfg = &pci_mmcfg_config[0]; - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) + if (mcfg_broken()) return pci_mmcfg_virt[0].virt; /* Fall back to type 0 */ @@ -145,16 +172,13 @@ int __init pci_mmcfg_arch_init(void) for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address, - MMCONFIG_APER_MAX); + pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); if (!pci_mmcfg_virt[i].virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", pci_mmcfg_config[i].pci_segment); return 0; } - printk(KERN_INFO "PCI: Using MMCONFIG at %Lx\n", - pci_mmcfg_config[i].address); } raw_pci_ops = &pci_mmcfg; return 1; From 44de0203fab205417b24322272c53ee0883c36e7 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 13/94] [PATCH] mmconfig: Reject a broken MCFG tables on Asus etc This rejects broken MCFG tables on Asus. When the table looks bogus just disable mmconfig Arjan and Andi suggested this. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig-shared.c | 24 +++++++++++++++- arch/i386/pci/mmconfig.c | 9 ------ arch/x86_64/pci/mmconfig.c | 50 +++++++++------------------------ 3 files changed, 37 insertions(+), 46 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 4757554b08cf..77de6de94f1f 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -197,6 +197,26 @@ static __init void pci_mmcfg_insert_resources(void) } } +static void __init pci_mmcfg_reject_broken(void) +{ + typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[0]; + + /* + * Handle more broken MCFG tables on Asus etc. + * They only contain a single entry for bus 0-0. + */ + if (pci_mmcfg_config_num == 1 && + cfg->pci_segment == 0 && + (cfg->start_bus_number | cfg->end_bus_number) == 0) { + kfree(pci_mmcfg_config); + pci_mmcfg_config = NULL; + pci_mmcfg_config_num = 0; + + printk(KERN_ERR "PCI: start and end of bus number is 0. " + "Rejected as broken MCFG."); + } +} + void __init pci_mmcfg_init(int type) { int known_bridge = 0; @@ -207,8 +227,10 @@ void __init pci_mmcfg_init(int type) if (type == 1 && pci_mmcfg_check_hostbridge()) known_bridge = 1; - if (!known_bridge) + if (!known_bridge) { acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + pci_mmcfg_reject_broken(); + } if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 97dcaaa0de0f..3325b79e651c 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -47,15 +47,6 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) return cfg->address; } - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - cfg = &pci_mmcfg_config[0]; - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) - return cfg->address; - /* Fall back to type 0 */ return 0; } diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 8e05449660fe..78e50b2c5cc5 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -28,39 +28,6 @@ struct mmcfg_virt { }; static struct mmcfg_virt *pci_mmcfg_virt; -static inline int mcfg_broken(void) -{ - struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[0]; - - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment_group_number == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) - return 1; - return 0; -} - -static void __iomem *mcfg_ioremap(struct acpi_mcfg_allocation *cfg) -{ - void __iomem *addr; - u32 size; - - if (mcfg_broken()) - size = 256 << 20; - else - size = (cfg->end_bus_number + 1) << 20; - - addr = ioremap_nocache(cfg->base_address, size); - if (addr) { - printk(KERN_INFO "PCI: Using MMCONFIG at %x - %x\n", - cfg->base_address, - cfg->base_address + size - 1); - } - return addr; -} - static char __iomem *get_virt(unsigned int seg, unsigned bus) { int cfg_num = -1; @@ -78,9 +45,6 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) return pci_mmcfg_virt[cfg_num].virt; } - if (mcfg_broken()) - return pci_mmcfg_virt[0].virt; - /* Fall back to type 0 */ return NULL; } @@ -160,6 +124,20 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; +static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) +{ + void __iomem *addr; + u32 size; + + size = (cfg->end_bus_number + 1) << 20; + addr = ioremap_nocache(cfg->address, size); + if (addr) { + printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", + cfg->address, cfg->address + size - 1); + } + return addr; +} + int __init pci_mmcfg_arch_init(void) { int i; From a4ec1b2c9fe9492c9ab30261b411d836527fe0b6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 14/94] [PATCH] mmconfig: remove #define MMCONFIG_APER_XXX MMCONFIG_APER_XXX is unneeded in arch/x86_64/pci/mmconfig.c. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen --- arch/x86_64/pci/mmconfig.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 78e50b2c5cc5..50512a8fc9e8 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -13,10 +13,6 @@ #include "pci.h" -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Verify the first 16 busses. We assume that systems with more busses get MCFG right. */ #define PCI_MMCFG_MAX_CHECK_BUS 16 From 429d512e532ec9c969aa6f66ddbc542f3a5fe4da Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 15/94] [PATCH] mmconfig: minor cleanup in mmconfig code This just cleans up. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig-shared.c | 52 ++++++++++++++------------------- arch/i386/pci/mmconfig.c | 13 +++------ arch/i386/pci/pci.h | 4 ++- arch/x86_64/pci/mmconfig.c | 16 +++------- 4 files changed, 33 insertions(+), 52 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 77de6de94f1f..4ea0852487a4 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -22,10 +22,6 @@ #define MMCONFIG_APER_MIN (2 * 1024*1024) #define MMCONFIG_APER_MAX (256 * 1024*1024) -/* Verify the first 16 busses. We assume that systems with more busses - get MCFG right. */ -#define PCI_MMCFG_MAX_CHECK_BUS 16 - DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); /* K8 systems have some devices (typically in the builtin northbridge) @@ -34,29 +30,30 @@ DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); and assigning suitable _SEGs, but this isn't implemented in some BIOS. Instead try to discover all devices on bus 0 that are unreachable using MM and fallback for them. */ -static __init void unreachable_devices(void) +static void __init unreachable_devices(void) { - int i, k; + int i, bus; /* Use the max bus number from ACPI here? */ - for (k = 0; k < PCI_MMCFG_MAX_CHECK_BUS; k++) { + for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) { for (i = 0; i < 32; i++) { + unsigned int devfn = PCI_DEVFN(i, 0); u32 val1, val2; - pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); + pci_conf1_read(0, bus, devfn, 0, 4, &val1); if (val1 == 0xffffffff) continue; - raw_pci_ops->read(0, k, PCI_DEVFN(i, 0), 0, 4, &val2); + raw_pci_ops->read(0, bus, devfn, 0, 4, &val2); if (val1 != val2) { - set_bit(i + 32*k, pci_mmcfg_fallback_slots); + set_bit(i + 32 * bus, pci_mmcfg_fallback_slots); printk(KERN_NOTICE "PCI: No mmconfig possible" - " on device %02x:%02x\n", k, i); + " on device %02x:%02x\n", bus, i); } } } } -static __init const char *pci_mmcfg_e7520(void) +static const char __init *pci_mmcfg_e7520(void) { u32 win; pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); @@ -73,7 +70,7 @@ static __init const char *pci_mmcfg_e7520(void) return "Intel Corporation E7520 Memory Controller Hub"; } -static __init const char *pci_mmcfg_intel_945(void) +static const char __init *pci_mmcfg_intel_945(void) { u32 pciexbar, mask = 0, len = 0; @@ -128,7 +125,7 @@ struct pci_mmcfg_hostbridge_probe { const char *(*probe)(void); }; -static __initdata struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] = { +static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, }; @@ -148,25 +145,21 @@ static int __init pci_mmcfg_check_hostbridge(void) pci_mmcfg_config = NULL; name = NULL; - for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) - if ((pci_mmcfg_probes[i].vendor == PCI_ANY_ID || - pci_mmcfg_probes[i].vendor == vendor) && - (pci_mmcfg_probes[i].device == PCI_ANY_ID || - pci_mmcfg_probes[i].device == device)) + for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { + if (pci_mmcfg_probes[i].vendor == vendor && + pci_mmcfg_probes[i].device == device) name = pci_mmcfg_probes[i].probe(); + } if (name) { - if (pci_mmcfg_config_num) - printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", name); - else - printk(KERN_INFO "PCI: Found %s without MMCONFIG support.\n", - name); + printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n", + name, pci_mmcfg_config_num ? "with" : "without"); } return name != NULL; } -static __init void pci_mmcfg_insert_resources(void) +static void __init pci_mmcfg_insert_resources(void) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -176,7 +169,6 @@ static __init void pci_mmcfg_insert_resources(void) res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), pci_mmcfg_config_num, GFP_KERNEL); - if (!res) { printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); return; @@ -184,12 +176,12 @@ static __init void pci_mmcfg_insert_resources(void) names = (void *)&res[pci_mmcfg_config_num]; for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - num_buses = pci_mmcfg_config[i].end_bus_number - - pci_mmcfg_config[i].start_bus_number + 1; + struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; + num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; res->name = names; snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", - pci_mmcfg_config[i].pci_segment); - res->start = pci_mmcfg_config[i].address; + cfg->pci_segment); + res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 3325b79e651c..11be089efd7d 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -27,22 +27,17 @@ static int mmcfg_last_accessed_cpu; */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - int cfg_num = -1; struct acpi_mcfg_allocation *cfg; + int cfg_num; if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots)) return 0; - while (1) { - ++cfg_num; - if (cfg_num >= pci_mmcfg_config_num) { - break; - } + for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = &pci_mmcfg_config[cfg_num]; - if (cfg->pci_segment != seg) - continue; - if ((cfg->start_bus_number <= bus) && + if (cfg->pci_segment == seg && + (cfg->start_bus_number <= bus) && (cfg->end_bus_number >= bus)) return cfg->address; } diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index 0270c80d99cc..2ce3d44b09d4 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -96,7 +96,9 @@ extern void pcibios_sort(void); /* pci-mmconfig.c */ +/* Verify the first 16 busses. We assume that systems with more busses + get MCFG right. */ #define PCI_MMCFG_MAX_CHECK_BUS 16 extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); -extern int pci_mmcfg_arch_init(void); +extern int __init pci_mmcfg_arch_init(void); diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 50512a8fc9e8..918fc5b95a06 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -13,10 +13,6 @@ #include "pci.h" -/* Verify the first 16 busses. We assume that systems with more busses - get MCFG right. */ -#define PCI_MMCFG_MAX_CHECK_BUS 16 - /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { struct acpi_mcfg_allocation *cfg; @@ -26,17 +22,13 @@ static struct mmcfg_virt *pci_mmcfg_virt; static char __iomem *get_virt(unsigned int seg, unsigned bus) { - int cfg_num = -1; struct acpi_mcfg_allocation *cfg; + int cfg_num; - while (1) { - ++cfg_num; - if (cfg_num >= pci_mmcfg_config_num) - break; + for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment != seg) - continue; - if ((cfg->start_bus_number <= bus) && + if (cfg->pci_segment == seg && + (cfg->start_bus_number <= bus) && (cfg->end_bus_number >= bus)) return pci_mmcfg_virt[cfg_num].virt; } From 56829d1982b6f1150553c049d372728b9eda5aec Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 16/94] [PATCH] mmconfig: fix unreachable_devices() Currently, unreachable_devices() compares value of mmconfig and value of conf1. But it doesn't check the device is reachable or not. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig-shared.c | 12 +++++++----- arch/i386/pci/mmconfig.c | 6 ++++++ arch/i386/pci/pci.h | 2 ++ arch/x86_64/pci/mmconfig.c | 6 ++++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 4ea0852487a4..473db6257013 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -43,12 +43,14 @@ static void __init unreachable_devices(void) if (val1 == 0xffffffff) continue; - raw_pci_ops->read(0, bus, devfn, 0, 4, &val2); - if (val1 != val2) { - set_bit(i + 32 * bus, pci_mmcfg_fallback_slots); - printk(KERN_NOTICE "PCI: No mmconfig possible" - " on device %02x:%02x\n", bus, i); + if (pci_mmcfg_arch_reachable(0, bus, devfn)) { + raw_pci_ops->read(0, bus, devfn, 0, 4, &val2); + if (val1 == val2) + continue; } + set_bit(i + 32 * bus, pci_mmcfg_fallback_slots); + printk(KERN_NOTICE "PCI: No mmconfig possible on device" + " %02x:%02x\n", bus, i); } } } diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 11be089efd7d..bb1afd9e589d 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -136,6 +136,12 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; +int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn) +{ + return get_base_addr(seg, bus, devfn) != 0; +} + int __init pci_mmcfg_arch_init(void) { printk(KERN_INFO "PCI: Using MMCONFIG\n"); diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index 2ce3d44b09d4..e58bae2076ad 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -101,4 +101,6 @@ extern void pcibios_sort(void); #define PCI_MMCFG_MAX_CHECK_BUS 16 extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); +extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn); extern int __init pci_mmcfg_arch_init(void); diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 918fc5b95a06..65d82736987e 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -126,6 +126,12 @@ static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) return addr; } +int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn) +{ + return pci_dev_base(seg, bus, devfn) != NULL; +} + int __init pci_mmcfg_arch_init(void) { int i; From 26054ed02bb20f5b2e02d92cb6f0be0e2b0196d5 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH 17/94] [PATCH] mmconfig: Move e820 check into pci_mmcfg_reject_broken() This is just cleanup. It moves to e820 check into pci_mmcfg_reject_broken(). Signed-off-by: OGAWA Hirofumi Signed-off-by: Andi Kleen --- arch/i386/pci/mmconfig-shared.c | 51 ++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index 473db6257013..747d8c63b0c4 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -191,9 +191,16 @@ static void __init pci_mmcfg_insert_resources(void) } } -static void __init pci_mmcfg_reject_broken(void) +static void __init pci_mmcfg_reject_broken(int type) { - typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[0]; + typeof(pci_mmcfg_config[0]) *cfg; + + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return; + + cfg = &pci_mmcfg_config[0]; /* * Handle more broken MCFG tables on Asus etc. @@ -202,13 +209,29 @@ static void __init pci_mmcfg_reject_broken(void) if (pci_mmcfg_config_num == 1 && cfg->pci_segment == 0 && (cfg->start_bus_number | cfg->end_bus_number) == 0) { - kfree(pci_mmcfg_config); - pci_mmcfg_config = NULL; - pci_mmcfg_config_num = 0; - printk(KERN_ERR "PCI: start and end of bus number is 0. " - "Rejected as broken MCFG."); + "Rejected as broken MCFG.\n"); + goto reject; } + + /* + * Only do this check when type 1 works. If it doesn't work + * assume we run on a Mac and always use MCFG + */ + if (type == 1 && !e820_all_mapped(cfg->address, + cfg->address + MMCONFIG_APER_MIN, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" + " E820-reserved\n", cfg->address); + goto reject; + } + return; + +reject: + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + kfree(pci_mmcfg_config); + pci_mmcfg_config = NULL; + pci_mmcfg_config_num = 0; } void __init pci_mmcfg_init(int type) @@ -223,7 +246,7 @@ void __init pci_mmcfg_init(int type) if (!known_bridge) { acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - pci_mmcfg_reject_broken(); + pci_mmcfg_reject_broken(type); } if ((pci_mmcfg_config_num == 0) || @@ -231,18 +254,6 @@ void __init pci_mmcfg_init(int type) (pci_mmcfg_config[0].address == 0)) return; - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !known_bridge && - !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not E820-reserved\n", - pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } - if (pci_mmcfg_arch_init()) { if (type == 1) unreachable_devices(); From 5809f9d442e9dbb23859e2c37d8c47043f6b5cc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 18/94] [PATCH] x86-64: get rid of ARCH_HAVE_XTIME_LOCK ARCH_HAVE_XTIME_LOCK is used by x86_64 arch . This arch needs to place a read only copy of xtime_lock into vsyscall page. This read only copy is named __xtime_lock, and xtime_lock is defined in arch/x86_64/kernel/vmlinux.lds.S as an alias. So the declaration of xtime_lock in kernel/timer.c was guarded by ARCH_HAVE_XTIME_LOCK define, defined to true on x86_64. We can get same result with _attribute__((weak)) in the declaration. linker should do the job. Signed-off-by: Eric Dumazet Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/asm-x86_64/vsyscall.h | 5 ----- include/linux/time.h | 2 +- kernel/timer.c | 4 +--- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h index 05cb8dd200de..0c7847165eae 100644 --- a/include/asm-x86_64/vsyscall.h +++ b/include/asm-x86_64/vsyscall.h @@ -56,11 +56,6 @@ extern struct vxtime_data vxtime; extern int vgetcpu_mode; extern struct timezone sys_tz; extern int sysctl_vsyscall; -extern seqlock_t xtime_lock; - -extern int sysctl_vsyscall; - -#define ARCH_HAVE_XTIME_LOCK 1 #endif /* __KERNEL__ */ diff --git a/include/linux/time.h b/include/linux/time.h index 55cee172d723..eceb1a59b078 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -90,7 +90,7 @@ static inline struct timespec timespec_sub(struct timespec lhs, extern struct timespec xtime; extern struct timespec wall_to_monotonic; -extern seqlock_t xtime_lock; +extern seqlock_t xtime_lock __attribute__((weak)); void timekeeping_init(void); diff --git a/kernel/timer.c b/kernel/timer.c index 8533c3796082..4902181e10e6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1162,11 +1162,9 @@ static inline void calc_load(unsigned long ticks) * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -#ifndef ARCH_HAVE_XTIME_LOCK -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); -#endif /* * This function runs timers and the timer-tq in bottom half context. From 5558870bfbcca10cfc7b13ab866687012ea3c9af Mon Sep 17 00:00:00 2001 From: Karsten Weiss Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 19/94] [PATCH] x86-64: improved iommu documentation - add SWIOTLB config help text - mention Documentation/x86_64/boot-options.txt in Documentation/kernel-parameters.txt - remove the duplication of the iommu kernel parameter documentation. - Better explanation of some of the iommu kernel parameter options. - "32MB< Signed-off-by: Andi Kleen Acked-by: Muli Ben-Yehuda Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/kernel-parameters.txt | 3 + Documentation/x86_64/boot-options.txt | 101 ++++++++++++++++++-------- arch/x86_64/Kconfig | 10 ++- arch/x86_64/kernel/pci-dma.c | 28 +------ 4 files changed, 86 insertions(+), 56 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index d25acd51e181..733a736bc6c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -104,6 +104,9 @@ loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme need or coordination with . +There are also arch-specific kernel-parameters not documented here. +See for example . + Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will be entered as an environment variable, whereas its absence indicates that diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 5c86ed6f0448..0d653993f361 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -180,40 +180,81 @@ PCI pci=lastbus=NUMBER Scan upto NUMBER busses, no matter what the mptable says. pci=noacpi Don't use ACPI to set up PCI interrupt routing. -IOMMU +IOMMU (input/output memory management unit) - iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,noaperture][,calgary] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do SG merging. Implies force (experimental) - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - When off all DMA over >4GB is forced through an IOMMU or bounce - buffering. - nodac Forbid DMA >4GB - panic Always panic when IOMMU overflows - calgary Use the Calgary IOMMU if it is available + Currently four x86-64 PCI-DMA mapping implementations exist: - swiotlb=pages[,force] + 1. : use no hardware/software IOMMU at all + (e.g. because you have < 3 GB memory). + Kernel boot message: "PCI-DMA: Disabling IOMMU" - pages Prereserve that many 128K pages for the software IO bounce buffering. - force Force all IO through the software TLB. + 2. : AMD GART based hardware IOMMU. + Kernel boot message: "PCI-DMA: using GART IOMMU" - calgary=[64k,128k,256k,512k,1M,2M,4M,8M] - calgary=[translate_empty_slots] - calgary=[disable=] + 3. : Software IOMMU implementation. Used + e.g. if there is no hardware IOMMU in the system and it is need because + you have >3GB memory or told the kernel to us it (iommu=soft)) + Kernel boot message: "PCI-DMA: Using software bounce buffering + for IO (SWIOTLB)" + + 4. : IBM Calgary hardware IOMMU. Used in IBM + pSeries and xSeries servers. This hardware IOMMU supports DMA address + mapping with memory protection, etc. + Kernel boot message: "PCI-DMA: Using Calgary IOMMU" + + iommu=[][,noagp][,off][,force][,noforce][,leak[=] + [,memaper[=]][,merge][,forcesac][,fullflush][,nomerge] + [,noaperture][,calgary] + + General iommu options: + off Don't initialize and use any kind of IOMMU. + noforce Don't force hardware IOMMU usage when it is not needed. + (default). + force Force the use of the hardware IOMMU even when it is + not actually needed (e.g. because < 3 GB memory). + soft Use software bounce buffering (SWIOTLB) (default for + Intel machines). This can be used to prevent the usage + of an available hardware IOMMU. + + iommu options only relevant to the AMD GART hardware IOMMU: + Set the size of the remapping area in bytes. + allowed Overwrite iommu off workarounds for specific chipsets. + fullflush Flush IOMMU on each allocation (default). + nofullflush Don't use IOMMU fullflush. + leak Turn on simple iommu leak tracing (only when + CONFIG_IOMMU_LEAK is on). Default number of leak pages + is 20. + memaper[=] Allocate an own aperture over RAM with size 32MB<4GB. + DAC is used with 32-bit PCI to push a 64-bit address in + two cycles. When off all DMA over >4GB is forced through + an IOMMU or software bounce buffering. + nodac Forbid DAC mode, i.e. DMA >4GB. + panic Always panic when IOMMU overflows. + calgary Use the Calgary IOMMU if it is available + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: + swiotlb=[,force] + Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: + + calgary=[64k,128k,256k,512k,1M,2M,4M,8M] + calgary=[translate_empty_slots] + calgary=[disable=] + panic Always panic when IOMMU overflows 64k,...,8M - Set the size of each PCI slot's translation table when using the Calgary IOMMU. This is the size of the translation diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 02dd39457bcf..a55382a1bb46 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -458,8 +458,8 @@ config IOMMU on systems with more than 3GB. This is usually needed for USB, sound, many IDE/SATA chipsets and some other devices. Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART - based IOMMU and a software bounce buffer based IOMMU used on Intel - systems and as fallback. + based hardware IOMMU and a software bounce buffer based IOMMU used + on Intel systems and as fallback. The code is only active when needed (enough memory and limited device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified too. @@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool + help + Support for software bounce buffers used on x86-64 systems + which don't have a hardware IOMMU (e.g. the current generation + of Intel's x86-64 CPUs). Using this PCI devices which can only + access 32-bits of memory can be used on systems with more than + 3 GB of memory. If unsure, say Y. config X86_MCE bool "Machine check support" if EMBEDDED diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 683b7a5c1ab3..651ccfb06697 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,biomerge] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do lazy merging. This may improve performance on some block devices. - Implies force (experimental) - biomerge Do merging at the BIO layer. This is more efficient than merge, - but should be only done with very big IOMMUs. Implies merge,force. - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - nodac Forbid DMA >4GB - panic Force panic when IOMMU overflows -*/ +/* + * See for the iommu kernel parameter + * documentation. + */ __init int iommu_setup(char *p) { iommu_merge = 1; From 006e84ee3a54e393ec6bef2a9bc891dc5bde2843 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 20/94] [PATCH] x86-64: do not always end the stack trace with ULONG_MAX It makes more sense to end the stack trace with ULONG_MAX only if nr_entries < max_entries. Otherwise, we lose one entry in the long stack traces and cannot know whether the trace was complete or not. Signed-off-by: Catalin Marinas Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jan Beulich Signed-off-by: Andrew Morton --- arch/x86_64/kernel/stacktrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 6026b31d037e..65ac2c6b34a6 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr) trace->skip--; return; } - if (trace->nr_entries < trace->max_entries - 1) + if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = addr; } @@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = { void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { dump_trace(task, NULL, NULL, &save_stack_ops, trace); - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL(save_stack_trace); From 90611fe923aa3ac7ffb9e5df45c83860b0f00227 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 21/94] [PATCH] i386: arch/i386/kernel/e820.c should #include Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index f391abcf7da9..9ded1e49119e 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_EFI int efi_enabled = 0; From c119ecce894120790903ef535dac3e105f3d6cde Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 22/94] [PATCH] MM: page allocation hooks for VMI backend The VMI backend uses explicit page type notification to track shadow page tables. The allocation of page table roots is especially tricky. We need to clone the root for non-PAE mode while it is protected under the pgd lock to correctly copy the shadow. We don't need to allocate pgds in PAE mode, (PDPs in Intel terminology) as they only have 4 entries, and are cached entirely by the processor, which makes shadowing them rather simple. For base page table level allocation, pmd_populate provides the exact hook point we need. Also, we need to allocate pages when splitting a large page, and we must release pages before returning the page to any free pool. Despite being required with these slightly odd semantics for VMI, Xen also uses these hooks to determine the exact moment when page tables are created or released. AK: All nops for other architectures Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 6 ++++++ arch/i386/mm/init.c | 4 ++++ arch/i386/mm/pageattr.c | 2 ++ arch/i386/mm/pgtable.c | 24 ++++++++++++++++++++---- include/asm-i386/paravirt.h | 14 ++++++++++++++ include/asm-i386/pgalloc.h | 30 ++++++++++++++++++++++++++---- 6 files changed, 72 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index e55fd05da0f5..7329ec9fcc99 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -550,6 +550,12 @@ struct paravirt_ops paravirt_ops = { .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .alloc_pt = (void *)native_nop, + .alloc_pd = (void *)native_nop, + .alloc_pd_clone = (void *)native_nop, + .release_pt = (void *)native_nop, + .release_pd = (void *)native_nop, + .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index c5c5ea700cc7..ae436882af7a 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); if (pmd_table != pmd_offset(pud, 0)) @@ -82,6 +83,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) { if (pmd_none(*pmd)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -345,6 +347,8 @@ static void __init pagetable_init (void) /* Init entries of the first-level page table to the zero page */ for (i = 0; i < PTRS_PER_PGD; i++) set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#else + paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); #endif /* Enable PSE if available */ diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index e223b1d4981c..412ebbd8adb0 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,6 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); + paravirt_alloc_pt(page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); @@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pgprot_t prot) if (!PageReserved(kpte_page)) { if (cpu_has_pse && (page_private(kpte_page) == 0)) { ClearPagePrivate(kpte_page); + paravirt_release_pt(page_to_pfn(kpte_page)); list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index f349eaf450b0..b5f538f52272 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -248,9 +248,15 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); + if (PTRS_PER_PMD > 1) return; + /* must happen under lock */ + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); + pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } @@ -260,6 +266,7 @@ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -277,13 +284,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } return pgd; out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + for (i--; i >= 0; i--) { + pgd_t pgdent = pgd[i]; + void* pmd = (void *)__va(pgd_val(pgdent)-1); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + kmem_cache_free(pmd_cache, pmd); + } kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -294,8 +306,12 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pgd_t pgdent = pgd[i]; + void* pmd = (void *)__va(pgd_val(pgdent)-1); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + kmem_cache_free(pmd_cache, pmd); + } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 9f06265065f4..53da276a2ec2 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -127,6 +127,12 @@ struct paravirt_ops void (fastcall *flush_tlb_kernel)(void); void (fastcall *flush_tlb_single)(u32 addr); + void (fastcall *alloc_pt)(u32 pfn); + void (fastcall *alloc_pd)(u32 pfn); + void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); + void (fastcall *release_pt)(u32 pfn); + void (fastcall *release_pd)(u32 pfn); + void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); @@ -320,6 +326,14 @@ static inline unsigned long apic_read(unsigned long reg) #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() #define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr) +#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn) +#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn) + +#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn) +#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \ + paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count) +#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn) + static inline void set_pte(pte_t *ptep, pte_t pteval) { paravirt_ops.set_pte(ptep, pteval); diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h index 4b1e61359f89..c8dc2d0141a7 100644 --- a/include/asm-i386/pgalloc.h +++ b/include/asm-i386/pgalloc.h @@ -5,13 +5,31 @@ #include #include /* for struct page */ -#define pmd_populate_kernel(mm, pmd, pte) \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) +#ifdef CONFIG_PARAVIRT +#include +#else +#define paravirt_alloc_pt(pfn) do { } while (0) +#define paravirt_alloc_pd(pfn) do { } while (0) +#define paravirt_alloc_pd(pfn) do { } while (0) +#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) +#define paravirt_release_pt(pfn) do { } while (0) +#define paravirt_release_pd(pfn) do { } while (0) +#endif + +#define pmd_populate_kernel(mm, pmd, pte) \ +do { \ + paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT); \ + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ +} while (0) #define pmd_populate(mm, pmd, pte) \ +do { \ + paravirt_alloc_pt(page_to_pfn(pte)); \ set_pmd(pmd, __pmd(_PAGE_TABLE + \ ((unsigned long long)page_to_pfn(pte) << \ - (unsigned long long) PAGE_SHIFT))) + (unsigned long long) PAGE_SHIFT))); \ +} while (0) + /* * Allocate and free page tables. */ @@ -32,7 +50,11 @@ static inline void pte_free(struct page *pte) } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pte_free_tlb(tlb,pte) \ +do { \ + paravirt_release_pt(page_to_pfn(pte)); \ + tlb_remove_page((tlb),(pte)); \ +} while (0) #ifdef CONFIG_X86_PAE /* From 9226d125d94c7e4964dd41cc5e9ca2ff84091d01 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 23/94] [PATCH] i386: paravirt CPU hypercall batching mode The VMI ROM has a mode where hypercalls can be queued and batched. This turns out to be a significant win during context switch, but must be done at a specific point before side effects to CPU state are visible to subsequent instructions. This is similar to the MMU batching hooks already provided. The same hooks could be used by the Xen backend to implement a context switch multicall. To explain a bit more about lazy modes in the paravirt patches, basically, the idea is that only one of lazy CPU or MMU mode can be active at any given time. Lazy MMU mode is similar to this lazy CPU mode, and allows for batching of multiple PTE updates (say, inside a remap loop), but to avoid keeping some kind of state machine about when to flush cpu or mmu updates, we just allow one or the other to be active. Although there is no real reason a more comprehensive scheme could not be implemented, there is also no demonstrated need for this extra complexity. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 1 + arch/i386/kernel/process.c | 25 +++++++++++++++++-------- include/asm-generic/pgtable.h | 13 +++++++++++++ include/asm-i386/paravirt.h | 15 +++++++++++++++ kernel/sched.c | 7 +++++++ 5 files changed, 53 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 7329ec9fcc99..4dfdac4550dd 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -545,6 +545,7 @@ struct paravirt_ops paravirt_ops = { .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, #endif + .set_lazy_mode = (void *)native_nop, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 23ae198dbbc3..cfae587bf7d2 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -669,14 +669,6 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas */ load_TLS(next, cpu); - /* - * Restore %gs if needed (which is common) - */ - if (prev->gs | next->gs) - loadsegment(gs, next->gs); - - write_pda(pcurrent, next_p); - /* * Now maybe handle debug registers and/or IO bitmaps */ @@ -686,6 +678,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_leave_lazy_cpu_mode(); + /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now @@ -693,6 +694,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (next_p->fpu_counter > 5) math_state_restore(); + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + + write_pda(pcurrent, next_p); + return prev_p; } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9d774d07d95b..00c23433b39f 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -182,6 +182,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define arch_leave_lazy_mmu_mode() do {} while (0) #endif +/* + * A facility to provide batching of the reload of page tables with the + * actual context switch code for paravirtualized guests. By convention, + * only one of the lazy modes (CPU, MMU) should be active at any given + * time, entry should never be nested, and entry and exits should always + * be paired. This is for sanity of maintaining and reasoning about the + * kernel code. + */ +#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE +#define arch_enter_lazy_cpu_mode() do {} while (0) +#define arch_leave_lazy_cpu_mode() do {} while (0) +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 53da276a2ec2..38e5164bd0e7 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -146,6 +146,8 @@ struct paravirt_ops void (fastcall *pmd_clear)(pmd_t *pmdp); #endif + void (fastcall *set_lazy_mode)(int mode); + /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); @@ -386,6 +388,19 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif +/* Lazy mode for batching updates / context switch */ +#define PARAVIRT_LAZY_NONE 0 +#define PARAVIRT_LAZY_MMU 1 +#define PARAVIRT_LAZY_CPU 2 + +#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE +#define arch_enter_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_CPU) +#define arch_leave_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE) + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#define arch_enter_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_MMU) +#define arch_leave_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE) + /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch { u8 *instr; /* original instructions */ diff --git a/kernel/sched.c b/kernel/sched.c index 08f86178aa34..0dc757246d89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1853,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_enter_lazy_cpu_mode(); + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); From 8b15114434998a78aa50f8559d69c7a400cff267 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 24/94] [PATCH] i386: iOPL handling for paravirt guests I found a clever way to make the extra IOPL switching invisible to non-paravirt compiles - since kernel_rpl is statically defined to be zero there, and only non-zero rpl kernel have a problem restoring IOPL, as popf does not restore IOPL flags unless run at CPL-0. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/process.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index cfae587bf7d2..05be77413351 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -669,6 +669,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas */ load_TLS(next, cpu); + /* + * Restore IOPL if needed. In normal use, the flags restore + * in the switch assembly will handle this. But if the kernel + * is running virtualized at a non-zero CPL, the popf will + * not restore flags, so it must be done in a separate step. + */ + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); + /* * Now maybe handle debug registers and/or IO bitmaps */ From ae5da273fe3352febd38658d8d34484cbcfb3423 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 25/94] [PATCH] i386: SMP boot hook for paravirt Add VMI SMP boot hook. We emulate a regular boot sequence and use the same APIC IPI initiation, we just poke magic values to load into the CPU state when the startup IPI is received, rather than having to jump through a real mode trampoline. This is all that was needed to get SMP to work. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 2 ++ arch/i386/kernel/smpboot.c | 7 +++++++ include/asm-i386/paravirt.h | 9 +++++++++ include/asm-i386/smp.h | 5 +++++ 4 files changed, 23 insertions(+) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 4dfdac4550dd..5bf81059a7e6 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -572,6 +572,8 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, + + .startup_ipi_hook = (void *)native_nop, }; /* diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8c6c8c52b95c..1908afa265b9 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -834,6 +834,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) else num_starts = 0; + /* + * Paravirt / VMI wants a startup IPI hook here to set up the + * target processor state. + */ + startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, + (unsigned long) stack_start.esp); + /* * Run STARTUP IPI loop. */ diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 38e5164bd0e7..6ccf36499b2a 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -151,6 +151,8 @@ struct paravirt_ops /* These two are jmp to, not actually called. */ void (fastcall *irq_enable_sysexit)(void); void (fastcall *iret)(void); + + void (fastcall *startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); }; /* Mark a paravirt probe function. */ @@ -323,6 +325,13 @@ static inline unsigned long apic_read(unsigned long reg) } #endif +#ifdef CONFIG_SMP +static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, + unsigned long start_esp) +{ + return paravirt_ops.startup_ipi_hook(phys_apicid, start_eip, start_esp); +} +#endif #define __flush_tlb() paravirt_ops.flush_tlb_user() #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 64fe624c02ca..6bf0033a301c 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -52,6 +52,11 @@ extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif +#ifndef CONFIG_PARAVIRT +#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ +do { } while (0) +#endif + /* * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. We map APIC_BASE very early in page_setup(), From 7ce0bcfd1667736f1293cff845139bbee53186de Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 26/94] [PATCH] i386: vMI backend for paravirt-ops Fairly straightforward implementation of VMI backend for paravirt-ops. [Adrian Bunk: some cleanups] Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/Kconfig | 9 + arch/i386/kernel/Makefile | 2 + arch/i386/kernel/head.S | 2 +- arch/i386/kernel/io_apic.c | 2 +- arch/i386/kernel/setup.c | 9 + arch/i386/kernel/smpboot.c | 4 + arch/i386/kernel/vmi.c | 904 +++++++++++++++++++++++++++++++++++++ arch/i386/mm/pgtable.c | 2 + include/asm-i386/timer.h | 1 + include/asm-i386/vmi.h | 262 +++++++++++ 10 files changed, 1195 insertions(+), 2 deletions(-) create mode 100644 arch/i386/kernel/vmi.c create mode 100644 include/asm-i386/vmi.h diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 63d5e841caf5..a3b3f6ee3642 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -203,6 +203,15 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +config VMI + bool "VMI Paravirt-ops support" + depends on PARAVIRT + default y + help + VMI provides a paravirtualized interface to multiple hypervisors + include VMware ESX server and Xen by connecting to a ROM module + provided by the hypervisor. + config ACPI_SRAT bool default y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 1e8988e558c5..9cfb58911f14 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,8 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_VMI) += vmi.o + # Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 15336c8b5960..6c7f71176977 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -360,7 +360,7 @@ check_x87: * cpu_gdt_table and boot_pda; for secondary CPUs, these will be * that CPU's GDT and PDA. */ -setup_pda: +ENTRY(setup_pda) /* get the PDA pointer */ movl start_pda, %eax diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 1711f4e1093f..e30ccedad0b9 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from_mpc(void) static void __init setup_ioapic_ids_from_mpc(void) { } #endif -static int no_timer_check __initdata; +int no_timer_check __initdata; static int __init notimercheck(char *s) { diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 4694ac980cd2..bd8c218d94af 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,14 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = setup_memory(); +#ifdef CONFIG_VMI + /* + * Must be after max_low_pfn is determined, and before kernel + * pagetables are setup. + */ + vmi_init(); +#endif + /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 1908afa265b9..42502d820e4f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -63,6 +63,7 @@ #include #include #include +#include /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -545,6 +546,9 @@ static void __cpuinit start_secondary(void *unused) * booting is too fragile that we want to limit the * things done here to the most necessary things. */ +#ifdef CONFIG_VMI + vmi_bringup(); +#endif secondary_cpu_init(); preempt_disable(); smp_callin(); diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c new file mode 100644 index 000000000000..a94d64b10f75 --- /dev/null +++ b/arch/i386/kernel/vmi.c @@ -0,0 +1,904 @@ +/* + * VMI specific paravirt-ops implementation + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to zach@vmware.com + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Convenient for calling VMI functions indirectly in the ROM */ +typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); +typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); + +#define call_vrom_func(rom,func) \ + (((VROMFUNC *)(rom->func))()) + +#define call_vrom_long_func(rom,func,arg) \ + (((VROMLONGFUNC *)(rom->func)) (arg)) + +static struct vrom_header *vmi_rom; +static int license_gplok; +static int disable_nodelay; +static int disable_pge; +static int disable_pse; +static int disable_sep; +static int disable_tsc; +static int disable_mtrr; + +/* Cached VMI operations */ +struct { + void (*cpuid)(void /* non-c */); + void (*_set_ldt)(u32 selector); + void (*set_tr)(u32 selector); + void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*allocate_page)(u32, u32, u32, u32, u32); + void (*release_page)(u32, u32); + void (*set_pte)(pte_t, pte_t *, unsigned); + void (*update_pte)(pte_t *, unsigned); + void (*set_linear_mapping)(int, u32, u32, u32); + void (*flush_tlb)(int); + void (*set_initial_ap_state)(int, int); +} vmi_ops; + +/* XXX move this to alternative.h */ +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; + +/* + * VMI patching routines. + */ +#define MNEM_CALL 0xe8 +#define MNEM_JMP 0xe9 +#define MNEM_RET 0xc3 + +static char irq_save_disable_callout[] = { + MNEM_CALL, 0, 0, 0, 0, + MNEM_CALL, 0, 0, 0, 0, + MNEM_RET +}; +#define IRQ_PATCH_INT_MASK 0 +#define IRQ_PATCH_DISABLE 5 + +static inline void patch_offset(unsigned char *eip, unsigned char *dest) +{ + *(unsigned long *)(eip+1) = dest-eip-5; +} + +static unsigned patch_internal(int call, unsigned len, void *insns) +{ + u64 reloc; + struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, call); + switch(rel->type) { + case VMI_RELOCATION_CALL_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_JUMP_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_JMP; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_NOP: + /* obliterate the whole thing */ + return 0; + + case VMI_RELOCATION_NONE: + /* leave native code in place */ + break; + + default: + BUG(); + } + return len; +} + +/* + * Apply patch if appropriate, return length of new instruction + * sequence. The callee does nop padding for us. + */ +static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + switch (type) { + case PARAVIRT_IRQ_DISABLE: + return patch_internal(VMI_CALL_DisableInterrupts, len, insns); + case PARAVIRT_IRQ_ENABLE: + return patch_internal(VMI_CALL_EnableInterrupts, len, insns); + case PARAVIRT_RESTORE_FLAGS: + return patch_internal(VMI_CALL_SetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS: + return patch_internal(VMI_CALL_GetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: + if (len >= 10) { + patch_internal(VMI_CALL_GetInterruptMask, len, insns); + patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); + return 10; + } else { + /* + * You bastards didn't leave enough room to + * patch save_flags_irq_disable inline. Patch + * to a helper + */ + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, irq_save_disable_callout); + return 5; + } + case PARAVIRT_INTERRUPT_RETURN: + return patch_internal(VMI_CALL_IRET, len, insns); + case PARAVIRT_STI_SYSEXIT: + return patch_internal(VMI_CALL_SYSEXIT, len, insns); + default: + break; + } + return len; +} + +/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ +static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int override = 0; + if (*eax == 1) + override = 1; + asm volatile ("call *%6" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + if (override) { + if (disable_pse) + *edx &= ~X86_FEATURE_PSE; + if (disable_pge) + *edx &= ~X86_FEATURE_PGE; + if (disable_sep) + *edx &= ~X86_FEATURE_SEP; + if (disable_tsc) + *edx &= ~X86_FEATURE_TSC; + if (disable_mtrr) + *edx &= ~X86_FEATURE_MTRR; + } +} + +static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) +{ + if (gdt[nr].a != new->a || gdt[nr].b != new->b) + write_gdt_entry(gdt, nr, new->a, new->b); +} + +static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); +} + +static void vmi_set_ldt(const void *addr, unsigned entries) +{ + unsigned cpu = smp_processor_id(); + u32 low, high; + + pack_descriptor(&low, &high, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); +} + +static void vmi_set_tr(void) +{ + vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); +} + +static void vmi_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); +} + +static void vmi_flush_tlb_user(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB); +} + +static void vmi_flush_tlb_kernel(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); +} + +/* Stub to do nothing at all; used for delays and unimplemented calls */ +static void vmi_nop(void) +{ +} + + +#ifdef CONFIG_DEBUG_PAGE_TYPE + +#ifdef CONFIG_X86_PAE +#define MAX_BOOT_PTS (2048+4+1) +#else +#define MAX_BOOT_PTS (1024+1) +#endif + +/* + * During boot, mem_map is not yet available in paging_init, so stash + * all the boot page allocations here. + */ +static struct { + u32 pfn; + int type; +} boot_page_allocations[MAX_BOOT_PTS]; +static int num_boot_page_allocations; +static int boot_allocations_applied; + +void vmi_apply_boot_page_allocations(void) +{ + int i; + BUG_ON(!mem_map); + for (i = 0; i < num_boot_page_allocations; i++) { + struct page *page = pfn_to_page(boot_page_allocations[i].pfn); + page->type = boot_page_allocations[i].type; + page->type = boot_page_allocations[i].type & + ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + } + boot_allocations_applied = 1; +} + +static void record_page_type(u32 pfn, int type) +{ + BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); + boot_page_allocations[num_boot_page_allocations].pfn = pfn; + boot_page_allocations[num_boot_page_allocations].type = type; + num_boot_page_allocations++; +} + +static void check_zeroed_page(u32 pfn, int type, struct page *page) +{ + u32 *ptr; + int i; + int limit = PAGE_SIZE / sizeof(int); + + if (page_address(page)) + ptr = (u32 *)page_address(page); + else + ptr = (u32 *)__va(pfn << PAGE_SHIFT); + /* + * When cloning the root in non-PAE mode, only the userspace + * pdes need to be zeroed. + */ + if (type & VMI_PAGE_CLONE) + limit = USER_PTRS_PER_PGD; + for (i = 0; i < limit; i++) + BUG_ON(ptr[i]); +} + +/* + * We stash the page type into struct page so we can verify the page + * types are used properly. + */ +static void vmi_set_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - don't track */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + if (type != VMI_PAGE_NORMAL) + BUG_ON(page->type); + else + BUG_ON(page->type == VMI_PAGE_NORMAL); + page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (type & VMI_PAGE_ZEROED) + check_zeroed_page(pfn, type, page); + } else { + record_page_type(pfn, type); + } +} + +static void vmi_check_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - skip checks */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + BUG_ON((page->type ^ type) & VMI_PAGE_PAE); + BUG_ON(type == VMI_PAGE_NORMAL && page->type); + BUG_ON((type & page->type) == 0); + } +} +#else +#define vmi_set_page_type(p,t) do { } while (0) +#define vmi_check_page_type(p,t) do { } while (0) +#endif + +static void vmi_allocate_pt(u32 pfn) +{ + vmi_set_page_type(pfn, VMI_PAGE_L1); + vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); +} + +static void vmi_allocate_pd(u32 pfn) +{ + /* + * This call comes in very early, before mem_map is setup. + * It is called only for swapper_pg_dir, which already has + * data on it. + */ + vmi_set_page_type(pfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); +} + +static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +{ + vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); + vmi_check_page_type(clonepfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); +} + +static void vmi_release_pt(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L1); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +static void vmi_release_pd(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L2); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +/* + * Helper macros for MMU update flags. We can defer updates until a flush + * or page invalidation only if the update is to the current address space + * (otherwise, there is no flush). We must check against init_mm, since + * this could be a kernel update, which usually passes init_mm, although + * sometimes this check can be skipped if we know the particular function + * is only called on user mode PTEs. We could change the kernel to pass + * current->active_mm here, but in particular, I was unsure if changing + * mm/highmem.c to do this would still be correct on other architectures. + */ +#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ + (!mustbeuser && (mm) == &init_mm)) +#define vmi_flags_addr(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) +#define vmi_flags_addr_defer(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) + +static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pte(pte_t *ptep, pte_t pte) +{ + /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); + vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ +#ifdef CONFIG_X86_PAE + const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); +#else + const pte_t pte = { pmdval.pud.pgd.pgd }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); +#endif + vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); +} + +#ifdef CONFIG_X86_PAE + +static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + /* + * XXX This is called from set_pmd_pte, but at both PT + * and PD layers so the VMI_PAGE_PT flag is wrong. But + * it is only called for large page mapping changes, + * the Xen backend, doesn't support large pages, and the + * ESX backend doesn't depend on the flag. + */ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); + vmi_ops.update_pte(ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); +} + +static void vmi_set_pud(pud_t *pudp, pud_t pudval) +{ + /* Um, eww */ + const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); + vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); +} + +static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +void vmi_pmd_clear(pmd_t *pmd) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); + vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); +} +#endif + +#ifdef CONFIG_SMP +struct vmi_ap_state ap; +extern void setup_pda(void); + +static void __init /* XXX cpu hotplug */ +vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, + unsigned long start_esp) +{ + /* Default everything to zero. This is fine for most GPRs. */ + memset(&ap, 0, sizeof(struct vmi_ap_state)); + + ap.gdtr_limit = GDT_SIZE - 1; + ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); + + ap.idtr_limit = IDT_ENTRIES * 8 - 1; + ap.idtr_base = (unsigned long) idt_table; + + ap.ldtr = 0; + + ap.cs = __KERNEL_CS; + ap.eip = (unsigned long) start_eip; + ap.ss = __KERNEL_DS; + ap.esp = (unsigned long) start_esp; + + ap.ds = __USER_DS; + ap.es = __USER_DS; + ap.fs = __KERNEL_PDA; + ap.gs = 0; + + ap.eflags = 0; + + setup_pda(); + +#ifdef CONFIG_X86_PAE + /* efer should match BSP efer. */ + if (cpu_has_nx) { + unsigned l, h; + rdmsr(MSR_EFER, l, h); + ap.efer = (unsigned long long) h << 32 | l; + } +#endif + + ap.cr3 = __pa(swapper_pg_dir); + /* Protected mode, paging, AM, WP, NE, MP. */ + ap.cr0 = 0x80050023; + ap.cr4 = mmu_cr4_features; + vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid); +} +#endif + +static inline int __init check_vmi_rom(struct vrom_header *rom) +{ + struct pci_header *pci; + struct pnp_header *pnp; + const char *manufacturer = "UNKNOWN"; + const char *product = "UNKNOWN"; + const char *license = "unspecified"; + + if (rom->rom_signature != 0xaa55) + return 0; + if (rom->vrom_signature != VMI_SIGNATURE) + return 0; + if (rom->api_version_maj != VMI_API_REV_MAJOR || + rom->api_version_min+1 < VMI_API_REV_MINOR+1) { + printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", + rom->api_version_maj, + rom->api_version_min); + return 0; + } + + /* + * Relying on the VMI_SIGNATURE field is not 100% safe, so check + * the PCI header and device type to make sure this is really a + * VMI device. + */ + if (!rom->pci_header_offs) { + printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); + return 0; + } + + pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); + if (pci->vendorID != PCI_VENDOR_ID_VMWARE || + pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { + /* Allow it to run... anyways, but warn */ + printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); + } + + if (rom->pnp_header_offs) { + pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); + if (pnp->manufacturer_offset) + manufacturer = (const char *)rom+pnp->manufacturer_offset; + if (pnp->product_offset) + product = (const char *)rom+pnp->product_offset; + } + + if (rom->license_offs) + license = (char *)rom+rom->license_offs; + + printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", + manufacturer, product, + rom->api_version_maj, rom->api_version_min, + pci->rom_version_maj, pci->rom_version_min); + + license_gplok = license_is_gpl_compatible(license); + if (!license_gplok) { + printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... " + "inlining disabled\n", + license); + add_taint(TAINT_PROPRIETARY_MODULE); + } + return 1; +} + +/* + * Probe for the VMI option ROM + */ +static inline int __init probe_vmi_rom(void) +{ + unsigned long base; + + /* VMI ROM is in option ROM area, check signature */ + for (base = 0xC0000; base < 0xE0000; base += 2048) { + struct vrom_header *romstart; + romstart = (struct vrom_header *)isa_bus_to_virt(base); + if (check_vmi_rom(romstart)) { + vmi_rom = romstart; + return 1; + } + } + return 0; +} + +/* + * VMI setup common to all processors + */ +void vmi_bringup(void) +{ + /* We must establish the lowmem mapping for MMU ops to work */ + if (vmi_rom) + vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); +} + +/* + * Return a pointer to the VMI function or a NOP stub + */ +static void *vmi_get_function(int vmicall) +{ + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); + BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); + if (rel->type == VMI_RELOCATION_CALL_REL) + return (void *)rel->eip; + else + return (void *)vmi_nop; +} + +/* + * Helper macro for making the VMI paravirt-ops fill code readable. + * For unimplemented operations, fall back to default. + */ +#define para_fill(opname, vmicall) \ +do { \ + reloc = call_vrom_long_func(vmi_rom, get_reloc, \ + VMI_CALL_##vmicall); \ + if (rel->type != VMI_RELOCATION_NONE) { \ + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \ + paravirt_ops.opname = (void *)rel->eip; \ + } \ +} while (0) + +/* + * Activate the VMI interface and switch into paravirtualized mode + */ +static inline int __init activate_vmi(void) +{ + short kernel_cs; + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + + if (call_vrom_func(vmi_rom, vmi_init) != 0) { + printk(KERN_ERR "VMI ROM failed to initialize!"); + return 0; + } + savesegment(cs, kernel_cs); + + paravirt_ops.paravirt_enabled = 1; + paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + + paravirt_ops.patch = vmi_patch; + paravirt_ops.name = "vmi"; + + /* + * Many of these operations are ABI compatible with VMI. + * This means we can fill in the paravirt-ops with direct + * pointers into the VMI ROM. If the calling convention for + * these operations changes, this code needs to be updated. + * + * Exceptions + * CPUID paravirt-op uses pointers, not the native ISA + * halt has no VMI equivalent; all VMI halts are "safe" + * no MSR support yet - just trap and emulate. VMI uses the + * same ABI as the native ISA, but Linux wants exceptions + * from bogus MSR read / write handled + * rdpmc is not yet used in Linux + */ + + /* CPUID is special, so very special */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.cpuid = (void *)rel->eip; + paravirt_ops.cpuid = vmi_cpuid; + } + + para_fill(clts, CLTS); + para_fill(get_debugreg, GetDR); + para_fill(set_debugreg, SetDR); + para_fill(read_cr0, GetCR0); + para_fill(read_cr2, GetCR2); + para_fill(read_cr3, GetCR3); + para_fill(read_cr4, GetCR4); + para_fill(write_cr0, SetCR0); + para_fill(write_cr2, SetCR2); + para_fill(write_cr3, SetCR3); + para_fill(write_cr4, SetCR4); + para_fill(save_fl, GetInterruptMask); + para_fill(restore_fl, SetInterruptMask); + para_fill(irq_disable, DisableInterrupts); + para_fill(irq_enable, EnableInterrupts); + /* irq_save_disable !!! sheer pain */ + patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], + (char *)paravirt_ops.save_fl); + patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], + (char *)paravirt_ops.irq_disable); + para_fill(safe_halt, Halt); + para_fill(wbinvd, WBINVD); + /* paravirt_ops.read_msr = vmi_rdmsr */ + /* paravirt_ops.write_msr = vmi_wrmsr */ + para_fill(read_tsc, RDTSC); + /* paravirt_ops.rdpmc = vmi_rdpmc */ + + /* TR interface doesn't pass TR value */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_tr = (void *)rel->eip; + paravirt_ops.load_tr_desc = vmi_set_tr; + } + + /* LDT is special, too */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops._set_ldt = (void *)rel->eip; + paravirt_ops.set_ldt = vmi_set_ldt; + } + + para_fill(load_gdt, SetGDT); + para_fill(load_idt, SetIDT); + para_fill(store_gdt, GetGDT); + para_fill(store_idt, GetIDT); + para_fill(store_tr, GetTR); + paravirt_ops.load_tls = vmi_load_tls; + para_fill(write_ldt_entry, WriteLDTEntry); + para_fill(write_gdt_entry, WriteGDTEntry); + para_fill(write_idt_entry, WriteIDTEntry); + reloc = call_vrom_long_func(vmi_rom, get_reloc, + VMI_CALL_UpdateKernelStack); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_kernel_stack = (void *)rel->eip; + paravirt_ops.load_esp0 = vmi_load_esp0; + } + + para_fill(set_iopl_mask, SetIOPLMask); + paravirt_ops.io_delay = (void *)vmi_nop; + if (!disable_nodelay) { + paravirt_ops.const_udelay = (void *)vmi_nop; + } + + para_fill(set_lazy_mode, SetLazyMode); + + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_ops.flush_tlb = (void *)rel->eip; + paravirt_ops.flush_tlb_user = vmi_flush_tlb_user; + paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel; + } + para_fill(flush_tlb_single, InvalPage); + + /* + * Until a standard flag format can be agreed on, we need to + * implement these as wrappers in Linux. Get the VMI ROM + * function pointers for the two backend calls. + */ +#ifdef CONFIG_X86_PAE + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); +#else + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); +#endif + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); + vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); + vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); + + paravirt_ops.alloc_pt = vmi_allocate_pt; + paravirt_ops.alloc_pd = vmi_allocate_pd; + paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + paravirt_ops.release_pt = vmi_release_pt; + paravirt_ops.release_pd = vmi_release_pd; + paravirt_ops.set_pte = vmi_set_pte; + paravirt_ops.set_pte_at = vmi_set_pte_at; + paravirt_ops.set_pmd = vmi_set_pmd; + paravirt_ops.pte_update = vmi_update_pte; + paravirt_ops.pte_update_defer = vmi_update_pte_defer; +#ifdef CONFIG_X86_PAE + paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; + paravirt_ops.set_pte_present = vmi_set_pte_present; + paravirt_ops.set_pud = vmi_set_pud; + paravirt_ops.pte_clear = vmi_pte_clear; + paravirt_ops.pmd_clear = vmi_pmd_clear; +#endif + /* + * These MUST always be patched. Don't support indirect jumps + * through these operations, as the VMI interface may use either + * a jump or a call to get to these operations, depending on + * the backend. They are performance critical anyway, so requiring + * a patch is not a big problem. + */ + paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; + paravirt_ops.iret = (void *)0xbadbab0; + +#ifdef CONFIG_SMP + paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook; + vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead); + paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite); + paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite); +#endif + + /* + * Alternative instruction rewriting doesn't happen soon enough + * to convert VMI_IRET to a call instead of a jump; so we have + * to do this before IRQs get reenabled. Fortunately, it is + * idempotent. + */ + apply_paravirt(__start_parainstructions, __stop_parainstructions); + + vmi_bringup(); + + return 1; +} + +#undef para_fill + +void __init vmi_init(void) +{ + unsigned long flags; + + if (!vmi_rom) + probe_vmi_rom(); + else + check_vmi_rom(vmi_rom); + + /* In case probing for or validating the ROM failed, basil */ + if (!vmi_rom) + return; + + reserve_top_address(-vmi_rom->virtual_top); + + local_irq_save(flags); + activate_vmi(); +#ifdef CONFIG_SMP + no_timer_check = 1; +#endif + local_irq_restore(flags & X86_EFLAGS_IF); +} + +static int __init parse_vmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "disable_nodelay")) + disable_nodelay = 1; + else if (!strcmp(arg, "disable_pge")) { + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + disable_pge = 1; + } else if (!strcmp(arg, "disable_pse")) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else if (!strcmp(arg, "disable_sep")) { + clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + disable_sep = 1; + } else if (!strcmp(arg, "disable_tsc")) { + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + disable_tsc = 1; + } else if (!strcmp(arg, "disable_mtrr")) { + clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + disable_mtrr = 1; + } + return 0; +} + +early_param("vmi", parse_vmi); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index b5f538f52272..fa0cfbd551e1 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) void reserve_top_address(unsigned long reserve) { BUG_ON(fixmaps > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); #ifdef CONFIG_COMPAT_VDSO BUG_ON(reserve != 0); #else diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index d0ebd05f8516..1ee64e34cd35 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -8,6 +8,7 @@ void setup_pit_timer(void); /* Modifiers for buggy PIT handling */ extern int pit_latch_buggy; extern int timer_ack; +extern int no_timer_check; extern int recalibrate_cpu_khz(void); #endif diff --git a/include/asm-i386/vmi.h b/include/asm-i386/vmi.h new file mode 100644 index 000000000000..43c89333037e --- /dev/null +++ b/include/asm-i386/vmi.h @@ -0,0 +1,262 @@ +/* + * VMI interface definition + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Maintained by: Zachary Amsden zach@vmware.com + * + */ +#include + +/* + *--------------------------------------------------------------------- + * + * VMI Option ROM API + * + *--------------------------------------------------------------------- + */ +#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */ + +#define PCI_VENDOR_ID_VMWARE 0x15AD +#define PCI_DEVICE_ID_VMWARE_VMI 0x0801 + +/* + * We use two version numbers for compatibility, with the major + * number signifying interface breakages, and the minor number + * interface extensions. + */ +#define VMI_API_REV_MAJOR 3 +#define VMI_API_REV_MINOR 0 + +#define VMI_CALL_CPUID 0 +#define VMI_CALL_WRMSR 1 +#define VMI_CALL_RDMSR 2 +#define VMI_CALL_SetGDT 3 +#define VMI_CALL_SetLDT 4 +#define VMI_CALL_SetIDT 5 +#define VMI_CALL_SetTR 6 +#define VMI_CALL_GetGDT 7 +#define VMI_CALL_GetLDT 8 +#define VMI_CALL_GetIDT 9 +#define VMI_CALL_GetTR 10 +#define VMI_CALL_WriteGDTEntry 11 +#define VMI_CALL_WriteLDTEntry 12 +#define VMI_CALL_WriteIDTEntry 13 +#define VMI_CALL_UpdateKernelStack 14 +#define VMI_CALL_SetCR0 15 +#define VMI_CALL_SetCR2 16 +#define VMI_CALL_SetCR3 17 +#define VMI_CALL_SetCR4 18 +#define VMI_CALL_GetCR0 19 +#define VMI_CALL_GetCR2 20 +#define VMI_CALL_GetCR3 21 +#define VMI_CALL_GetCR4 22 +#define VMI_CALL_WBINVD 23 +#define VMI_CALL_SetDR 24 +#define VMI_CALL_GetDR 25 +#define VMI_CALL_RDPMC 26 +#define VMI_CALL_RDTSC 27 +#define VMI_CALL_CLTS 28 +#define VMI_CALL_EnableInterrupts 29 +#define VMI_CALL_DisableInterrupts 30 +#define VMI_CALL_GetInterruptMask 31 +#define VMI_CALL_SetInterruptMask 32 +#define VMI_CALL_IRET 33 +#define VMI_CALL_SYSEXIT 34 +#define VMI_CALL_Halt 35 +#define VMI_CALL_Reboot 36 +#define VMI_CALL_Shutdown 37 +#define VMI_CALL_SetPxE 38 +#define VMI_CALL_SetPxELong 39 +#define VMI_CALL_UpdatePxE 40 +#define VMI_CALL_UpdatePxELong 41 +#define VMI_CALL_MachineToPhysical 42 +#define VMI_CALL_PhysicalToMachine 43 +#define VMI_CALL_AllocatePage 44 +#define VMI_CALL_ReleasePage 45 +#define VMI_CALL_InvalPage 46 +#define VMI_CALL_FlushTLB 47 +#define VMI_CALL_SetLinearMapping 48 + +#define VMI_CALL_SetIOPLMask 61 +#define VMI_CALL_SetInitialAPState 62 +#define VMI_CALL_APICWrite 63 +#define VMI_CALL_APICRead 64 +#define VMI_CALL_SetLazyMode 73 + +/* + *--------------------------------------------------------------------- + * + * MMU operation flags + * + *--------------------------------------------------------------------- + */ + +/* Flags used by VMI_{Allocate|Release}Page call */ +#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */ +#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */ +#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */ + + +/* Flags shared by Allocate|Release Page and PTE updates */ +#define VMI_PAGE_PT 0x01 +#define VMI_PAGE_PD 0x02 +#define VMI_PAGE_PDP 0x04 +#define VMI_PAGE_PML4 0x08 + +#define VMI_PAGE_NORMAL 0x00 /* for debugging */ + +/* Flags used by PTE updates */ +#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */ +#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */ +#define VMI_PAGE_VA_MASK 0xfffff000 + +#ifdef CONFIG_X86_PAE +#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED) +#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED) +#else +#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED) +#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED) +#endif + +/* Flags used by VMI_FlushTLB call */ +#define VMI_FLUSH_TLB 0x01 +#define VMI_FLUSH_GLOBAL 0x02 + +/* + *--------------------------------------------------------------------- + * + * VMI relocation definitions for ROM call get_reloc + * + *--------------------------------------------------------------------- + */ + +/* VMI Relocation types */ +#define VMI_RELOCATION_NONE 0 +#define VMI_RELOCATION_CALL_REL 1 +#define VMI_RELOCATION_JUMP_REL 2 +#define VMI_RELOCATION_NOP 3 + +#ifndef __ASSEMBLY__ +struct vmi_relocation_info { + unsigned char *eip; + unsigned char type; + unsigned char reserved[3]; +}; +#endif + + +/* + *--------------------------------------------------------------------- + * + * Generic ROM structures and definitions + * + *--------------------------------------------------------------------- + */ + +#ifndef __ASSEMBLY__ + +struct vrom_header { + u16 rom_signature; // option ROM signature + u8 rom_length; // ROM length in 512 byte chunks + u8 rom_entry[4]; // 16-bit code entry point + u8 rom_pad0; // 4-byte align pad + u32 vrom_signature; // VROM identification signature + u8 api_version_min;// Minor version of API + u8 api_version_maj;// Major version of API + u8 jump_slots; // Number of jump slots + u8 reserved1; // Reserved for expansion + u32 virtual_top; // Hypervisor virtual address start + u16 reserved2; // Reserved for expansion + u16 license_offs; // Offset to License string + u16 pci_header_offs;// Offset to PCI OPROM header + u16 pnp_header_offs;// Offset to PnP OPROM header + u32 rom_pad3; // PnP reserverd / VMI reserved + u8 reserved[96]; // Reserved for headers + char vmi_init[8]; // VMI_Init jump point + char get_reloc[8]; // VMI_GetRelocationInfo jump point +} __attribute__((packed)); + +struct pnp_header { + char sig[4]; + char rev; + char size; + short next; + short res; + long devID; + unsigned short manufacturer_offset; + unsigned short product_offset; +} __attribute__((packed)); + +struct pci_header { + char sig[4]; + short vendorID; + short deviceID; + short vpdData; + short size; + char rev; + char class; + char subclass; + char interface; + short chunks; + char rom_version_min; + char rom_version_maj; + char codetype; + char lastRom; + short reserved; +} __attribute__((packed)); + +/* Function prototypes for bootstrapping */ +extern void vmi_init(void); +extern void vmi_bringup(void); +extern void vmi_apply_boot_page_allocations(void); + +/* State needed to start an application processor in an SMP system. */ +struct vmi_ap_state { + u32 cr0; + u32 cr2; + u32 cr3; + u32 cr4; + + u64 efer; + + u32 eip; + u32 eflags; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esp; + u32 ebp; + u32 esi; + u32 edi; + u16 cs; + u16 ss; + u16 ds; + u16 es; + u16 fs; + u16 gs; + u16 ldtr; + + u16 gdtr_limit; + u32 gdtr_base; + u32 idtr_base; + u16 idtr_limit; +}; + +#endif From bbab4f3bb7f528d2b8ccb5de9ae5f6ff3fb29684 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 27/94] [PATCH] i386: vMI timer patches VMI timer code. It works by taking over the local APIC clock when APIC is configured, which requires a couple hooks into the APIC code. The backend timer code could be commonized into the timer infrastructure, but there are some pieces missing (stolen time, in particular), and the exact semantics of when to do accounting for NO_IDLE need to be shared between different hypervisors as well. So for now, VMI timer is a separate module. [Adrian Bunk: cleanups] Subject: VMI timer patches Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Rusty Russell Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/Kconfig | 9 + arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/apic.c | 2 +- arch/i386/kernel/entry.S | 5 + arch/i386/kernel/paravirt.c | 2 + arch/i386/kernel/smpboot.c | 4 +- arch/i386/kernel/time.c | 4 +- arch/i386/kernel/tsc.c | 4 + arch/i386/kernel/vmi.c | 45 ++++ arch/i386/kernel/vmitime.c | 495 ++++++++++++++++++++++++++++++++++++ include/asm-i386/apic.h | 2 + include/asm-i386/paravirt.h | 12 + include/asm-i386/time.h | 1 + include/asm-i386/timer.h | 2 + include/asm-i386/vmi_time.h | 103 ++++++++ 15 files changed, 687 insertions(+), 5 deletions(-) create mode 100644 arch/i386/kernel/vmitime.c create mode 100644 include/asm-i386/vmi_time.h diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index a3b3f6ee3642..595fb771366e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1272,3 +1272,12 @@ config X86_TRAMPOLINE config KTIME_SCALAR bool default y + +config NO_IDLE_HZ + bool + depends on PARAVIRT + default y + help + Switches the regular HZ timer off when the system is going idle. + This helps a hypervisor detect that the Linux system is idle, + reducing the overhead of idle systems. diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 9cfb58911f14..97f1e961d684 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,7 +40,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_VMI) += vmi.o +obj-$(CONFIG_VMI) += vmi.o vmitime.o # Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776d9be26af9..629c5ed94260 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -1395,7 +1395,7 @@ int __init APIC_init_uniprocessor (void) if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); #endif - setup_boot_APIC_clock(); + setup_boot_clock(); return 0; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 8c6a22a42d2e..d4b4ffc9eacb 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -626,6 +626,11 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" +/* This alternate entry is needed because we hijack the apic LVTT */ +#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) +BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) +#endif + KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 5bf81059a7e6..2003733310dc 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -544,6 +544,8 @@ struct paravirt_ops paravirt_ops = { .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, + .setup_boot_clock = setup_boot_APIC_clock, + .setup_secondary_clock = setup_secondary_APIC_clock, #endif .set_lazy_mode = (void *)native_nop, diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 42502d820e4f..5a00b07e7194 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -554,7 +554,7 @@ static void __cpuinit start_secondary(void *unused) smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); - setup_secondary_APIC_clock(); + setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); enable_NMI_through_LVT0(NULL); @@ -1331,7 +1331,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) smpboot_setup_io_apic(); - setup_boot_APIC_clock(); + setup_boot_clock(); /* * Synchronize the TSC with the AP diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index c505b16c0990..9603ccaba997 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -232,6 +232,7 @@ EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +int no_sync_cmos_clock; static void sync_cmos_clock(unsigned long dummy) { @@ -275,7 +276,8 @@ static void sync_cmos_clock(unsigned long dummy) void notify_arch_cmos_timer(void) { - mod_timer(&sync_cmos_timer, jiffies + 1); + if (!no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); } static long clock_cmos_diff; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 2cfc7b09b925..12fef14995a5 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -23,6 +23,7 @@ * an extra value to store the TSC freq */ unsigned int tsc_khz; +unsigned long long (*custom_sched_clock)(void); int tsc_disable; @@ -107,6 +108,9 @@ unsigned long long sched_clock(void) { unsigned long long this_offset; + if (unlikely(custom_sched_clock)) + return (*custom_sched_clock)(); + /* * in the NUMA case we dont use the TSC as they are not * synchronized across all CPUs. diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index a94d64b10f75..bb5a7abf949c 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -34,6 +34,7 @@ #include #include #include +#include /* Convenient for calling VMI functions indirectly in the ROM */ typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); @@ -67,6 +68,7 @@ struct { void (*set_linear_mapping)(int, u32, u32, u32); void (*flush_tlb)(int); void (*set_initial_ap_state)(int, int); + void (*halt)(void); } vmi_ops; /* XXX move this to alternative.h */ @@ -252,6 +254,19 @@ static void vmi_nop(void) { } +/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ +#ifdef CONFIG_NO_IDLE_HZ +static fastcall void vmi_safe_halt(void) +{ + int idle = vmi_stop_hz_timer(); + vmi_ops.halt(); + if (idle) { + local_irq_disable(); + vmi_account_time_restart_hz_timer(); + local_irq_enable(); + } +} +#endif #ifdef CONFIG_DEBUG_PAGE_TYPE @@ -727,7 +742,12 @@ static inline int __init activate_vmi(void) (char *)paravirt_ops.save_fl); patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], (char *)paravirt_ops.irq_disable); +#ifndef CONFIG_NO_IDLE_HZ para_fill(safe_halt, Halt); +#else + vmi_ops.halt = vmi_get_function(VMI_CALL_Halt); + paravirt_ops.safe_halt = vmi_safe_halt; +#endif para_fill(wbinvd, WBINVD); /* paravirt_ops.read_msr = vmi_rdmsr */ /* paravirt_ops.write_msr = vmi_wrmsr */ @@ -837,6 +857,31 @@ static inline int __init activate_vmi(void) paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite); #endif + /* + * Check for VMI timer functionality by probing for a cycle frequency method + */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; + vmi_timer_ops.get_cycle_counter = + vmi_get_function(VMI_CALL_GetCycleCounter); + vmi_timer_ops.get_wallclock = + vmi_get_function(VMI_CALL_GetWallclockTime); + vmi_timer_ops.wallclock_updated = + vmi_get_function(VMI_CALL_WallclockUpdated); + vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); + vmi_timer_ops.cancel_alarm = + vmi_get_function(VMI_CALL_CancelAlarm); + paravirt_ops.time_init = vmi_time_init; + paravirt_ops.get_wallclock = vmi_get_wallclock; + paravirt_ops.set_wallclock = vmi_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; + paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; +#endif + custom_sched_clock = vmi_sched_clock; + } + /* * Alternative instruction rewriting doesn't happen soon enough * to convert VMI_IRET to a call instead of a jump; so we have diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c new file mode 100644 index 000000000000..7c3033dbe5f5 --- /dev/null +++ b/arch/i386/kernel/vmitime.c @@ -0,0 +1,495 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to dhecht@vmware.com + * + */ + +/* + * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. + * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. + * See comments there for proper credits. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT +#else +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 +#endif + +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; + +#ifdef CONFIG_NO_IDLE_HZ + +/* /proc/sys/kernel/hz_timer state. */ +int sysctl_hz_timer; + +/* Some stats */ +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); +static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ +static int alarm_hz = CONFIG_VMI_ALARM_HZ; + +/* Cache of the value get_cycle_frequency / HZ. */ +static signed long long cycles_per_jiffy; + +/* Cache of the value get_cycle_frequency / alarm_hz. */ +static signed long long cycles_per_alarm; + +/* The number of cycles accounted for by the 'jiffies'/'xtime' count. + * Protected by xtime_lock. */ +static unsigned long long real_cycles_accounted_system; + +/* The number of cycles accounted for by update_process_times(), per cpu. */ +static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); + +/* The number of stolen cycles accounted, per cpu. */ +static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); + +/* Clock source. */ +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static cycle_t read_available_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +#if 0 +static cycle_t read_stolen_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); +} +#endif /* 0 */ + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .is_continuous = 1, +}; + + +/* Timer interrupt handler. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); + +static struct irqaction vmi_timer_irq = { + vmi_timer_interrupt, + SA_INTERRUPT, + CPU_MASK_NONE, + "VMI-alarm", + NULL, + NULL +}; + +/* Alarm rate */ +static int __init vmi_timer_alarm_rate_setup(char* str) +{ + int alarm_rate; + if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { + alarm_hz = alarm_rate; + printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); + } + return 1; +} +__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); + + +/* Initialization */ +static void vmi_get_wallclock_ts(struct timespec *ts) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec units + ts->tv_nsec = do_div(wallclock, 1000000000); + ts->tv_sec = wallclock; +} + +static void update_xtime_from_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + do_settimeofday(&ts); +} + +unsigned long vmi_get_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + return ts.tv_sec; +} + +int vmi_set_wallclock(unsigned long now) +{ + return -1; +} + +unsigned long long vmi_sched_clock(void) +{ + return read_available_cycles(); +} + +void __init vmi_time_init(void) +{ + unsigned long long cycles_per_sec, cycles_per_msec; + + setup_irq(0, &vmi_timer_irq); +#ifdef CONFIG_X86_LOCAL_APIC + set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); +#endif + + no_sync_cmos_clock = 1; + + vmi_get_wallclock_ts(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + real_cycles_accounted_system = read_real_cycles(); + update_xtime_from_wallclock(); + per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); + + cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); + + cycles_per_jiffy = cycles_per_sec; + (void)do_div(cycles_per_jiffy, HZ); + cycles_per_alarm = cycles_per_sec; + (void)do_div(cycles_per_alarm, alarm_hz); + cycles_per_msec = cycles_per_sec; + (void)do_div(cycles_per_msec, 1000); + cpu_khz = cycles_per_msec; + + printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" + "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, + cycles_per_alarm); + + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + if (clocksource_register(&clocksource_vmi)) + printk(KERN_WARNING "Error registering VMITIME clocksource."); + + /* Disable PIT. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu + * reduce the latency calling update_process_times. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); +} + +#ifdef CONFIG_X86_LOCAL_APIC + +void __init vmi_timer_setup_boot_alarm(void) +{ + local_irq_disable(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); + local_irq_enable(); +} + +/* Initialize the time accounting variables for an AP on an SMP system. + * Also, set the local alarm for the AP. */ +void __init vmi_timer_setup_secondary_alarm(void) +{ + int cpu = smp_processor_id(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); + + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); +} + +#endif + +/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ +static void vmi_account_real_cycles(unsigned long long cur_real_cycles) +{ + long long cycles_not_accounted; + + write_seqlock(&xtime_lock); + + cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; + while (cycles_not_accounted >= cycles_per_jiffy) { + /* systems wide jiffies and wallclock. */ + do_timer(1); + + cycles_not_accounted -= cycles_per_jiffy; + real_cycles_accounted_system += cycles_per_jiffy; + } + + if (vmi_timer_ops.wallclock_updated()) + update_xtime_from_wallclock(); + + write_sequnlock(&xtime_lock); +} + +/* Update per-cpu process times. */ +static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + /* Account time to the current process. This includes + * calling into the scheduler to decrement the timeslice + * and possibly reschedule.*/ + update_process_times(user_mode(regs)); + /* XXX handle /proc/profile multiplier. */ + profile_tick(CPU_PROFILING); + + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } +} + +#ifdef CONFIG_NO_IDLE_HZ +/* Update per-cpu idle times. Used when a no-hz halt is ended. */ +static void vmi_account_no_hz_idle_cycles(int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + unsigned long no_idle_hz_jiffies = 0; + + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + no_idle_hz_jiffies++; + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* Account time to the idle process. */ + account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); +} +#endif + +/* Update per-cpu stolen time. */ +static void vmi_account_stolen_cycles(int cpu, + unsigned long long cur_real_cycles, + unsigned long long cur_avail_cycles) +{ + long long stolen_cycles_not_accounted; + unsigned long stolen_jiffies = 0; + + if (cur_real_cycles < cur_avail_cycles) + return; + + stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - + per_cpu(stolen_cycles_accounted_cpu, cpu); + + while (stolen_cycles_not_accounted >= cycles_per_jiffy) { + stolen_jiffies++; + stolen_cycles_not_accounted -= cycles_per_jiffy; + per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* HACK: pass NULL to force time onto cpustat->steal. */ + account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); +} + +/* Body of either IRQ0 interrupt handler (UP no local-APIC) or + * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ +static void vmi_local_timer_interrupt(int cpu) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu process times. */ + vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); +} + +#ifdef CONFIG_NO_IDLE_HZ + +/* Must be called only from idle loop, with interrupts disabled. */ +int vmi_stop_hz_timer(void) +{ + /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ + + unsigned long seq, next; + unsigned long long real_cycles_expiry; + int cpu = smp_processor_id(); + int idle; + + BUG_ON(!irqs_disabled()); + if (sysctl_hz_timer != 0) + return 0; + + cpu_set(cpu, nohz_cpu_mask); + smp_mb(); + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (next = next_timer_interrupt(), time_before_eq(next, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + next = jiffies; + idle = 0; + } else + idle = 1; + + /* Convert jiffies to the real cycle counter. */ + do { + seq = read_seqbegin(&xtime_lock); + real_cycles_expiry = real_cycles_accounted_system + + (long)(next - jiffies) * cycles_per_jiffy; + } while (read_seqretry(&xtime_lock, seq)); + + /* This cpu is going idle. Disable the periodic alarm. */ + if (idle) { + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + per_cpu(idle_start_jiffies, cpu) = jiffies; + } + + /* Set the real time alarm to expire at the next event. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, + real_cycles_expiry, 0); + + return idle; +} + +static void vmi_reenable_hz_timer(int cpu) +{ + /* For /proc/vmi/info idle_hz stat. */ + per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); + per_cpu(vmi_idle_no_hz_irqs, cpu)++; + + /* Don't bother explicitly cancelling the one-shot alarm -- at + * worse we will receive a spurious timer interrupt. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); + /* Indicate this cpu is no longer nohz idle. */ + cpu_clear(cpu, nohz_cpu_mask); +} + +/* Called from interrupt handlers when (local) HZ timer is disabled. */ +void vmi_account_time_restart_hz_timer(void) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + int cpu = smp_processor_id(); + + BUG_ON(!irqs_disabled()); + /* Account the time during which the HZ timer was disabled. */ + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu idle times. */ + vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); + /* Reenable the hz timer. */ + vmi_reenable_hz_timer(cpu); +} + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* UP (and no local-APIC) VMI-timer alarm interrupt handler. + * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after + * APIC setup and setup_boot_vmi_alarm() is called. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + vmi_local_timer_interrupt(smp_processor_id()); + return IRQ_HANDLED; +} + +#ifdef CONFIG_X86_LOCAL_APIC + +/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. + * Also used in UP when CONFIG_X86_LOCAL_APIC. + * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ +void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat,cpu).apic_timer_irqs++; + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + vmi_local_timer_interrupt(cpu); + irq_exit(); + set_irq_regs(old_regs); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h index 41a44319905f..3a61206fd108 100644 --- a/include/asm-i386/apic.h +++ b/include/asm-i386/apic.h @@ -43,6 +43,8 @@ extern void generic_apic_probe(void); #define apic_write native_apic_write #define apic_write_atomic native_apic_write_atomic #define apic_read native_apic_read +#define setup_boot_clock setup_boot_APIC_clock +#define setup_secondary_clock setup_secondary_APIC_clock #endif static __inline fastcall void native_apic_write(unsigned long reg, diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 6ccf36499b2a..12ef95924da6 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -121,6 +121,8 @@ struct paravirt_ops void (fastcall *apic_write)(unsigned long reg, unsigned long v); void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); unsigned long (fastcall *apic_read)(unsigned long reg); + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); #endif void (fastcall *flush_tlb_user)(void); @@ -323,6 +325,16 @@ static inline unsigned long apic_read(unsigned long reg) { return paravirt_ops.apic_read(reg); } + +static inline void setup_boot_clock(void) +{ + paravirt_ops.setup_boot_clock(); +} + +static inline void setup_secondary_clock(void) +{ + paravirt_ops.setup_secondary_clock(); +} #endif #ifdef CONFIG_SMP diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h index ea8065af825a..571b4294dc2e 100644 --- a/include/asm-i386/time.h +++ b/include/asm-i386/time.h @@ -30,6 +30,7 @@ static inline int native_set_wallclock(unsigned long nowtime) #ifdef CONFIG_PARAVIRT #include +extern unsigned long long native_sched_clock(void); #else /* !CONFIG_PARAVIRT */ #define get_wallclock() native_get_wallclock() diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h index 1ee64e34cd35..4752c3a6a708 100644 --- a/include/asm-i386/timer.h +++ b/include/asm-i386/timer.h @@ -9,6 +9,8 @@ void setup_pit_timer(void); extern int pit_latch_buggy; extern int timer_ack; extern int no_timer_check; +extern unsigned long long (*custom_sched_clock)(void); +extern int no_sync_cmos_clock; extern int recalibrate_cpu_khz(void); #endif diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h new file mode 100644 index 000000000000..c12931211007 --- /dev/null +++ b/include/asm-i386/vmi_time.h @@ -0,0 +1,103 @@ +/* + * VMI Time wrappers + * + * Copyright (C) 2006, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to dhecht@vmware.com + * + */ + +#ifndef __VMI_TIME_H +#define __VMI_TIME_H + +/* + * Raw VMI call indices for timer functions + */ +#define VMI_CALL_GetCycleFrequency 66 +#define VMI_CALL_GetCycleCounter 67 +#define VMI_CALL_SetAlarm 68 +#define VMI_CALL_CancelAlarm 69 +#define VMI_CALL_GetWallclockTime 70 +#define VMI_CALL_WallclockUpdated 71 + +/* Cached VMI timer operations */ +extern struct vmi_timer_ops { + u64 (*get_cycle_frequency)(void); + u64 (*get_cycle_counter)(int); + u64 (*get_wallclock)(void); + int (*wallclock_updated)(void); + void (*set_alarm)(u32 flags, u64 expiry, u64 period); + void (*cancel_alarm)(u32 flags); +} vmi_timer_ops; + +/* Prototypes */ +extern void __init vmi_time_init(void); +extern unsigned long vmi_get_wallclock(void); +extern int vmi_set_wallclock(unsigned long now); +extern unsigned long long vmi_sched_clock(void); + +#ifdef CONFIG_X86_LOCAL_APIC +extern void __init vmi_timer_setup_boot_alarm(void); +extern void __init vmi_timer_setup_secondary_alarm(void); +extern void apic_vmi_timer_interrupt(void); +#endif + +#ifdef CONFIG_NO_IDLE_HZ +extern int vmi_stop_hz_timer(void); +extern void vmi_account_time_restart_hz_timer(void); +#endif + +/* + * When run under a hypervisor, a vcpu is always in one of three states: + * running, halted, or ready. The vcpu is in the 'running' state if it + * is executing. When the vcpu executes the halt interface, the vcpu + * enters the 'halted' state and remains halted until there is some work + * pending for the vcpu (e.g. an alarm expires, host I/O completes on + * behalf of virtual I/O). At this point, the vcpu enters the 'ready' + * state (waiting for the hypervisor to reschedule it). Finally, at any + * time when the vcpu is not in the 'running' state nor the 'halted' + * state, it is in the 'ready' state. + * + * Real time is advances while the vcpu is 'running', 'ready', or + * 'halted'. Stolen time is the time in which the vcpu is in the + * 'ready' state. Available time is the remaining time -- the vcpu is + * either 'running' or 'halted'. + * + * All three views of time are accessible through the VMI cycle + * counters. + */ + +/* The cycle counters. */ +#define VMI_CYCLES_REAL 0 +#define VMI_CYCLES_AVAILABLE 1 +#define VMI_CYCLES_STOLEN 2 + +/* The alarm interface 'flags' bits */ +#define VMI_ALARM_COUNTERS 2 + +#define VMI_ALARM_COUNTER_MASK 0x000000ff + +#define VMI_ALARM_WIRED_IRQ0 0x00000000 +#define VMI_ALARM_WIRED_LVTT 0x00010000 + +#define VMI_ALARM_IS_ONESHOT 0x00000000 +#define VMI_ALARM_IS_PERIODIC 0x00000100 + +#define CONFIG_VMI_ALARM_HZ 100 + +#endif From 7b3552024380f306a6c50d5105d18d9d4258fa4e Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 28/94] [PATCH] i386: Profile pc badness Profile_pc was broken when using paravirtualization because the assumption the kernel was running at CPL 0 was violated, causing bad logic to read a random value off the stack. The only way to be in kernel lock functions is to be in kernel code, so validate that assumption explicitly by checking the CS value. We don't want to be fooled by BIOS / APM segments and try to read those stacks, so only match KERNEL_CS. I moved some stuff in segment.h to make it prettier. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/time.c | 10 ++++------ include/asm-i386/ptrace.h | 4 ++++ include/asm-i386/segment.h | 19 +++++++++++++------ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 9603ccaba997..a4f67a6e6821 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); #else - unsigned long *sp; - if ((regs->xcs & 3) == 0) - sp = (unsigned long *)®s->esp; - else - sp = (unsigned long *)regs->esp; + unsigned long *sp = (unsigned long *)®s->esp; + /* Return address is either directly at stack pointer or above a saved eflags. Eflags has bits 22-31 zero, kernel addresses don't. */ diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index 1646996c73da..6002597b9e12 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -49,6 +49,10 @@ static inline int user_mode_vm(struct pt_regs *regs) { return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL; } +static inline int v8086_mode(struct pt_regs *regs) +{ + return (regs->eflags & VM_MASK); +} #define instruction_pointer(regs) ((regs)->eip) #define regs_return_value(regs) ((regs)->eax) diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h index 3c796af33776..065f10bfa487 100644 --- a/include/asm-i386/segment.h +++ b/include/asm-i386/segment.h @@ -83,14 +83,8 @@ * The GDT has 32 entries */ #define GDT_ENTRIES 32 - #define GDT_SIZE (GDT_ENTRIES * 8) -/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ -#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) - /* Simple and small GDT entries for booting only */ #define GDT_ENTRY_BOOT_CS 2 @@ -134,4 +128,17 @@ #ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif +/* + * Matching rules for certain types of segments. + */ + +/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */ +#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8) + +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */ +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8) + +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) + #endif From ac3b6faff961dd52fde71fb199ec3cf68ba35052 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 29/94] [PATCH] i386: Kprobe rpl fix Kprobes bugfix for paravirt compatibility - RPL on the CS when inserting BPs must match running kernel. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen CC: Eric Biederman --- arch/i386/kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index b85cfa3ce1dd..b545bc746fce 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); /* fixup registers */ - regs->xcs = __KERNEL_CS; + regs->xcs = __KERNEL_CS | get_kernel_rpl(); regs->eip = trampoline_address; regs->orig_eax = 0xffffffff; From 90736e20e3805dd1ffff60e4750495944956cd44 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:21 +0100 Subject: [PATCH 30/94] [PATCH] i386: Vmi timer race Because timer code moves around, and we might eventually move our init to a late_time_init hook, save and restore IRQs around this code because it is definitely not interrupt safe. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/kernel/vmitime.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c index 7c3033dbe5f5..2e2d8dbcbd68 100644 --- a/arch/i386/kernel/vmitime.c +++ b/arch/i386/kernel/vmitime.c @@ -180,7 +180,9 @@ unsigned long long vmi_sched_clock(void) void __init vmi_time_init(void) { unsigned long long cycles_per_sec, cycles_per_msec; + unsigned long flags; + local_irq_save(flags); setup_irq(0, &vmi_timer_irq); #ifdef CONFIG_X86_LOCAL_APIC set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); @@ -224,6 +226,8 @@ void __init vmi_time_init(void) VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, cycles_per_alarm); + + local_irq_restore(flags); } #ifdef CONFIG_X86_LOCAL_APIC From 7c0b49f9d1d59b3638c884b346a92dcb4ea1560a Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 31/94] [PATCH] i386: Paravirt debug defaults off Deliberate register clobber around performance critical inline code is great for testing, bad to leave on by default. Many people ship with DEBUG_KERNEL turned on, so stop making DEBUG_PARAVIRT default on. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/i386/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index f68cc6f215f8..458bc1611933 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -87,7 +87,7 @@ config DOUBLEFAULT config DEBUG_PARAVIRT bool "Enable some paravirtualization debugging" - default y + default n depends on PARAVIRT && DEBUG_KERNEL help Currently deliberately clobbers regs which are allowed to be From f8657e1b55901e6c227094258d1fa3642fa242bd Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 32/94] [PATCH] i386: move startup_32() in text.head section o Entry startup_32 was in .text section but it was accessing some init data too and it prompts MODPOST to generate compilation warnings. WARNING: vmlinux - Section mismatch: reference to .init.data:boot_params from .text between '_text' (at offset 0xc0100029) and 'startup_32_smp' WARNING: vmlinux - Section mismatch: reference to .init.data:boot_params from .text between '_text' (at offset 0xc0100037) and 'startup_32_smp' WARNING: vmlinux - Section mismatch: reference to .init.data:init_pg_tables_end from .text between '_text' (at offset 0xc0100099) and 'startup_32_smp' o Can't move startup_32 to .init.text as this entry point has to be at the start of bzImage. Hence moved startup_32 to a new section .text.head and instructed MODPOST to not to generate warnings if init data is being accessed from .text.head section. This code has been audited. o SMP boot up code (startup_32_smp) can go into .init.text if CPU hotplug is not supported. Otherwise it generates more warnings WARNING: vmlinux - Section mismatch: reference to .init.data:new_cpu_data from .text between 'checkCPUtype' (at offset 0xc0100126) and 'is486' WARNING: vmlinux - Section mismatch: reference to .init.data:new_cpu_data from .text between 'checkCPUtype' (at offset 0xc0100130) and 'is486' Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/head.S | 17 ++++++++++++++--- arch/i386/kernel/vmlinux.lds.S | 7 ++++++- scripts/mod/modpost.c | 10 +++++++++- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 6c7f71176977..734be5572eb9 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -53,6 +53,7 @@ * any particular GDT layout, because we load our own as soon as we * can. */ +.section .text.head,"ax",@progbits ENTRY(startup_32) #ifdef CONFIG_PARAVIRT @@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20); jb 10b movl %edi,(init_pg_tables_end - __PAGE_OFFSET) -#ifdef CONFIG_SMP xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f - /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but * we know the trampoline has already loaded the boot_gdt_table GDT * for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later */ + +#ifdef CONFIG_HOTPLUG_CPU +.section .text,"ax",@progbits +#else +.section .init.text,"ax",@progbits +#endif + +#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -208,8 +218,8 @@ ENTRY(startup_32_smp) xorl %ebx,%ebx incl %ebx -3: #endif /* CONFIG_SMP */ +3: /* * Enable paging @@ -492,6 +502,7 @@ ignore_int: #endif iret +.section .text #ifdef CONFIG_PARAVIRT startup_paravirt: cld diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 5038a73d554e..ca51610955df 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -37,9 +37,14 @@ SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; + + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ + *(.text.head) + } :text = 0x9090 + /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ *(.text) SCHED_TEXT LOCK_TEXT diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2aa47623f5f8..569e68410d7a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -641,12 +641,20 @@ static int secref_whitelist(const char *modname, const char *tosec, if (f1 && f2) return 1; - /* Whitelist all references from .pci_fixup section if vmlinux */ + /* Whitelist all references from .pci_fixup section if vmlinux + * Whitelist all refereces from .text.head to .init.data if vmlinux + * Whitelist all refereces from .text.head to .init.text if vmlinux + */ if (is_vmlinux(modname)) { if ((strcmp(fromsec, ".pci_fixup") == 0) && (strcmp(tosec, ".init.text") == 0)) return 1; + if ((strcmp(fromsec, ".text.head") == 0) && + ((strcmp(tosec, ".init.data") == 0) || + (strcmp(tosec, ".init.text") == 0))) + return 1; + /* Check for pattern 3 */ for (s = pat3refsym; *s; s++) if (strcmp(refsymname, *s) == 0) From ee5bfa642a0d4b0f6ec6200bf96e5e647f93fcdb Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 33/94] [PATCH] generic: Break init() in two parts to avoid MODPOST warnings o init() is a non __init function in .text section but it calls many functions which are in .init.text section. Hence MODPOST generates lots of cross reference warnings on i386 if compiled with CONFIG_RELOCATABLE=y WARNING: vmlinux - Section mismatch: reference to .init.text:smp_prepare_cpus from .text between 'init' (at offset 0xc0101049) and 'rest_init' WARNING: vmlinux - Section mismatch: reference to .init.text:migration_init from .text between 'init' (at offset 0xc010104e) and 'rest_init' WARNING: vmlinux - Section mismatch: reference to .init.text:spawn_ksoftirqd from .text between 'init' (at offset 0xc0101053) and 'rest_init' o This patch breaks down init() in two parts. One part which can go in .init.text section and can be freed and other part which has to be non __init(init_post()). Now init() calls init_post() and init_post() does not call any functions present in .init sections. Hence getting rid of warnings. Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- init/main.c | 81 +++++++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/init/main.c b/init/main.c index 4e9e92bb2b89..62ded9f3b393 100644 --- a/init/main.c +++ b/init/main.c @@ -727,7 +727,49 @@ static void run_init_process(char *init_filename) kernel_execve(init_filename, argv_init, envp_init); } -static int init(void * unused) +/* This is a non __init function. Force it to be noinline otherwise gcc + * makes it inline to init() and it becomes part of init.text section + */ +static int noinline init_post(void) +{ + free_initmem(); + unlock_kernel(); + mark_rodata_ro(); + system_state = SYSTEM_RUNNING; + numa_default_policy(); + + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + printk(KERN_WARNING "Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); + + if (ramdisk_execute_command) { + run_init_process(ramdisk_execute_command); + printk(KERN_WARNING "Failed to execute %s\n", + ramdisk_execute_command); + } + + /* + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. + */ + if (execute_command) { + run_init_process(execute_command); + printk(KERN_WARNING "Failed to execute %s. Attempting " + "defaults...\n", execute_command); + } + run_init_process("/sbin/init"); + run_init_process("/etc/init"); + run_init_process("/bin/init"); + run_init_process("/bin/sh"); + + panic("No init found. Try passing init= option to kernel."); +} + +static int __init init(void * unused) { lock_kernel(); /* @@ -775,39 +817,6 @@ static int init(void * unused) * we're essentially up and running. Get rid of the * initmem segments and start the user-mode stuff.. */ - free_initmem(); - unlock_kernel(); - mark_rodata_ro(); - system_state = SYSTEM_RUNNING; - numa_default_policy(); - - if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - printk(KERN_WARNING "Warning: unable to open an initial console.\n"); - - (void) sys_dup(0); - (void) sys_dup(0); - - if (ramdisk_execute_command) { - run_init_process(ramdisk_execute_command); - printk(KERN_WARNING "Failed to execute %s\n", - ramdisk_execute_command); - } - - /* - * We try each of these until one succeeds. - * - * The Bourne shell can be used instead of init if we are - * trying to recover a really broken machine. - */ - if (execute_command) { - run_init_process(execute_command); - printk(KERN_WARNING "Failed to execute %s. Attempting " - "defaults...\n", execute_command); - } - run_init_process("/sbin/init"); - run_init_process("/etc/init"); - run_init_process("/bin/init"); - run_init_process("/bin/sh"); - - panic("No init found. Try passing init= option to kernel."); + init_post(); + return 0; } From 86a978837ca739842317c4cf433de36aeb85ea3b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 34/94] [PATCH] i386: arch/i386/kernel/cpu/mcheck/mce.c should #include Every file should include the headers containing the prototypes for it's global functions. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index d555bec0db99..4f10c62d180c 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -12,6 +12,7 @@ #include #include +#include #include "mce.h" From 2ff2d3d74705d34ab71b21f54634fcf50d57bdd5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 35/94] [PATCH] i386: add idle notifier Add a notifier mechanism to the low level idle loop. You can register a callback function which gets invoked on entry and exit from the low level idle loop. The low level idle loop is defined as the polling loop, low-power call, or the mwait instruction. Interrupts processed by the idle thread are not considered part of the low level loop. The notifier can be used to measure precisely how much is spent in useless execution (or low power mode). The perfmon subsystem uses it to turn on/off monitoring. Signed-off-by: stephane eranian Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 4 +++ arch/i386/kernel/cpu/mcheck/p4.c | 2 ++ arch/i386/kernel/irq.c | 3 ++ arch/i386/kernel/process.c | 53 +++++++++++++++++++++++++++++++- arch/i386/kernel/smp.c | 2 ++ include/asm-i386/idle.h | 14 +++++++++ include/asm-i386/processor.h | 8 +++++ 7 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 include/asm-i386/idle.h diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 629c5ed94260..f4159e0a7ae9 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -1255,6 +1256,7 @@ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ + exit_idle(); irq_enter(); smp_local_timer_interrupt(); irq_exit(); @@ -1305,6 +1307,7 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs) { unsigned long v; + exit_idle(); irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1329,6 +1332,7 @@ fastcall void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; + exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a46011..8359c19d3a23 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm fastcall void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); vendor_thermal_interrupt(regs); irq_exit(); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 3201d421090a..5785d84103a6 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -19,6 +19,8 @@ #include #include +#include + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -61,6 +63,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) union irq_ctx *curctx, *irqctx; u32 *isp; #endif + exit_idle(); if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 05be77413351..7845d480c293 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -48,6 +48,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -80,6 +81,42 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} + +static DEFINE_PER_CPU(volatile unsigned long, idle_state); + +void enter_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + __set_bit(0, &__get_cpu_var(idle_state)); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +void exit_idle(void) +{ + if (current->pid) + return; + __exit_idle(); +} + void disable_hlt(void) { hlt_counter++; @@ -130,6 +167,7 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { + local_irq_enable(); cpu_relax(); } @@ -189,7 +227,16 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); idle(); + __exit_idle(); } preempt_enable_no_resched(); schedule(); @@ -243,7 +290,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __sti_mwait(eax, ecx); + else + local_irq_enable(); + } else { + local_irq_enable(); } } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 5285aff8367f..ffc4f65c5189 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -23,6 +23,7 @@ #include #include +#include #include /* @@ -624,6 +625,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) /* * At this point the info structure may be out of scope unless wait==1 */ + exit_idle(); irq_enter(); (*func)(info); irq_exit(); diff --git a/include/asm-i386/idle.h b/include/asm-i386/idle.h new file mode 100644 index 000000000000..87ab93911199 --- /dev/null +++ b/include/asm-i386/idle.h @@ -0,0 +1,14 @@ +#ifndef _ASM_I386_IDLE_H +#define _ASM_I386_IDLE_H 1 + +#define IDLE_START 1 +#define IDLE_END 2 + +struct notifier_block; +void idle_notifier_register(struct notifier_block *n); +void idle_notifier_unregister(struct notifier_block *n); + +void exit_idle(void); +void enter_idle(void); + +#endif diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 11bf899de8aa..edfbe46a5e13 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -257,6 +257,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) : :"a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + "sti; .byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); /* from system description table in BIOS. Mostly for MCA use, but From f9690982b8c2f9a2c65acdc113e758ec356676a3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 36/94] [PATCH] i386: improve sched_clock() on i686 Clean up sched_clock() on i686: it will use the TSC if available and falls back to jiffies only if the user asked for it to be disabled via notsc or the CPU calibration code didnt figure out the right cpu_khz. This generally makes the scheduler timestamps more finegrained, on all hardware. (the current scheduler is pretty resistant against asynchronous sched_clock() values on different CPUs, it will allow at most up to a jiffy of jitter.) Also simplify sched_clock()'s check for TSC availability: propagate the desire and ability to use the TSC into the tsc_disable flag, previously this flag only indicated whether the notsc option was passed. This makes the rare low-res sched_clock() codepath a single branch off a read-mostly flag. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/tsc.c | 22 ++++++++++++++-------- include/asm-i386/bugs.h | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 12fef14995a5..46f752a8bbf3 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -112,13 +112,10 @@ unsigned long long sched_clock(void) return (*custom_sched_clock)(); /* - * in the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. + * Fall back to jiffies if there's no TSC available: */ -#ifndef CONFIG_NUMA - if (!cpu_khz || check_tsc_unstable()) -#endif - /* no locking but a rare wrong value is not a big deal */ + if (unlikely(tsc_disable)) + /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ @@ -198,13 +195,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); void __init tsc_init(void) { if (!cpu_has_tsc || tsc_disable) - return; + goto out_no_tsc; cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; if (!cpu_khz) - return; + goto out_no_tsc; printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, @@ -212,6 +209,15 @@ void __init tsc_init(void) set_cyc2ns_scale(cpu_khz); use_tsc_delay(); + return; + +out_no_tsc: + /* + * Set the tsc_disable flag if there's no TSC support, this + * makes it a fast flag for the kernel to see whether it + * should be using the TSC. + */ + tsc_disable = 1; } #ifdef CONFIG_CPU_FREQ diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index 38f1aebbbdb5..c90c7c499302 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -160,7 +160,7 @@ static void __init check_config(void) * If we configured ourselves for a TSC, we'd better have one! */ #ifdef CONFIG_X86_TSC - if (!cpu_has_tsc) + if (!cpu_has_tsc && !tsc_disable) panic("Kernel compiled for Pentium+, requires TSC feature!"); #endif From 3b3d5e1db66cd66148b2cebd2c38aff2a8df03d6 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 37/94] [PATCH] i386: romsignature/checksum cleanup Use adding __init to romsignature() (it's only called from probe_roms() which is itself __init) as an excuse to submit a pedantic cleanup. Signed-off-by: Rene Herman Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/e820.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 9ded1e49119e..70f39560846a 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -157,21 +157,22 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static int romsignature(const unsigned char *x) +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) { unsigned short sig; - int ret = 0; - if (probe_kernel_address((const unsigned short *)x, sig) == 0) - ret = (sig == 0xaa55); - return ret; + + return probe_kernel_address((const unsigned short *)rom, sig) == 0 && + sig == ROMSIGNATURE; } static int __init romchecksum(unsigned char *rom, unsigned long length) { - unsigned char *p, sum = 0; + unsigned char sum; - for (p = rom; p < rom + length; p++) - sum += *p; + for (sum = 0; length; length--) + sum += *rom++; return sum == 0; } From 53fee04f318222a3179ca5933d8bda82c1eef17a Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 38/94] [PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole This patch resolves the issue of running with numa=fake=X on kernel command line on x86_64 machines that have big IO hole. While calculating the size of each node now we look at the total hole size in that range. Previously there were nodes that only had IO holes in them causing kernel boot problems. We now use the NODE_MIN_SIZE (64MB) as the minimum size of memory that any node must have. We reduce the number of allocated nodes if the number of nodes specified on kernel command line results in any node getting memory smaller than NODE_MIN_SIZE. This change allows the extra memory to be incremented in NODE_MIN_SIZE granule and uniformly distribute among as many nodes (called big nodes) as possible. [akpm@osdl.org: build fix] Signed-off-by: David Rientjes Signed-off-by: Paul Menage Signed-off-by: Rohit Seth Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 31 ++++++++++ arch/x86_64/mm/numa.c | 110 +++++++++++++++++++++++++++++++----- include/asm-x86_64/e820.h | 1 + include/asm-x86_64/mmzone.h | 5 ++ 4 files changed, 133 insertions(+), 14 deletions(-) diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 9d67955bbc31..4651fd22b213 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -190,6 +190,37 @@ unsigned long __init e820_end_of_ram(void) return end_pfn; } +/* + * Find the hole size in the range. + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram); +} + /* * Mark e820 reserved areas as busy for the resource manager. */ diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 1ec16ea97519..d3f747dd61d3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -272,31 +272,113 @@ void __init numa_init_array(void) } #ifdef CONFIG_NUMA_EMU +/* Numa emulation */ int numa_fake __initdata = 0; -/* Numa emulation */ +/* + * This function is used to find out if the start and end correspond to + * different zones. + */ +int zone_cross_over(unsigned long start, unsigned long end) +{ + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && + (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)< 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 855fb4a454b6..6216fa3f2802 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void); extern void e820_print_map(char *who); extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern unsigned long e820_hole_size(unsigned long start, unsigned long end); extern void e820_setup_gap(void); extern void e820_register_active_regions(int nid, diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index 39ef106986eb..fb558fb1d211 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h @@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) extern int pfn_valid(unsigned long pfn); #endif +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE (64*1024*1024) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) +#endif + #endif #endif From c49c5330c9592f29a69bb2ea8f6e7fd5d9c151e8 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 39/94] [PATCH] x86-64: Remove fastcall references in x86_64 code Unlike x86, x86_64 already passes arguments in registers. The use of regparm attribute makes no difference in produced code, and the use of fastcall just bloats the code. Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/acpi/sleep.c | 2 +- arch/x86_64/kernel/x8664_ksyms.c | 4 ++-- include/asm-x86_64/hw_irq.h | 2 +- include/asm-x86_64/mutex.h | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 5ebf62c7a3d2..23178ce6c783 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0; unsigned long acpi_video_flags; extern char wakeup_start, wakeup_end; -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); +extern unsigned long acpi_copy_wakeup_routine(unsigned long); static pgd_t low_ptr; diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 23a7da312f30..0dffae69f4ad 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); #ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +extern void __write_lock_failed(rwlock_t *rw); +extern void __read_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); #endif diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h index 179cce755aa7..552df5f10a6d 100644 --- a/include/asm-x86_64/hw_irq.h +++ b/include/asm-x86_64/hw_irq.h @@ -91,7 +91,7 @@ extern void enable_8259A_irq(unsigned int irq); extern int i8259A_irq_pending(unsigned int irq); extern void make_8259A_irq(unsigned int irq); extern void init_8259A(int aeoi); -extern void FASTCALL(send_IPI_self(int vector)); +extern void send_IPI_self(int vector); extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); diff --git a/include/asm-x86_64/mutex.h b/include/asm-x86_64/mutex.h index 16396b1de3e4..6c2949a3c677 100644 --- a/include/asm-x86_64/mutex.h +++ b/include/asm-x86_64/mutex.h @@ -21,7 +21,7 @@ do { \ unsigned long dummy; \ \ typecheck(atomic_t *, v); \ - typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ \ __asm__ __volatile__( \ LOCK_PREFIX " decl (%%rdi) \n" \ @@ -47,7 +47,7 @@ do { \ */ static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int fastcall (*fail_fn)(atomic_t *)) + int (*fail_fn)(atomic_t *)) { if (unlikely(atomic_dec_return(count) < 0)) return fail_fn(count); @@ -67,7 +67,7 @@ do { \ unsigned long dummy; \ \ typecheck(atomic_t *, v); \ - typecheck_fn(fastcall void (*)(atomic_t *), fail_fn); \ + typecheck_fn(void (*)(atomic_t *), fail_fn); \ \ __asm__ __volatile__( \ LOCK_PREFIX " incl (%%rdi) \n" \ From 4c3cbf75b262433afc90b5c35510d1e5744d3b94 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 40/94] [PATCH] x86-64: Use constant instead of raw number in x86_64 ioperm.c This is a tiny cleanup to increase readability Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/ioport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index fe063d3cfe42..745b1f0f494e 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); return 0; } From 1676193937a538fdb92a2916a86a705093cfd613 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 41/94] [PATCH] x86-64: Handle 32 bit PerfMon Counter writes cleanly in x86_64 nmi_watchdog P6 CPUs and Core/Core 2 CPUs which has 'architectural perf mon' feature, only supports write of low 32 bits in Performance Monitoring Counters. Bits 32..39 are sign extended based on bit 31 and bits 40..63 are reserved and should be zero. This patch: Change x86_64 nmi handler to handle this case cleanly. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9cb42ecb7f89..e59cda134166 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + unsigned int retval = hz; + + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { + retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + return retval; +} + int __init check_nmi_watchdog (void) { int *counts; @@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } kfree(counts); @@ -634,7 +642,9 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -855,15 +865,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* ARCH_PERFMON has 32 bit counter writes */ + wrmsr(wd->perfctr_msr, + (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); + } else { + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. From 90ce4bc4542c10b63dc6482ac920ff1226a6e5ff Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH 42/94] [PATCH] i386: Handle 32 bit PerfMon Counter writes cleanly in i386 nmi_watchdog Change i386 nmi handler to handle 32 bit perfmon counter MSR writes cleanly. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 64 +++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1a6f8bb8881c..7d3f4e22d6fb 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -216,6 +216,28 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -281,18 +303,10 @@ static int __init check_nmi_watchdog(void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - nmi_hz = count + 1; + + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } } @@ -442,6 +456,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) wrmsrl(perfctr_msr, 0 - count); } +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + /* Note that these events don't tick when the CPU idles. This means the frequency varies with CPU load. */ @@ -531,7 +556,8 @@ static int setup_p6_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -704,7 +730,8 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -956,6 +983,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { @@ -964,9 +993,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * other P6 variant. * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL); + } else { + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. From 44264261d8fb87849118e41b2735bd95db28126f Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 43/94] [PATCH] i386: Handle 32 bit PerfMon Counter writes cleanly in oprofile Handle these 32 bit perfmon counter MSR writes cleanly in oprofile. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Andi Kleen --- arch/i386/oprofile/op_model_ppro.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c index ca2447e05e15..c554f52cb808 100644 --- a/arch/i386/oprofile/op_model_ppro.c +++ b/arch/i386/oprofile/op_model_ppro.c @@ -24,7 +24,8 @@ #define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) -#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) +#define CTR_32BIT_WRITE(l,msrs,c) \ + do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) #define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) @@ -79,7 +80,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs,i))) continue; - CTR_WRITE(1, msrs, i); + CTR_32BIT_WRITE(1, msrs, i); } /* enable active counters */ @@ -87,7 +88,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) { reset_value[i] = counter_config[i].count; - CTR_WRITE(counter_config[i].count, msrs, i); + CTR_32BIT_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -116,7 +117,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + CTR_32BIT_WRITE(reset_value[i], msrs, i); } } From 57d307720c9a60038f134b0567ca302b88313a0a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 44/94] [PATCH] x86-64: cleanup Doc/x86_64/ files Fix typos. Lots of whitespace changes for readability and consistency. Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen --- Documentation/x86_64/boot-options.txt | 27 ++++++++++----------------- Documentation/x86_64/cpu-hotplug-spec | 2 +- Documentation/x86_64/kernel-stacks | 26 +++++++++++++------------- Documentation/x86_64/mm.txt | 22 +++++++++++----------- 4 files changed, 35 insertions(+), 42 deletions(-) diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 0d653993f361..625a21db0c2a 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -226,9 +226,9 @@ IOMMU (input/output memory management unit) is 20. memaper[=] Allocate an own aperture over RAM with size 32MB< Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 45/94] [PATCH] x86-64: list x86_64 quilt tree List x86_64 quilt tree in MAINTAINERS. Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f85c603b02a3..011dd1db3e1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3768,6 +3768,7 @@ P: Andi Kleen M: ak@suse.de L: discuss@x86-64.org W: http://www.x86-64.org +T: quilt ftp://ftp.firstfloor.org/pub/ak/x86_64/quilt-current S: Maintained YAM DRIVER FOR AX.25 From 9b355897562fe2291248a7aec8e479c2c98cf117 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 46/94] [PATCH] x86: simplify notify_page_fault() Remove all parameters from this function that aren't really variable. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/fault.c | 18 ++++++++---------- arch/x86_64/mm/fault.c | 18 ++++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cba9b3894a33..b8c4e259fc8b 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -46,17 +46,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* @@ -327,8 +327,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) return; - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -337,8 +336,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* It's safe to allow irq's after cr2 has been saved and the vmalloc diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 49e8cf2e06f8..6ada7231f3ab 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* Sometimes the CPU reports invalid exceptions on prefetch. @@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; if (likely(regs->eflags & X86_EFLAGS_IF)) From 24ce0e96f2dea558762c994d054ea2f3c01fa95a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 47/94] [PATCH] x86-64: Tighten mce_amd driver MSR reads while debugging an unrelated problem in Xen, I noticed odd reads from non-existent MSRs. Having now found time to look why these happen, I came up with below patch, which - prevents accessing MCi_MISCj with j > 0 when the block pointer in MCi_MISC0 is zero - accesses only contiguous MCi_MISCj until a non-implemented one is found - doesn't touch unimplemented blocks in mce_threshold_interrupt at all - gives names to two bits previously derived from MASK_VALID_HI (it took me some time to understand the code without this) The first three items, besides being apparently closer to the spec, should namely help cutting down on the time mce_threshold_interrupt() takes. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/mce_amd.c | 40 ++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 93c707257637..cd8dbe57b33a 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -37,6 +37,8 @@ #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 #define MASK_LVTOFF_HI 0x00F00000 #define MASK_COUNT_EN_HI 0x00080000 #define MASK_INT_TYPE_HI 0x00060000 @@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (!block) @@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -206,8 +216,8 @@ asmlinkage void mce_threshold_interrupt(void) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (high & MASK_OVERFLOW_HI) { @@ -385,7 +395,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; if (rdmsr_safe(address, &low, &high)) - goto recurse; + return 0; if (!(high & MASK_VALID_HI)) { if (block) @@ -394,8 +404,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) goto recurse; b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); From a98f0dd34d94ea0b5f3816196bea5dba467827bb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 48/94] [PATCH] x86-64: Allow to run a program when a machine check event is detected When a machine check event is detected (including a AMD RevF threshold overflow event) allow to run a "trigger" program. This allows user space to react to such events sooner. The trigger is configured using a new trigger entry in the machinecheck sysfs interface. It is currently shared between all CPUs. I also fixed the AMD threshold handler to run the machine check polling code immediately to actually log any events that might have caused the threshold interrupt. Also added some documentation for the mce sysfs interface. Signed-off-by: Andi Kleen --- Documentation/x86_64/machinecheck | 70 +++++++++++++++++++++++++++++++ arch/x86_64/kernel/mce.c | 66 +++++++++++++++++++++++------ arch/x86_64/kernel/mce_amd.c | 4 ++ include/asm-x86_64/mce.h | 2 + kernel/kmod.c | 44 ++++++++++++------- 5 files changed, 160 insertions(+), 26 deletions(-) create mode 100644 Documentation/x86_64/machinecheck diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck new file mode 100644 index 000000000000..068a6d9904b9 --- /dev/null +++ b/Documentation/x86_64/machinecheck @@ -0,0 +1,70 @@ + +Configurable sysfs parameters for the x86-64 machine check code. + +Machine checks report internal hardware error conditions detected +by the CPU. Uncorrected errors typically cause a machine check +(often with panic), corrected ones cause a machine check log entry. + +Machine checks are organized in banks (normally associated with +a hardware subsystem) and subevents in a bank. The exact meaning +of the banks and subevent is CPU specific. + +mcelog knows how to decode them. + +When you see the "Machine check errors logged" message in the system +log then mcelog should run to collect and decode machine check entries +from /dev/mcelog. Normally mcelog should be run regularly from a cronjob. + +Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN +(N = CPU number) + +The directory contains some configurable entries: + +Entries: + +bankNctl +(N bank number) + 64bit Hex bitmask enabling/disabling specific subevents for bank N + When a bit in the bitmask is zero then the respective + subevent will not be reported. + By default all events are enabled. + Note that BIOS maintain another mask to disable specific events + per bank. This is not visible here + +The following entries appear for each CPU, but they are truly shared +between all CPUs. + +check_interval + How often to poll for corrected machine check errors, in seconds + (Note output is hexademical). Default 5 minutes. + +tolerant + Tolerance level. When a machine check exception occurs for a non + corrected machine check the kernel can take different actions. + Since machine check exceptions can happen any time it is sometimes + risky for the kernel to kill a process because it defies + normal kernel locking rules. The tolerance level configures + how hard the kernel tries to recover even at some risk of deadlock. + + 0: always panic, + 1: panic if deadlock possible, + 2: try to avoid panic, + 3: never panic or exit (for testing only) + + Default: 1 + + Note this only makes a difference if the CPU allows recovery + from a machine check exception. Current x86 CPUs generally do not. + +trigger + Program to run when a machine check event is detected. + This is an alternative to running mcelog regularly from cron + and allows to detect events faster. + +TBD document entries for AMD threshold interrupt configuration + +For more details about the x86 machine check architecture +see the Intel and AMD architecture manuals from their developer websites. + +For more details about the architecture see +see http://one.firstfloor.org/~andi/mce.pdf diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bdb54a2c9f18..8011a8e1c7d4 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,10 @@ static unsigned long console_logged; static int notify_user; static int rip_msr; static int mce_bootlog = 1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; /* * Lockless MCE logging infrastructure. @@ -57,6 +62,7 @@ struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { @@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +static void do_mce_trigger(void) +{ + static atomic_t mce_logged; + int events = atomic_read(&mce_events); + if (events != atomic_read(&mce_logged) && trigger[0]) { + /* Small race window, but should be harmless. */ + atomic_set(&mce_logged, events); + call_usermodehelper(trigger, trigger_argv, NULL, -1); + } +} + /* * The actual machine check handler */ @@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) + if (!regs) { + /* Normal interrupt context here. Call trigger for any new + events. */ + do_mce_trigger(); goto out; + } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ @@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); +/* TBD should generate these dynamically based on number of available banks */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static struct sysdev_attribute * bank_attributes[NR_BANKS] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; + +static ssize_t show_trigger(struct sys_device *s, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +{ + char *p; + int len; + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + if (*p) *p = 0; + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant, &attr_check_interval, &attr_trigger, + NULL +}; /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) err = sysdev_register(&per_cpu(device_mce,cpu)); if (!err) { - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_create_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); } return err; } @@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu) { int i; - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index cd8dbe57b33a..d0bd5d66e103 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void) (high & MASK_LOCKED_HI)) continue; + /* Log the machine check that caused the threshold + event. */ + do_machine_check(NULL, 0); + if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h index 5a11146d6d9c..177e92b4019b 100644 --- a/include/asm-x86_64/mce.h +++ b/include/asm-x86_64/mce.h @@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status); extern atomic_t mce_entry; +extern void do_machine_check(struct pt_regs *, long); + #endif #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -217,7 +217,10 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + if (sub_info->wait < 0) + kfree(sub_info); + else + complete(sub_info->complete); return 0; } @@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); + if (wait < 0) + return; + if (pid < 0) { sub_info->retval = pid; complete(sub_info->complete); @@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) * @envp: null-terminated environment list * @session_keyring: session keyring for process (NULL for an empty keyring) * @wait: wait for the application to finish and return status. + * when -1 don't wait at all, but you get no useful error back when + * the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. @@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, struct key *session_keyring, int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .ring = session_keyring, - .wait = wait, - .retval = 0, - }; + struct subprocess_info *sub_info; + int retval; if (!khelper_wq) return -EBUSY; @@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &sub_info.work); + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + return -ENOMEM; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->complete = &done; + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + sub_info->ring = session_keyring; + sub_info->wait = wait; + + queue_work(khelper_wq, &sub_info->work); + if (wait < 0) /* task has freed sub_info */ + return 0; wait_for_completion(&done); - return sub_info.retval; + retval = sub_info->retval; + kfree(sub_info); + return retval; } EXPORT_SYMBOL(call_usermodehelper_keys); From 930f8b8bcde30b501fdf00fb7624aefb9bf35f47 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 49/94] [PATCH] x86-64: remove get_pmd() Function is dead. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- include/asm-x86_64/pgalloc.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 43d4c333a8b1..4e28b6060a5e 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -18,11 +18,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *p set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); } -static inline pmd_t *get_pmd(void) -{ - return (pmd_t *)get_zeroed_page(GFP_KERNEL); -} - static inline void pmd_free(pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); From 8c40ad02e5b026902b8ce134f895b3b09803db39 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 50/94] [PATCH] i386: Small cleanup to TLB flush code - Remove outdated comment - Use cpu_relax() in a busy loop Signed-off-by: Andi Kleen --- arch/i386/kernel/smp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index ffc4f65c5189..9bd9637ae692 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -375,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. + * AK: x86-64 has a faster method that could be ported. */ spin_lock(&tlbstate_lock); @@ -401,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; From edf8dd36b53fdd558bc9a8ac5be793d27e110f90 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 51/94] [PATCH] x86-64: Kconfig typos Some typos in Kconfig. Signed-off-by: Nicolas Kaiser Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index a55382a1bb46..7982cbc3bc94 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -152,18 +152,18 @@ config MPSC Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs with Intel Extended Memory 64 Technology(EM64T). For details see . - Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distingush them + Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the + Netburst core and shouldn't use this option. You can distinguish them using the cpu family field - in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one - (this rule only applies to system that support EM64T) + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one + (this rule only applies to systems that support EM64T) config MCORE2 bool "Intel Core2 / newer Xeon" help Optimize for Intel Core2 and newer Xeons (51xx) - You can distingush the newer Xeons from the older ones using - the cpu family field in /proc/cpuinfo. 15 is a older Xeon + You can distinguish the newer Xeons from the older ones using + the cpu family field in /proc/cpuinfo. 15 is an older Xeon (use CONFIG_MPSC then), 6 is a newer one. This rule only applies to CPUs that support EM64T. From d958f143329e685d114725b64fe6bef22994c74c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 52/94] [PATCH] i386: use smp_call_function_single() It will execute rdmsr and wrmsr only on the cpu we need. Signed-off-by: Alexey Dobriyan Signed-off-by: Andi Kleen --- arch/i386/kernel/msr.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 4e14264f392a..bcaa6e9b6197 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) #ifdef CONFIG_SMP struct msr_command { - int cpu; int err; u32 reg; u32 data[2]; @@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); } static void msr_smp_rdmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); } static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) @@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) if (cpu == smp_processor_id()) { ret = wrmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data[0] = eax; cmd.data[1] = edx; - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1); ret = cmd.err; } preempt_enable(); @@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) if (cpu == smp_processor_id()) { ret = rdmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1); *eax = cmd.data[0]; *edx = cmd.data[1]; From ad4e680fb2220518de5118a8e734240d4c374fe2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 53/94] [PATCH] i386: use smp_call_function_single() It will execure cpuid only on the cpu we need. Signed-off-by: Alexey Dobriyan Signed-off-by: Andi Kleen --- arch/i386/kernel/cpuid.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index 4da75fa3208d..eeae0d992337 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -48,7 +48,6 @@ static struct class *cpuid_class; #ifdef CONFIG_SMP struct cpuid_command { - int cpu; u32 reg; u32 *data; }; @@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block) { struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], + cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); } @@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data) if (cpu == smp_processor_id()) { cpuid(reg, &data[0], &data[1], &data[2], &data[3]); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data = data; - smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); } preempt_enable(); } From b0957f1a3a7687bfaf5b0bfe402b50985ea2f06b Mon Sep 17 00:00:00 2001 From: Josef 'Jeff' Sipek Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 54/94] [PATCH] x86-64: Fix preprocessor condition Old code was legal standard C, but apparently not sparse-C. Signed-off-by: Josef 'Jeff' Sipek Signed-off-by: Andi Kleen --- include/asm-x86_64/io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86_64/io.h b/include/asm-x86_64/io.h index f5d84bb7c948..de2cd9a2303a 100644 --- a/include/asm-x86_64/io.h +++ b/include/asm-x86_64/io.h @@ -100,7 +100,7 @@ __OUTS(l) #define IO_SPACE_LIMIT 0xffff -#if defined(__KERNEL__) && __x86_64__ +#if defined(__KERNEL__) && defined(__x86_64__) #include From 016d6f35803667ffbe3e7bba8b58a6b611fac998 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 55/94] [PATCH] i386: Convert /proc/apm to seqfile Byte-to-byte identical /proc/apm here. Signed-off-by: Alexey Dobriyan Signed-off-by: Andi Kleen --- arch/i386/kernel/apm.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index db99a8948dae..f9ba0af7ee1f 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -211,6 +211,7 @@ #include #include #include +#include #include #include #include @@ -1636,9 +1637,8 @@ static int do_open(struct inode * inode, struct file * filp) return 0; } -static int apm_get_info(char *buf, char **start, off_t fpos, int length) +static int proc_apm_show(struct seq_file *m, void *v) { - char * p; unsigned short bx; unsigned short cx; unsigned short dx; @@ -1650,8 +1650,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) int time_units = -1; char *units = "?"; - p = buf; - if ((num_online_cpus() == 1) && !(error = apm_get_power_status(&bx, &cx, &dx))) { ac_line_status = (bx >> 8) & 0xff; @@ -1705,7 +1703,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) -1: Unknown 8) min = minutes; sec = seconds */ - p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", driver_version, (apm_info.bios.version >> 8) & 0xff, apm_info.bios.version & 0xff, @@ -1716,10 +1714,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) percentage, time_units, units); - - return p - buf; + return 0; } +static int proc_apm_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_apm_show, NULL); +} + +static const struct file_operations apm_file_ops = { + .owner = THIS_MODULE, + .open = proc_apm_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int apm(void *unused) { unsigned short bx; @@ -2341,9 +2351,9 @@ static int __init apm_init(void) set_base(gdt[APM_DS >> 3], __va((unsigned long)apm_info.bios.dseg << 4)); - apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); + apm_proc = create_proc_entry("apm", 0, NULL); if (apm_proc) - apm_proc->owner = THIS_MODULE; + apm_proc->proc_fops = &apm_file_ops; kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { From 6c5806cae50717f31878d0da29109b10610ab862 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH 56/94] [PATCH] i386: fix size_or_mask and size_and_mask mtrr: fix size_or_mask and size_and_mask This fixes two bugs in /proc/mtrr interface: o If physical address size crosses the 44 bit boundary size_or_mask is evaluated wrong. o size_and_mask limits width of physical base address for an MTRR to be less than 44 bits. TBD: later patch had one more change, but I think that was bogus. TBD: need to double check Signed-off-by: Andreas Herrmann Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/main.c | 6 +++--- arch/i386/kernel/cpu/mtrr/mtrr.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 16bb7ea87145..0acfb6a5a220 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -50,7 +50,7 @@ u32 num_var_ranges = 0; unsigned int *usage_table; static DEFINE_MUTEX(mtrr_mutex); -u32 size_or_mask, size_and_mask; +u64 size_or_mask, size_and_mask; static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; @@ -662,8 +662,8 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; + size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { /* VIA C* family have Intel style MTRRs, but diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index d61ea9db6cfe..289dfe6030e3 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -84,7 +84,7 @@ void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); -extern u32 size_or_mask, size_and_mask; +extern u64 size_or_mask, size_and_mask; extern struct mtrr_ops * mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) From 2f7a2a79c3ebb44f8b1b7d9b4fd3a650eb69e544 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 57/94] [PATCH] x86-64: - Ignore long SMI interrupts in clock calibration code - update 1 Add failsafe mechanism to HPET/TSC clock calibration. Signed-off-by: Jack Steiner Updated to include failsafe mechanism & additional community feedback. Patch built on latest 2.6.20-rc4-mm1 tree. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 335cc91c49b7..9c7fba30dac1 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc); #define TICK_COUNT 100000000 #define TICK_MIN 5000 +#define MAX_READ_RETRIES 5 /* * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none @@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc); */ static void __init read_hpet_tsc(int *hpet, int *tsc) { - int tsc1, tsc2, hpet1; + int tsc1, tsc2, hpet1, retries = 0; + static int msg; do { tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN); + } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); + if (retries >= MAX_READ_RETRIES && !msg++) + printk(KERN_WARNING + "hpet.c: exceeded max retries to read HPET & TSC\n"); *hpet = hpet1; *tsc = tsc2; } From f49481bc50fce428521497977861b8115666dbe7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 58/94] [PATCH] x86-64: Check return value of putreg in PTRACE_SETREGS This means if an illegal value is set for the segment registers there ptrace will error out now with an errno instead of silently ignoring it. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/ptrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index addc14af0c56..4326a690a509 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } ret = 0; for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __get_user(tmp, (unsigned long __user *) data); - putreg(child, ui, tmp); + ret = __get_user(tmp, (unsigned long __user *) data); + if (ret) + break; + ret = putreg(child, ui, tmp); + if (ret) + break; data += sizeof(long); } break; From 9a11ff68273f440b1d33fcc4d550ffc881e6a0b4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 59/94] [PATCH] x86-64: Unexport __supported_pte_mask The symbol is needed to manipulate page tables, and modules shouldn't do that. Leftover from 2.4, but no in tree module should need it now. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8c4b80fe71a1..6a70b55f719d 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off From ffb6017563aa15f9a8cff9a30b861d42c2695894 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 60/94] [PATCH] x86-64: x86_64 - Fix FS/GS registers for VT execution Initialize FS and GS to __KERNEL_DS as well. The actual value of them is not important, but it is important to reload them in protected mode. At this time, they still retain the real mode values from initial boot. VT disallows execution of code under such conditions, which means hardware virtualization can not be used to boot the kernel on Intel platforms, making the boot time painfully slow. This requires moving the GS load before the load of GS_BASE, so just move all the segments loads there to keep them together in the code. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 1e6f80870679..598a4d0351fc 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -163,6 +163,20 @@ startup_64: */ lgdt cpu_gdt_descr + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + /* * Setup up a dummy PDA. this is just for some early bootup code * that does in_interrupt() @@ -173,12 +187,6 @@ startup_64: shrq $32,%rdx wrmsr - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - /* esi is pointer to real mode structure with interesting info. pass it to C */ movl %esi, %edi From 00edefae050c2c2d1e26fa9984f8f529fbc45989 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 61/94] [PATCH] x86-64: Fix off by one error in IOMMU boundary checking Should be harmless because there is normally no memory there, but technically it was incorrect. Pointed out by Leo Duran Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-gart.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index fc1960f1f243..030eb3753358 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; if (force_iommu) mmu = 1; @@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; return mmu; } From fc986db4fc1e773e240a19bc8b407ead88982cea Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 62/94] [PATCH] x86-64: Don't reserve ROMs We trust the e820 table, so explicitely reserving ROMs shouldn't be needed. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 130 +------------------------------------ 1 file changed, 2 insertions(+), 128 deletions(-) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f330f8285499..e82c1a155af5 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -138,128 +138,6 @@ struct resource code_resource = { .flags = IORESOURCE_RAM, }; -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; - start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -524,15 +402,11 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); + * We trust e820 completely. No explicit ROM probing in memory. + */ e820_reserve_resources(); e820_mark_nosave_regions(); - request_resource(&iomem_resource, &video_ram_resource); - { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ From 2fa8a050a0026eadbb39a2f281011991e00fe29a Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 63/94] [PATCH] x86-64: define dma noncoherent API functions x86-64 is missing these: Signed-off-by: Jeff Garzik Signed-off-by: Andi Kleen --- include/asm-x86_64/dma-mapping.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h index 49dbab09ef2b..d2af227f06d0 100644 --- a/include/asm-x86_64/dma-mapping.h +++ b/include/asm-x86_64/dma-mapping.h @@ -66,6 +66,9 @@ static inline int dma_mapping_error(dma_addr_t dma_addr) #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) + extern void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr, From 310adfdd9153f6ae818981a38a48dd2330990d8d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 64/94] [PATCH] x86-64: robustify bad_dma_address handling - set bad_dma_address explicitly to 0x0 - reserve 32 pages from bad_dma_address and up - WARN_ON() a driver feeding us bad_dma_address Thanks to Leo Duran for the suggestion. Signed-off-by: Muli Ben-Yehuda Signed-off-by: Andi Kleen Cc: Leo Duran Cc: Job Mason --- arch/x86_64/kernel/pci-calgary.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 3d65b1d4c2b3..04480c3b68f5 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = { #define PHB_DEBUG_STUFF_OFFSET 0x0020 +#define EMERGENCY_PAGES 32 /* = 128KB */ + unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, { unsigned long entry; unsigned long badbit; + unsigned long badend; + + /* were we called with bad_dma_address? */ + badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + WARN_ON(1); + return; + } entry = dma_addr >> PAGE_SHIFT; @@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) u64 start; struct iommu_table *tbl = dev->sysdata; - /* reserve bad_dma_address in case it's a legal address */ - iommu_range_reserve(tbl, bad_dma_address, 1); + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ start = (640 * 1024); @@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void) } force_iommu = 1; + bad_dma_address = 0x0; dma_ops = &calgary_dma_ops; return 0; From 5d0e600d903caa09e790824cc5812f0d97113b23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 65/94] [PATCH] x86: fix laptop bootup hang in init_acpi() During kernel bootup, a new T60 laptop (CoreDuo, 32-bit) hangs about 10%-20% of the time in acpi_init(): Calling initcall 0xc055ce1a: topology_init+0x0/0x2f() Calling initcall 0xc055d75e: mtrr_init_finialize+0x0/0x2c() Calling initcall 0xc05664f3: param_sysfs_init+0x0/0x175() Calling initcall 0xc014cb65: pm_sysrq_init+0x0/0x17() Calling initcall 0xc0569f99: init_bio+0x0/0xf4() Calling initcall 0xc056b865: genhd_device_init+0x0/0x50() Calling initcall 0xc056c4bd: fbmem_init+0x0/0x87() Calling initcall 0xc056dd74: acpi_init+0x0/0x1ee() It's a hard hang that not even an NMI could punch through! Frustratingly, adding printks or function tracing to the ACPI code made the hangs go away ... After some time an additional detail emerged: disabling the NMI watchdog made these occasional hangs go away. So i spent the better part of today trying to debug this and trying out various theories when i finally found the likely reason for the hang: if acpi_ns_initialize_devices() executes an _INI AML method and an NMI happens to hit that AML execution in the wrong moment, the machine would hang. (my theory is that this must be some sort of chipset setup method doing stores to chipset mmio registers?) Unfortunately given the characteristics of the hang it was sheer impossible to figure out which of the numerous AML methods is impacted by this problem. As a workaround i wrote an interface to disable chipset-based NMIs while executing _INI sections - and indeed this fixed the hang. I did a boot-loop of 100 separate reboots and none hung - while without the patch it would hang every 5-10 attempts. Out of caution i did not touch the nmi_watchdog=2 case (it's not related to the chipset anyway and didnt hang). I implemented this for both x86_64 and i686, tested the i686 laptop both with nmi_watchdog=1 [which triggered the hangs] and nmi_watchdog=2, and tested an Athlon64 box with the 64-bit kernel as well. Everything builds and works with the patch applied. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Len Brown Signed-off-by: Andrew Morton --- arch/i386/kernel/nmi.c | 28 ++++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 27 +++++++++++++++++++++++++++ drivers/acpi/namespace/nsinit.c | 9 +++++++++ include/linux/nmi.h | 9 ++++++++- 4 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 7d3f4e22d6fb..b11abacc5cfd 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -383,6 +383,34 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index e59cda134166..269fe585e71e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -368,6 +368,33 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/namespace/nsinit.c index 326af8fc0ce7..33db2241044e 100644 --- a/drivers/acpi/namespace/nsinit.c +++ b/drivers/acpi/namespace/nsinit.c @@ -45,6 +45,7 @@ #include #include #include +#include #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsinit") @@ -534,7 +535,15 @@ acpi_ns_init_one_device(acpi_handle obj_handle, info->parameter_type = ACPI_PARAM_ARGS; info->flags = ACPI_IGNORE_RETURN_VALUE; + /* + * Some hardware relies on this being executed as atomically + * as possible (without an NMI being received in the middle of + * this) - so disable NMIs and initialize the device: + */ + acpi_nmi_disable(); status = acpi_ns_evaluate(info); + acpi_nmi_enable(); + if (ACPI_SUCCESS(status)) { walk_info->num_INI++; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index acb4ed130247..29af2d5df097 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -17,8 +17,15 @@ #ifdef ARCH_HAS_NMI_WATCHDOG #include extern void touch_nmi_watchdog(void); +extern void acpi_nmi_disable(void); +extern void acpi_nmi_enable(void); #else -# define touch_nmi_watchdog() touch_softlockup_watchdog() +static inline void touch_nmi_watchdog(void) +{ + touch_softlockup_watchdog(); +} +static inline void acpi_nmi_disable(void) { } +static inline void acpi_nmi_enable(void) { } #endif #ifndef trigger_all_cpu_backtrace From 30b82ea08c3365a6fc916250ff2ad634717fc81b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 66/94] [PATCH] i386: All Transmeta CPUs have constant TSCs All Transmeta CPUs ever produced have constant-rate TSCs. Signed-off-by: H. Peter Anvin Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/transmeta.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 4056fb7d2cdf..b536e810d27e 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) wrmsr(0x80860004, ~0, uk); c->x86_capability[0] = cpuid_edx(0x00000001); wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); /* If we can run i686 user-space code, call us an i686 */ #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) From 3101673b659b916c965271c7f7c9b99cb353c01c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 67/94] [PATCH] i386: avoid gcc extension setcc() in math-emu is written as a gcc extension statement expression macro that returns a value. However, it's not used that way and it's not needed like that, so just make it a inline function so that we don't use an extension when it's not needed. Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Christoph Hellwig Cc: Segher Boessenkool Signed-off-by: Andrew Morton --- arch/i386/math-emu/status_w.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/i386/math-emu/status_w.h b/arch/i386/math-emu/status_w.h index 78d7b7689dd6..59e73302aa60 100644 --- a/arch/i386/math-emu/status_w.h +++ b/arch/i386/math-emu/status_w.h @@ -48,9 +48,11 @@ #define status_word() \ ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) -#define setcc(cc) ({ \ - partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); \ - partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); }) +static inline void setcc(int cc) +{ + partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); + partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); +} #ifdef PECULIAR_486 /* Default, this conveys no information, but an 80486 does it. */ From 2632f01a66d75f4ad59653a7efa506c6ea6845d0 Mon Sep 17 00:00:00 2001 From: takada Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 68/94] [PATCH] i386: support Classic MediaGXm I hope to support "classic" MediaGXm in kernel. The DIR1 register of MediaGXm( or Geode) shows the following values for identify CPU. For example, My MediaGXm shows 0x42. We can read National Semiconductor's datasheet without any NDAs. http://www.national.com/pf/GX/GXLV.html from datasheets: DIR1 0x30 - 0x33 GXm rev. 1.0 - 2.3 0x34 - 0x4f GXm rev. 2.4 - 3.x 0x5x GXm rev. 5.0 - 5.4 0x6x GXLV 0x7x (unknow) 0x8x Gx1 In nsc driver of X, accept 0x30 through 0x82. What will 0x7x mean? Cc: Jordan Crouse Cc: Andi Kleen Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index c0c3b59de32c..69b263f5632a 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -285,10 +285,15 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); - /* GXlv/GXm/GX1 */ - if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f)) geode_configure(); get_model_name(c); /* get CPU marketing name */ return; From 47a55cd795656d11bb18a7885583361f02a6baa8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:24 +0100 Subject: [PATCH 69/94] [PATCH] i386: entry.S END/ENDPROC annotations Annotate i386/kernel/entry.S with END/ENDPROC to assist disassemblers and other analysis tools. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/entry.S | 41 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d4b4ffc9eacb..18bddcb8e9e8 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -227,6 +227,7 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET -4 jmp syscall_exit CFI_ENDPROC +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, @@ -258,6 +259,7 @@ ENTRY(resume_userspace) # int/exception return? jne work_pending jmp restore_all +END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) @@ -272,6 +274,7 @@ need_resched: jz restore_all call preempt_schedule_irq jmp need_resched +END(resume_kernel) #endif CFI_ENDPROC @@ -359,6 +362,7 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection +ENDPROC(sysenter_entry) # system call handler stub ENTRY(system_call) @@ -459,6 +463,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC +ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN @@ -504,6 +509,7 @@ work_notifysig_v86: xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig +END(work_pending) # perform syscall exit tracing ALIGN @@ -519,6 +525,7 @@ syscall_trace_entry: cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit +END(syscall_trace_entry) # perform syscall exit tracing ALIGN @@ -532,6 +539,7 @@ syscall_exit_work: movl $1, %edx call do_syscall_trace jmp resume_userspace +END(syscall_exit_work) CFI_ENDPROC RING0_INT_FRAME # can't unwind into user space anyway @@ -542,10 +550,12 @@ syscall_fault: GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace +END(syscall_fault) syscall_badsys: movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace +END(syscall_badsys) CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ @@ -581,9 +591,9 @@ syscall_badsys: ENTRY(interrupt) .text -vector=0 ENTRY(irq_entries_start) RING0_INT_FRAME +vector=0 .rept NR_IRQS ALIGN .if vector @@ -592,11 +602,16 @@ ENTRY(irq_entries_start) 1: pushl $~(vector) CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt -.data + .previous .long 1b -.text + .text vector=vector+1 .endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous /* * the CPU automatically disables interrupts when executing an IRQ vector, @@ -609,6 +624,7 @@ common_interrupt: movl %esp,%eax call do_IRQ jmp ret_from_intr +ENDPROC(common_interrupt) CFI_ENDPROC #define BUILD_INTERRUPT(name, nr) \ @@ -621,7 +637,8 @@ ENTRY(name) \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; \ - CFI_ENDPROC + CFI_ENDPROC; \ +ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" @@ -697,6 +714,7 @@ ENTRY(coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME @@ -706,6 +724,7 @@ ENTRY(simd_coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME @@ -726,6 +745,7 @@ device_not_available_emulate: CFI_ADJUST_CFA_OFFSET -4 jmp ret_from_exception CFI_ENDPROC +END(device_not_available) /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -869,10 +889,12 @@ ENTRY(native_iret) .align 4 .long 1b,iret_exc .previous +END(native_iret) ENTRY(native_irq_enable_sysexit) sti sysexit +END(native_irq_enable_sysexit) #endif KPROBE_ENTRY(int3) @@ -895,6 +917,7 @@ ENTRY(overflow) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(overflow) ENTRY(bounds) RING0_INT_FRAME @@ -904,6 +927,7 @@ ENTRY(bounds) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(bounds) ENTRY(invalid_op) RING0_INT_FRAME @@ -913,6 +937,7 @@ ENTRY(invalid_op) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME @@ -922,6 +947,7 @@ ENTRY(coprocessor_segment_overrun) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME @@ -929,6 +955,7 @@ ENTRY(invalid_TSS) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME @@ -936,6 +963,7 @@ ENTRY(segment_not_present) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME @@ -943,6 +971,7 @@ ENTRY(stack_segment) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(stack_segment) KPROBE_ENTRY(general_protection) RING0_EC_FRAME @@ -958,6 +987,7 @@ ENTRY(alignment_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME @@ -967,6 +997,7 @@ ENTRY(divide_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) @@ -977,6 +1008,7 @@ ENTRY(machine_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(machine_check) #endif ENTRY(spurious_interrupt_bug) @@ -987,6 +1019,7 @@ ENTRY(spurious_interrupt_bug) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder From f0a5a58aa812b31fd9f197c4ba48245942364eae Mon Sep 17 00:00:00 2001 From: Bob Picco Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 70/94] [PATCH] x86-64: clean up sparsemem memory_present call Eliminate arch specific memory_present call x86_64 NUMA by utilizing sparse_memory_present_with_active_regions. Acked-by: Mel Gorman Signed-off-by: Bob Picco Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/mm/numa.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index d3f747dd61d3..41b8fb069924 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -460,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_SPARSEMEM -static void __init arch_sparse_init(void) -{ - int i; - - for_each_online_node(i) - memory_present(i, node_start_pfn(i), node_end_pfn(i)); - - sparse_init(); -} -#else -#define arch_sparse_init() do {} while (0) -#endif - void __init paging_init(void) { int i; @@ -483,7 +469,8 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = end_pfn; - arch_sparse_init(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); for_each_online_node(i) { setup_node_zones(i); From a4af60aa64c828b7c047e7a67b2f896d4bfbd700 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 71/94] [PATCH] i386: Remove unused kernel config option X86_XADD Remove the unused kernel config option X86_XADD, which is unused in any source or header file. Signed-off-by: Robert P. J. Day Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/Kconfig.cpu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index 2aecfba4ac4f..b99c0e2a4e63 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -226,11 +226,6 @@ config X86_CMPXCHG depends on !M386 default y -config X86_XADD - bool - depends on !M386 - default y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC From ee4eff6ff6cbfc8ce38131058a18802bf6206879 Mon Sep 17 00:00:00 2001 From: Benjamin Romer Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 72/94] [PATCH] x86-64: update IO-APIC dest field to 8-bit for xAPIC On the Unisys ES7000/ONE system, we encountered a problem where performing a kexec reboot or dump on any cell other than cell 0 causes the system timer to stop working, resulting in a hang during timer calibration in the new kernel. We traced the problem to one line of code in disable_IO_APIC(), which needs to restore the timer's IO-APIC configuration before rebooting. The code is currently using the 4-bit physical destination field, rather than using the 8-bit logical destination field, and it cuts off the upper 4 bits of the timer's APIC ID. If we change this to use the logical destination field, the timer works and we can kexec on the upper cells. This was tested on two different cells (0 and 2) in an ES7000/ONE system. For reference, the relevant Intel xAPIC spec is kept at ftp://download.intel.com/design/chipsets/e8501/datashts/30962001.pdf, specifically on page 334. Signed-off-by: Benjamin M Romer Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Eric W. Biederman" Cc: Vivek Goyal Signed-off-by: Andrew Morton --- arch/x86_64/kernel/io_apic.c | 24 +++++++++++------------- include/asm-x86_64/io_apic.h | 14 ++------------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 6be6730acb5c..566e64d966c4 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); @@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); } if (!apic && !IO_APIC_IRQ(irq)) @@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (vector < 0) return; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); @@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", + printk(KERN_DEBUG " %02x %03X ", i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest + entry.dest ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", @@ -1293,8 +1292,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(apic_read(APIC_ID)); + entry.dest = GET_APIC_ID(apic_read(APIC_ID)); /* * Add it to the IO-APIC irq-routing table: @@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; @@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h index 561ecbfd4cb5..f4fb238c89f1 100644 --- a/include/asm-x86_64/io_apic.h +++ b/include/asm-x86_64/io_apic.h @@ -85,18 +85,8 @@ struct IO_APIC_route_entry { mask : 1, /* 0: enabled, 1: disabled */ __reserved_2 : 15; - union { struct { __u32 - __reserved_1 : 24, - physical_dest : 4, - __reserved_2 : 4; - } physical; - - struct { __u32 - __reserved_1 : 24, - logical_dest : 8; - } logical; - } dest; - + __u32 __reserved_3 : 24, + dest : 8; } __attribute__ ((packed)); /* From 3e94fb8f54c5305ed472e0867cd67d53e05bfb64 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 73/94] [PATCH] x86-64: avoid warning message livelock I've seen my box paralyzed by an endless spew of rtc: lost some interrupts at 1024Hz. messages on the serial console. What seems to be happening is that something real causes an interrupt to be lost and triggers the message. But then printing the message to the serial console (from the hpet interrupt handler) takes more than 1/1024th of a second, and then some more interrupts are lost, so the message triggers again.... Fix this by adding a printk_ratelimit() before printing the warning. Signed-off-by: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 9c7fba30dac1..3cc6886f1fb7 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1226,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } From 8469adde5932f2879688fd5f183a6e9dadbf7b9f Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 74/94] [PATCH] x86-64: Minor patch for compilation warning in x86_64 signal code If DEBUG_SIG is enbaled in source code, ia32_signal.c compiles with warning due to wrong format string. Attached patch fixes that. It is quite minor update, since by default DEBUG_SIG is not enabled and can not be turned on without code modification. Signed-off-by: Evgeniy Polyakov Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/ia32/ia32_signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index ff499ef2a1ba..490f7c1b7c84 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -495,7 +495,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -601,7 +601,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif From 86c418374223be3f328b5522545196db02c8ceda Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 75/94] [PATCH] i386: add option to show more code in oops reports Sometimes developers need to see more object code in an oops report, e.g. when kernel may be corrupted at runtime. Add the "code_bytes" option for this. Signed-off-by: Chuck Ebbert Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/kernel-parameters.txt | 5 +++++ arch/i386/kernel/traps.c | 20 ++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 733a736bc6c8..22b19962a1a2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -364,6 +364,11 @@ and is between 256 and 4096 characters. It is defined in the file clocksource is not available, it defaults to PIT. Format: { pit | tsc | cyclone | pmtmr } + code_bytes [IA32] How many bytes of object code to print in an + oops report. + Range: 0 - 8192 + Default: 64 + disable_8254_timer enable_8254_timer [IA32/X86_64] Disable/Enable interrupt 0 timer routing diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 4ec21037a361..af0d3f70a817 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); int kstack_depth_to_print = 24; +static unsigned int code_bytes = 64; ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -325,7 +326,8 @@ void show_registers(struct pt_regs *regs) */ if (in_kernel) { u8 *eip; - int code_bytes = 64; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); @@ -333,14 +335,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - 43; + eip = (u8 *)regs->eip - code_prologue; if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { /* try starting at EIP */ eip = (u8 *)regs->eip; - code_bytes = 32; + code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_bytes; i++, eip++) { + for (i = 0; i < code_len; i++, eip++) { if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { printk(" Bad EIP value."); @@ -1192,3 +1194,13 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); From bcde1ebb81c51ebdfa02887703e4d21c1bbc2431 Mon Sep 17 00:00:00 2001 From: TAKADA Yoshihito Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 76/94] [PATCH] i386: geode configuration fixes Original code doesn't write back to CCR4 register. This patch reflects a value of a register. Cc: Jordan Crouse Acked-by: Alan Cox Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index 69b263f5632a..408a74e5c424 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -161,19 +161,19 @@ static void __cpuinit set_cx86_inc(void) static void __cpuinit geode_configure(void) { unsigned long flags; - u8 ccr3, ccr4; + u8 ccr3; local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ccr4 = getCx86(CX86_CCR4); - ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */ - - setCx86(CX86_CCR3, ccr3); + + /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); set_cx86_reorder(); @@ -420,15 +420,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) if (dir0 == 5 || dir0 == 3) { - unsigned char ccr3, ccr4; + unsigned char ccr3; unsigned long flags; printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ccr4 = getCx86(CX86_CCR4); - setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } } From 2fb12a9bca5ad9aa6dcd2c639b4a7656a8843ef8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 77/94] [PATCH] x86-64: survive having no irq mapping for a vector Occasionally the kernel has bugs that result in no irq being found for a given cpu vector. If we acknowledge the irq the system has a good chance of continuing even though we dropped an irq message. If we continue to simply print a message and not acknowledge the irq the system is likely to become non-responsive shortly there after. AK: Fixed compilation for UP kernels Signed-off-by: Eric W. Biederman Signed-off-by: Andi Kleen Cc: "Luigi Genoni" Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/irq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 0c06af6c13bc..3bc30d2c13d3 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -18,6 +18,7 @@ #include #include #include +#include atomic_t irq_err_count; @@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) if (likely(irq < NR_IRQS)) generic_handle_irq(irq); - else if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", - __func__, smp_processor_id(), vector); + else { + if (!disable_apic) + ack_APIC_irq(); + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", + __func__, smp_processor_id(), vector); + } irq_exit(); From 9f6026b8c308365d955faaf31dd0f457266d11f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 78/94] [PATCH] x86-64: Fix wrong gcc check in bitops.h gcc 5.0 will likely not have the constraint problem Signed-off-by: Andi Kleen --- include/asm-x86_64/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 8da9609070f4..d4dbbe5f7bd9 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -7,7 +7,7 @@ #include -#if __GNUC__ < 4 || __GNUC_MINOR__ < 1 +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc versions. */ #define ADDR "=m" (*(volatile long *) addr) From 1a1eecd1c272f704f135a7d8060ec3da1c201b4c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 79/94] [PATCH] i386: Remove fastcall in paravirt.[ch] Not needed because fastcall is always default now Signed-off-by: Andi Kleen --- arch/i386/kernel/paravirt.c | 102 ++++++++++++++--------------- include/asm-i386/paravirt.h | 126 ++++++++++++++++++------------------ 2 files changed, 114 insertions(+), 114 deletions(-) diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 2003733310dc..ebe82552ad30 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) return insn_len; } -static fastcall unsigned long native_get_debugreg(int regno) +static unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno) return val; } -static fastcall void native_set_debugreg(int regno, unsigned long value) +static void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -146,55 +146,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } -static fastcall void native_clts(void) +static void native_clts(void) { asm volatile ("clts"); } -static fastcall unsigned long native_read_cr0(void) +static unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr0(unsigned long val) +static void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } -static fastcall unsigned long native_read_cr2(void) +static unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr2(unsigned long val) +static void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } -static fastcall unsigned long native_read_cr3(void) +static unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr3(unsigned long val) +static void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } -static fastcall unsigned long native_read_cr4(void) +static unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); return val; } -static fastcall unsigned long native_read_cr4_safe(void) +static unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void) return val; } -static fastcall void native_write_cr4(unsigned long val) +static void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } -static fastcall unsigned long native_save_fl(void) +static unsigned long native_save_fl(void) { unsigned long f; asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); return f; } -static fastcall void native_restore_fl(unsigned long f) +static void native_restore_fl(unsigned long f) { asm volatile("pushl %0 ; popfl": /* no output */ :"g" (f) :"memory", "cc"); } -static fastcall void native_irq_disable(void) +static void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static fastcall void native_irq_enable(void) +static void native_irq_enable(void) { asm volatile("sti": : :"memory"); } -static fastcall void native_safe_halt(void) +static void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static fastcall void native_halt(void) +static void native_halt(void) { asm volatile("hlt": : :"memory"); } -static fastcall void native_wbinvd(void) +static void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +static unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; @@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) return val; } -static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +static int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val) return err; } -static fastcall unsigned long long native_read_tsc(void) +static unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=A" (val)); return val; } -static fastcall unsigned long long native_read_pmc(void) +static unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=A" (val)); return val; } -static fastcall void native_load_tr_desc(void) +static void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } -static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +static void native_load_gdt(const struct Xgt_desc_struct *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); } -static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +static void native_load_idt(const struct Xgt_desc_struct *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } -static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +static void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=m" (*dtr)); } -static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +static void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=m" (*dtr)); } -static fastcall unsigned long native_store_tr(void) +static unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=r" (tr)); return tr; } -static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +static void native_load_tls(struct thread_struct *t, unsigned int cpu) { #define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] C(0); C(1); C(2); @@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 lp[1] = entry_high; } -static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_load_esp0(struct tss_struct *tss, +static void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss, } } -static fastcall void native_io_delay(void) +static void native_io_delay(void) { asm volatile("outb %al,$0x80"); } -static fastcall void native_flush_tlb(void) +static void native_flush_tlb(void) { __native_flush_tlb(); } @@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void) * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -static fastcall void native_flush_tlb_global(void) +static void native_flush_tlb_global(void) { __native_flush_tlb_global(); } -static fastcall void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(u32 addr) { __native_flush_tlb_single(addr); } #ifndef CONFIG_X86_PAE -static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +static void native_set_pte(pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; } #else /* CONFIG_X86_PAE */ -static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +static void native_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { ptep->pte_low = 0; smp_wmb(); @@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) { set_64bit((unsigned long long *)ptep,pte_val(pteval)); } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); } -static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +static void native_set_pud(pud_t *pudp, pud_t pudval) { *pudp = pudval; } -static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; } -static fastcall void native_pmd_clear(pmd_t *pmd) +static void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; *tmp = 0; @@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd) #endif /* CONFIG_X86_PAE */ /* These are in entry.S */ -extern fastcall void native_iret(void); -extern fastcall void native_irq_enable_sysexit(void); +extern void native_iret(void); +extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index 12ef95924da6..6317e0a4d735 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -59,102 +59,102 @@ struct paravirt_ops convention. This makes it easier to implement inline assembler replacements. */ - void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx, + void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); - unsigned long (fastcall *get_debugreg)(int regno); - void (fastcall *set_debugreg)(int regno, unsigned long value); + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); - void (fastcall *clts)(void); + void (*clts)(void); - unsigned long (fastcall *read_cr0)(void); - void (fastcall *write_cr0)(unsigned long); + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); - unsigned long (fastcall *read_cr2)(void); - void (fastcall *write_cr2)(unsigned long); + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); - unsigned long (fastcall *read_cr3)(void); - void (fastcall *write_cr3)(unsigned long); + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); - unsigned long (fastcall *read_cr4_safe)(void); - unsigned long (fastcall *read_cr4)(void); - void (fastcall *write_cr4)(unsigned long); + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); - unsigned long (fastcall *save_fl)(void); - void (fastcall *restore_fl)(unsigned long); - void (fastcall *irq_disable)(void); - void (fastcall *irq_enable)(void); - void (fastcall *safe_halt)(void); - void (fastcall *halt)(void); - void (fastcall *wbinvd)(void); + unsigned long (*save_fl)(void); + void (*restore_fl)(unsigned long); + void (*irq_disable)(void); + void (*irq_enable)(void); + void (*safe_halt)(void); + void (*halt)(void); + void (*wbinvd)(void); /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (fastcall *read_msr)(unsigned int msr, int *err); - int (fastcall *write_msr)(unsigned int msr, u64 val); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, u64 val); - u64 (fastcall *read_tsc)(void); - u64 (fastcall *read_pmc)(void); + u64 (*read_tsc)(void); + u64 (*read_pmc)(void); - void (fastcall *load_tr_desc)(void); - void (fastcall *load_gdt)(const struct Xgt_desc_struct *); - void (fastcall *load_idt)(const struct Xgt_desc_struct *); - void (fastcall *store_gdt)(struct Xgt_desc_struct *); - void (fastcall *store_idt)(struct Xgt_desc_struct *); - void (fastcall *set_ldt)(const void *desc, unsigned entries); - unsigned long (fastcall *store_tr)(void); - void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu); - void (fastcall *write_ldt_entry)(void *dt, int entrynum, + void (*load_tr_desc)(void); + void (*load_gdt)(const struct Xgt_desc_struct *); + void (*load_idt)(const struct Xgt_desc_struct *); + void (*store_gdt)(struct Xgt_desc_struct *); + void (*store_idt)(struct Xgt_desc_struct *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); + void (*write_ldt_entry)(void *dt, int entrynum, u32 low, u32 high); - void (fastcall *write_gdt_entry)(void *dt, int entrynum, + void (*write_gdt_entry)(void *dt, int entrynum, u32 low, u32 high); - void (fastcall *write_idt_entry)(void *dt, int entrynum, + void (*write_idt_entry)(void *dt, int entrynum, u32 low, u32 high); - void (fastcall *load_esp0)(struct tss_struct *tss, + void (*load_esp0)(struct tss_struct *tss, struct thread_struct *thread); - void (fastcall *set_iopl_mask)(unsigned mask); + void (*set_iopl_mask)(unsigned mask); - void (fastcall *io_delay)(void); + void (*io_delay)(void); void (*const_udelay)(unsigned long loops); #ifdef CONFIG_X86_LOCAL_APIC - void (fastcall *apic_write)(unsigned long reg, unsigned long v); - void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); - unsigned long (fastcall *apic_read)(unsigned long reg); + void (*apic_write)(unsigned long reg, unsigned long v); + void (*apic_write_atomic)(unsigned long reg, unsigned long v); + unsigned long (*apic_read)(unsigned long reg); void (*setup_boot_clock)(void); void (*setup_secondary_clock)(void); #endif - void (fastcall *flush_tlb_user)(void); - void (fastcall *flush_tlb_kernel)(void); - void (fastcall *flush_tlb_single)(u32 addr); + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(u32 addr); - void (fastcall *alloc_pt)(u32 pfn); - void (fastcall *alloc_pd)(u32 pfn); - void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); - void (fastcall *release_pt)(u32 pfn); - void (fastcall *release_pd)(u32 pfn); + void (*alloc_pt)(u32 pfn); + void (*alloc_pd)(u32 pfn); + void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count); + void (*release_pt)(u32 pfn); + void (*release_pd)(u32 pfn); - void (fastcall *set_pte)(pte_t *ptep, pte_t pteval); - void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); - void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); - void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep); #ifdef CONFIG_X86_PAE - void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); - void (fastcall *set_pud)(pud_t *pudp, pud_t pudval); - void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (fastcall *pmd_clear)(pmd_t *pmdp); + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); + void (*set_pud)(pud_t *pudp, pud_t pudval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); #endif - void (fastcall *set_lazy_mode)(int mode); + void (*set_lazy_mode)(int mode); /* These two are jmp to, not actually called. */ - void (fastcall *irq_enable_sysexit)(void); - void (fastcall *iret)(void); + void (*irq_enable_sysexit)(void); + void (*iret)(void); - void (fastcall *startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); + void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); }; /* Mark a paravirt probe function. */ From f790cd30d002949a12623b2a8cec4d4e5a8887ef Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 80/94] [PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in /proc/cpuinfo Just various new acronyms. The new popcnt bit is in the middle of Intel space. This looks a little weird, but I've been assured it's ok. Also I fixed RDTSCP for i386 which was at the wrong place. For i386 and x86-64. Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 14 +++++++++----- arch/x86_64/kernel/setup.c | 14 ++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 6624d8583c42..47e3ebbfb28d 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm", + "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", NULL, - /* nothing */ /* constant_tsc - moved to flags */ + NULL, /* constant_tsc - moved to flags */ + /* nothing */ }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index e82c1a155af5..13daaf359d21 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -942,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -960,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -970,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", + "altmovcr8", "abm", "sse4a", + "misalignsse", "3dnowprefetch", + "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -982,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", + NULL, /* tsc invariant mapped to constant_tsc */ NULL, /* nothing */ /* constant_tsc - moved to flags */ }; From 0a4599c894d880763eec6cb93f6c246dac6c3269 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 81/94] [PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs For i386/x86-64. Straight forward -- just reuse the Family 0xf code. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 6 ++++-- arch/x86_64/kernel/nmi.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index b11abacc5cfd..5d8a07c20281 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) + || (boot_cpu_data.x86 == 16)); case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -817,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused) if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) return; if (!setup_k7_watchdog()) return; diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 269fe585e71e..486f4c61a948 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15; + return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; From 2ba1ff2b796746722fc4fe8bdcd1f30a834e3d0a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 82/94] [PATCH] i386: Fix warning in microcode.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix bogus gcc warning linux/arch/i386/kernel/microcode.c:387: warning: ‘new_mc’ may be used uninitialized in this function Signed-off-by: Andi Kleen --- arch/i386/kernel/microcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index 381252bae3d8..b8f16633a6ec 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -384,7 +384,7 @@ static int do_microcode_update (void) { long cursor = 0; int error = 0; - void *new_mc; + void *new_mc = NULL; int cpu; cpumask_t old; From 7de6d3618b09c39fdaa6125e23fcf465a65bc266 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH 83/94] [PATCH] i386: Fix warning in cpu initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix bogus warning linux/arch/i386/kernel/cpu/transmeta.c:12: warning: ‘cpu_freq’ may be used uninitialized in this function Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/transmeta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index b536e810d27e..5678d46863c6 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; - unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; get_model_name(c); /* Same as AMD/Cyrix */ From 120fad72401ebec2a126c16cc48f56c28f3eefe2 Mon Sep 17 00:00:00 2001 From: Alan Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 84/94] [PATCH] i386: Fix Cyrix MediaGX detection The old Cyrix 5520 CPU detection code relied upon the PCI layer setup being done earlier than the CPU setup, which is no longer true. Fortunately we know that if the processor is a MediaGX we can do type 1 pci config accesses to check the companion chip. We thus do those directly and from this find the 5520 and implement the workarounds for the timer problem Original report from takada@mbf.nifty.com, I sent a proposed patch which Takara then corrected, tested and sent back to the list on 10th January. Submitting for merging as it seems to have been missed AK: Changed to use pci-direct.h and fix warning for !CONFIG_PCI (later AK: originally from akpm) Signed-off-by: Alan Cox Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Cc: Jordan Crouse Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/cyrix.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index 408a74e5c424..de27bd07bc9c 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "cpu.h" @@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void) } -#ifdef CONFIG_PCI -static struct pci_device_id __cpuinitdata cyrix_55x0[] = { - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, - { }, -}; -#endif - static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; @@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ #ifdef CONFIG_PCI + { + u32 vendor, device; /* It isn't really a PCI quirk directly, but the cure is the same. The MediaGX has deep magic SMM stuff that handles the SB emulation. It thows away the fifo on disable_dma() which @@ -273,12 +268,19 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); isa_dma_bridge_buggy = 2; + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); /* * The 5510/5520 companion chips have a funky PIT. */ - if (pci_dev_present(cyrix_55x0)) + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) pit_latch_buggy = 1; + } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ From 9fbbd4dd17d0712054368e5e939e28b2456bfe1b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 85/94] [PATCH] x86: Don't require the vDSO for handling a.out signals and in other strange binfmts. vDSO is not necessarily mapped there. Signed-off-by: Andi Kleen --- arch/i386/kernel/signal.c | 6 +++++- arch/x86_64/ia32/ia32_signal.c | 7 ++++++- fs/binfmt_elf.c | 3 ++- include/linux/binfmts.h | 1 + 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 8f4afcc7d2ab..4f99e870c986 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + if (current->binfmt->hasvdso) + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 490f7c1b7c84..359eacc38509 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, /* Return stub is in 32bit vsyscall page */ { - void __user *restorer = VSYSCALL32_SIGRETURN; + void __user *restorer; + if (current->binfmt->hasvdso) + restorer = VSYSCALL32_SIGRETURN; + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 669dbe5b0317..51db1182b27e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -76,7 +76,8 @@ static struct linux_binfmt elf_format = { .load_binary = load_elf_binary, .load_shlib = load_elf_library, .core_dump = elf_core_dump, - .min_coredump = ELF_EXEC_PAGESIZE + .min_coredump = ELF_EXEC_PAGESIZE, + .hasvdso = 1 }; #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index c1e82c514443..2d956cd566ae 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -59,6 +59,7 @@ struct linux_binfmt { int (*load_shlib)(struct file *); int (*core_dump)(long signr, struct pt_regs * regs, struct file * file); unsigned long min_coredump; /* minimal dump size */ + int hasvdso; }; extern int register_binfmt(struct linux_binfmt *); From 9af3cf054615862c86efcf55a37bb40f0d96e406 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 86/94] [PATCH] x86_64: Wire up compat epoll_pwait > Which remembers me that I think that MIPS is using the non-compat version > of sys_epoll_pwait for compat syscalls. But maybe MIPS doesn't need a compat > syscall for some reason. Dunno. Which reminds me that x86_64 i386 compat doesn't wire up sys_epoll_pwait ;-) Signed-off-by: Ralf Baechle Signed-off-by: Andi Kleen --- arch/x86_64/ia32/ia32entry.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5f32cf4de5fb..eda7a0d4dc15 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -718,4 +718,5 @@ ia32_sys_call_table: .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_epoll_pwait ia32_syscall_end: From 992af68147299bb635be97f789e4f66ba7add477 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 87/94] [PATCH] i386: paravirt unhandled fallthrough The current code simply calls "start_kernel" directly if we're under a hypervisor and no paravirt_ops backend wants us, because paravirt.c registers that as a backend. This was always a vain hope; start_kernel won't get far without setup. It's also impossible for paravirt_ops backends which don't sit in the arch/i386/kernel directory: they can't link before paravirt.o anyway. Keep it simple: if we pass all the registered paravirt probes, BUG(). Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 2 -- arch/i386/kernel/head.S | 7 ++++++- arch/i386/kernel/paravirt.c | 3 --- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 97f1e961d684..223969377de4 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -41,8 +41,6 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_VMI) += vmi.o vmitime.o - -# Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 734be5572eb9..b322f72ffaad 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -513,10 +513,11 @@ startup_paravirt: pushl %ecx pushl %eax - /* paravirt.o is last in link, and that probe fn never returns */ pushl $__start_paravirtprobe 1: movl 0(%esp), %eax + cmpl $__stop_paravirtprobe, %eax + je unhandled_paravirt pushl (%eax) movl 8(%esp), %eax call *(%esp) @@ -528,6 +529,10 @@ startup_paravirt: addl $4, (%esp) jmp 1b + +unhandled_paravirt: + /* Nothing wanted us: we're screwed. */ + ud2 #endif /* diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index ebe82552ad30..c156ecfa3872 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -482,9 +482,6 @@ static int __init print_banner(void) } core_initcall(print_banner); -/* We simply declare start_kernel to be the paravirt probe of last resort. */ -paravirt_probe(start_kernel); - struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, From 105fddb862d3da2f414329ff7719794fb2bd706b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 88/94] [PATCH] i386: Move mce_disabled to asm/mce.h Allows external actors to disable mce. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen =================================================================== --- arch/i386/kernel/cpu/mcheck/mce.h | 2 +- include/asm-i386/mce.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h index 84fd4cf7d0fb..81fb6e2d35f3 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.h +++ b/arch/i386/kernel/cpu/mcheck/mce.h @@ -1,4 +1,5 @@ #include +#include void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); @@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); -extern int mce_disabled; extern int nr_mce_banks; diff --git a/include/asm-i386/mce.h b/include/asm-i386/mce.h index 7cc1a973bf00..b0a02ee34ffd 100644 --- a/include/asm-i386/mce.h +++ b/include/asm-i386/mce.h @@ -3,3 +3,5 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do {} while(0) #endif + +extern int mce_disabled; From 2a57ff1a7051f0936b57342a57c25658d7ca3cc6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 89/94] [PATCH] i386: Rename cpu_gdt_descr and remove extern declaration from smpboot.c When I implemented the DECLARE_PER_CPU(var) macros, I was careful that people couldn't use "var" in a non-percpu context, by prepending percpu__. I never considered that this would allow them to overload the same name for a per-cpu and a non-percpu variable. It is only one of many horrors in the i386 boot code, but let's rename the non-perpcu cpu_gdt_descr to early_gdt_descr (not boot_gdt_descr, that's something else...) Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen =================================================================== --- arch/i386/kernel/head.S | 6 +++--- arch/i386/kernel/smpboot.c | 1 - include/asm-i386/desc.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index b322f72ffaad..3fa7f9389afe 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -319,7 +319,7 @@ is386: movl $2,%ecx # set MP call check_x87 call setup_pda - lgdt cpu_gdt_descr + lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers @@ -375,7 +375,7 @@ ENTRY(setup_pda) movl start_pda, %eax /* slot the PDA address into the GDT */ - mov cpu_gdt_descr+2, %ecx + mov early_gdt_descr+2, %ecx mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ shr $16, %eax mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ @@ -597,7 +597,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(cpu_gdt_descr) +ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 5a00b07e7194..f46a4d095e6c 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -623,7 +623,6 @@ extern struct { unsigned short ss; } stack_start; extern struct i386_pda *start_pda; -extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h index f398cc456448..050831f34f71 100644 --- a/include/asm-i386/desc.h +++ b/include/asm-i386/desc.h @@ -22,7 +22,7 @@ struct Xgt_desc_struct { extern struct Xgt_desc_struct idt_descr; DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); - +extern struct Xgt_desc_struct early_gdt_descr; static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { From 40d22c1b5675e428b3f3f9a945d0bd62e94ca2f1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 90/94] [PATCH] i386: Remove extern declaration from mm/discontig.c, put in header. Extern declarations belong in headers. Times, they are a'changin. Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen =================================================================== --- arch/i386/mm/discontig.c | 1 - include/asm-i386/setup.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index e0c390d6ceb5..aa58720f6871 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -101,7 +101,6 @@ extern void find_max_pfn(void); extern void add_one_highpage_init(struct page *, int, int); extern struct e820map e820; -extern unsigned long init_pg_tables_end; extern unsigned long highend_pfn, highstart_pfn; extern unsigned long max_low_pfn; extern unsigned long totalram_pages; diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h index 76316275d6f9..0e8077cbfdac 100644 --- a/include/asm-i386/setup.h +++ b/include/asm-i386/setup.h @@ -77,6 +77,8 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map); void __init add_memory_region(unsigned long long start, unsigned long long size, int type); +extern unsigned long init_pg_tables_end; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ From 62cc49396e593dd71c6595302bb10b085aefbfa5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 91/94] [PATCH] x86: Unify pcspeaker platform device code between i386/x86-64 Trivial cleanup. Only change is that it is always compiled in now on x86-64 like on i386. Signed-off-by: Andi Kleen --- arch/i386/kernel/Makefile | 1 + arch/i386/kernel/pcspeaker.c | 20 ++++++++++++++++++++ arch/i386/kernel/setup.c | 26 -------------------------- arch/x86_64/kernel/Makefile | 2 ++ arch/x86_64/kernel/setup.c | 20 -------------------- 5 files changed, 23 insertions(+), 46 deletions(-) create mode 100644 arch/i386/kernel/pcspeaker.c diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 223969377de4..cbe4e601885c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_VMI) += vmi.o vmitime.o obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-y += pcspeaker.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c new file mode 100644 index 000000000000..bc1f2d3ea277 --- /dev/null +++ b/arch/i386/kernel/pcspeaker.c @@ -0,0 +1,20 @@ +#include +#include +#include + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index bd8c218d94af..122623dcc6e1 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -660,28 +659,3 @@ void __init setup_arch(char **cmdline_p) #endif tsc_init(); } - -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); - -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:8 - * End: - */ diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 3c7cbff04d3d..ae399458024b 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o +pcspeaker-y += ../../i386/kernel/pcspeaker.o diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 13daaf359d21..3d98b696881d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1104,23 +1104,3 @@ struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) -#include -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); -#endif From 98838ec984b78c625bbf9a5daaf001cd216b8f86 Mon Sep 17 00:00:00 2001 From: Giuliano Procida Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 92/94] [PATCH] i386: fix 32-bit ioctls on x64_32 [MTRR] fix 32-bit ioctls on x64_32 Signed-off-by: Giuliano Procida Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/if.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index ee771f305f96..c7d8f1756745 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) default: return -ENOTTY; case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 0); break; case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 0); break; case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); @@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) break; case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 1); break; case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del_page(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); From 22c5ace7290b792faf64ffe90cf933950fbf52db Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 93/94] [PATCH] i386: Fix broken CONFIG_COMPAT_VDSO on i386 After updating several machines to 2.6.20, I can't boot anymore the single one of them that supports the NX bit and is configured as a 32-bit system. My understanding is that the VDSO changes in 2.6.20-rc7 were not fully cooked, in that with that config option enabled VDSO_SYM(x) now equals x, meaning that an address in the fixmap area is now being passed to apps via AT_SYSINFO. However, the page is mapped with PAGE_READONLY rather than PAGE_READONLY_EXEC. I'm not certain whether having app code go through the fixmap area is intended, but in case it is here is the simple patch that makes things work again. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index bc882a2b1db6..13ca54a85a1c 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -78,7 +78,7 @@ int __init sysenter_setup(void) syscall_pages[0] = virt_to_page(syscall_page); #ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); #endif From 126b1922367fbe5513daa675a2abd13ed3917f4e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH 94/94] [PATCH] x86-64: Remove mk_pte_phys() - Convert last user to pfn_pte - Remove mk_pte_phys Suggested by Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/mm/pageattr.c | 4 +++- include/asm-x86_64/pgtable.h | 9 --------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index ccb91dd996a9..65c5eaa59905 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) pud_t *pud; pmd_t *pmd; pte_t large_pte; + unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; + large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 59901c690a0d..730bd6028416 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -359,15 +359,6 @@ static inline int pmd_large(pmd_t pte) { #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) -/* physical address -> PTE */ -static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) -{ - pte_t pte; - pte_val(pte) = physpage | pgprot_val(pgprot); - pte_val(pte) &= __supported_pte_mask; - return pte; -} - /* Change flags of a PTE */ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) {