From: kernel.org Subject: 2.6.25 Patch-mainline: 2.6.25 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> Automatically created from "patches.kernel.org/patch-2.6.25" by xen-port-patches.py --- sle11sp1-2010-03-22.orig/arch/x86/Kconfig 2010-02-09 16:52:20.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/Kconfig 2010-02-09 16:53:24.000000000 +0100 @@ -39,7 +39,7 @@ config X86 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS - select HAVE_KVM + select HAVE_KVM if !XEN select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 @@ -227,14 +227,12 @@ config X86_TRAMPOLINE default y config X86_NO_TSS - bool + def_bool y depends on XEN - default y config X86_NO_IDT - bool + def_bool y depends on XEN - default y config X86_32_LAZY_GS def_bool y @@ -880,9 +878,8 @@ config X86_VISWS_APIC depends on X86_32 && X86_VISWS config X86_XEN_GENAPIC - bool + def_bool y depends on X86_64_XEN - default y config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" @@ -1331,7 +1328,7 @@ config ARCH_PROC_KCORE_TEXT config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on X86_64 + depends on X86_64 && !X86_64_XEN config ARCH_SPARSEMEM_ENABLE def_bool y @@ -2025,10 +2022,10 @@ config PCI_MMCONFIG depends on X86_64 && PCI && ACPI config XEN_PCIDEV_FRONTEND - bool "Xen PCI Frontend" if X86_64 + def_bool y + prompt "Xen PCI Frontend" if X86_64 depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) select HOTPLUG - default y help The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. @@ -2036,7 +2033,6 @@ config XEN_PCIDEV_FRONTEND config XEN_PCIDEV_FE_DEBUG bool "Xen PCI Frontend Debugging" depends on XEN_PCIDEV_FRONTEND - default n help Enables some debug statements within the PCI Frontend. --- sle11sp1-2010-03-22.orig/arch/x86/Kconfig.debug 2009-10-15 11:53:21.000000000 +0200 +++ sle11sp1-2010-03-22/arch/x86/Kconfig.debug 2009-11-06 10:51:25.000000000 +0100 @@ -264,6 +264,7 @@ config DEBUG_BOOT_PARAMS bool "Debug boot parameters" depends on DEBUG_KERNEL depends on DEBUG_FS + depends on !XEN ---help--- This option will cause struct boot_params to be exported via debugfs. --- sle11sp1-2010-03-22.orig/arch/x86/ia32/ia32entry-xen.S 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/ia32/ia32entry-xen.S 2009-11-06 10:51:25.000000000 +0100 @@ -12,7 +12,6 @@ #include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> -#include <asm/vsyscall32.h> #include <asm/irqflags.h> #include <linux/linkage.h> @@ -99,10 +98,11 @@ ENTRY(ia32_sysenter_target) CFI_RESTORE rcx movl %ebp,%ebp /* zero extension */ movl %eax,%eax + movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d movl $__USER32_DS,40(%rsp) movq %rbp,32(%rsp) movl $__USER32_CS,16(%rsp) - movl $VSYSCALL32_SYSEXIT,8(%rsp) + movq %r10,8(%rsp) movq %rax,(%rsp) cld SAVE_ARGS 0,0,1 @@ -582,8 +582,8 @@ ia32_sys_call_table: .quad compat_sys_futex /* 240 */ .quad compat_sys_sched_setaffinity .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area + .quad sys_set_thread_area + .quad sys_get_thread_area .quad compat_sys_io_setup /* 245 */ .quad sys_io_destroy .quad compat_sys_io_getevents @@ -661,7 +661,9 @@ ia32_sys_call_table: .quad sys_epoll_pwait .quad compat_sys_utimensat /* 320 */ .quad compat_sys_signalfd - .quad compat_sys_timerfd + .quad sys_timerfd_create .quad sys_eventfd .quad sys32_fallocate + .quad compat_sys_timerfd_settime /* 325 */ + .quad compat_sys_timerfd_gettime ia32_syscall_end: --- sle11sp1-2010-03-22.orig/arch/x86/kernel/Makefile 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/Makefile 2009-11-06 10:51:25.000000000 +0100 @@ -134,11 +134,10 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_XEN) += nmi_64.o time_64-$(CONFIG_XEN) += time_32.o pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o endif disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o -disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += mpparse_64.o -%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) := --- sle11sp1-2010-03-22.orig/arch/x86/kernel/acpi/boot.c 2010-03-22 12:18:02.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/acpi/boot.c 2010-02-17 14:50:06.000000000 +0100 @@ -113,6 +113,11 @@ char *__init __acpi_map_table(unsigned l if (!phys || !size) return NULL; +#ifdef CONFIG_XEN + if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT)) + return isa_bus_to_virt(phys); +#endif + return early_ioremap(phys, size); } void __init __acpi_unmap_table(char *map, unsigned long size) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/acpi/sleep-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,95 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> + +#include <asm/smp.h> + +#ifndef CONFIG_ACPI_PV_SLEEP +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_realmode_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); +#endif + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if (!acpi_wakeup_address) { + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + return -ENOMEM; + } + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); +#endif + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +#endif +} + + +#ifndef CONFIG_ACPI_PV_SLEEP +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); +#endif /* CONFIG_ACPI_PV_SLEEP */ --- sle11sp1-2010-03-22.orig/arch/x86/kernel/acpi/sleep_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,117 +0,0 @@ -/* - * sleep.c - x86-specific ACPI sleep support. - * - * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> - */ - -#include <linux/acpi.h> -#include <linux/bootmem.h> -#include <linux/dmi.h> -#include <linux/cpumask.h> - -#include <asm/smp.h> - -#ifndef CONFIG_ACPI_PV_SLEEP -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); -#endif - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - if (!acpi_wakeup_address) - return 1; - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); -#endif - return 0; -} - -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_bootmem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if (!acpi_wakeup_address) - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -#endif -} - -#ifndef CONFIG_ACPI_PV_SLEEP -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - -/* Ouch, we want to delete this. We already have better version in userspace, in - s2ram from suspend.sf.net project */ -static __init int reset_videomode_after_s3(const struct dmi_system_id *d) -{ - acpi_realmode_flags |= 2; - return 0; -} - -static __initdata struct dmi_system_id acpisleep_dmi_table[] = { - { /* Reset video mode after returning from ACPI S3 sleep */ - .callback = reset_videomode_after_s3, - .ident = "Toshiba Satellite 4030cdt", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - }, - }, - {} -}; - -static int __init acpisleep_dmi_init(void) -{ - dmi_check_system(acpisleep_dmi_table); - return 0; -} - -core_initcall(acpisleep_dmi_init); -#endif /* CONFIG_ACPI_PV_SLEEP */ --- sle11sp1-2010-03-22.orig/arch/x86/kernel/acpi/sleep_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,125 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/acpi.h> -#include <linux/cpumask.h> - -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -#ifndef CONFIG_ACPI_PV_SLEEP -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); -#endif - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); -#endif - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ -#ifndef CONFIG_ACPI_PV_SLEEP - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -#endif -} - -#ifndef CONFIG_ACPI_PV_SLEEP -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); -#endif /* CONFIG_ACPI_PV_SLEEP */ - --- sle11sp1-2010-03-22.orig/arch/x86/kernel/apic_32-xen.c 2009-11-06 10:49:47.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/apic_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -86,7 +86,7 @@ int setup_profiling_timer(unsigned int m * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int __init APIC_init_uniprocessor (void) +int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_IO_APIC if (smp_found_config) --- sle11sp1-2010-03-22.orig/arch/x86/kernel/apic_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/apic_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -34,34 +34,17 @@ #include <asm/hpet.h> #include <asm/idle.h> -int apic_verbosity; +int disable_apic; /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Debug level, exported for io_apic.c */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at irq %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} +int apic_verbosity; -void smp_local_timer_interrupt(void) +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) { #ifndef CONFIG_XEN int cpu = smp_processor_id(); @@ -121,11 +104,34 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(); + local_apic_timer_interrupt(); irq_exit(); set_irq_regs(old_regs); } +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 1; +} + +/* + * Local APIC interrupts + */ + /* * This interrupt should _never_ happen with our APIC/SMP architecture */ @@ -150,7 +156,6 @@ asmlinkage void smp_spurious_interrupt(v /* * This interrupt should never happen with our APIC/SMP architecture */ - asmlinkage void smp_error_interrupt(void) { unsigned int v, v1; @@ -178,19 +183,3 @@ asmlinkage void smp_error_interrupt(void smp_processor_id(), v , v1); irq_exit(); } - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif - - return 1; -} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/asm-offsets_32.c 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/asm-offsets_32.c 2009-11-06 10:51:25.000000000 +0100 @@ -24,8 +24,10 @@ #include <xen/interface/xen.h> #endif +#ifdef CONFIG_LGUEST_GUEST #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" +#endif /* workaround for a warning with -Wmissing-prototypes */ void foo(void); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/cpu/common-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/cpu/common-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -27,45 +27,50 @@ #include "cpu.h" DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, #ifndef CONFIG_XEN /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, #endif - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + static int cachesize_override __cpuinitdata = -1; -static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; -static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern int disable_pse; - static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -214,16 +219,8 @@ static void __cpuinit get_cpu_vendor(str static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPUs to not use it... */ - disable_x86_fxsr = 1; - - /* - * ... and clear the bits early in the boot_cpu_data - * so that the bootup process doesn't try to do this - * either. - */ - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -231,7 +228,7 @@ __setup("nofxsr", x86_fxsr_setup); static int __init x86_sep_setup(char * s) { - disable_x86_sep = 1; + setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; } __setup("nosep", x86_sep_setup); @@ -268,10 +265,10 @@ static int __cpuinit have_cpuid_p(void) void __init cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; if (c->cpuid_level >= 0x00000001) { @@ -284,9 +281,38 @@ void __init cpu_detect(struct cpuinfo_x8 if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; - if (cap0 & (1<<19)) + if (cap0 & (1<<19)) { c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + } + } +} +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + unsigned int ebx; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (have_cpuid_p()) { + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + } + } /* Do minimum CPU detection early. @@ -300,6 +326,7 @@ static void __init early_cpu_detect(void struct cpuinfo_x86 *c = &boot_cpu_data; c->x86_cache_alignment = 32; + c->x86_clflush_size = 32; if (!have_cpuid_p()) return; @@ -307,19 +334,30 @@ static void __init early_cpu_detect(void cpu_detect(c); get_cpu_vendor(c, 1); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + + early_get_cap(c); } static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; - int ebx; + unsigned int ebx; if (have_cpuid_p()) { /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); get_cpu_vendor(c, 0); /* Initialize the standard set of capabilities */ @@ -366,8 +404,6 @@ static void __cpuinit generic_identify(s init_scattered_cpuid_features(c); } - early_intel_workaround(c); - #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif @@ -401,7 +437,7 @@ __setup("serialnumber", x86_serial_nr_se /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -427,20 +463,9 @@ static void __cpuinit identify_cpu(struc generic_identify(c); - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { + if (this_cpu->c_identify) this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -462,23 +487,6 @@ static void __cpuinit identify_cpu(struc * we do "generic changes." */ - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - /* SEP disabled? */ - if (disable_x86_sep) - clear_bit(X86_FEATURE_SEP, c->x86_capability); - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -491,13 +499,6 @@ static void __cpuinit identify_cpu(struc c->x86, c->x86_model); } - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -510,8 +511,14 @@ static void __cpuinit identify_cpu(struc boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] &= ~cleared_cpu_caps[i]; + /* Init Machine Check Exception if available. */ mcheck_init(c); + + select_idle_routine(c); } void __init identify_boot_cpu(void) @@ -519,7 +526,6 @@ void __init identify_boot_cpu(void) identify_cpu(&boot_cpu_data); sysenter_setup(); enable_sep_cpu(); - mtrr_bp_init(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -576,6 +582,13 @@ void __cpuinit detect_ht(struct cpuinfo_ } #endif +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -599,6 +612,17 @@ void __cpuinit print_cpu_info(struct cpu printk("\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) @@ -608,16 +632,6 @@ cpumask_t cpu_initialized __cpuinitdata * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); - void __init early_cpu_init(void) { intel_cpu_init(); @@ -629,21 +643,13 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif } /* Make sure %fs is initialized properly in idle threads */ -struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PERCPU; + regs->fs = __KERNEL_PERCPU; return regs; } @@ -651,7 +657,7 @@ struct pt_regs * __devinit idle_regs(str * it's on the real one. */ void switch_to_new_gdt(void) { - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; unsigned long va, frames[16]; int f; @@ -694,12 +700,6 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } switch_to_new_gdt(); @@ -712,7 +712,7 @@ void __cpuinit cpu_init(void) BUG(); enter_lazy_tlb(&init_mm, curr); - load_esp0(t, thread); + load_sp0(t, thread); load_LDT(&init_mm.context); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/cpu/mtrr/main-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -33,7 +33,7 @@ struct mtrr_ops generic_mtrr_ops = { struct mtrr_ops *mtrr_if = &generic_mtrr_ops; unsigned int num_var_ranges; -unsigned int *usage_table; +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; static void __init set_num_var_ranges(void) { @@ -52,17 +52,12 @@ static void __init init_table(void) int i, max; max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } for (i = 0; i < max; i++) - usage_table[i] = 0; + mtrr_usage_table[i] = 0; } int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) + unsigned int type, bool increment) { int error; struct xen_platform_op op; @@ -81,7 +76,7 @@ int mtrr_add_page(unsigned long base, un } if (increment) - ++usage_table[op.u.add_memtype.reg]; + ++mtrr_usage_table[op.u.add_memtype.reg]; mutex_unlock(&mtrr_mutex); @@ -103,7 +98,7 @@ static int mtrr_check(unsigned long base int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) + bool increment) { if (mtrr_check(base, size)) return -EINVAL; @@ -136,11 +131,11 @@ int mtrr_del_page(int reg, unsigned long goto out; } } - if (usage_table[reg] < 1) { + if (mtrr_usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) { + if (--mtrr_usage_table[reg] < 1) { op.cmd = XENPF_del_memtype; op.u.del_memtype.handle = 0; op.u.del_memtype.reg = reg; --- sle11sp1-2010-03-22.orig/arch/x86/kernel/e820_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/e820_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -7,7 +7,6 @@ #include <linux/kexec.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/suspend.h> @@ -18,11 +17,6 @@ #include <asm/setup.h> #include <xen/interface/memory.h> -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - struct e820map e820; struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -38,26 +32,6 @@ unsigned long pci_mem_start = 0x10000000 EXPORT_SYMBOL(pci_mem_start); #endif extern int user_defined_memmap; -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; static struct resource system_rom_resource = { .name = "System ROM", @@ -112,60 +86,6 @@ static struct resource video_rom_resourc .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) @@ -272,10 +192,9 @@ static struct e820map machine_e820; * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -324,39 +243,6 @@ legacy_init_iomem_resources(struct resou #undef e820 -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - /* Nothing to do if not running in dom0. */ - if (!is_initial_xendomain()) - return 0; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, - &data_resource, &bss_resource); - else - legacy_init_iomem_resources(&code_resource, - &data_resource, &bss_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) /** * e820_mark_nosave_regions - Find the ranges of physical addresses that do not @@ -393,19 +279,17 @@ void __init add_memory_region(unsigned l { int x; - if (!efi_enabled) { - x = e820.nr_map; + x = e820.nr_map; - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; } /* add_memory_region */ /* @@ -642,29 +526,6 @@ int __init copy_e820_map(struct e820entr } /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* * Find the highest page frame number we have available */ void __init find_max_pfn(void) @@ -672,11 +533,6 @@ void __init find_max_pfn(void) int i; max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; @@ -694,34 +550,12 @@ void __init find_max_pfn(void) } /* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* * Register fully available low RAM pages with the bootmem allocator. */ void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* @@ -842,56 +676,12 @@ void __init print_memory_map(char *who) } } -static __init __always_inline void efi_limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - efi_memory_desc_t *md, *next_md; - void *p, *p1; - int i, j; - - j = 0; - p1 = memmap.map; - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - next_md = p1; - current_addr = md->phys_addr + - PFN_PHYS(md->num_pages); - if (is_available_memory(md)) { - if (md->phys_addr >= size) continue; - memcpy(next_md, md, memmap.desc_size); - if (current_addr >= size) { - next_md->num_pages -= - PFN_UP(current_addr-size); - } - p1 += memmap.desc_size; - next_md = p1; - j++; - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == - EFI_MEMORY_RUNTIME) { - /* In order to make runtime services - * available we have to include runtime - * memory regions in memory map */ - memcpy(next_md, md, memmap.desc_size); - p1 += memmap.desc_size; - next_md = p1; - j++; - } - } - memmap.nr_map = j; - memmap.map_end = memmap.map + - (memmap.nr_map * memmap.desc_size); -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; int i; print_memory_map("limit_regions start"); - if (efi_enabled) { - efi_limit_regions(size); - return; - } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -1043,3 +833,44 @@ static int __init parse_memmap(char *arg return 0; } early_param("memmap", parse_memmap); + +#ifndef CONFIG_XEN +void __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && ei->size <= size) { + ei->type = new_type; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + } +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + print_memory_map("modified"); +} +#endif --- sle11sp1-2010-03-22.orig/arch/x86/kernel/e820_64-xen.c 2009-12-04 11:01:22.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/e820_64-xen.c 2009-12-04 11:03:21.000000000 +0100 @@ -1,4 +1,4 @@ -/* +/* * Handle the memory map. * The functions here do the job until bootmem takes over. * @@ -26,6 +26,7 @@ #include <asm/proto.h> #include <asm/setup.h> #include <asm/sections.h> +#include <asm/kdebug.h> #include <xen/interface/memory.h> struct e820map e820 __initdata; @@ -33,98 +34,105 @@ struct e820map e820 __initdata; struct e820map machine_e820; #endif -/* +/* * PFN of last memory page. */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); +unsigned long end_pfn; #ifndef CONFIG_XEN -/* +/* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. * The direct mapping extends to end_pfn_map, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; + */ +unsigned long end_pfn_map; #endif -/* +/* * Last pfn which the user wants to use. */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource, bss_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 +struct early_res { + unsigned long start, end; + char name[16]; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { #ifndef CONFIG_XEN - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image+ramdisk_size; - - if (last >= ramdisk_image && addr < ramdisk_end) { - *addrp = PAGE_ALIGN(ramdisk_end); - return 1; - } - } + { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ +#ifdef CONFIG_SMP + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" }, #endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; - } +#endif + {} +}; - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; +void __init reserve_early(unsigned long start, unsigned long end, char *name) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", + start, end - 1, name?name:"", r->start, r->end - 1, r->name); } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); +} -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; +void __init early_res_to_bootmem(void) +{ + int i; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i, + r->start, r->end - 1, r->name); + reserve_bootmem_generic(r->start, r->end - r->start); } -#endif - /* XXX ramdisk image here? */ -#else - if (last < (table_end<<PAGE_SHIFT)) { - *addrp = table_end << PAGE_SHIFT; - return 1; +} + +/* Check for already reserved areas */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + int i; + unsigned long addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = r->end; + changed = 1; + goto again; + } } -#endif - return 0; -} + return changed; +} /* * This function checks if any part of the range <start,end> is mapped * with type. */ -int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ +int +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) +{ int i; #ifndef CONFIG_XEN - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; #else if (!is_initial_xendomain()) return 0; @@ -132,12 +140,12 @@ int e820_any_mapped(unsigned long start, const struct e820entry *ei = &machine_e820.map[i]; #endif - if (type && ei->type != type) + if (type && ei->type != type) continue; if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } + continue; + return 1; + } return 0; } EXPORT_SYMBOL_GPL(e820_any_mapped); @@ -148,7 +156,8 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +int __init e820_all_mapped(unsigned long start, unsigned long end, + unsigned type) { int i; @@ -173,65 +182,77 @@ int __init e820_all_mapped(unsigned long */ if (ei->addr <= start) start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ + /* + * if start is now at or beyond end, we're done, full + * coverage + */ if (start >= end) - return 1; /* we're done */ + return 1; } return 0; } -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) +/* + * Find a free area with specified alignment in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, + unsigned size, unsigned long align) +{ + int i; + unsigned long mask = ~(align - 1); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + + if (ei->type != E820_RAM) + continue; + if (addr < start) addr = start; - if (addr > ei->addr + ei->size) - continue; + if (addr > ei->addr + ei->size) + continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; - last = PAGE_ALIGN(addr) + size; + addr = (addr + align - 1) & mask; + last = addr + size; if (last > ei->addr + ei->size) continue; - if (last > end) + if (last > end) continue; - return addr; - } - return -1UL; -} + return addr; + } + return -1UL; +} /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn = 0; + unsigned long end_pfn; + end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) + + if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) end_pfn_map = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(struct e820entry *e820, int nr_map) +void __init e820_reserve_resources(struct e820entry *e820, int nr_map, + struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; for (i = 0; i < nr_map; i++) { @@ -249,14 +270,14 @@ void __init e820_reserve_resources(struc request_resource(&iomem_resource, res); if (e820[i].type == E820_RAM) { /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. */ #ifndef CONFIG_XEN - request_resource(res, &code_resource); - request_resource(res, &data_resource); - request_resource(res, &bss_resource); + request_resource(res, code_resource); + request_resource(res, data_resource); + request_resource(res, bss_resource); #endif #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) @@ -368,9 +389,9 @@ e820_register_active_regions(int nid, un #endif } -/* +/* * Add a memory region to the kernel e820 map. - */ + */ void __init add_memory_region(unsigned long start, unsigned long size, int type) { int x = e820.nr_map; @@ -395,9 +416,7 @@ unsigned long __init e820_hole_size(unsi { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; + unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { @@ -409,28 +428,31 @@ unsigned long __init e820_hole_size(unsi return end - start - (ram << PAGE_SHIFT); } -void __init e820_print_map(char *who) +static void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; case E820_RESERVED: - printk("(reserved)\n"); - break; + printk(KERN_CONT "(reserved)\n"); + break; case E820_ACPI: - printk("(ACPI data)\n"); - break; + printk(KERN_CONT "(ACPI data)\n"); + break; case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; } } } @@ -438,11 +460,11 @@ void __init e820_print_map(char *who) /* * Sanitize the BIOS e820 map. * - * Some e820 responses include overlapping entries. The following + * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -462,7 +484,8 @@ static int __init sanitize_e820_map(stru int i; /* - Visually we're performing the following (1,2,3,4 = memory types)... + Visually we're performing the following + (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ @@ -504,22 +527,23 @@ static int __init sanitize_e820_map(stru old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) + for (i = 0; i < old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) return -1; /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) + for (i = 0; i < 2 * old_nr; i++) change_point[i] = &change_point_list[i]; /* record all known change-points (starting and ending addresses), omitting those that are for empty memory regions */ chgidx = 0; - for (i=0; i < old_nr; i++) { + for (i = 0; i < old_nr; i++) { if (biosmap[i].size != 0) { change_point[chgidx]->addr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } @@ -529,75 +553,106 @@ static int __init sanitize_e820_map(stru still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; - still_changing=1; + still_changing = 1; } } } /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { + for (chgidx = 0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; } overlap_entries--; } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ current_type = 0; - for (i=0; i<overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ + /* + * continue building up new bios map based on this + * information + */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ + /* + * move forward only if the new size + * was non-zero + */ if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ + break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; + last_addr = change_point[chgidx]->addr; } last_type = current_type; } } - new_nr = new_bios_entry; /* retain count for new bios entries */ + /* retain count for new bios entries */ + new_nr = new_bios_entry; /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); *pnr_map = new_nr; return 0; @@ -612,7 +667,7 @@ static int __init sanitize_e820_map(stru * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { #ifndef CONFIG_XEN /* Only one memory region (or negative)? Ignore it */ @@ -633,7 +688,7 @@ static int __init copy_e820_map(struct e return -1; add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); #ifdef CONFIG_XEN if (is_initial_xendomain()) { @@ -652,15 +707,17 @@ static int __init copy_e820_map(struct e return 0; } -void early_panic(char *msg) +static void early_panic(char *msg) { early_printk(msg); panic(msg); } -#ifndef CONFIG_XEN -void __init setup_memory_region(void) +/* We're not void only for x86 32-bit compat */ +char * __init machine_specific_memory_setup(void) { +#ifndef CONFIG_XEN + char *who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * @@ -670,14 +727,8 @@ void __init setup_memory_region(void) sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); -} - #else /* CONFIG_XEN */ - -void __init setup_memory_region(void) -{ + char *who = "Xen"; int rc; struct xen_memory_map memmap; /* @@ -705,11 +756,13 @@ void __init setup_memory_region(void) if (copy_e820_map(map, (char)memmap.nr_entries) < 0) early_panic("Cannot find a valid memory map"); - +#endif printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("Xen"); + e820_print_map(who); + + /* In case someone cares... */ + return who; } -#endif static int __init parse_memopt(char *p) { @@ -720,7 +773,7 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; + end_user_pfn >>= PAGE_SHIFT; end = end_user_pfn<<PAGE_SHIFT; i = e820.nr_map-1; @@ -738,7 +791,7 @@ static int __init parse_memopt(char *p) } return 0; -} +} early_param("mem", parse_memopt); static int userdef __initdata; @@ -750,9 +803,9 @@ static int __init parse_memmap_opt(char if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is * reset. */ e820_register_active_regions(0, 0, -1UL); @@ -769,6 +822,8 @@ static int __init parse_memmap_opt(char mem_size = memparse(p, &p); if (p == oldp) return -EINVAL; + + userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); @@ -788,11 +843,58 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { + char nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + printk(KERN_INFO "user-defined physical RAM map:\n"); e820_print_map("user"); } } +#ifndef CONFIG_XEN +void __init update_memory_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + int i; + + BUG_ON(old_type == new_type); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + if (ei->type != old_type) + continue; + /* totally covered? */ + if (ei->addr >= start && ei->size <= size) { + ei->type = new_type; + continue; + } + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(start + size, ei->addr + ei->size); + if (final_start >= final_end) + continue; + add_memory_region(final_start, final_end - final_start, + new_type); + } +} + +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} +#endif + unsigned long pci_mem_start = 0xaeedbabe; EXPORT_SYMBOL(pci_mem_start); @@ -836,8 +938,10 @@ __init void e820_setup_gap(struct e820en if (!found) { gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); } /* @@ -850,8 +954,9 @@ __init void e820_setup_gap(struct e820en /* Fun with two's complement */ pci_mem_start = (gapstart + round) & -round; - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); } int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) --- sle11sp1-2010-03-22.orig/arch/x86/kernel/early_printk-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/early_printk-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -222,7 +222,7 @@ static struct console simnow_console = { }; /* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; +static struct console *early_console = &early_vga_console; static int early_console_initialized = 0; void early_printk(const char *fmt, ...) --- sle11sp1-2010-03-22.orig/arch/x86/kernel/entry_32-xen.S 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/entry_32-xen.S 2009-11-06 10:51:25.000000000 +0100 @@ -59,7 +59,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -282,16 +282,21 @@ END(resume_kernel) #endif CFI_ENDPROC + .macro test_tif ti_reg # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg) + .endm + /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(sysenter_entry) +ENTRY(ia32_sysenter_target) CFI_STARTPROC simple CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl SYSENTER_stack_esp0(%esp),%esp + movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* * No need to follow this irqs on/off section: the syscall @@ -334,9 +339,7 @@ sysenter_past_esp: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + test_tif %ebp jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -354,7 +357,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSEXIT + ENABLE_INTERRUPTS_SYSCALL_RET CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -363,10 +366,10 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection -ENDPROC(sysenter_entry) +ENDPROC(ia32_sysenter_target) # pv sysenter call handler stub -ENTRY(sysenter_entry_pv) +ENTRY(ia32pv_sysenter_target) RING0_INT_FRAME movl $__USER_DS,16(%esp) movl %ebp,12(%esp) @@ -389,7 +392,7 @@ ENTRY(sysenter_entry_pv) .previous /* fall through */ CFI_ENDPROC -ENDPROC(sysenter_entry_pv) +ENDPROC(ia32pv_sysenter_target) # system call handler stub ENTRY(system_call) @@ -398,9 +401,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + test_tif %ebp jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -452,7 +453,8 @@ restore_nocheck_notrace: RESTORE_REGS addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 -1: INTERRUPT_RETURN +irq_return: + INTERRUPT_RETURN .section .fixup,"ax" iret_exc: pushl $0 # no error code @@ -461,7 +463,7 @@ iret_exc: .previous .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long irq_return,iret_exc .previous CFI_RESTORE_STATE @@ -657,7 +659,7 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.data +.section .rodata,"a" ENTRY(interrupt) .text @@ -963,7 +965,7 @@ END(device_not_available) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -975,7 +977,7 @@ END(device_not_available) cmpw $__KERNEL_CS,4(%esp); \ jne ok; \ label: \ - movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + movl SYSENTER_stack_sp0+offset(%esp),%esp; \ CFI_DEF_CFA esp, 0; \ CFI_UNDEFINED eip; \ pushfl; \ @@ -990,7 +992,7 @@ label: \ KPROBE_ENTRY(debug) RING0_INT_FRAME #ifndef CONFIG_XEN - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: @@ -1023,7 +1025,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -1036,7 +1038,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) + cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ @@ -1089,12 +1091,8 @@ nmi_espfix_stack: RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 -1: INTERRUPT_RETURN + jmp irq_return CFI_ENDPROC -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous #else KPROBE_ENTRY(nmi) RING0_INT_FRAME @@ -1112,17 +1110,17 @@ KPROBE_END(nmi) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) -1: iret + iret .section __ex_table,"a" .align 4 - .long 1b,iret_exc + .long native_iret, iret_exc .previous END(native_iret) -ENTRY(native_irq_enable_sysexit) +ENTRY(native_irq_enable_syscall_ret) sti sysexit -END(native_irq_enable_sysexit) +END(native_irq_enable_syscall_ret) #endif KPROBE_ENTRY(int3) @@ -1271,7 +1269,144 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#include <asm/alternative-asm.h> + + # pv syscall call handler stub +ENTRY(ia32pv_cstar_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,%ecx + movl $__USER_CS,4(%esp) + movl 12(%esp),%ebp + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-4,%ebp + CFI_REMEMBER_STATE + ja cstar_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,cstar_fault +.previous + SAVE_ALL + GET_THREAD_INFO(%ebp) + test_tif %ebp + jnz cstar_trace_entry + cmpl $nr_syscalls,%eax + jae cstar_badsys +.Lcstar_call: + btl %eax,cstar_special + jc .Lcstar_special + call *cstar_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +.Lcstar_exit: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_exit +.Lcstar_special: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_call +cstar_set_tif: + movl $cstar_clear_tif,(%esp) # replace return address + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + jmp *sys_call_table(,%eax,4) +cstar_clear_tif: + movl %eax,PT_EAX(%esp) # store the return value + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp .Lcstar_exit +cstar_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + cmpl $nr_syscalls,%eax + jae 1f + btl %eax,cstar_special + jc .Lcstar_trace_special +1: movl %esp,%eax + xorl %edx,%edx + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + call do_syscall_trace + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + testl %eax,%eax + jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl PT_ORIG_EAX(%esp),%eax + cmpl $nr_syscalls,%eax + jb .Lcstar_call + jmp .Lcstar_exit +.Lcstar_trace_special: + movl PT_ECX(%esp),%ecx + movl %esp,%eax + xorl %edx,%edx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + call do_syscall_trace + testl %eax,%eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl PT_ORIG_EAX(%esp),%eax + cmpl $nr_syscalls,%eax + jb syscall_call + jmp syscall_exit +cstar_badsys: + movl $-ENOSYS,PT_EAX(%esp) +.Lcstar_resume: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp resume_userspace + CFI_RESTORE_STATE +cstar_fault: + movl $-EFAULT,%eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + jmp .Lcstar_resume + CFI_ENDPROC +ENDPROC(ia32pv_cstar_target) + +ENTRY(cstar_ret_from_fork) + CFI_STARTPROC + movl PT_ECX(%esp),%ecx + GET_THREAD_INFO(%ebp) + movl %ecx,PT_EBP(%esp) # put user EBP back in place + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp ret_from_fork + CFI_ENDPROC +END(ret_from_fork) + .section .rodata,"a" #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +#include <asm/unistd.h> +cstar_special: +nr=0 +mask=0 +.rept nr_syscalls+31 + .irp n, __NR_sigreturn, __NR_rt_sigreturn + .if nr == \n + mask = mask | (1 << (\n & 31)) + .endif + .endr + nr = nr + 1 + .if (nr & 31) == 0 + .long mask + mask = 0 + .endif +.endr +#define sys_call_table cstar_call_table +#define sys_fork cstar_set_tif +#define sys_clone cstar_set_tif +#define sys_vfork cstar_set_tif +#include "syscall_table_32.S" +#undef sys_call_table +#undef sys_fork +#undef sys_clone +#undef sys_vfork --- sle11sp1-2010-03-22.orig/arch/x86/kernel/entry_64-xen.S 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/entry_64-xen.S 2009-11-06 10:51:25.000000000 +0100 @@ -54,17 +54,22 @@ #include <asm/page.h> #include <asm/irqflags.h> #include <asm/errno.h> -#include <xen/interface/arch-x86_64.h> +#include <xen/interface/xen.h> #include <xen/interface/features.h> -#include "xen_entry_64.S" - .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_PARAVIRT +ENTRY(native_irq_enable_syscall_ret) + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq +#endif /* CONFIG_PARAVIRT */ + .macro TRACE_IRQS_IRETQ offset=ARGOFFSET #ifdef CONFIG_TRACE_IRQFLAGS @@ -277,7 +282,7 @@ ret_from_sys_call: sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -287,7 +292,7 @@ sysret_check: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET VGCF_IN_SYSCALL @@ -298,7 +303,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -309,9 +314,8 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + ENABLE_INTERRUPTS(CLBR_NONE) + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f /* Really a signal */ @@ -323,7 +327,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -355,7 +359,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testb $3,CS-ARGOFFSET(%rsp) jnz 1f @@ -381,22 +385,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -411,7 +413,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -419,7 +421,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -474,6 +476,7 @@ ENTRY(stub_execve) CFI_REGISTER rip, r11 SAVE_REST FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx call sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) @@ -526,11 +529,10 @@ retint_check: retint_restore_args: /* return to kernel space */ movl EFLAGS-REST_SKIP(%rsp), %eax shr $9, %eax # EAX[0] == IRET_EFLAGS.IF - XEN_GET_VCPU_INFO(%rsi) + GET_VCPU_INFO andb evtchn_upcall_mask(%rsi),%al andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask jnz restore_all_enable_events # != 0 => enable event delivery - XEN_PUT_VCPU_INFO(%rsi) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET 0 @@ -541,31 +543,29 @@ retint_careful: bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) -/* sti */ + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - XEN_BLOCK_EVENTS(%rsi) -/* cli */ + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_restore_args TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -702,7 +702,7 @@ END(spurious_interrupt) rdmsr testl %edx,%edx js 1f - swapgs + SWAPGS xorl %ebx,%ebx 1: #endif @@ -719,8 +719,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif -/* cli */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -749,10 +748,10 @@ paranoid_swapgs\trace: .if \trace TRACE_IRQS_IRETQ 0 .endif - swapgs + SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - iretq + jmp irq_return paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -767,11 +766,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -780,9 +779,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_ANY) call schedule - cli + DISABLE_INTERRUPTS(CLBR_ANY) .if \trace TRACE_IRQS_OFF .endif @@ -846,8 +845,7 @@ error_call_handler: call *%rax error_exit: RESTORE_REST -/* cli */ - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testb $3,CS-ARGOFFSET(%rsp) @@ -875,7 +873,7 @@ error_kernelspace: iret run with kernel gs again, so don't set the user space flag. B stepping K8s sometimes report an truncated RIP for IRET exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp + leaq irq_return(%rip),%rbp cmpq %rbp,RIP(%rsp) je error_swapgs movl %ebp,%ebp /* zero extend */ @@ -930,19 +928,17 @@ END(do_hypervisor_callback) restore_all_enable_events: CFI_DEFAULT_STACK adj=1 TRACE_IRQS_ON - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... + __ENABLE_INTERRUPTS scrit: /**** START OF CRITICAL REGION ****/ - XEN_TEST_PENDING(%rsi) + __TEST_PENDING CFI_REMEMBER_STATE jnz 14f # process more events if necessary... - XEN_PUT_VCPU_INFO(%rsi) RESTORE_ARGS 0,8,0 HYPERVISOR_IRET 0 CFI_RESTORE_STATE -14: XEN_LOCKED_BLOCK_EVENTS(%rsi) - XEN_PUT_VCPU_INFO(%rsi) +14: __DISABLE_INTERRUPTS SAVE_REST movq %rsp,%rdi # set the argument again jmp 11b @@ -1086,15 +1082,16 @@ ENDPROC(child_rip) * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack */ ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL + movq %rsp,%rcx call sys_execve movq %rax, RAX(%rsp) RESTORE_REST @@ -1144,7 +1141,7 @@ do_nmi_callback: call do_nmi orl $NMI_MASK,EFLAGS(%rsp) RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) jmp retint_restore_args --- sle11sp1-2010-03-22.orig/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/fixup.c 2009-11-06 10:51:25.000000000 +0100 @@ -36,7 +36,7 @@ #define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) -fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) { static unsigned long printed = 0; char info[100]; --- sle11sp1-2010-03-22.orig/arch/x86/kernel/head64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/head64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -16,6 +16,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/percpu.h> +#include <linux/start_kernel.h> #include <linux/module.h> #include <asm/processor.h> @@ -26,6 +27,8 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/e820.h> unsigned long start_pfn; @@ -34,7 +37,7 @@ static void __init zap_identity_mappings { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + __flush_tlb_all(); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -72,6 +75,37 @@ EXPORT_SYMBOL(machine_to_phys_mapping); unsigned int machine_to_phys_order; EXPORT_SYMBOL(machine_to_phys_order); +#define EBDA_ADDR_POINTER 0x40E + +static __init void reserve_ebda(void) +{ +#ifndef CONFIG_XEN + unsigned ebda_addr, ebda_size; + + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + + if (!ebda_addr) + return; + + ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; + + reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA"); +#endif +} + void __init x86_64_start_kernel(char * real_mode_data) { struct xen_machphys_mapping mapping; @@ -103,8 +137,16 @@ void __init x86_64_start_kernel(char * r /* Make NULL pointers segfault */ zap_identity_mappings(); - for (i = 0; i < IDT_ENTRIES; i++) + /* Cleanup the over mapped high alias */ + cleanup_highmap(); + + for (i = 0; i < IDT_ENTRIES; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else set_intr_gate(i, early_idt_handler); +#endif + } load_idt((const struct desc_ptr *)&idt_descr); #endif @@ -115,8 +157,19 @@ void __init x86_64_start_kernel(char * r pda_init(0); copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif + + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); + + reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), + start_pfn << PAGE_SHIFT, "Xen provided"); + + reserve_ebda(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } --- sle11sp1-2010-03-22.orig/arch/x86/kernel/head_32-xen.S 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/head_32-xen.S 2009-11-06 10:51:25.000000000 +0100 @@ -3,6 +3,7 @@ .text #include <linux/elfnote.h> #include <linux/threads.h> +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> @@ -88,7 +89,7 @@ ENTRY(_stext) */ .section ".bss.page_aligned","wa" .align PAGE_SIZE_asm -ENTRY(swapper_pg_pmd) +ENTRY(swapper_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 --- sle11sp1-2010-03-22.orig/arch/x86/kernel/io_apic_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/io_apic_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -35,6 +35,7 @@ #include <linux/htirq.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #include <asm/io.h> #include <asm/smp.h> @@ -48,8 +49,6 @@ #include <mach_apic.h> #include <mach_apicdef.h> -#include "io_ports.h" - #ifdef CONFIG_XEN #include <xen/interface/xen.h> #include <xen/interface/physdev.h> @@ -400,7 +399,7 @@ static void set_ioapic_affinity_irq(unsi # include <asm/processor.h> /* kernel_thread() */ # include <linux/kernel_stat.h> /* kstat */ # include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> /* time_after() */ +# include <linux/timer.h> #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) @@ -777,7 +776,7 @@ late_initcall(balanced_irq_init); #endif #ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { #ifndef CONFIG_XEN unsigned int cfg; @@ -1959,7 +1958,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; @@ -2142,7 +2141,7 @@ static struct irq_chip lapic_chip __read .eoi = ack_apic, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -2155,7 +2154,7 @@ static void setup_nmi (void) */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(); apic_printk(APIC_VERBOSE, " done.\n"); } @@ -2479,7 +2478,7 @@ static int ioapic_resume(struct sys_devi } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; --- sle11sp1-2010-03-22.orig/arch/x86/kernel/io_apic_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/io_apic_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -32,9 +32,11 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/dmar.h> +#include <linux/jiffies.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif +#include <linux/bootmem.h> #include <asm/idle.h> #include <asm/io.h> @@ -1064,7 +1066,7 @@ void __apicdebuginit print_local_APIC(vo v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1165,7 +1167,7 @@ void __apicdebuginit print_PIC(void) } #endif /* !CONFIG_XEN */ -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; #ifndef CONFIG_XEN @@ -1299,7 +1301,7 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; } @@ -1412,7 +1414,7 @@ static void irq_complete_move(unsigned i if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_rax; + vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; @@ -1439,7 +1441,7 @@ static void ack_apic_level(unsigned int int do_unmask_irq = 0; irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; @@ -1570,7 +1572,7 @@ static struct hw_interrupt_type lapic_ir .end = end_lapic_irq, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -1583,7 +1585,7 @@ static void setup_nmi (void) */ printk(KERN_INFO "activating NMI Watchdog ..."); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); printk(" done.\n"); } @@ -1659,7 +1661,7 @@ static inline void unlock_ExtINT_logic(v * * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; @@ -1863,7 +1865,7 @@ static int ioapic_resume(struct sys_devi } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; @@ -2303,5 +2305,93 @@ void __init setup_ioapic_dest(void) } } #endif -#endif /* !CONFIG_XEN */ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif /* !CONFIG_XEN */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ioport-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,112 @@ +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <linux/syscalls.h> +#include <xen/interface/physdev.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) +{ + unsigned int i; + + for (i = base; i < base + extent; i++) { + if (new_value) + __set_bit(i, bitmap); + else + __clear_bit(i, bitmap); + } +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + */ +static int do_iopl(unsigned int level, struct thread_struct *t) +{ + unsigned int old = t->iopl >> 12; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage long sys_iopl(unsigned long regsp) +{ + struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ +#endif + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, t); + if (rc < 0) + goto out; + + t->iopl = level << 12; + set_iopl_mask(t->iopl); +out: + return rc; +} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/ioport_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,121 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> -#include <xen/interface/physdev.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } - - if (length > 0) { - mask = ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - } -} - - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - struct thread_struct * t = ¤t->thread; - unsigned long *bitmap; - struct physdev_set_iobitmap set_iobitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); - set_iobitmap.nr_ports = IO_BITMAP_BITS; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &set_iobitmap)); - } - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned long unused) -{ - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - struct thread_struct *t = ¤t->thread; - unsigned int old = (t->iopl >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - t->iopl = level << 12; - set_iopl_mask(t->iopl); - return 0; -} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/ioport_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,99 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> -#include <xen/interface/physdev.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - struct thread_struct * t = ¤t->thread; - unsigned long *bitmap; - struct physdev_set_iobitmap set_iobitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - - set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); - set_iobitmap.nr_ports = IO_BITMAP_BITS; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, - &set_iobitmap)); - } - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - */ - -asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs) -{ - unsigned int old_iopl = current->thread.iopl; - struct physdev_set_iopl set_iopl; - - if (new_iopl > 3) - return -EINVAL; - - /* Need "raw I/O" privileges for direct port access. */ - if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* Change our version of the privilege levels. */ - current->thread.iopl = new_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); - - return 0; -} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/irq_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/irq_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_eax; + int irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_r #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - long esp; + long sp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + "=r" (sp) : "0" (THREAD_SIZE - 1)); + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct thread_info)); + sp - sizeof(struct thread_info)); dump_stack(); } } @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_r * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, bx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_r (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) : "memory", "cc" --- sle11sp1-2010-03-22.orig/arch/x86/kernel/irq_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/irq_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -20,6 +20,28 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +#endif +} + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -33,11 +55,11 @@ static inline void stack_overflow_check( u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + 128 && time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); show_stack(NULL,NULL); warned = jiffies; } @@ -150,7 +172,7 @@ asmlinkage unsigned int do_IRQ(struct pt struct pt_regs *old_regs = set_irq_regs(regs); /* high bit used in ret_from_ code */ - unsigned irq = ~regs->orig_rax; + unsigned irq = ~regs->orig_ax; /*exit_idle();*/ /*irq_enter();*/ @@ -251,14 +273,3 @@ asmlinkage void do_softirq(void) } local_irq_restore(flags); } - -#ifndef CONFIG_X86_LOCAL_APIC -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at irq %02x\n", irq); -} -#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ldt-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,272 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/vmalloc.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_SMP +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); +#endif + make_pages_readonly(newldt, + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + mutex_init(&mm->context.lock); + old_mm = current->mm; + if (old_mm) + mm->context.vdso = old_mm->context.vdso; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/ldt_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,265 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/mmu_context.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -{ - void *oldldt; - void *newldt; - int oldsize; - - if (mincount <= pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); -#endif - make_pages_readonly( - pc->ldt, - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable( - oldldt, - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly( - new->ldt, - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - mutex_init(&mm->context.lock); - mm->context.size = 0; - mm->context.has_foreign_mappings = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * No need to lock the MM as we are the last user - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable( - mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - - err = 0; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (clear_user(ptr, size)) - err = -EFAULT; - - return err; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, - entry_1, entry_2); - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/ldt_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,271 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/pgalloc.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); -#endif - make_pages_readonly( - pc->ldt, - (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable( - oldldt, - (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly( - new->ldt, - (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - memset(&mm->context, 0, sizeof(mm->context)); - mutex_init(&mm->context.lock); - old_mm = current->mm; - if (old_mm) - mm->context.vdso = old_mm->context.vdso; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable( - mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, - XENFEAT_writable_descriptor_tables); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - mach_lp = arbitrary_virt_to_machine(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/machine_kexec_64.c 2009-12-04 10:44:49.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/machine_kexec_64.c 2009-11-06 10:51:25.000000000 +0100 @@ -404,7 +404,9 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ VMCOREINFO_SYMBOL(phys_base); +#endif VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA --- sle11sp1-2010-03-22.orig/arch/x86/kernel/microcode-xen.c 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/microcode-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -167,7 +167,7 @@ static int request_microcode(void) } op.cmd = XENPF_microcode_update; - set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data); + set_xen_guest_handle(op.u.microcode.data, firmware->data); op.u.microcode.length = firmware->size; error = HYPERVISOR_platform_op(&op); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/mpparse_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/mpparse_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -unsigned int __cpuinitdata num_processors; +unsigned int num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -265,7 +265,7 @@ static void __init MP_ioapic_info (struc if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", @@ -412,9 +412,9 @@ static int __init smp_read_mpc(struct mp mps_oem_check(mpc, oem, str); - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + printk("APIC at: 0x%X\n", mpc->mpc_lapic); - /* + /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ @@ -728,7 +728,7 @@ static int __init smp_scan_config (unsig unsigned long *bp = isa_bus_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); @@ -742,9 +742,10 @@ static int __init smp_scan_config (unsig smp_found_config = 1; #ifndef CONFIG_XEN - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, + BOOTMEM_DEFAULT); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute @@ -759,11 +760,12 @@ static int __init smp_scan_config (unsig unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); + reserve_bootmem(mpf->mpf_physptr, size, + BOOTMEM_DEFAULT); } #else - printk(KERN_INFO "found SMP MP-table at %08lx\n", - ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - isa_bus_to_virt(base)) + base); #endif mpf_found = mpf; @@ -940,14 +942,14 @@ void __init mp_register_ioapic(u8 id, u3 */ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + + mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); } void __init @@ -1063,15 +1065,16 @@ void __init mp_config_acpi_legacy_irqs ( } #define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; + static int pci_irq = IRQ_COMPRESSION_START; /* - * Mapping between Global System Interrups, which + * Mapping between Global System Interrupts, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ @@ -1108,12 +1111,16 @@ int mp_register_gsi(u32 gsi, int trigger if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi_to_irq[gsi]; + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { /* * For PCI devices assign IRQs in order, avoiding gaps * due to unused I/O APIC pins. --- sle11sp1-2010-03-22.orig/arch/x86/kernel/mpparse_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/mpparse_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -60,14 +60,20 @@ unsigned int boot_cpu_id = -1U; EXPORT_SYMBOL(boot_cpu_id); /* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; +unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +#ifndef CONFIG_XEN +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata + = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_bios_cpu_apicid_early_ptr; +#endif +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* @@ -119,24 +125,22 @@ static void __cpuinit MP_processor_info( physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* - * bios_cpu_apicid is required to have processors listed + * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; } - bios_cpu_apicid[cpu] = m->mpc_apicid; - /* - * We get called early in the the start_kernel initialization - * process when the per_cpu data area is not yet setup, so we - * use a static array that is removed after the per_cpu data - * area is created. - */ - if (x86_cpu_to_apicid_ptr) { - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* are we being called early in kernel startup? */ + if (x86_cpu_to_apicid_early_ptr) { + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + + cpu_to_apicid[cpu] = m->mpc_apicid; + bios_cpu_apicid[cpu] = m->mpc_apicid; } else { per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; } cpu_set(cpu, cpu_possible_map); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/pci-dma-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/pci-dma-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -431,3 +431,23 @@ dma_sync_single_for_device(struct device swiotlb_sync_single_for_device(dev, dma_handle, size, direction); } EXPORT_SYMBOL(dma_sync_single_for_device); + +void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_sg_for_cpu); + +void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_sg_for_device(dev,sg,nelems,direction); + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_sg_for_device); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/process_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/process_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/utsname.h> #include <linux/delay.h> @@ -59,8 +58,10 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> +#include <asm/kdebug.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); static int hlt_counter; @@ -78,7 +79,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - return ((unsigned long *)tsk->thread.esp)[3]; + return ((unsigned long *)tsk->thread.sp)[3]; } /* @@ -86,7 +87,6 @@ unsigned long thread_saved_pc(struct tas */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -107,7 +107,7 @@ EXPORT_SYMBOL(enable_hlt); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { cpu_relax(); } @@ -122,10 +122,19 @@ static void xen_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } #ifdef CONFIG_APM_MODULE @@ -168,13 +177,13 @@ void cpu_idle(void) while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - check_pgt_cache(); rmb(); idle = xen_idle; /* no alternatives */ + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, 0); + if (cpu_is_offline(cpu)) play_dead(); @@ -192,40 +201,19 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -251,15 +239,15 @@ void __show_registers(struct pt_regs *re { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long esp; + unsigned long sp; unsigned short ss, gs; if (user_mode_vm(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; + sp = regs->sp; + ss = regs->ss & 0xffff; savesegment(gs, gs); } else { - esp = (unsigned long) (®s->esp); + sp = (unsigned long) (®s->sp); savesegment(ss, ss); savesegment(gs, gs); } @@ -272,17 +260,17 @@ void __show_registers(struct pt_regs *re init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->xcs, regs->eip, regs->eflags, + 0xffff & regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); + regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); + regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->ds & 0xffff, regs->es & 0xffff, + regs->fs & 0xffff, gs, ss); if (!all) return; @@ -310,12 +298,12 @@ void __show_registers(struct pt_regs *re void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); - show_trace(NULL, regs, ®s->esp); + show_trace(NULL, regs, ®s->sp, regs->bp); } /* - * This gets run with %ebx containing the - * function to call, and %edx containing + * This gets run with %bx containing the + * function to call, and %dx containing * the "args". */ extern void kernel_thread_helper(void); @@ -329,16 +317,16 @@ int kernel_thread(int (*fn)(void *), voi memset(®s, 0, sizeof(regs)); - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.xfs = __KERNEL_PERCPU; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS | get_kernel_rpl(); - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -368,7 +356,12 @@ void flush_thread(void) { struct task_struct *tsk = current; - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* @@ -393,7 +386,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -403,17 +396,19 @@ int copy_thread(int nr, unsigned long cl childregs = task_pt_regs(p); *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; + childregs->ax = 0; + childregs->sp = sp; - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs,p->thread.gs); + savesegment(gs, p->thread.gs); tsk = current; + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) + p->thread.ip = (unsigned long) cstar_ret_from_fork; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -424,34 +419,17 @@ int copy_thread(int nr, unsigned long cl set_tsk_thread_flag(p, TIF_IO_BITMAP); } + err = 0; + /* * Set a new TLS for the child thread? */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); p->thread.iopl = current->thread.iopl; - err = 0; - out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; @@ -459,67 +437,8 @@ int copy_thread(int nr, unsigned long cl return err; } -/* - * fill in the user structure for a core dump.. - */ -void dump_thread(struct pt_regs * regs, struct user * dump) -{ - int i; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; - - if (dump->start_stack < TASK_SIZE) - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - dump->regs.fs = regs->xfs; - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; - - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -} -EXPORT_SYMBOL(dump_thread); - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs = *task_pt_regs(tsk); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - #ifdef CONFIG_SECCOMP -void hard_disable_TSC(void) +static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } @@ -534,7 +453,7 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } -void hard_enable_TSC(void) +static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } @@ -543,18 +462,32 @@ void hard_enable_TSC(void) static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *next; + struct thread_struct *prev, *next; + unsigned long debugctl; + prev = &prev_p->thread; next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + } + + if (next->debugctlmsr != debugctl) + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); } #ifdef CONFIG_SECCOMP @@ -567,6 +500,14 @@ __switch_to_xtra(struct task_struct *pre hard_enable_TSC(); } #endif + +#ifdef X86_BTS + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif } /* @@ -592,11 +533,11 @@ __switch_to_xtra(struct task_struct *pre * More important, however, is the fact that this allows us much * more flexibility. * - * The return value (in %eax) will be the "prev" task after + * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -632,12 +573,12 @@ struct task_struct fastcall * __switch_t #endif /* - * Reload esp0. - * This is load_esp0(tss, next) with a multicall. + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. */ mcl->op = __HYPERVISOR_stack_switch; mcl->args[0] = __KERNEL_DS; - mcl->args[1] = next->esp0; + mcl->args[1] = next->sp0; mcl++; /* @@ -734,7 +675,7 @@ struct task_struct fastcall * __switch_t asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) @@ -743,12 +684,12 @@ asmlinkage int sys_clone(struct pt_regs unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; + clone_flags = regs.bx; + newsp = regs.cx; + parent_tidptr = (int __user *)regs.dx; + child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.esp; + newsp = regs.sp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } @@ -764,7 +705,7 @@ asmlinkage int sys_clone(struct pt_regs */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } /* @@ -775,18 +716,15 @@ asmlinkage int sys_execve(struct pt_regs int error; char * filename; - filename = getname((char __user *) regs.ebx); + filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, + (char __user * __user *) regs.cx, + (char __user * __user *) regs.dx, ®s); if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } @@ -800,145 +738,37 @@ out: unsigned long get_wchan(struct task_struct *p) { - unsigned long ebp, esp, eip; + unsigned long bp, sp, ip; unsigned long stack_page; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; stack_page = (unsigned long)task_stack_page(p); - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; do { - if (ebp < stack_page || ebp > top_ebp+stack_page) + if (bp < stack_page || bp > top_ebp+stack_page) return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; } while (count++ < 16); return 0; } -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - memset(&info, 0, sizeof(info)); - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/process_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/process_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -3,7 +3,7 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * + * * X86-64 port * Andi Kleen. * @@ -22,19 +22,18 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/fs.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/module.h> -#include <linux/a.out.h> #include <linux/interrupt.h> +#include <linux/utsname.h> #include <linux/delay.h> +#include <linux/module.h> #include <linux/ptrace.h> -#include <linux/utsname.h> #include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -73,7 +72,6 @@ EXPORT_SYMBOL(boot_option_idle_override) */ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); -static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -81,13 +79,6 @@ void idle_notifier_register(struct notif { atomic_notifier_chain_register(&idle_notifier, n); } -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); void enter_idle(void) { @@ -116,7 +107,7 @@ void exit_idle(void) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); cpu_relax(); @@ -131,10 +122,19 @@ static void xen_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); + safe_halt(); /* enables interrupts racelessly */ + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -161,19 +161,15 @@ static inline void play_dead(void) * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); - if (__get_cpu_var(cpu_idle_state)) - __get_cpu_var(cpu_idle_state) = 0; - - tick_nohz_stop_sched_tick(); - rmb(); idle = xen_idle; /* no alternatives */ if (cpu_is_offline(smp_processor_id())) @@ -203,49 +199,27 @@ static void do_nothing(void *unused) { } +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ void cpu_idle_wait(void) { - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); @@ -260,13 +234,13 @@ static int __init idle_setup (char *str) } early_param("idle", idle_setup); -/* Prints also some state that isn't saved in the pt_regs */ +/* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; printk("\n"); print_modules(); @@ -275,16 +249,16 @@ void __show_regs(struct pt_regs * regs) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); + regs->ax, regs->bx, regs->cx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); + regs->dx, regs->si, regs->di); printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); + regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", @@ -318,7 +292,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } /* @@ -329,7 +303,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { + if (me->thread.io_bitmap_ptr) { #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); #endif @@ -382,7 +356,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -405,26 +379,21 @@ void release_thread(struct task_struct * static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) { - struct user_desc ud = { + struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; + struct desc_struct *desc = t->thread.tls_array; desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); + fill_ldt(desc, &ud); } static inline u32 read_32bit_tls(struct task_struct *t, int tls) { - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); + return get_desc_base(&t->thread.tls_array[tls]); } /* @@ -436,7 +405,7 @@ void prepare_to_copy(struct task_struct unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -448,14 +417,14 @@ int copy_thread(int nr, unsigned long cl (THREAD_SIZE + task_stack_page(p))) - 1; *childregs = *regs; - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; - - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); @@ -476,7 +445,7 @@ int copy_thread(int nr, unsigned long cl memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); - } + } /* * Set a new TLS for the child thread? @@ -484,7 +453,8 @@ int copy_thread(int nr, unsigned long cl if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); @@ -502,26 +472,32 @@ out: return err; } -static inline void __save_init_fpu( struct task_struct *tsk ) -{ - asm volatile( "rex64 ; fxsave %0 ; fnclex" - : "=m" (tsk->thread.i387.fxsave)); - tsk->thread_info->status &= ~TS_USEDFPU; -} - /* * This special macro can be used to load a debugging register */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p) + struct task_struct *next_p) { struct thread_struct *prev, *next; + unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + } + + if (next->debugctlmsr != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); @@ -531,12 +507,20 @@ static inline void __switch_to_xtra(stru loaddebug(next, 6); loaddebug(next, 7); } + +#ifdef X86_BTS + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); +#endif } /* * switch_to(x,y) should switch tasks from x to y. * - * This could still be optimized: + * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * @@ -547,7 +531,7 @@ __switch_to(struct task_struct *prev_p, { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); #ifndef CONFIG_X86_NO_TSS struct tss_struct *tss = &per_cpu(init_tss, cpu); #endif @@ -581,11 +565,12 @@ __switch_to(struct task_struct *prev_p, prev_p->fpu_counter = 0; /* - * Reload esp0, LDT and the page table pointer: + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. */ mcl->op = __HYPERVISOR_stack_switch; mcl->args[0] = __KERNEL_DS; - mcl->args[1] = next->rsp0; + mcl->args[1] = next->sp0; mcl++; /* @@ -593,11 +578,12 @@ __switch_to(struct task_struct *prev_p, * This is load_TLS(next, cpu) with multicalls. */ #define C(i) do { \ - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ mcl->op = __HYPERVISOR_update_descriptor; \ mcl->args[0] = virt_to_machine( \ - &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \ - mcl->args[1] = next->tls_array[i]; \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ mcl++; \ } \ } while (0) @@ -605,7 +591,7 @@ __switch_to(struct task_struct *prev_p, #undef C if (unlikely(prev->iopl != next->iopl)) { - iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl; + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; #if CONFIG_XEN_COMPAT > 0x030002 mcl->op = __HYPERVISOR_physdev_op; mcl->args[0] = PHYSDEVOP_set_iopl; @@ -669,8 +655,8 @@ __switch_to(struct task_struct *prev_p, /* * Switch the PDA context. */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); + prev->usersp = read_pda(oldrsp); + write_pda(oldrsp, next->usersp); write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); @@ -687,7 +673,8 @@ __switch_to(struct task_struct *prev_p, /* * Now maybe reload the debug registers */ - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p); /* If the task has used fpu the last 5 timeslices, just do a full @@ -702,23 +689,18 @@ __switch_to(struct task_struct *prev_p, /* * sys_execve() executes a new program. */ -asmlinkage +asmlinkage long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) + char __user * __user *envp, struct pt_regs *regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); - if (IS_ERR(filename)) + if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } + error = do_execve(filename, argv, envp, regs); putname(filename); return error; } @@ -728,18 +710,18 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_IA32); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that - 32bit childs are affected again. */ + 32bit childs are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } asmlinkage long sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } asmlinkage long @@ -747,7 +729,7 @@ sys_clone(unsigned long clone_flags, uns void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) - newsp = regs->rsp; + newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } @@ -763,29 +745,29 @@ sys_clone(unsigned long clone_flags, uns */ asmlinkage long sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,rip; + u64 fp,ip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; - fp = *(u64 *)(p->thread.rsp); + fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; fp = *(u64 *)fp; } while (count++ < 16); return 0; @@ -827,19 +809,19 @@ long do_arch_prctl(struct task_struct *t /* Not strictly needed for fs, but do it for symmetry with gs */ if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { + if (addr <= 0xffffffff) { set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); + if (doit) { + load_TLS(&task->thread, cpu); asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; - } else { + } else { task->thread.fsindex = 0; task->thread.fs = addr; if (doit) { @@ -852,24 +834,24 @@ long do_arch_prctl(struct task_struct *t } put_cpu(); break; - case ARCH_GET_FS: { - unsigned long base; + case ARCH_GET_FS: { + unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); else if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; + ret = put_user(base, (unsigned long __user *)addr); + break; } - case ARCH_GET_GS: { + case ARCH_GET_GS: { unsigned long base; unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else @@ -877,40 +859,21 @@ long do_arch_prctl(struct task_struct *t } else base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)addr); break; } default: ret = -EINVAL; break; - } + } - return ret; -} + return ret; +} long sys_arch_prctl(int code, unsigned long addr) { return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - boot_option_idle_override = 1; - return 1; } unsigned long arch_align_stack(unsigned long sp) @@ -919,3 +882,9 @@ unsigned long arch_align_stack(unsigned sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} --- sle11sp1-2010-03-22.orig/arch/x86/kernel/quirks-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/quirks-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -9,7 +9,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; - u32 word; + u16 word; /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) @@ -24,14 +24,17 @@ static void __devinit quirk_intel_irqbal pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); - /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); if (!(word & (1 << 13))) { struct xen_platform_op op; - printk(KERN_INFO "Intel E7520/7320/7525 detected. " - "Disabling irq balancing and affinity\n"); + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); op.cmd = XENPF_platform_quirk; op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; WARN_ON(HYPERVISOR_platform_op(&op)); @@ -102,14 +105,16 @@ static void ich_force_enable_hpet(struct pci_read_config_dword(dev, 0xF0, &rcba); rcba &= 0xFFFFC000; if (rcba == 0) { - printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); return; } /* use bits 31:14, 16 kB aligned */ rcba_base = ioremap_nocache(rcba, 0x4000); if (rcba_base == NULL) { - printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); return; } @@ -120,8 +125,8 @@ static void ich_force_enable_hpet(struct /* HPET is enabled in HPTC. Just not reported by BIOS */ val = val & 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); iounmap(rcba_base); return; } @@ -140,11 +145,12 @@ static void ich_force_enable_hpet(struct if (err) { force_hpet_address = 0; iounmap(rcba_base); - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); } else { force_hpet_resume_type = ICH_FORCE_HPET_RESUME; - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); } } @@ -160,6 +166,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); static struct pci_dev *cached_dev; @@ -204,8 +212,8 @@ static void old_ich_force_enable_hpet(st if (val & 0x4) { val &= 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); return; } @@ -225,14 +233,14 @@ static void old_ich_force_enable_hpet(st /* HPET is enabled in HPTC. Just not reported by BIOS */ val &= 0x3; force_hpet_address = 0xFED00000 | (val << 12); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); cached_dev = dev; force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; return; } - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); } /* @@ -290,8 +298,8 @@ static void vt8237_force_enable_hpet(str */ if (val & 0x80) { force_hpet_address = (val & ~0x3ff); - printk(KERN_DEBUG "HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); return; } @@ -305,14 +313,14 @@ static void vt8237_force_enable_hpet(str pci_read_config_dword(dev, 0x68, &val); if (val & 0x80) { force_hpet_address = (val & ~0x3ff); - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", - force_hpet_address); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); cached_dev = dev; force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; return; } - printk(KERN_DEBUG "Failed to force enable HPET\n"); + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, @@ -340,7 +348,7 @@ static void nvidia_force_enable_hpet(str pci_read_config_dword(dev, 0x44, &val); force_hpet_address = val & 0xfffffffe; force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; - printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; return; @@ -353,6 +361,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N nvidia_force_enable_hpet); /* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, nvidia_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, @@ -373,19 +383,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_N void force_hpet_resume(void) { switch (force_hpet_resume_type) { - case ICH_FORCE_HPET_RESUME: - return ich_force_hpet_resume(); - - case OLD_ICH_FORCE_HPET_RESUME: - return old_ich_force_hpet_resume(); - - case VT8237_FORCE_HPET_RESUME: - return vt8237_force_hpet_resume(); - - case NVIDIA_FORCE_HPET_RESUME: - return nvidia_force_hpet_resume(); - - default: + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + default: break; } } --- sle11sp1-2010-03-22.orig/arch/x86/kernel/rtc.c 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/rtc.c 2009-11-06 10:51:25.000000000 +0100 @@ -171,6 +171,11 @@ int update_persistent_clock(struct times unsigned long flags; int retval; +#ifdef CONFIG_XEN + if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) + return 0; +#endif + spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.set_wallclock(now.tv_sec); spin_unlock_irqrestore(&rtc_lock, flags); @@ -183,6 +188,10 @@ void read_persistent_clock(struct timesp { unsigned long retval, flags; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return xen_read_persistent_clock(); +#endif spin_lock_irqsave(&rtc_lock, flags); retval = x86_platform.get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/setup64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -31,7 +31,11 @@ #include <asm/hypervisor.h> #endif +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -47,6 +51,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __attr unsigned long __supported_pte_mask __read_mostly = ~0UL; EXPORT_SYMBOL(__supported_pte_mask); + static int do_not_nx __cpuinitdata = 0; /* noexec=on|off @@ -90,6 +95,45 @@ static int __init nonx32_setup(char *str __setup("noexec32=", nonx32_setup); /* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ +#ifndef CONFIG_XEN + int cpu; + + for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + if (per_cpu_offset(cpu)) { +#endif + per_cpu(x86_cpu_to_apicid, cpu) = + x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_bios_cpu_apicid, cpu) = + x86_bios_cpu_apicid_init[cpu]; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + x86_cpu_to_node_map_init[cpu]; +#endif +#ifdef CONFIG_SMP + } + else + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", + cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + x86_cpu_to_apicid_early_ptr = NULL; + x86_bios_cpu_apicid_early_ptr = NULL; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = NULL; +#endif +#endif +} + +/* * Great future plan: * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. * Always point %gs to its beginning @@ -109,19 +153,24 @@ void __init setup_per_cpu_areas(void) printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(size); +#else + int node = early_cpu_to_node(i); - if (!NODE_DATA(cpu_to_node(i))) { - printk("cpu with no node %d, num_online_nodes %d\n", - i, num_online_nodes()); + if (!node_online(node) || !NODE_DATA(node)) ptr = alloc_bootmem_pages(size); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); - } + else + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); +#endif if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); cpu_pda(i)->data_offset = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } + + /* setup percpu data maps early */ + setup_per_cpu_maps(); } #ifdef CONFIG_XEN @@ -224,7 +273,8 @@ void syscall_init(void) wrmsrl(MSR_CSTAR, ignore_sysret); /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); #endif #ifdef CONFIG_IA32_EMULATION syscall32_cpu_init (); @@ -303,7 +353,7 @@ void __cpuinit cpu_init (void) */ #ifndef CONFIG_XEN if (cpu) - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); #endif cpu_gdt_descr[cpu].size = GDT_SIZE; @@ -334,10 +384,10 @@ void __cpuinit cpu_init (void) v, cpu); } estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. --- sle11sp1-2010-03-22.orig/arch/x86/kernel/setup_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -47,9 +47,12 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> +#include <linux/pci.h> +#include <linux/init_ohci1394_dma.h> #include <video/edid.h> +#include <asm/mtrr.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> @@ -79,14 +82,83 @@ static struct notifier_block xen_panic_b xen_panic_event, NULL, 0 /* try to go last */ }; -int disable_pse __cpuinitdata = 0; - /* * Machine setup.. */ -extern struct resource code_resource; -extern struct resource data_resource; -extern struct resource bss_resource; +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -94,13 +166,16 @@ struct cpuinfo_x86 new_cpu_data __cpuini struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; EXPORT_SYMBOL(boot_cpu_data); +#ifndef CONFIG_X86_PAE unsigned long mmu_cr4_features; +#else +unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; unsigned int machine_submodel_id; unsigned int BIOS_revision; -unsigned int mca_pentium_flag; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -131,13 +206,17 @@ extern int root_mountflags; unsigned long saved_videomode; -#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 +#define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif /* * Point at the empty zero page to start with. We map the real shared_info @@ -198,8 +277,7 @@ static int __init parse_mem(char *arg) return -EINVAL; if (strcmp(arg, "nopentium") == 0) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; + setup_clear_cpu_cap(X86_FEATURE_PSE); } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to @@ -208,7 +286,7 @@ static int __init parse_mem(char *arg) * trim the existing memory map. */ unsigned long long mem_size; - + mem_size = memparse(arg, &arg); limit_regions(mem_size); user_defined_memmap = 1; @@ -350,7 +428,7 @@ static void __init reserve_ebda_region(v unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT); } #endif @@ -365,8 +443,6 @@ static unsigned long __init setup_memory min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) + xen_start_info->nr_pt_frames; - find_max_pfn(); - max_low_pfn = find_max_low_pfn(); #ifdef CONFIG_HIGHMEM @@ -449,7 +525,8 @@ static void __init reserve_crashkernel(v (unsigned long)(total_mem >> 20)); crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); + reserve_bootmem(crash_base, crash_size, + BOOTMEM_DEFAULT); } else printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); @@ -463,6 +540,99 @@ static inline void __init reserve_crashk {} #endif +#ifdef CONFIG_BLK_DEV_INITRD + +static bool do_relocate_initrd = false; + +static void __init reserve_initrd(void) +{ + unsigned long ramdisk_image = __pa(xen_start_info->mod_start); + unsigned long ramdisk_size = xen_start_info->mod_len; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + + initrd_start = 0; + + if (!xen_start_info->mod_start || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + if (ramdisk_end < ramdisk_image) { + printk(KERN_ERR "initrd wraps around end of memory, " + "disabling initrd\n"); + return; + } + if (ramdisk_size >= end_of_lowmem/2) { + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + return; + } + + /* We need to move the initrd down into lowmem */ + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + + /* Note: this includes all the lowmem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT); + initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + + do_relocate_initrd = true; +} + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +static void __init relocate_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + unsigned long slop, clen, mapaddr; + char *p, *q; + + if (!do_relocate_initrd) + return; + + ramdisk_here = initrd_start - PAGE_OFFSET; + + q = (char *)initrd_start; + + /* Copy any lowmem portion of the initrd */ + if (ramdisk_image < end_of_lowmem) { + clen = end_of_lowmem - ramdisk_image; + p = (char *)__va(ramdisk_image); + memcpy(q, p, clen); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + /* Copy the highmem portion of the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_ioremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } +} + +#endif /* CONFIG_BLK_DEV_INITRD */ + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -480,14 +650,15 @@ void __init setup_bootmem_allocator(void * bootmem allocator with an invalid RAM area. */ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text), + BOOTMEM_DEFAULT); #ifndef CONFIG_XEN /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ - reserve_bootmem(0, PAGE_SIZE); + reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT); /* reserve EBDA region, it's a 4K region */ reserve_ebda_region(); @@ -497,7 +668,7 @@ void __init setup_bootmem_allocator(void unless you have no PS/2 mouse plugged in. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 == 6) - reserve_bootmem(0xa0000 - 4096, 4096); + reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT); #ifdef CONFIG_SMP /* @@ -505,7 +676,7 @@ void __init setup_bootmem_allocator(void * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); + reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT); #endif #ifdef CONFIG_ACPI_SLEEP /* @@ -513,29 +684,12 @@ void __init setup_bootmem_allocator(void */ acpi_reserve_bootmem(); #endif - numa_kva_reserve(); #endif /* !CONFIG_XEN */ #ifdef CONFIG_BLK_DEV_INITRD - if (xen_start_info->mod_start) { - unsigned long ramdisk_image = __pa(xen_start_info->mod_start); - unsigned long ramdisk_size = xen_start_info->mod_len; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - - if (ramdisk_end <= end_of_lowmem) { - /*reserve_bootmem(ramdisk_image, ramdisk_size);*/ - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; - initrd_below_start_ok = 1; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; - } - } + reserve_initrd(); #endif + numa_kva_reserve(); reserve_crashkernel(); } @@ -602,20 +756,14 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); + early_ioremap_init(); #ifdef CONFIG_SMP prefill_possible_map(); #endif - /* - * FIXME: This isn't an official loader_type right - * now but does currently work with elilo. - * If we were configured as an EFI kernel, check to make - * sure that we were loaded correctly from elilo and that - * the system table is valid. If not, then initialize normally. - */ #ifdef CONFIG_EFI - if ((boot_params.hdr.type_of_loader == 0x50) && - boot_params.efi_info.efi_systab) + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) efi_enabled = 1; #endif @@ -655,12 +803,9 @@ void __init setup_arch(char **cmdline_p) #endif ARCH_SETUP - if (efi_enabled) - efi_init(); - else { - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); - } + + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); copy_edd(); @@ -693,6 +838,17 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + if (efi_enabled) + efi_init(); + + /* update e820 for memory not covered by WB MTRRs */ + find_max_pfn(); + mtrr_bp_init(); +#ifndef CONFIG_XEN + if (mtrr_trim_uncached_memory(max_pfn)) + find_max_pfn(); +#endif + max_low_pfn = setup_memory(); #ifdef CONFIG_VMI @@ -717,6 +873,16 @@ void __init setup_arch(char **cmdline_p) smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + remapped_pgdat_init(); sparse_init(); zone_sizes_init(); @@ -802,16 +968,20 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ +#ifdef CONFIG_BLK_DEV_INITRD + relocate_initrd(); +#endif + paravirt_post_allocator_init(); if (is_initial_xendomain()) dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); -#endif - if (efi_enabled) - efi_map_memmap(); +#endif set_iopl.iopl = 1; WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); @@ -829,7 +999,7 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) +#ifndef CONFIG_XEN early_quirks(); #endif @@ -875,3 +1045,30 @@ xen_panic_event(struct notifier_block *t /* we're never actually going to get here... */ return NOTIFY_DONE; } + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return 0; + + printk(KERN_INFO "Setting up standard PCI resources\n"); + init_iomem_resources(&code_resource, &data_resource, &bss_resource); + + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/setup_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -15,7 +15,6 @@ #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/a.out.h> #include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/delay.h> @@ -30,6 +29,7 @@ #include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/pci.h> +#include <linux/efi.h> #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> @@ -39,10 +39,13 @@ #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> +#include <linux/uaccess.h> +#include <linux/init_ohci1394_dma.h> #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> +#include <asm/vsyscall.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/msr.h> @@ -50,6 +53,7 @@ #include <video/edid.h> #include <asm/e820.h> #include <asm/dma.h> +#include <asm/gart.h> #include <asm/mpspec.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -59,6 +63,9 @@ #include <asm/sections.h> #include <asm/dmi.h> #include <asm/cacheflush.h> +#include <asm/mce.h> +#include <asm/ds.h> +#include <asm/topology.h> #ifdef CONFIG_XEN #include <linux/percpu.h> #include <xen/interface/physdev.h> @@ -108,6 +115,8 @@ EXPORT_SYMBOL(xen_start_info); struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + unsigned long mmu_cr4_features; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ @@ -117,7 +126,7 @@ unsigned long saved_video_mode; int force_mwait __cpuinitdata; -/* +/* * Early DMI memory */ int dmi_alloc_index; @@ -163,25 +172,27 @@ struct resource standard_io_resources[] #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) -struct resource data_resource = { +static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource code_resource = { +static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource bss_resource = { +static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -205,9 +216,10 @@ contig_initmem_init(unsigned long start_ unsigned long bootmap_size, bootmap; bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, + PAGE_SIZE); if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); + panic("Cannot find bootmem map of size %ld\n", bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); #ifdef CONFIG_XEN @@ -216,8 +228,8 @@ contig_initmem_init(unsigned long start_ else #endif free_bootmem_with_active_regions(0, end_pfn); - reserve_bootmem(bootmap, bootmap_size); -} + reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); +} #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -250,27 +262,35 @@ static inline void copy_edd(void) #ifndef CONFIG_XEN static void __init reserve_crashkernel(void) { - unsigned long long free_mem; + unsigned long long total_mem; unsigned long long crash_size, crash_base; int ret; - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + total_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; - ret = parse_crashkernel(boot_command_line, free_mem, + ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret == 0 && crash_size) { - if (crash_base > 0) { - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(free_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size); - } else + if (crash_base <= 0) { printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); + return; + } + + if (reserve_bootmem(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation failed - " + "memory is in use\n"); + return; + } + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; } } #else @@ -281,37 +301,21 @@ static inline void __init reserve_crashk {} #endif -#ifndef CONFIG_XEN -#define EBDA_ADDR_POINTER 0x40E - -unsigned __initdata ebda_addr; -unsigned __initdata ebda_size; - -static void discover_ebda(void) +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) __init memory_setup(void) { - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - ebda_size = *(unsigned short *)__va(ebda_addr); - - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; + machine_specific_memory_setup(); } -#else -#define discover_ebda() ((void)0) -#endif +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ void __init setup_arch(char **cmdline_p) { + unsigned i; + #ifdef CONFIG_XEN extern struct e820map machine_e820; @@ -320,6 +324,11 @@ void __init setup_arch(char **cmdline_p) /* Register a call for panic conditions. */ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + early_ioremap_init(); + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); screen_info = boot_params.screen_info; @@ -336,11 +345,6 @@ void __init setup_arch(char **cmdline_p) screen_info.orig_video_isVGA = 0; copy_edid(); - - WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_writable_pagetables)); - - ARCH_SETUP #else printk(KERN_INFO "Command line: %s\n", boot_command_line); @@ -356,7 +360,15 @@ void __init setup_arch(char **cmdline_p) rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif - setup_memory_region(); +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) + efi_enabled = 1; +#endif + + ARCH_SETUP + + memory_setup(); copy_edd(); if (!boot_params.hdr.root_flags) @@ -380,28 +392,51 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + finish_e820_parsing(); + early_gart_iommu_check(); + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); +#ifndef CONFIG_XEN + if (mtrr_trim_uncached_memory(end_pfn)) { + e820_register_active_regions(0, 0, -1UL); + end_pfn = e820_end_of_ram(); + } +#endif + num_physpages = end_pfn; + max_mapnr = end_pfn; check_efer(); - discover_ebda(); - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + if (efi_enabled) + efi_init(); if (is_initial_xendomain()) dmi_scan_machine(); + io_delay_init(); + #if defined(CONFIG_SMP) && !defined(CONFIG_XEN) - /* setup to use the static apicid table during kernel startup */ - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; + /* setup to use the early static init tables during kernel startup */ + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; +#endif #endif /* How many end-of-memory variables you have, grandma! */ @@ -420,54 +455,25 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); + numa_initmem_init(0, end_pfn); #else contig_initmem_init(0, end_pfn); #endif -#ifdef CONFIG_XEN - /* - * Reserve kernel, physmap, start info, initial page tables, and - * direct mapping. - */ - reserve_bootmem_generic(__pa_symbol(&_text), - (table_end << PAGE_SHIFT) - __pa_symbol(&_text)); -#else - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - reserve_bootmem_generic(__pa_symbol(&_text), - __pa_symbol(&_end) - __pa_symbol(&_text)); + early_res_to_bootmem(); +#ifndef CONFIG_XEN +#ifdef CONFIG_ACPI_SLEEP /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. + * Reserve low memory region for sleep support. */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - if (ebda_addr) - reserve_bootmem_generic(ebda_addr, ebda_size); -#ifdef CONFIG_NUMA - /* reserve nodemap region */ - if (nodemap_addr) - reserve_bootmem_generic(nodemap_addr, nodemap_size); + acpi_reserve_bootmem(); #endif -#ifdef CONFIG_SMP - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); -#endif + if (efi_enabled) + efi_reserve_bootmem(); #endif -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif #ifdef CONFIG_BLK_DEV_INITRD #ifdef CONFIG_XEN if (xen_start_info->mod_start) { @@ -491,6 +497,8 @@ void __init setup_arch(char **cmdline_p) initrd_below_start_ok = 1; #endif } else { + /* Assumes everything on node 0 */ + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", ramdisk_end, end_of_mem); @@ -500,10 +508,11 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); paging_init(); + map_vsyscall(); #ifdef CONFIG_X86_LOCAL_APIC /* - * Find and reserve possible boot-time SMP configuration: - */ + * Find and reserve possible boot-time SMP configuration: + */ find_smp_config(); #endif #ifdef CONFIG_XEN @@ -591,16 +600,10 @@ void __init setup_arch(char **cmdline_p) #endif #endif -#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) +#ifndef CONFIG_XEN early_quirks(); #endif - /* - * set this early, so we dont allocate cpu0 - * if MADT list doesnt list BSP first - * mpparse.c/MP_processor_info() allocates logical cpu numbers. - */ - cpu_set(0, cpu_present_map); #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -624,6 +627,7 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #ifndef CONFIG_XEN init_apic_mappings(); + ioapic_init_mappings(); #endif #endif #if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) @@ -635,18 +639,17 @@ void __init setup_arch(char **cmdline_p) */ #ifdef CONFIG_XEN if (is_initial_xendomain()) - e820_reserve_resources(machine_e820.map, machine_e820.nr_map); + e820_reserve_resources(machine_e820.map, machine_e820.nr_map, + &code_resource, &data_resource, &bss_resource); #else - e820_reserve_resources(e820.map, e820.nr_map); + e820_reserve_resources(e820.map, e820.nr_map, + &code_resource, &data_resource, &bss_resource); e820_mark_nosave_regions(); #endif - { - unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); - } #ifdef CONFIG_XEN if (is_initial_xendomain()) @@ -680,7 +683,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif @@ -724,9 +728,10 @@ static void __cpuinit display_cacheinfo( if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; } @@ -740,27 +745,25 @@ static void __cpuinit display_cacheinfo( printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } } #ifdef CONFIG_NUMA -static int nearby_node(int apicid) +static int __cpuinit nearby_node(int apicid) { - int i; + int i, node; + for (i = apicid - 1; i >= 0; i--) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -772,7 +775,7 @@ static int nearby_node(int apicid) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned bits; @@ -781,7 +784,54 @@ static void __init amd_detect_cmp(struct int node = 0; unsigned apicid = hard_smp_processor_id(); #endif - unsigned ecx = cpuid_ecx(0x80000008); + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + c->phys_proc_id = phys_pkg_id(bits); + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); c->x86_max_cores = (ecx & 0xff) + 1; @@ -794,37 +844,8 @@ static void __init amd_detect_cmp(struct bits++; } - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); + c->x86_coreid_bits = bits; - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif #endif } @@ -841,8 +862,8 @@ static void __init amd_detect_cmp(struct /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ static __cpuinit int amd_apic_timer_broken(void) { - u32 lo, hi; - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + switch (eax & CPUID_XFAM) { case CPUID_XFAM_K8: if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) @@ -861,6 +882,15 @@ static __cpuinit int amd_apic_timer_brok } #endif +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -871,7 +901,7 @@ static void __cpuinit init_amd(struct cp /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 - * + * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ @@ -884,35 +914,32 @@ static void __cpuinit init_amd(struct cp /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - + clear_bit(0*32+31, (unsigned long *)&c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); level = get_model_name(c); if (!level) { - switch (c->x86) { + switch (c->x86) { case 15: /* Should distinguish Models here, but this is only a fallback anyways. */ strcpy(c->x86_model_id, "Hammer"); - break; - } - } + break; + } + } display_cacheinfo(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); @@ -924,14 +951,10 @@ static void __cpuinit init_amd(struct cp num_cache_leaves = 3; if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_K8, &c->x86_capability); - - /* RDTSC can be speculated around */ - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_K8); - /* Family 10 doesn't support C states in MWAIT so don't use it */ - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); #ifndef CONFIG_XEN if (amd_apic_timer_broken()) @@ -939,28 +962,29 @@ static void __cpuinit init_amd(struct cp #endif } -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; cpuid(1, &eax, &ebx, &ecx, &edx); if (!cpu_has(c, X86_FEATURE_HT)) return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { + } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } @@ -970,7 +994,7 @@ static void __cpuinit detect_ht(struct c smp_num_siblings = smp_num_siblings / c->x86_max_cores; - index_msb = get_count_order(smp_num_siblings) ; + index_msb = get_count_order(smp_num_siblings); core_bits = get_count_order(c->x86_max_cores); @@ -979,8 +1003,10 @@ static void __cpuinit detect_ht(struct c } out: if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); } #endif @@ -1004,7 +1030,7 @@ static int __cpuinit intel_num_cpu_cores return 1; } -static void srat_detect_node(void) +static void __cpuinit srat_detect_node(void) { #ifdef CONFIG_NUMA unsigned node; @@ -1014,7 +1040,7 @@ static void srat_detect_node(void) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); @@ -1022,28 +1048,39 @@ static void srat_detect_node(void) #endif } +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; init_intel_cacheinfo(c); - if (c->cpuid_level > 9 ) { + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } if (cpu_has_ds) { unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) - set_bit(X86_FEATURE_BTS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) - set_bit(X86_FEATURE_PEBS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_PEBS); } + + if (cpu_has_bts) + ds_init_intel(c); + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -1060,14 +1097,11 @@ static void __cpuinit init_intel(struct c->x86_cache_alignment = c->x86_clflush_size * 2; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 == 6) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 15) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - c->x86_max_cores = intel_num_cpu_cores(c); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -1084,18 +1118,12 @@ static void __cpuinit get_cpu_vendor(str c->x86_vendor = X86_VENDOR_UNKNOWN; } -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { - u32 tfms; + u32 tfms, xlvl; c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; @@ -1106,6 +1134,7 @@ void __cpuinit early_identify_cpu(struct c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_max_cores = 1; + c->x86_coreid_bits = 0; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -1114,7 +1143,7 @@ void __cpuinit early_identify_cpu(struct (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - + get_cpu_vendor(c); /* Initialize the standard set of capabilities */ @@ -1132,7 +1161,7 @@ void __cpuinit early_identify_cpu(struct c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) + if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ @@ -1142,18 +1171,6 @@ void __cpuinit early_identify_cpu(struct #ifdef CONFIG_SMP c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -1174,6 +1191,30 @@ void __cpuinit identify_cpu(struct cpuin c->x86_capability[2] = cpuid_edx(0x80860001); } + c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + early_identify_cpu(c); + init_scattered_cpuid_features(c); #ifndef CONFIG_XEN @@ -1205,8 +1246,7 @@ void __cpuinit identify_cpu(struct cpuin break; } - select_idle_routine(c); - detect_ht(c); + detect_ht(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -1216,31 +1256,55 @@ void __cpuinit identify_cpu(struct cpuin */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] &= ~cleared_cpu_caps[i]; + #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + select_idle_routine(c); + if (c != &boot_cpu_data) mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + } - + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); + printk(KERN_CONT "%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); + else + printk(KERN_CONT "\n"); +} - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); else - printk("\n"); + return 0; + return 1; } +__setup("clearcpuid=", setup_disablecpuid); /* * Get CPU information for use by the procfs. @@ -1249,116 +1313,41 @@ void __cpuinit print_cpu_info(struct cpu static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - int cpu = 0; - - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static const char *const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char *const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ - }; - + int cpu = 0, i; #ifdef CONFIG_SMP cpu = c->cpu_index; #endif - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)cpu, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { + + if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get((unsigned)cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + freq / 1000, (freq % 1000)); } /* Cache size */ - if (c->x86_cache_size >= 0) + if (c->x86_cache_size >= 0) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - + #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); @@ -1367,48 +1356,43 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } -#endif +#endif seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", c->cpuid_level); - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - } - + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); - if (c->x86_tlbsize > 0) + if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0]?" ":"", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } } seq_printf(m, "\n\n"); @@ -1435,8 +1419,8 @@ static void c_stop(struct seq_file *m, v { } -struct seq_operations cpuinfo_op = { - .start =c_start, +const struct seq_operations cpuinfo_op = { + .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, --- sle11sp1-2010-03-22.orig/arch/x86/kernel/smp_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/smp_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -168,7 +168,7 @@ void __send_IPI_shortcut(unsigned int sh } } -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector); } @@ -224,13 +224,14 @@ static DEFINE_SPINLOCK(tlbstate_lock); * We need to reload %cr3 since the page tables may be going * away from under us.. */ -void leave_mm(unsigned long cpu) +void leave_mm(int cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * --- sle11sp1-2010-03-22.orig/arch/x86/kernel/smp_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/smp_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -33,7 +33,7 @@ #ifndef CONFIG_XEN /* - * Smarter SMP flushing macros. + * Smarter SMP flushing macros. * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -41,15 +41,15 @@ * * Optimizations Manfred Spraul <manfred@colorfullife.com> * - * More scalable flush, from Andi Kleen + * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into * the right per cpu variable for the flush data. * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. * In future when interrupts are split into per CPU domains this could be * fixed, at the cost of triggering multiple IPIs in some cases. */ @@ -59,7 +59,6 @@ union smp_flush_state { cpumask_t flush_cpumask; struct mm_struct *flush_mm; unsigned long flush_va; -#define FLUSH_ALL -1ULL spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; @@ -71,16 +70,17 @@ union smp_flush_state { static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm(unsigned long cpu) +void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -89,25 +89,25 @@ static inline void leave_mm(unsigned lon * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 * was in lazy tlb mode. * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. + * Now cpu0 accepts tlb flushes for the new mm. * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. + * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change * cpu active_mm is correct, cpu0 already handles * flush ipis. * 1b1) set cpu mmu_state to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. * 1b3) if the bit was 0: leave_mm was called, flush the tlb. * 2) switch %%esp, ie current * @@ -141,12 +141,12 @@ asmlinkage void smp_invalidate_interrupt * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) goto out; - /* + /* * This was a BUG() but until someone can quote me the * line from the intel manual that guarantees an IPI to * multiple CPUs is retried _only_ on the erroring CPUs @@ -154,10 +154,10 @@ asmlinkage void smp_invalidate_interrupt * * BUG(); */ - + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == FLUSH_ALL) + if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(f->flush_va); @@ -170,19 +170,22 @@ out: add_pda(irq_tlb_count, 1); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { int sender; union smp_flush_state *f; + cpumask_t cpumask = *cpumaskp; /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); - /* Could avoid this lock when - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - probably not worth checking this for a cache-hot lock. */ + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ spin_lock(&f->tlbstate_lock); f->flush_mm = mm; @@ -206,14 +209,14 @@ static void flush_tlb_others(cpumask_t c int __cpuinit init_smp_flush(void) { int i; + for_each_cpu_mask(i, cpu_possible_map) { spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } - core_initcall(init_smp_flush); - + void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; @@ -225,10 +228,9 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -245,11 +247,10 @@ void flush_tlb_mm (struct mm_struct * mm leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -263,8 +264,8 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) @@ -272,7 +273,6 @@ void flush_tlb_page(struct vm_area_struc preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -330,11 +330,9 @@ void unlock_ipi_call_lock(void) * this function sends a 'generic call function' IPI to all other CPU * of the system defined in the mask. */ - -static int -__smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +static int __smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -422,11 +420,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); + int ret, me = get_cpu(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -476,9 +473,9 @@ static void stop_this_cpu(void *dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); disable_all_local_evtchn(); - for (;;) + for (;;) halt(); -} +} void smp_send_stop(void) { --- sle11sp1-2010-03-22.orig/arch/x86/kernel/time-xen.c 2010-03-01 14:31:25.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/time-xen.c 2010-02-09 16:53:28.000000000 +0100 @@ -28,47 +28,19 @@ * serialize accesses to xtime/lost_ticks). */ -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/time.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/efi.h> #include <linux/sysctl.h> #include <linux/percpu.h> #include <linux/kernel_stat.h> #include <linux/posix-timers.h> #include <linux/cpufreq.h> #include <linux/clocksource.h> +#include <linux/sysdev.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> #include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/timer.h> #include <asm/time.h> -#include <asm/sections.h> - -#include "mach_time.h" - -#include <linux/timex.h> - -#include <asm/hpet.h> - -#include <asm/arch_hooks.h> #include <xen/evtchn.h> #include <xen/sysctl.h> @@ -88,9 +60,6 @@ volatile unsigned long __jiffies __secti unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); - /* These are peridically updated in shared_info, and then copied here. */ struct shadow_time_info { u64 tsc_timestamp; /* TSC at last update of time vals. */ @@ -153,6 +122,11 @@ static int __init __independent_wallcloc } __setup("independent_wallclock", __independent_wallclock); +int xen_independent_wallclock(void) +{ + return independent_wallclock; +} + /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ static int __init __permitted_clock_jitter(char *str) @@ -209,7 +183,6 @@ static inline u64 get64(volatile u64 *pt return res; #else return *ptr; -#define cmpxchg64 cmpxchg #endif } @@ -224,7 +197,6 @@ static inline u64 get64_local(volatile u return res; #else return *ptr; -#define cmpxchg64_local cmpxchg_local #endif } @@ -330,35 +302,6 @@ static inline int time_values_up_to_date return (dst->version == src->version); } -/* - * This is a special lock that is owned by the CPU and holds the index - * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. - */ -volatile unsigned long cmos_lock = 0; -EXPORT_SYMBOL(cmos_lock); - -/* Routines for accessing the CMOS RAM/RTC. */ -unsigned char rtc_cmos_read(unsigned char addr) -{ - unsigned char val; - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - val = inb_p(RTC_PORT(1)); - lock_cmos_suffix(addr); - return val; -} -EXPORT_SYMBOL(rtc_cmos_read); - -void rtc_cmos_write(unsigned char val, unsigned char addr) -{ - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - outb_p(val, RTC_PORT(1)); - lock_cmos_suffix(addr); -} -EXPORT_SYMBOL(rtc_cmos_write); - static void sync_xen_wallclock(unsigned long dummy); static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); static void sync_xen_wallclock(unsigned long dummy) @@ -367,7 +310,8 @@ static void sync_xen_wallclock(unsigned s64 nsec; struct xen_platform_op op; - if (!ntp_synced() || independent_wallclock || !is_initial_xendomain()) + BUG_ON(!is_initial_xendomain()); + if (!ntp_synced() || independent_wallclock) return; write_seqlock_irq(&xtime_lock); @@ -390,23 +334,6 @@ static void sync_xen_wallclock(unsigned mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); } -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval; - unsigned long flags; - - if (independent_wallclock || !is_initial_xendomain()) - return 0; - - /* gets recalled with irq locally disabled */ - /* XXX - does irqsave resolve this? -johnstul */ - spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - static unsigned long long local_clock(void) { unsigned int cpu = get_cpu(); @@ -416,7 +343,7 @@ static unsigned long long local_clock(vo do { local_time_version = shadow->version; - barrier(); + rdtsc_barrier(); time = shadow->system_timestamp + get_nsec_offset(shadow); if (!time_values_up_to_date(cpu)) get_time_values_from_xen(cpu); @@ -489,28 +416,24 @@ unsigned long profile_pc(struct pt_regs #if defined(CONFIG_SMP) || defined(__x86_64__) # ifdef __i386__ - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) # else if (!user_mode(regs) # endif && in_lock_functions(pc)) { # ifdef CONFIG_FRAME_POINTER -# ifdef __i386__ - return ((unsigned long *)regs->ebp)[1]; -# else - return ((unsigned long *)regs->rbp)[1]; -# endif + return ((unsigned long *)regs->bp)[1]; # else # ifdef __i386__ - unsigned long *sp = (unsigned long *)®s->esp; + unsigned long *sp = (unsigned long *)®s->sp; # else - unsigned long *sp = (unsigned long *)regs->rsp; + unsigned long *sp = (unsigned long *)regs->sp; # endif /* Return address is either directly at stack pointer - or above a saved eflags. Eflags has bits 22-31 zero, + or above a saved flags. Eflags has bits 22-31 zero, kernel addresses don't. */ - if (sp[0] >> 22) + if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; @@ -749,25 +672,32 @@ static void init_missing_ticks_accountin runstate->time[RUNSTATE_offline]; } -/* not static: needed by APM */ -unsigned long read_persistent_clock(void) +unsigned long xen_read_persistent_clock(void) { - unsigned long retval; - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); + const shared_info_t *s = HYPERVISOR_shared_info; + u32 version, sec, nsec; + u64 delta; - retval = get_wallclock(); + do { + version = s->wc_version; + rmb(); + sec = s->wc_sec; + nsec = s->wc_nsec; + rmb(); + } while ((s->wc_version & 1) | (version ^ s->wc_version)); - spin_unlock_irqrestore(&rtc_lock, flags); + delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec; + do_div(delta, NSEC_PER_SEC); - return retval; + return delta; } -int update_persistent_clock(struct timespec now) +int xen_update_persistent_clock(void) { + if (!is_initial_xendomain()) + return -1; mod_timer(&sync_xen_wallclock_timer, jiffies + 1); - return set_rtc_mmss(now.tv_sec); + return 0; } extern void (*late_time_init)(void); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/traps_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/traps_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -79,7 +79,8 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif asmlinkage void divide_error(void); @@ -109,6 +110,34 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) { return p > (void *)tinfo && @@ -122,48 +151,35 @@ struct stack_frame { }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long ebp, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)ebp; - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { - struct stack_frame *next; - unsigned long addr; + struct stack_frame *frame = (struct stack_frame *)bp; - addr = frame->return_address; - ops->address(data, addr); - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function). Also, - * we can never allow a frame pointer to - * move downwards! - */ - next = frame->next_frame; - if (next <= frame) - break; - frame = next; - } -#else while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { unsigned long addr; - addr = *stack++; - if (__kernel_text_address(addr)) - ops->address(data, addr); + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; } -#endif - return ebp; + return bp; } #define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long ebp = 0; - if (!task) task = current; @@ -171,17 +187,17 @@ void dump_trace(struct task_struct *task unsigned long dummy; stack = &dummy; if (task != current) - stack = (unsigned long *)task->thread.esp; + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER - if (!ebp) { + if (!bp) { if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); + /* Grab bp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (bp) : ); } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; } } #endif @@ -190,7 +206,7 @@ void dump_trace(struct task_struct *task struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data); /* Should be after the line below, but somewhere in early boot context comes out corrupted and we can't reference it -AK */ @@ -225,9 +241,11 @@ static int print_trace_stack(void *data, /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); print_symbol("%s\n", addr); touch_nmi_watchdog(); } @@ -241,32 +259,32 @@ static const struct stacktrace_ops print static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *esp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; - if (esp == NULL) { + if (sp == NULL) { if (task) - esp = (unsigned long*)task->thread.esp; + sp = (unsigned long*)task->thread.sp; else - esp = (unsigned long *)&esp; + sp = (unsigned long *)&sp; } - stack = esp; + stack = sp; for(i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; @@ -275,13 +293,13 @@ static void show_stack_log_lvl(struct ta printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, regs, esp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *sp) { printk(" "); - show_stack_log_lvl(task, NULL, esp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -290,13 +308,19 @@ void show_stack(struct task_struct *task void dump_stack(void) { unsigned long stack; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(current, NULL, &stack); + show_trace(current, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -315,30 +339,30 @@ void show_registers(struct pt_regs *regs * time of the fault.. */ if (!user_mode_vm(regs)) { - u8 *eip; + u8 *ip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - code_prologue; - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { /* try starting at EIP */ - eip = (u8 *)regs->eip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_len; i++, eip++) { - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (ip == (u8 *)regs->ip) printk("<%02x> ", c); else printk("%02x ", c); @@ -347,18 +371,57 @@ void show_registers(struct pt_regs *regs printk("\n"); } -int is_valid_bugaddr(unsigned long eip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (eip < PAGE_OFFSET) + if (ip < PAGE_OFFSET) return 0; - if (probe_kernel_address((unsigned short *)eip, ud2)) + if (probe_kernel_address((unsigned short *)ip, ud2)) return 0; return ud2 == 0x0b0f; } +static int die_counter; + +int __kprobes __die(const char * str, struct pt_regs * regs, long err) +{ + unsigned long sp; + unsigned short ss; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != + NOTIFY_STOP) { + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; + } else { + return 1; + } +} + /* * This is gone through when something in the kernel has done something bad and * is about to be terminated. @@ -374,7 +437,6 @@ void die(const char * str, struct pt_reg .lock_owner = -1, .lock_owner_depth = 0 }; - static int die_counter; unsigned long flags; oops_enter(); @@ -390,43 +452,13 @@ void die(const char * str, struct pt_reg raw_local_irq_save(flags); if (++die.lock_owner_depth < 3) { - unsigned long esp; - unsigned short ss; - - report_bug(regs->eip, regs); - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, - ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); + report_bug(regs->ip, regs); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != - NOTIFY_STOP) { - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - if (user_mode(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); - print_symbol("%s", regs->eip); - printk(" SS:ESP %04x:%08lx\n", ss, esp); - } - else + if (__die(str, regs, err)) regs = NULL; - } else + } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } bust_spinlocks(0); die.lock_owner = -1; @@ -462,7 +494,7 @@ static void __kprobes do_trap(int trapnr { struct task_struct *tsk = current; - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; @@ -508,7 +540,7 @@ static void __kprobes do_trap(int trapnr } #define DO_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -517,7 +549,7 @@ fastcall void do_##name(struct pt_regs * } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ if (irq) \ @@ -533,7 +565,7 @@ fastcall void do_##name(struct pt_regs * } #define DO_VM86_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -542,7 +574,7 @@ fastcall void do_##name(struct pt_regs * } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -556,13 +588,13 @@ fastcall void do_##name(struct pt_regs * do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) @@ -570,10 +602,10 @@ DO_ERROR(12, SIGBUS, "stack segment", s DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) -fastcall void __kprobes do_general_protection(struct pt_regs * regs, +void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto gp_in_vm86; if (!user_mode(regs)) @@ -582,11 +614,14 @@ fastcall void __kprobes do_general_prote current->thread.error_code = error_code; current->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", current->comm, task_pid_nr(current), - regs->eip, regs->esp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return; @@ -675,8 +710,8 @@ void __kprobes die_nmi(struct pt_regs *r */ bust_spinlocks(1); printk(KERN_EMERG "%s", msg); - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); console_silent(); spin_unlock(&nmi_print_lock); @@ -733,7 +768,7 @@ static __kprobes void default_do_nmi(str static int ignore_nmis; -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +__kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -762,7 +797,7 @@ void restart_nmi(void) } #ifdef CONFIG_KPROBES -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) +void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); @@ -798,7 +833,7 @@ fastcall void __kprobes do_int3(struct p * find every occurrence of the TF bit that could be saved away even * by user code) */ -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) +void __kprobes do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -807,24 +842,30 @@ fastcall void __kprobes do_debug(struct get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) + if (!tsk->thread.debugreg7) goto clear_dr7; } - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto debug_vm86; /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; + tsk->thread.debugreg6 = condition; /* * Single-stepping through TF: make sure we ignore any events in @@ -856,7 +897,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; return; } @@ -865,7 +906,7 @@ clear_TF_reenable: * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(void __user *eip) +void math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -881,7 +922,7 @@ void math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -924,13 +965,13 @@ void math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +void do_coprocessor_error(struct pt_regs * regs, long error_code) { ignore_fpu_irq = 1; - math_error((void __user *)regs->eip); + math_error((void __user *)regs->ip); } -static void simd_math_error(void __user *eip) +static void simd_math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -946,7 +987,7 @@ static void simd_math_error(void __user info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -978,19 +1019,19 @@ static void simd_math_error(void __user force_sig_info(SIGFPE, &info, task); } -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, +void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->eip); + simd_math_error((void __user *)regs->ip); } else { /* * Handle strange cache flush from user space exception * in all other cases. This is undocumented behaviour. */ - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); return; @@ -1003,7 +1044,7 @@ fastcall void do_simd_coprocessor_error( } #ifndef CONFIG_XEN -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, +void do_spurious_interrupt_bug(struct pt_regs * regs, long error_code) { #if 0 @@ -1012,7 +1053,7 @@ fastcall void do_spurious_interrupt_bug( #endif } -fastcall unsigned long patch_espfix_desc(unsigned long uesp, +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; @@ -1072,7 +1113,7 @@ asmlinkage void math_emulate(long arg) * NB. All these are "trap gates" (i.e. events_mask isn't set) except * for those that specify <dpl>|4 in the second field. */ -static trap_info_t __cpuinitdata trap_table[] = { +static const trap_info_t __cpuinitconst trap_table[] = { { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, @@ -1105,17 +1146,12 @@ void __init trap_init(void) if (ret) printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generate a build-time error if the alignment is wrong. + */ + BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15); if (cpu_has_fxsr) { - /* - * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. - * Generates a compile-time "error: zero width for bit-field" if - * the alignment is wrong. - */ - struct fxsrAlignAssert { - int _:!(offsetof(struct task_struct, - thread.i387.fxsave) & 15); - }; - printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/traps_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/traps_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -74,38 +74,41 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +static unsigned int code_bytes = 64; + static inline void conditional_sti(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_sti(struct pt_regs *regs) { - preempt_disable(); - if (regs->eflags & X86_EFLAGS_IF) + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_cli(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ - preempt_enable_no_resched(); + dec_preempt_count(); } int kstack_depth_to_print = 12; -#ifdef CONFIG_KALLSYMS -void printk_address(unsigned long address) +void printk_address(unsigned long address, int reliable) { +#ifdef CONFIG_KALLSYMS unsigned long offset = 0, symsize; const char *symname; char *modname; char *delim = ":"; - char namebuf[128]; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -113,17 +116,17 @@ void printk_address(unsigned long addres printk(" [<%016lx>]\n", address); return; } + if (!reliable) + strcpy(reliab, "? "); + if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", - address, delim, modname, delim, symname, offset, symsize); -} + modname = delim = ""; + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); #else -void printk_address(unsigned long address) -{ printk(" [<%016lx>]\n", address); -} #endif +} static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -210,14 +213,53 @@ static unsigned long *in_exception_stack * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) { - void *t = (void *)tinfo; - return p > t && p < t + THREAD_SIZE - 3; + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; } void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -227,36 +269,28 @@ void dump_trace(struct task_struct *tsk, if (!tsk) tsk = current; + tinfo = task_thread_info(tsk); if (!stack) { unsigned long dummy; stack = &dummy; if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.rsp; + stack = (unsigned long *)tsk->thread.sp; } - /* - * Print function call entries within a stack. 'cond' is the - * "end of stackframe" condition, that the 'stack++' - * iteration will eventually trigger. - */ -#define HANDLE_STACK(cond) \ - do while (cond) { \ - unsigned long addr = *stack++; \ - /* Use unlocked access here because except for NMIs \ - we should be already protected against module unloads */ \ - if (__kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - ops->address(data, addr); \ - } \ - } while (0) +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (tsk == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp):); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) tsk->thread.sp; + } + } +#endif + + /* * Print function call entries in all stacks, starting at the @@ -272,7 +306,9 @@ void dump_trace(struct task_struct *tsk, if (estack_end) { if (ops->stack(data, id) < 0) break; - HANDLE_STACK (stack < estack_end); + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -290,7 +326,8 @@ void dump_trace(struct task_struct *tsk, if (stack >= irqstack && stack < irqstack_end) { if (ops->stack(data, "IRQ") < 0) break; - HANDLE_STACK (stack < irqstack_end); + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); /* * We link to the next stack (which would be * the process stack normally) the last @@ -308,9 +345,7 @@ void dump_trace(struct task_struct *tsk, /* * This handles the process stack: */ - tinfo = task_thread_info(tsk); - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); -#undef HANDLE_STACK + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -333,10 +368,10 @@ static int print_trace_stack(void *data, return 0; } -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk_address(addr); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { @@ -347,15 +382,17 @@ static const struct stacktrace_ops print }; void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, + unsigned long bp) { unsigned long *stack; int i; @@ -366,14 +403,14 @@ _show_stack(struct task_struct *tsk, str // debugging aid: "show_stack(NULL, NULL);" prints the // back trace for this cpu. - if (rsp == NULL) { + if (sp == NULL) { if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; + sp = (unsigned long *)tsk->thread.sp; else - rsp = (unsigned long *)&rsp; + sp = (unsigned long *)&sp; } - stack = rsp; + stack = sp; for(i=0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { @@ -389,12 +426,12 @@ _show_stack(struct task_struct *tsk, str printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, rsp); + show_trace(tsk, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +void show_stack(struct task_struct *tsk, unsigned long * sp) { - _show_stack(tsk, NULL, rsp); + _show_stack(tsk, NULL, sp, 0); } /* @@ -403,13 +440,19 @@ void show_stack(struct task_struct *tsk, void dump_stack(void) { unsigned long dummy; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy); + show_trace(NULL, NULL, &dummy, bp); } EXPORT_SYMBOL(dump_stack); @@ -417,12 +460,15 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; + unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; + u8 *ip; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; - rsp = regs->rsp; + sp = regs->sp; + ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -432,45 +478,43 @@ void show_registers(struct pt_regs *regs * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode(regs)) { + unsigned char c; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long*)rsp); + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + printk("\n"); - printk("\nCode: "); - if (regs->rip < PAGE_OFFSET) - goto bad; - - for (i=0; i<20; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: + printk(KERN_EMERG "Code: "); + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *) regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad RIP value."); break; } - printk("%02x ", c); + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); } -int is_valid_bugaddr(unsigned long rip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) return 0; return ud2 == 0x0b0f; } -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -EXPORT_SYMBOL(out_of_line_bug); -#endif - static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -498,7 +542,7 @@ unsigned __kprobes long oops_begin(void) return flags; } -void __kprobes oops_end(unsigned long flags) +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { die_owner = -1; bust_spinlocks(0); @@ -507,12 +551,17 @@ void __kprobes oops_end(unsigned long fl /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } if (panic_on_oops) panic("Fatal exception"); oops_exit(); + do_exit(signr); } -void __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); @@ -526,15 +575,17 @@ void __kprobes __die(const char * str, s printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); if (kexec_should_crash(current)) crash_kexec(regs); + return 0; } void die(const char * str, struct pt_regs * regs, long err) @@ -542,11 +593,11 @@ void die(const char * str, struct pt_reg unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip, regs); + report_bug(regs->ip, regs); - __die(str, regs, err); - oops_end(flags); - do_exit(SIGSEGV); + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); } #if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_SYSCTL) @@ -564,10 +615,10 @@ void __kprobes die_nmi(char *str, struct crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags); + oops_end(flags, NULL, SIGBUS); nmi_exit(); local_irq_enable(); - do_exit(SIGSEGV); + do_exit(SIGBUS); } #endif @@ -592,11 +643,14 @@ static void __kprobes do_trap(int trapnr tsk->thread.trap_no = trapnr; if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } if (info) force_sig_info(signr, info, tsk); @@ -606,19 +660,12 @@ static void __kprobes do_trap(int trapnr } - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) - regs->rip = fixup->fixup; - else { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); } + return; } #define DO_ERROR(trapnr, signr, str, name) \ @@ -647,10 +694,10 @@ asmlinkage void do_##name(struct pt_regs do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -698,32 +745,28 @@ asmlinkage void __kprobes do_general_pro tsk->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, tsk); return; } - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } + if (fixup_exception(regs)) + return; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); } static __kprobes void @@ -833,15 +876,15 @@ asmlinkage __kprobes struct pt_regs *syn { struct pt_regs *regs = eregs; /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) + if (eregs == (struct pt_regs *)eregs->sp) ; /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) *regs = *eregs; return regs; @@ -859,6 +902,12 @@ asmlinkage void __kprobes do_debug(struc get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; @@ -874,27 +923,14 @@ asmlinkage void __kprobes do_debug(struc tsk->thread.debugreg6 = condition; - /* Mask out spurious TF errors due to lazy TF clearing */ + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ if (!user_mode(regs)) goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } } /* Ok, finally something we can handle */ @@ -903,7 +939,7 @@ asmlinkage void __kprobes do_debug(struc info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: @@ -913,18 +949,15 @@ clear_dr7: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; + if (fixup_exception(regs)) return 1; - } + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); /* Illegal floating point operation in the kernel */ current->thread.trap_no = trapnr; @@ -939,7 +972,7 @@ static int kernel_math_error(struct pt_r */ asmlinkage void do_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short cwd, swd; @@ -959,7 +992,7 @@ asmlinkage void do_coprocessor_error(str info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -1008,7 +1041,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short mxcsr; @@ -1028,7 +1061,7 @@ asmlinkage void do_simd_coprocessor_erro info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1092,13 +1125,14 @@ asmlinkage void math_state_restore(void) task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); /* * NB. All these are "interrupt gates" (i.e. events_mask is set) because we * specify <dpl>|4 in the second field. */ -static trap_info_t __cpuinitdata trap_table[] = { +static const trap_info_t __cpuinitconst trap_table[] = { { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error }, { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, @@ -1169,3 +1203,14 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); + + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); --- sle11sp1-2010-03-22.orig/arch/x86/kernel/vsyscall_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/kernel/vsyscall_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -43,12 +43,7 @@ #include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" -#define __pa_vsymbol(x) \ - ({unsigned long v; \ - extern char __vsyscall_0; \ - asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) +#define __syscall_clobber "r11","cx","memory" /* * vsyscall_gtod_data contains data that is : @@ -102,7 +97,7 @@ static __always_inline void do_get_tz(st static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) { int ret; - asm volatile("vsysc2: syscall" + asm volatile("syscall" : "=a" (ret) : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); @@ -112,7 +107,7 @@ static __always_inline int gettimeofday( static __always_inline long time_syscall(long *t) { long secs; - asm volatile("vsysc1: syscall" + asm volatile("syscall" : "=a" (secs) : "0" (__NR_time),"D" (t) : __syscall_clobber); return secs; @@ -190,7 +185,7 @@ time_t __vsyscall(1) vtime(time_t *t) long __vsyscall(2) vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - unsigned int dummy, p; + unsigned int p; unsigned long j = 0; /* Fast cache - only recompute value once per jiffies and avoid @@ -205,7 +200,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); @@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void) #ifdef CONFIG_SYSCTL -#define SYSCALL 0x050f -#define NOP2 0x9090 - -/* - * NOP out syscall in vsyscall page when not needed. - */ -static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int +vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - extern u16 vsysc1, vsysc2; - u16 __iomem *map1; - u16 __iomem *map2; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - if (!write) - return ret; - /* gcc has some trouble with __va(__pa()), so just do it this - way. */ - map1 = ioremap(__pa_vsymbol(&vsysc1), 2); - if (!map1) - return -ENOMEM; - map2 = ioremap(__pa_vsymbol(&vsysc2), 2); - if (!map2) { - ret = -ENOMEM; - goto out; - } - if (!vsyscall_gtod_data.sysctl_enabled) { - writew(SYSCALL, map1); - writew(SYSCALL, map2); - } else { - writew(NOP2, map1); - writew(NOP2, map2); - } - iounmap(map2); -out: - iounmap(map1); - return ret; + return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); } static ctl_table kernel_table2[] = { @@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] = .child = kernel_table2 }, {} }; - #endif /* Assume __initcall executes before all user space. Hopefully kmod @@ -301,7 +264,7 @@ static void __cpuinit vsyscall_set_cpu(i d |= cpu; d |= (node & 0xf) << 12; d |= (node >> 4) << 48; - if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu) + if (HYPERVISOR_update_descriptor(virt_to_machine(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU), d)) BUG(); @@ -322,7 +285,7 @@ cpu_vsyscall_notifier(struct notifier_bl return NOTIFY_DONE; } -static void __init map_vsyscall(void) +void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); @@ -338,7 +301,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); #ifdef CONFIG_XEN vsyscall_gtod_data.sysctl_enabled = 0; /* disable vgettimeofay() */ if (boot_cpu_has(X86_FEATURE_RDTSCP)) --- sle11sp1-2010-03-22.orig/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,36 +0,0 @@ -/* - * Copied from arch/xen/i386/kernel/entry.S - */ -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 6 - -#ifdef CONFIG_SMP -//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) -#define preempt_disable(reg) -#define preempt_enable(reg) -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ - movq %gs:pda_cpunumber,reg ; \ - shl $32, reg ; \ - shr $32-sizeof_vcpu_shift,reg ; \ - addq HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff -#else -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) -#define XEN_PUT_VCPU_INFO_fixup -#endif - -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_BLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) --- sle11sp1-2010-03-22.orig/arch/x86/mach-xen/setup.c 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mach-xen/setup.c 2009-11-06 10:51:25.000000000 +0100 @@ -161,15 +161,12 @@ void __init machine_specific_arch_setup( /* Do an early initialization of the fixmap area */ { - extern pte_t swapper_pg_pmd[PTRS_PER_PTE]; + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; unsigned long addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); - pgd_t *pgd = (pgd_t *)xen_start_info->pt_base; - pud_t *pud = pud_offset(pgd + pgd_index(addr), addr); + pud_t *pud = pud_offset(swapper_pg_dir + pgd_index(addr), addr); pmd_t *pmd = pmd_offset(pud, addr); - swapper_pg_dir = pgd; - init_mm.pgd = pgd; - make_lowmem_page_readonly(swapper_pg_pmd, XENFEAT_writable_page_tables); - set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_pmd) | _PAGE_TABLE)); + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); } } --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/fault-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,1025 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/compiler.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> + +#include <asm/system.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm-generic/sections.h> + +/* + * Page fault error code bits + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch + */ +#define PF_PROT (1<<0) +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +static inline int notify_page_fault(struct pt_regs *regs) +{ +#ifdef CONFIG_KPROBES + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +#ifdef CONFIG_X86_32 + if (!user_mode_vm(regs)) { +#else + if (!user_mode(regs)) { +#endif + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +#else + return 0; +#endif +} + +/* + * X86_32 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * X86_64 + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner + */ +static int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + scan_more = ((instr_lo & 7) == 0x6); + break; +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + + if (probe_kernel_address(instr, opcode)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + +#ifdef CONFIG_X86_64 +static int bad_address(void *p) +{ + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); +} +#endif + +static void dump_pagetable(unsigned long address) +{ +#ifdef CONFIG_X86_32 + __typeof__(pte_val(__pte(0))) page; + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", page); + if ((page & _PAGE_PRESENT) + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn) { + page = mfn_to_pfn(page >> PAGE_SHIFT); + page <<= PAGE_SHIFT; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_CONT "*pde = %016Lx ", page); + page &= ~_PAGE_NX; + } +#else + printk("*pde = %08lx ", page); +#endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page & _PAGE_PRESENT) + && mfn_to_local_pfn(page >> PAGE_SHIFT) < max_low_pfn + && !(page & _PAGE_PSE)) { + page = mfn_to_pfn(page >> PAGE_SHIFT); + page <<= PAGE_SHIFT; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk(KERN_CONT "*pte = %0*Lx ", sizeof(page)*2, (u64)page); + } + + printk(KERN_CONT "\n"); +#else /* CONFIG_X86_64 */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk(KERN_CONT "PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk(KERN_CONT "PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk(KERN_CONT "PTE %lx", pte_val(*pte)); +ret: + printk(KERN_CONT "\n"); + return; +bad: + printk("BAD\n"); +#endif +} + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) { + bool lazy = x86_read_percpu(xen_lazy_mmu); + + x86_write_percpu(xen_lazy_mmu, false); +#if CONFIG_XEN_COMPAT > 0x030002 + set_pmd(pmd, *pmd_k); +#else + /* + * When running on older Xen we must launder *pmd_k through + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. + */ + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); +#endif + x86_write_percpu(xen_lazy_mmu, lazy); + } else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} +#endif + +#ifdef CONFIG_X86_64 +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. + Does nothing for X86_32 + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + static int warned; + if (address != regs->ip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal + * addresses >4GB. We catch this in the page fault handler because these + * addresses are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return 1; +#endif + return 0; +} + +void do_invalid_op(struct pt_regs *, unsigned long); + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_X86_32 + if (!oops_may_print()) + return; +#endif + +#ifdef CONFIG_X86_PAE + if (error_code & PF_INSTR) { + unsigned int level; + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } +#endif + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); +#ifdef CONFIG_X86_32 + printk(KERN_CONT " at %08lx\n", address); +#else + printk(KERN_CONT " at %016lx\n", address); +#endif + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + dump_pagetable(address); +} + +#ifdef CONFIG_X86_64 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) + regs = NULL; + oops_end(flags, regs, SIGKILL); +} +#endif + +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. This allows + * us to lazily refresh the TLB when increasing the permissions of a + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very + * expensive since that implies doing a full cross-processor TLB + * flush, even if no stale TLB entries exist on other processors. + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static int spurious_fault(unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + return spurious_fault_check(error_code, pte); +} + +/* + * X86_32 + * Handle a fault on the vmalloc or module mapping area + * + * X86_64 + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_32 + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +#else + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + /* On Xen the line below does not always work. Needs investigating! */ + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +#endif +} + +int show_unhandled_signals = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long address; + int write, si_code; + int fault; +#ifdef CONFIG_X86_64 + unsigned long flags; +#endif + + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + + /* Set the "privileged fault" bit to something sane. */ + if (user_mode_vm(regs)) + error_code |= PF_USER; + else + error_code &= ~PF_USER; + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = read_cr2(); + + si_code = SEGV_MAPERR; + + if (notify_page_fault(regs)) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ +#ifdef CONFIG_X86_32 + if (unlikely(address >= TASK_SIZE)) { +#else + if (unlikely(address >= TASK_SIZE64)) { +#endif + /* Faults in hypervisor area can never be patched up. */ +#if defined(CONFIG_X86_XEN) + if (address >= hypervisor_virt_start) + goto bad_area_nosemaphore; +#elif defined(CONFIG_X86_64_XEN) + if (address >= HYPERVISOR_VIRT_START + && address < HYPERVISOR_VIRT_END) + goto bad_area_nosemaphore; +#endif + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + vmalloc_fault(address) >= 0) + return; + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + +#ifdef CONFIG_X86_32 + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; +#else /* CONFIG_X86_64 */ + if (likely(regs->flags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; +again: +#endif + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535,$31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + +#ifdef CONFIG_X86_32 +survive: +#endif + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + +#ifdef CONFIG_X86_32 + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (v8086_mode(regs)) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } +#endif + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( +#ifdef CONFIG_X86_32 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", +#else + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", +#endif + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->ip, + regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + if (__die("Oops", regs, error_code)) + regs = NULL; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, SIGKILL); +#endif + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_global_init(tsk)) { + yield(); +#ifdef CONFIG_X86_32 + down_read(&mm->mmap_sem); + goto survive; +#else + goto again; +#endif + } + + printk("VM: killing process %s\n", tsk->comm); + if (error_code & PF_USER) + do_group_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ +#ifdef CONFIG_X86_32 + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. + * This change works just fine with 2-level paging too. + */ +#define sync_index(a) ((a) >> PMD_SHIFT) + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); + static unsigned long start = TASK_SIZE; + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + BUILD_BUG_ON(TASK_SIZE & ~PMD_MASK); + for (address = start; + address < hypervisor_virt_start; + address += PMD_SIZE) { + if (!test_bit(sync_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + /* XEN: failure path assumes non-empty pgd_list. */ + if (unlikely(list_empty(&pgd_list))) { + spin_unlock_irqrestore(&pgd_lock, flags); + return; + } + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(sync_index(address), insync); + } + if (address == start && test_bit(sync_index(address), insync)) + start = address + PMD_SIZE; + } +#else /* CONFIG_X86_64 */ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +#endif +} --- sle11sp1-2010-03-22.orig/arch/x86/mm/fault_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,757 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/bootmem.h> /* for max_low_pfn */ -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/desc.h> -#include <asm/segment.h> - -extern void die(const char *,struct pt_regs *,long); - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* Unlikely, but must come before segment checks. */ - if (unlikely(regs->eflags & VM_MASK)) { - base = seg << 4; - *eip_limit = base + 0xffff; - return base + (eip & 0xffff); - } - - /* The standard kernel/user address space limit. */ - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; - - /* By far the most common cases. */ - if (likely(SEGMENT_IS_FLAT_CODE(seg))) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - mutex_lock(¤t->mm->context.lock); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - mutex_unlock(¤t->mm->context.lock); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -#ifdef CONFIG_X86_PAE -static void dump_fault_path(unsigned long address) -{ - unsigned long *p, page; - unsigned long mfn; - - page = read_cr3(); - p = (unsigned long *)__va(page); - p += (address >> 30) * 2; - printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); - if (p[0] & _PAGE_PRESENT) { - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); - page = mfn_to_pfn(mfn) << PAGE_SHIFT; - p = (unsigned long *)__va(page); - address &= 0x3fffffff; - p += (address >> 21) * 2; - printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", - page, p[1], p[0]); - mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); -#ifdef CONFIG_HIGHPTE - if (mfn_to_pfn(mfn) >= highstart_pfn) - return; -#endif - if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) { - page = mfn_to_pfn(mfn) << PAGE_SHIFT; - p = (unsigned long *) __va(page); - address &= 0x001fffff; - p += (address >> 12) * 2; - printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n", - page, p[1], p[0]); - } - } -} -#else -static void dump_fault_path(unsigned long address) -{ - unsigned long page; - - page = read_cr3(); - page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT]; - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - /* - * We must not directly access the pte in the highpte - * case if the page table is located in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ - if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - page = machine_to_phys(page & PAGE_MASK); - page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - } -} -#endif - -static int spurious_fault(struct pt_regs *regs, - unsigned long address, - unsigned long error_code) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & 0x0c) - return 0; - - pgd = init_mm.pgd + pgd_index(address); - if (!pgd_present(*pgd)) - return 0; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return 0; - if ((error_code & 0x02) && !pte_write(*pte)) - return 0; -#ifdef CONFIG_X86_PAE - if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX)) - return 0; -#endif - - return 1; -} - -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - bool lazy = x86_read_percpu(xen_lazy_mmu); - - x86_write_percpu(xen_lazy_mmu, false); -#if CONFIG_XEN_COMPAT > 0x030002 - set_pmd(pmd, *pmd_k); -#else - /* - * When running on older Xen we must launder *pmd_k through - * pmd_val() to ensure that _PAGE_PRESENT is correctly set. - */ - set_pmd(pmd, __pmd(pmd_val(*pmd_k))); -#endif - x86_write_percpu(xen_lazy_mmu, lazy); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} - -/* - * Handle a fault on the vmalloc or module mapping area - * - * This assumes no large pages in there. - */ -static inline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - int write, si_code; - int fault; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - /* get the address */ - address = read_cr2(); - - /* Set the "privileged fault" bit to something sane. */ - error_code &= ~4; - error_code |= (regs->xcs & 2) << 1; - if (regs->eflags & X86_EFLAGS_VM) - error_code |= 4; - - tsk = current; - - si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { -#ifdef CONFIG_XEN - /* Faults in hypervisor area can never be patched up. */ - if (address >= hypervisor_virt_start) - goto bad_area_nosemaphore; -#endif - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) - return; - /* Can take a spurious fault if mapping changes R/O -> R/W. */ - if (spurious_fault(regs, address, error_code)) - return; - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * Accessing the stack below %esp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes - * 32 pointers and then decrements %esp by 65535.) - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk("%s%s[%d]: segfault at %08lx eip %08lx " - "esp %08lx error %lx\n", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->eip, - regs->esp, error_code); - } - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (oops_may_print()) { -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); - } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL " - "pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging" - " request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT "printing eip: %08lx\n", regs->eip); - dump_fault_path(address); - } - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -void vmalloc_sync_all(void) -{ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. - * This change works just fine with 2-level paging too. - */ -#define sync_index(a) ((a) >> PMD_SHIFT) - static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); - static unsigned long start = TASK_SIZE; - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; - address >= TASK_SIZE && address < hypervisor_virt_start; - address += 1UL << PMD_SHIFT) { - if (!test_bit(sync_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - /* XEN: failure path assumes non-empty pgd_list. */ - if (unlikely(!pgd_list)) { - spin_unlock_irqrestore(&pgd_lock, flags); - return; - } - for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { - BUG_ON(page != pgd_list); - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(sync_index(address), insync); - } - if (address == start && test_bit(sync_index(address), insync)) - start = address + (1UL << PMD_SHIFT); - } -} --- sle11sp1-2010-03-22.orig/arch/x86/mm/fault_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,686 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm-generic/sections.h> - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - /* On Xen the line below does not always work. Needs investigating! */ - /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -int show_unhandled_signals = 1; - - -#define MEM_VERBOSE 1 - -#ifdef MEM_VERBOSE -#define MEM_LOG(_f, _a...) \ - printk("fault.c:[%d]-> " _f "\n", \ - __LINE__ , ## _a ) -#else -#define MEM_LOG(_f, _a...) ((void)0) -#endif - -static int spurious_fault(struct pt_regs *regs, - unsigned long address, - unsigned long error_code) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - -#ifdef CONFIG_XEN - /* Faults in hypervisor area are never spurious. */ - if ((address >= HYPERVISOR_VIRT_START) && - (address < HYPERVISOR_VIRT_END)) - return 0; -#endif - - /* Reserved-bit violation or user access to kernel space? */ - if (error_code & (PF_RSVD|PF_USER)) - return 0; - - pgd = init_mm.pgd + pgd_index(address); - if (!pgd_present(*pgd)) - return 0; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return 0; - if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; - if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX)) - return 0; - - return 1; -} - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - if (!user_mode(regs)) - error_code &= ~PF_USER; /* means kernel */ - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Can take a spurious fault if mapping changes R/O -> R/W. */ - if (spurious_fault(regs, address, error_code)) - return; - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} --- sle11sp1-2010-03-22.orig/arch/x86/mm/highmem_32-xen.c 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mm/highmem_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -18,6 +18,49 @@ void kunmap(struct page *page) kunmap_high(page); } +static void debug_kmap_atomic_prot(enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +#endif +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -37,6 +80,8 @@ void *kmap_atomic_prot(struct page *page if (!PageHighMem(page)) return page_address(page); + debug_kmap_atomic_prot(type); + idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); --- sle11sp1-2010-03-22.orig/arch/x86/mm/hypervisor.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mm/hypervisor.c 2009-11-06 10:51:25.000000000 +0100 @@ -873,15 +873,11 @@ int xen_limit_pages_to_max_mfn( } EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn); -#ifdef __i386__ -int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) +int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) { - __u32 *lp = (__u32 *)((char *)ldt + entry * 8); - maddr_t mach_lp = arbitrary_virt_to_machine(lp); - return HYPERVISOR_update_descriptor( - mach_lp, (u64)entry_a | ((u64)entry_b<<32)); + maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry); + return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc); } -#endif #define MAX_BATCHED_FULL_PTES 32 --- sle11sp1-2010-03-22.orig/arch/x86/mm/init_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mm/init_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -27,13 +27,13 @@ #include <linux/bootmem.h> #include <linux/slab.h> #include <linux/proc_fs.h> -#include <linux/efi.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> #include <linux/dma-mapping.h> #include <linux/scatterlist.h> +#include <asm/asm.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -42,18 +42,22 @@ #include <asm/fixmap.h> #include <asm/e820.h> #include <asm/apic.h> +#include <asm/bugs.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/hypervisor.h> #include <asm/swiotlb.h> +#include <asm/setup.h> +#include <asm/cacheflush.h> unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; -static int noinline do_test_wp_bit(void); +static noinline int do_test_wp_bit(void); /* * Creates a middle page table and puts a pointer to it in the @@ -64,17 +68,16 @@ static pmd_t * __init one_md_table_init( { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); + BUG_ON(pmd_table != pmd_offset(pud, 0)); } #endif pud = pud_offset(pgd, 0); @@ -85,7 +88,7 @@ static pmd_t * __init one_md_table_init( /* * Create a page table and place a pointer to it in a middle page - * directory entry. + * directory entry: */ static pte_t * __init one_page_table_init(pmd_t *pmd) { @@ -99,9 +102,10 @@ static pte_t * __init one_page_table_ini #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) + if (!page_table) { page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); make_lowmem_page_readonly(page_table, @@ -114,22 +118,21 @@ static pte_t * __init one_page_table_ini } /* - * This function initializes a certain range of kernel virtual memory + * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without * checking the pgd every time. */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -139,7 +142,8 @@ static void __init page_table_range_init for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); pmd = pmd + pmd_index(vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { if (vaddr < hypervisor_virt_start) one_page_table_init(pmd); @@ -157,17 +161,17 @@ static inline int is_kernel_text(unsigne } /* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) { + int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; unsigned long max_ram_pfn = xen_start_info->nr_pages; if (max_ram_pfn > max_low_pfn) @@ -195,36 +199,49 @@ static void __init kernel_physical_mappi if (pfn >= max_low_pfn) continue; pmd += pmd_idx; - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - if (address >= hypervisor_virt_start) + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + if (addr >= hypervisor_virt_start) continue; - /* Map with big pages if possible, otherwise create normal page tables. */ + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); + continue; + } + pte = one_page_table_init(pmd); + + for (pte += pte_ofs; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + + /* XEN: Only map initial RAM allocation. */ + if ((pfn >= max_ram_pfn) || pte_present(*pte)) + continue; + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; - for (pte += pte_ofs; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { - /* XEN: Only map initial RAM allocation. */ - if ((pfn >= max_ram_pfn) || pte_present(*pte)) - continue; - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - pte_ofs = 0; + set_pte(pte, pfn_pte(pfn, prot)); } + pte_ofs = 0; } pmd_idx = 0; } @@ -245,57 +262,23 @@ static inline int page_kills_ppro(unsign #endif -int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} static void __init kmap_init(void) { unsigned long kmap_vstart; - /* cache the first kmap pte */ + /* + * Cache the first kmap pte: + */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); @@ -304,11 +287,11 @@ static void __init kmap_init(void) static void __init permanent_kmaps_init(pgd_t *pgd_base) { + unsigned long vaddr; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr; vaddr = PKMAP_BASE; page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); @@ -317,7 +300,7 @@ static void __init permanent_kmaps_init( pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = pte; } static void __meminit free_new_highpage(struct page *page, int pfn) @@ -336,7 +319,8 @@ void __init add_one_highpage_init(struct SetPageReserved(page); } -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) +static int __meminit +add_one_highpage_hotplug(struct page *page, unsigned long pfn) { free_new_highpage(page, pfn); totalram_pages++; @@ -344,6 +328,7 @@ static int __meminit add_one_highpage_ho max_mapnr = max(pfn, max_mapnr); #endif num_physpages++; + return 0; } @@ -351,7 +336,7 @@ static int __meminit add_one_highpage_ho * Not currently handling the NUMA case. * Assuming single node and all memory that * has been added dynamically that would be - * onlined here is in HIGHMEM + * onlined here is in HIGHMEM. */ void __meminit online_page(struct page *page) { @@ -359,13 +344,11 @@ void __meminit online_page(struct page * add_one_highpage_hotplug(page, page_to_pfn(page)); } - -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else +#ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn && pfn < xen_start_info->nr_pages; pfn++) { /* @@ -383,23 +366,18 @@ static void __init set_highmem_pages_ini totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ +#endif /* !CONFIG_NUMA */ #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) +# define kmap_init() do { } while (0) +# define permanent_kmaps_init(pgd_base) do { } while (0) +# define set_highmem_pages_init(bad_ppro) do { } while (0) #endif /* CONFIG_HIGHMEM */ -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +pteval_t __PAGE_KERNEL = _PAGE_KERNEL; EXPORT_SYMBOL(__PAGE_KERNEL); -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; pgd_t *swapper_pg_dir; @@ -417,9 +395,8 @@ static void __init xen_pagetable_setup_d * the boot process. * * If we're booting on native hardware, this will be a pagetable - * constructed in arch/i386/kernel/head.S, and not running in PAE mode - * (even if we'll end up running in PAE). The root of the pagetable - * will be swapper_pg_dir. + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. * * If we're booting paravirtualized under a hypervisor, then there are * more options: we may already be running PAE, and the pagetable may @@ -431,10 +408,10 @@ static void __init xen_pagetable_setup_d * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init (void) +static void __init pagetable_init(void) { - unsigned long vaddr, end; pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; + unsigned long vaddr, end; xen_pagetable_setup_start(pgd_base); @@ -456,34 +433,36 @@ static void __init pagetable_init (void) * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ + early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); permanent_kmaps_init(pgd_base); xen_pagetable_setup_done(pgd_base); } -#if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI) +#if defined(CONFIG_ACPI_SLEEP) && !defined(CONFIG_XEN) /* - * Swap suspend & friends need this for resume because things like the intel-agp + * ACPI suspend needs this for resume, because things like the intel-agp * driver might have split up a kernel 4MB mapping. */ -char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); +char swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned(PAGE_SIZE))); static inline void save_pg_dir(void) { memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); } -#else +#else /* !CONFIG_ACPI_SLEEP */ static inline void save_pg_dir(void) { } -#endif +#endif /* !CONFIG_ACPI_SLEEP */ -void zap_low_mappings (void) +void zap_low_mappings(void) { int i; @@ -495,22 +474,24 @@ void zap_low_mappings (void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) + for (i = 0; i < USER_PTRS_PER_PGD; i++) { #if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); #endif + } flush_tlb_all(); } -int nx_enabled = 0; +int nx_enabled; + +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE -static int disable_nx __initdata = 0; -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; -EXPORT_SYMBOL_GPL(__supported_pte_mask); +static int disable_nx __initdata; /* * noexec = on|off @@ -527,11 +508,14 @@ static int __init noexec_setup(char *str __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else if (!strcmp(str,"off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else - return -EINVAL; + } else { + if (!strcmp(str, "off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } else { + return -EINVAL; + } + } return 0; } @@ -543,6 +527,7 @@ static void __init set_nx(void) if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { rdmsr(MSR_EFER, l, h); l |= EFER_NX; @@ -552,35 +537,6 @@ static void __init set_nx(void) } } } - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - pte_update_defer(&init_mm, vaddr, pte); - __flush_tlb_all(); -out: - return ret; -} - #endif /* @@ -597,21 +553,10 @@ void __init paging_init(void) #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #endif - pagetable_init(); -#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - * when running as xen domain we are already in PAE mode at - * this point. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif __flush_tlb_all(); kmap_init(); @@ -638,10 +583,10 @@ void __init paging_init(void) * used to involve black magic jumps to work around some nasty CPU bugs, * but fortunately the switch to using exceptions got rid of all that. */ - static void __init test_wp_bit(void) { - printk("Checking if this processor honours the WP bit even in supervisor mode... "); + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); @@ -649,23 +594,22 @@ static void __init test_wp_bit(void) clear_fixmap(FIX_WP_TEST); if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); + printk(KERN_CONT "No.\n"); #ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); #endif } else { - printk("Ok.\n"); + printk(KERN_CONT "Ok.\n"); } } -static struct kcore_list kcore_mem, kcore_vmalloc; +static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { - extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; + int tmp, bad_ppro; unsigned long pfn; #if defined(CONFIG_SWIOTLB) @@ -675,19 +619,19 @@ void __init mem_init(void) #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR + "fixmap and kmap areas overlap - this will crash\n"); printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, + FIXADDR_START); BUG(); } #endif - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); /* XEN: init low-mem pages outside initial allocation. */ @@ -699,7 +643,7 @@ void __init mem_init(void) reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* - * Only count reserved RAM pages + * Only count reserved RAM pages: */ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; @@ -710,11 +654,12 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, @@ -725,54 +670,53 @@ void __init mem_init(void) ); #if 1 /* double-sanity-check paranoia */ - printk("virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, #ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, #endif - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, - (unsigned long)__va(0), (unsigned long)high_memory, - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, - (unsigned long)&__init_begin, (unsigned long)&__init_end, - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, - (unsigned long)&_etext, (unsigned long)&_edata, - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, - (unsigned long)&_text, (unsigned long)&_etext, - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); #ifdef CONFIG_HIGHMEM - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); - BUG_ON(VMALLOC_END > PKMAP_BASE); + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); - BUG_ON((unsigned long)high_memory > VMALLOC_START); + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); + cpa_init(); + /* * Subtle. SMP is doing it's boot stuff late (because it has to * fork idle threads) - but it also needs low mappings for the @@ -796,49 +740,35 @@ int arch_add_memory(int nid, u64 start, return __add_pages(zone, start_pfn, nr_pages); } - #endif -struct kmem_cache *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); -} - /* * This function cannot be __init, since exceptions don't work in that * section. Put this after the callers, so that it cannot be inlined. */ -static int noinline do_test_wp_bit(void) +static noinline int do_test_wp_bit(void) { char tmp_reg; int flag; __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" "2: \n" - ".section __ex_table,\"a\"\n" - " .align 4 \n" - " .long 1b,2b \n" - ".previous \n" + _ASM_EXTABLE(1b,2b) :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) :"2" (1) :"memory"); - + return flag; } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -851,32 +781,58 @@ void mark_rodata_ro(void) if (num_possible_cpus() <= 1) #endif { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif } #endif start += size; size = (unsigned long)__end_rodata - start; - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk("Write protecting the kernel read-only data: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); - /* - * change_page_attr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif } #endif void free_init_pages(char *what, unsigned long begin, unsigned long end) { +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else unsigned long addr; + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -885,6 +841,7 @@ void free_init_pages(char *what, unsigne totalram_pages++; } printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); +#endif } void free_initmem(void) @@ -900,4 +857,3 @@ void free_initrd_mem(unsigned long start free_init_pages("initrd memory", start, end); } #endif - --- sle11sp1-2010-03-22.orig/arch/x86/mm/init_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mm/init_64-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -46,14 +46,13 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> +#include <asm/cacheflush.h> #include <xen/features.h> -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); #if CONFIG_XEN_COMPAT <= 0x030002 @@ -80,7 +79,21 @@ extern pte_t level1_fixmap_pgt[PTRS_PER_ (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \ __START_KERNEL_map))) -static void __meminit early_make_page_readonly(void *va, unsigned int feature) +pmd_t *__init early_get_pmd(unsigned long va) +{ + unsigned long addr; + unsigned long *page = (unsigned long *)init_level4_pgt; + + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + + addr = page[pud_index(va)]; + addr_to_page(addr, page); + + return (pmd_t *)&page[pmd_index(va)]; +} + +void __meminit early_make_page_readonly(void *va, unsigned int feature) { unsigned long addr, _va = (unsigned long)va; pte_t pte, *ptep; @@ -107,76 +120,6 @@ static void __meminit early_make_page_re BUG(); } -static void __make_page_readonly(void *va) -{ - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; - unsigned long addr = (unsigned long) va; - - pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - ptep = pte_offset_kernel(pmd, addr); - - pte.pte = ptep->pte & ~_PAGE_RW; - if (HYPERVISOR_update_va_mapping(addr, pte, 0)) - xen_l1_entry_update(ptep, pte); /* fallback */ - - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) - __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT)); -} - -static void __make_page_writable(void *va) -{ - pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; - unsigned long addr = (unsigned long) va; - - pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - ptep = pte_offset_kernel(pmd, addr); - - pte.pte = ptep->pte | _PAGE_RW; - if (HYPERVISOR_update_va_mapping(addr, pte, 0)) - xen_l1_entry_update(ptep, pte); /* fallback */ - - if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) - __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT)); -} - -void make_page_readonly(void *va, unsigned int feature) -{ - if (!xen_feature(feature)) - __make_page_readonly(va); -} - -void make_page_writable(void *va, unsigned int feature) -{ - if (!xen_feature(feature)) - __make_page_writable(va); -} - -void make_pages_readonly(void *va, unsigned nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - __make_page_readonly(va); - va = (void*)((unsigned long)va + PAGE_SIZE); - } -} - -void make_pages_writable(void *va, unsigned nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - __make_page_writable(va); - va = (void*)((unsigned long)va + PAGE_SIZE); - } -} - /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move @@ -187,22 +130,26 @@ void show_mem(void) { long i, total = 0, reserved = 0; long shared = 0, cached = 0; - pg_data_t *pgdat; struct page *page; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", + nr_swap_pages << (PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* + * This loop can take a while with 256 GB and + * 4k pages so defer the NMI watchdog: + */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) touch_nmi_watchdog(); - } + if (!pfn_valid(pgdat->node_start_pfn + i)) continue; + page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -211,58 +158,67 @@ void show_mem(void) cached++; else if (page_count(page)) shared += page_count(page) - 1; - } + } } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n", reserved); + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages swap cached\n", cached); } +static unsigned long __meminitdata table_start; +static unsigned long __meminitdata table_end; static __init void *spp_getpage(void) -{ +{ void *ptr; + if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else if (start_pfn < table_end) { ptr = __va(start_pfn << PAGE_SHIFT); start_pfn++; memset(ptr, 0, PAGE_SIZE); } else ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - Dprintk("spp_getpage %p\n", ptr); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + return ptr; -} +} #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address)) #define pud_offset_u(address) (level3_user_pgt + pud_index(address)) -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot, int user_mode) +static __init void +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot, int user_mode) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte, new_pte; - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr)); if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr)); if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); make_page_readonly(pmd, XENFEAT_writable_page_tables); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); return; } } @@ -272,7 +228,7 @@ static __init void set_pte_phys(unsigned make_page_readonly(pte, XENFEAT_writable_page_tables); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; } } @@ -294,30 +250,30 @@ static __init void set_pte_phys(unsigned __flush_tlb_one(vaddr); } -static __init void set_pte_phys_ma(unsigned long vaddr, - unsigned long phys, pgprot_t prot) +static __init void +set_pte_phys_ma(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte, new_pte; - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_phys_ma %lx to %lx\n", vaddr, phys); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { - - pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); make_page_readonly(pmd, XENFEAT_writable_page_tables); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); - return; + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); } } pmd = pmd_offset(pud, vaddr); @@ -326,7 +282,7 @@ static __init void set_pte_phys_ma(unsig make_page_readonly(pte, XENFEAT_writable_page_tables); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; } } @@ -350,14 +306,44 @@ static __init void set_pte_phys_ma(unsig __flush_tlb_one(vaddr); } +#ifndef CONFIG_XEN +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_addr holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _end. _end is + * rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + pmd_t *last_pmd = pmd + PTRS_PER_PMD; + + for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + if (!pmd_present(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} +#endif + /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); + printk(KERN_ERR "Invalid __set_fixmap\n"); return; } switch (idx) { @@ -375,16 +361,14 @@ __set_fixmap (enum fixed_addresses idx, } } -unsigned long __meminitdata table_start, table_end; - static __meminit void *alloc_static_page(unsigned long *phys) { unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map; if (after_bootmem) { void *adr = (void *)get_zeroed_page(GFP_ATOMIC); - *phys = __pa(adr); + return adr; } @@ -396,7 +380,7 @@ static __meminit void *alloc_static_page #define PTE_SIZE PAGE_SIZE -static inline int make_readonly(unsigned long paddr) +static inline int __meminit make_readonly(unsigned long paddr) { extern char __vsyscall_0; int readonly = 0; @@ -430,33 +414,38 @@ static inline int make_readonly(unsigned /* Must run before zap_low_mappings */ __meminit void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long vaddr; pmd_t *pmd, *last_pmd; + unsigned long vaddr; int i, pmds; pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; vaddr = __START_KERNEL_map; pmd = level2_kernel_pgt; last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { for (i = 0; i < pmds; i++) { if (pmd_present(pmd[i])) - goto next; + goto continue_outer_loop; } vaddr += addr & ~PMD_MASK; addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + __flush_tlb_all(); + return (void *)vaddr; - next: +continue_outer_loop: ; } printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); return NULL; } -/* To avoid virtual aliases later */ +/* + * To avoid virtual aliases later: + */ __meminit void early_iounmap(void *addr, unsigned long size) { unsigned long vaddr; @@ -466,9 +455,11 @@ __meminit void early_iounmap(void *addr, vaddr = (unsigned long)addr; pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) pmd_clear(pmd + i); - __flush_tlb(); + + __flush_tlb_all(); } #endif @@ -517,18 +508,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud,0); + pmd_t *pmd = pmd_offset(pud, 0); spin_lock(&init_mm.page_table_lock); phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ +static void __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +{ int i = pud_index(addr); - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -550,8 +542,8 @@ static void __meminit phys_pud_init(pud_ early_make_page_readonly(pmd, XENFEAT_writable_page_tables); } - __flush_tlb(); -} + __flush_tlb_all(); +} void __init xen_init_pt(void) { @@ -637,6 +629,7 @@ void __init xen_init_pt(void) static void __init extend_init_mapping(unsigned long tables_space) { unsigned long va = __START_KERNEL_map; + unsigned long start = start_pfn; unsigned long phys, addr, *pte_page; pmd_t *pmd; pte_t *pte, new_pte; @@ -713,6 +706,10 @@ static void __init extend_init_mapping(u BUG(); va += PAGE_SIZE; } + + if (start_pfn > start) + reserve_early(start << PAGE_SHIFT, + start_pfn << PAGE_SHIFT, "INITMAP"); } static void __init find_early_table_space(unsigned long end) @@ -737,7 +734,7 @@ static void __init find_early_table_spac (table_start << PAGE_SHIFT) + tables); } -static void xen_finish_init_mapping(void) +static void __init xen_finish_init_mapping(void) { unsigned long i, start, end; @@ -769,13 +766,6 @@ static void xen_finish_init_mapping(void /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */ table_end = ~0UL; - /* - * Prefetch pte's for the bt_ioremap() area. It gets used before the - * boot-time allocator is online, so allocate-on-demand would fail. - */ - for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++) - __set_fixmap(i, 0, __pgprot(0)); - /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); @@ -795,20 +785,23 @@ static void xen_finish_init_mapping(void table_end = start_pfn; } -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) -{ +{ unsigned long next; - Dprintk("init_memory_mapping\n"); + pr_debug("init_memory_mapping\n"); - /* + /* * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. */ if (!after_bootmem) find_early_table_space(end); @@ -817,8 +810,8 @@ void __init_refok init_memory_mapping(un end = (unsigned long)__va(end); for (; start < end; start = next) { - unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; pud_t *pud; if (after_bootmem) @@ -826,8 +819,8 @@ void __init_refok init_memory_mapping(un else pud = alloc_static_page(&pud_phys); next = start + PGDIR_SIZE; - if (next > end) - next = end; + if (next > end) + next = end; phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) { early_make_page_readonly(pud, XENFEAT_writable_page_tables); @@ -841,12 +834,17 @@ void __init_refok init_memory_mapping(un } __flush_tlb_all(); + + if (!after_bootmem) + reserve_early(table_start << PAGE_SHIFT, + table_end << PAGE_SHIFT, "PGTABLE"); } #ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -860,40 +858,6 @@ void __init paging_init(void) } #endif -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - /* * Memory hotplug specific functions */ @@ -919,16 +883,12 @@ int arch_add_memory(int nid, u64 start, unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, (start + size -1)); + init_memory_mapping(start, start + size-1); ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; + WARN_ON(1); return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -942,36 +902,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, + kcore_modules, kcore_vsyscall; void __init mem_init(void) { @@ -980,8 +912,7 @@ void __init mem_init(void) pci_iommu_alloc(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + /* clear_bss() already clear the empty_zero_page */ reservedpages = 0; @@ -998,7 +929,6 @@ void __init mem_init(void) } reservedpages = end_pfn - totalram_pages - absent_pages_in_range(0, end_pfn); - after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -1006,46 +936,64 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); kclist_add(&kcore_kernel, &_stext, _end - _stext); kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START); - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), end_pfn << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); + + cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr; + unsigned long addr = begin; - if (begin >= end) + if (addr >= end) return; + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), POISON_FREE_INITMEM, PAGE_SIZE); if (addr >= __START_KERNEL_map) { /* make_readonly() reports all kernel addresses. */ - __make_page_writable(__va(__pa(addr))); - change_page_attr_addr(addr, 1, __pgprot(0)); + if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)), + pfn_pte(__pa(addr) >> PAGE_SHIFT, + PAGE_KERNEL), + 0)) + BUG(); + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + BUG(); } free_page(addr); totalram_pages++; } - if (addr > __START_KERNEL_map) - global_flush_tlb(); +#endif } void free_initmem(void) @@ -1056,6 +1004,8 @@ void free_initmem(void) } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -1077,18 +1027,27 @@ void mark_rodata_ro(void) if (end <= start) return; - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. + * The rodata section (but not the kernel text!) should also be + * not-executable. */ - global_flush_tlb(); + start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + set_memory_nx(start, (end - start) >> PAGE_SHIFT); + + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif } #endif @@ -1099,17 +1058,21 @@ void free_initrd_mem(unsigned long start } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); #endif unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ + /* + * This can happen with kdump kernels when accessing + * firmware tables: + */ if (pfn < end_pfn_map) return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); return; @@ -1117,9 +1080,9 @@ void __init reserve_bootmem_generic(unsi /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); +#else + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif #ifndef CONFIG_XEN if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { @@ -1129,46 +1092,49 @@ void __init reserve_bootmem_generic(unsi #endif } -int kern_addr_valid(unsigned long addr) -{ +int kern_addr_valid(unsigned long addr) +{ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; if (above != 0 && above != -1UL) - return 0; - + return 0; + pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) return 0; pud = pud_offset(pgd, addr); if (pud_none(*pud)) - return 0; + return 0; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; + if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) return 0; + return pfn_valid(pte_pfn(*pte)); } -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) @@ -1183,14 +1149,17 @@ struct vm_area_struct *get_gate_vma(stru int in_gate_area(struct task_struct *task, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); } -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. +/* + * Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives: */ int in_gate_area_no_task(unsigned long addr) { @@ -1210,8 +1179,8 @@ const char *arch_vma_name(struct vm_area /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long size, int node) +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) { unsigned long addr = (unsigned long)start_page; unsigned long end = (unsigned long)(start_page + size); @@ -1226,6 +1195,7 @@ int __meminit vmemmap_populate(struct pa pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); if (!pud) return -ENOMEM; @@ -1233,20 +1203,22 @@ int __meminit vmemmap_populate(struct pa pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { pte_t entry; - void *p = vmemmap_alloc_block(PMD_SIZE, node); + void *p; + + p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); - mk_pte_huge(entry); - set_pmd(pmd, __pmd(pte_val(entry))); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd_ma(__pte_val(entry))); printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", addr, addr + PMD_SIZE - 1, p, node); - } else + } else { vmemmap_verify((pte_t *)pmd, node, addr, next); + } } - return 0; } #endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/ioremap-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,687 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> + +enum ioremap_mode { + IOR_MODE_UNCACHED, + IOR_MODE_CACHED, +}; + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + +#endif + +static int direct_remap_area_pte_fn(pte_t *pte, + struct page *pmd_page, + unsigned long address, + void *data) +{ + mmu_update_t **v = (mmu_update_t **)data; + + BUG_ON(!pte_none(*pte)); + + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + (*v)++; + + return 0; +} + +static int __direct_remap_pfn_range(struct mm_struct *mm, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int rc; + unsigned long i, start_address; + mmu_update_t *u, *v, *w; + + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (u == NULL) + return -ENOMEM; + + start_address = address; + + flush_cache_all(); + + for (i = 0; i < size; i += PAGE_SIZE) { + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { + /* Flush a full batch after filling in the PTE ptrs. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) + goto out; + v = w = u; + start_address = address; + } + + /* + * Fill in the machine address: PTE ptr is done later by + * apply_to_page_range(). + */ + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; + + mfn++; + address += PAGE_SIZE; + v++; + } + + if (v != u) { + /* Final batch. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) + goto out; + } + + rc = 0; + + out: + flush_tlb_all(); + + free_page((unsigned long)u); + + return rc; +} + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return remap_pfn_range(vma, address, mfn, size, prot); + + if (domid == DOMID_SELF) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + vma->vm_mm->context.has_foreign_mappings = 1; + + return __direct_remap_pfn_range( + vma->vm_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_remap_pfn_range); + +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + return __direct_remap_pfn_range( + &init_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_kernel_remap_pfn_range); + +static int lookup_pte_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + uint64_t *ptep = (uint64_t *)data; + if (ptep) + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + return 0; +} + +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep) +{ + return apply_to_page_range(mm, address, PAGE_SIZE, + lookup_pte_fn, ptep); +} + +EXPORT_SYMBOL(create_lookup_pte_addr); + +static int noop_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + return 0; +} + +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size) +{ + return apply_to_page_range(mm, address, size, noop_fn, NULL); +} + +EXPORT_SYMBOL(touch_pte_range); + +#ifdef CONFIG_X86_32 +int page_is_ram(unsigned long pagenr) +{ + unsigned long addr, end; + int i; + +#ifndef CONFIG_XEN + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + if (pagenr == 0) + return 0; + + /* + * Second special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + */ + if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && + pagenr < (BIOS_END >> PAGE_SHIFT)) + return 0; +#endif + + for (i = 0; i < e820.nr_map; i++) { + /* + * Not usable memory: + */ + if (e820.map[i].type != E820_RAM) + continue; + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; + + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} +#endif + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, + enum ioremap_mode mode) +{ + unsigned long nrpages = size >> PAGE_SHIFT; + int err; + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + err = set_memory_uc(vaddr, nrpages); + break; + case IOR_MODE_CACHED: + err = set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, + enum ioremap_mode mode) +{ + unsigned long mfn, offset, last_addr, vaddr; + struct vm_struct *area; + pgprot_t prot; + domid_t domid = DOMID_IO; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (is_initial_xendomain() && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + for (mfn = PFN_DOWN(phys_addr); mfn < PFN_UP(last_addr); mfn++) { + unsigned long pfn = mfn_to_local_pfn(mfn); + + if (pfn >= max_pfn) + continue; + + domid = DOMID_SELF; + + if (pfn >= max_pfn_mapped) /* bogus */ + continue; + + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + return NULL; + } + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + /* + * FIXME: we will use UC MINUS for now, as video fb drivers + * depend on it. Upcoming ioremap_wc() will fix this behavior. + */ + prot = PAGE_KERNEL_UC_MINUS; + break; + case IOR_MODE_CACHED: + prot = PAGE_KERNEL; + break; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (mode << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + vaddr = (unsigned long) area->addr; + if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr), + size, prot, domid)) { + free_vm_area(area); + return NULL; + } + + if (ioremap_change_attr(vaddr, size, mode) < 0) { + iounmap((void __iomem *) vaddr); + return NULL; + } + + return (void __iomem *) (vaddr + offset); +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); +} +EXPORT_SYMBOL(ioremap_nocache); + +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_CACHED); +} +EXPORT_SYMBOL(ioremap_cache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + if ((p->flags >> 20) != IOR_MODE_CACHED) { + unsigned long n = get_vm_area_size(p) >> PAGE_SHIFT; + unsigned long mfn = p->phys_addr; + unsigned long va = (unsigned long)addr; + + for (; n > 0; n--, mfn++, va += PAGE_SIZE) + if (mfn_to_local_pfn(mfn) < max_pfn) + set_memory_wb(va, 1); + } + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] + __attribute__((aligned(PAGE_SIZE))); + +#ifdef CONFIG_X86_32 +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; +} +#else +#define early_ioremap_pmd early_get_pmd +#define make_lowmem_page_readonly early_make_page_readonly +#define make_lowmem_page_writable make_page_writable +#endif + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + return &bm_pte[pte_index(addr)]; +} + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +#ifdef CONFIG_X86_32 +void __init early_ioremap_clear(void) +{ + pmd_t *pmd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_clear()\n"); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + pmd_clear(pmd); + make_lowmem_page_writable(bm_pte, XENFEAT_writable_page_tables); + /* paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); */ + __flush_tlb_all(); +} + +void __init early_ioremap_reset(void) +{ + enum fixed_addresses idx; + unsigned long addr, phys; + pte_t *pte; + + after_paging_init = 1; + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { + addr = fix_to_virt(idx); + pte = early_ioremap_pte(addr); + if (pte_present(*pte)) { + phys = __pte_val(*pte) & PAGE_MASK; + set_fixmap(idx, phys); + } + } +} +#endif /* CONFIG_X86_32 */ + +static void __init __early_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) + set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags)); + else + pte_clear(NULL, addr, pte); + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + unsigned long phys) +{ + if (after_paging_init) + set_fixmap(idx, phys); + else + __early_set_fixmap(idx, phys, PAGE_KERNEL); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + + +int __initdata early_ioremap_nested; + +static int __init check_early_ioremap_leak(void) +{ + if (!early_ioremap_nested) + return 0; + + printk(KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + early_ioremap_nested); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + WARN_ON(1); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages, nesting; + enum fixed_addresses idx0, idx; + + WARN_ON(system_state != SYSTEM_BOOTING); + + nesting = early_ioremap_nested; + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", + phys_addr, size, nesting); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + if (nesting >= FIX_BTMAPS_NESTING) { + WARN_ON(1); + return NULL; + } + early_ioremap_nested++; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + + return (void *) (offset + fix_to_virt(idx0)); +} + +void __init early_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + unsigned int nesting; + + nesting = --early_ioremap_nested; + WARN_ON(nesting < 0); + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, nesting); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} --- sle11sp1-2010-03-22.orig/arch/x86/mm/ioremap_32-xen.c 2009-11-06 10:51:07.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,445 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> -#include <linux/sched.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -#define ISA_START_ADDRESS 0x0 -#define ISA_END_ADDRESS 0x100000 - -static int direct_remap_area_pte_fn(pte_t *pte, - struct page *pmd_page, - unsigned long address, - void *data) -{ - mmu_update_t **v = (mmu_update_t **)data; - - BUG_ON(!pte_none(*pte)); - - (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); - (*v)++; - - return 0; -} - -static int __direct_remap_pfn_range(struct mm_struct *mm, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int rc; - unsigned long i, start_address; - mmu_update_t *u, *v, *w; - - u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (u == NULL) - return -ENOMEM; - - start_address = address; - - flush_cache_all(); - - for (i = 0; i < size; i += PAGE_SIZE) { - if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { - /* Flush a full batch after filling in the PTE ptrs. */ - rc = apply_to_page_range(mm, start_address, - address - start_address, - direct_remap_area_pte_fn, &w); - if (rc) - goto out; - rc = -EFAULT; - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) - goto out; - v = w = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * apply_to_page_range(). - */ - v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; - - mfn++; - address += PAGE_SIZE; - v++; - } - - if (v != u) { - /* Final batch. */ - rc = apply_to_page_range(mm, start_address, - address - start_address, - direct_remap_area_pte_fn, &w); - if (rc) - goto out; - rc = -EFAULT; - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) - goto out; - } - - rc = 0; - - out: - flush_tlb_all(); - - free_page((unsigned long)u); - - return rc; -} - -int direct_remap_pfn_range(struct vm_area_struct *vma, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return remap_pfn_range(vma, address, mfn, size, prot); - - if (domid == DOMID_SELF) - return -EINVAL; - - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - - vma->vm_mm->context.has_foreign_mappings = 1; - - return __direct_remap_pfn_range( - vma->vm_mm, address, mfn, size, prot, domid); -} -EXPORT_SYMBOL(direct_remap_pfn_range); - -int direct_kernel_remap_pfn_range(unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - return __direct_remap_pfn_range( - &init_mm, address, mfn, size, prot, domid); -} -EXPORT_SYMBOL(direct_kernel_remap_pfn_range); - -static int lookup_pte_fn( - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) -{ - uint64_t *ptep = (uint64_t *)data; - if (ptep) - *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << - PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); - return 0; -} - -int create_lookup_pte_addr(struct mm_struct *mm, - unsigned long address, - uint64_t *ptep) -{ - return apply_to_page_range(mm, address, PAGE_SIZE, - lookup_pte_fn, ptep); -} - -EXPORT_SYMBOL(create_lookup_pte_addr); - -static int noop_fn( - pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) -{ - return 0; -} - -int touch_pte_range(struct mm_struct *mm, - unsigned long address, - unsigned long size) -{ - return apply_to_page_range(mm, address, size, noop_fn, NULL); -} - -EXPORT_SYMBOL(touch_pte_range); - -/* - * Does @address reside within a non-highmem page that is local to this virtual - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). - * See the comment that accompanies mfn_to_local_pfn() in page.h to understand - * why this works. - */ -static inline int is_local_lowmem(unsigned long address) -{ - extern unsigned long max_low_pfn; - return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn); -} - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t prot; - domid_t domid = DOMID_IO; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_initial_xendomain() && - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) isa_bus_to_virt(phys_addr); - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (is_local_lowmem(phys_addr)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = bus_to_virt(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - - domid = DOMID_SELF; - } - - prot = __pgprot(_KERNPG_TABLE | flags); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, - phys_addr>>PAGE_SHIFT, - size, prot, domid)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (is_local_lowmem(last_addr)) { - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if ((void __force *)addr <= high_memory) - return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), - get_vm_area_size(p) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_initial_xendomain() && - phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return isa_bus_to_virt(phys_addr); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/pageattr-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,1414 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/interrupt.h> + +#include <asm/e820.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/mmu_context.h> + +#ifndef CONFIG_X86_64 +#define TASK_SIZE64 TASK_SIZE +#endif + +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + +#define PIN_BATCH sizeof(void *) +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); + +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, + unsigned int cpu, unsigned int seq) +{ + unsigned long pfn = page_to_pfn(page); + + if (PageHighMem(page)) { + if (pgprot_val(flags) & _PAGE_RW) + ClearPagePinned(page); + else + SetPagePinned(page); + } else { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); + if (unlikely(++seq == PIN_BATCH)) { + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + PIN_BATCH, NULL))) + BUG(); + seq = 0; + } + } + + return seq; +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + int g,u,m; + unsigned int cpu, seq; + multicall_entry_t *mcl; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + cpu = get_cpu(); + + /* + * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables + * may not be the 'current' task's pagetables (e.g., current may be + * 32-bit, but the pagetables may be for a 64-bit task). + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. + */ + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); + } + } + } + + mcl = per_cpu(pb_mcl, cpu); +#ifdef CONFIG_X86_64 + if (unlikely(seq > PIN_BATCH - 2)) { + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) + BUG(); + seq = 0; + } + MULTI_update_va_mapping(mcl + seq, + (unsigned long)__user_pgd(pgd_base), + pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), + 0); + MULTI_update_va_mapping(mcl + seq + 1, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) + BUG(); +#else + if (likely(seq != 0)) { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + seq + 1, NULL))) + BUG(); + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH)) + BUG(); +#endif + + put_cpu(); +} + +static void __pgd_pin(pgd_t *pgd) +{ + pgd_walk(pgd, PAGE_KERNEL_RO); + kmap_flush_unused(); + xen_pgd_pin(__pa(pgd)); /* kernel */ +#ifdef CONFIG_X86_64 + xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ +#endif + SetPagePinned(virt_to_page(pgd)); +} + +static void __pgd_unpin(pgd_t *pgd) +{ + xen_pgd_unpin(__pa(pgd)); +#ifdef CONFIG_X86_64 + xen_pgd_unpin(__pa(__user_pgd(pgd))); +#endif + pgd_walk(pgd, PAGE_KERNEL); + ClearPagePinned(virt_to_page(pgd)); +} + +void pgd_test_and_unpin(pgd_t *pgd) +{ + if (PagePinned(virt_to_page(pgd))) + __pgd_unpin(pgd); +} + +void mm_pin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_pin(mm->pgd); + pin_unlock(mm); +} + +void mm_unpin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + __pgd_unpin(mm->pgd); + pin_unlock(mm); +} + +void mm_pin_all(void) +{ + struct page *page; + unsigned long flags; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + /* + * Allow uninterrupted access to the pgd_list. Also protects + * __pgd_pin() by disabling preemption. + * All other CPUs must be at a safe point (e.g., in stop_machine + * or offlined entirely). + */ + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!PagePinned(page)) + __pgd_pin((pgd_t *)page_address(page)); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} + +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + if (!PagePinned(virt_to_page(mm->pgd))) + mm_pin(mm); +} + +void arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if (PagePinned(virt_to_page(mm->pgd)) + && atomic_read(&mm->mm_count) == 1 + && !mm->context.has_foreign_mappings) + mm_unpin(mm); +} + +static void _pte_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pte_free(page); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) { + pgtable_page_ctor(pte); + SetPageForeign(pte, _pte_free); + init_page_count(pte); + } + return pte; +} + +void __pte_free(pgtable_t pte) +{ + if (!PageHighMem(pte)) { + unsigned long va = (unsigned long)page_address(pte); + unsigned int level; + pte_t *ptep = lookup_address(va, &level); + + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); + if (!pte_write(*ptep) + && HYPERVISOR_update_va_mapping(va, + mk_pte(pte, PAGE_KERNEL), + 0)) + BUG(); + } else +#ifdef CONFIG_HIGHPTE + ClearPagePinned(pte); +#else + BUG(); +#endif + + ClearPageForeign(pte); + init_page_count(pte); + pgtable_page_dtor(pte); + __free_page(pte); +} + +#if PAGETABLE_LEVELS >= 3 +static void _pmd_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + __pmd_free(page); +} + +pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pmd; + + pmd = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + if (!pmd) + return NULL; + SetPageForeign(pmd, _pmd_free); + init_page_count(pmd); + return page_address(pmd); +} + +void __pmd_free(pgtable_t pmd) +{ + unsigned long va = (unsigned long)page_address(pmd); + unsigned int level; + pte_t *ptep = lookup_address(va, &level); + + BUG_ON(!ptep || level != PG_LEVEL_4K || !pte_present(*ptep)); + if (!pte_write(*ptep) + && HYPERVISOR_update_va_mapping(va, mk_pte(pmd, PAGE_KERNEL), 0)) + BUG(); + + ClearPageForeign(pmd); + init_page_count(pmd); + __free_page(pmd); +} +#endif + +/* blktap and gntdev need this, as otherwise they would implicitly (and + * needlessly, as they never use it) reference init_mm. */ +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int full) +{ + return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm, + addr, ptep, full); +} +EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full); + +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long vaddr; + pgprot_t mask_set; + pgprot_t mask_clr; + int numpages; + int flushtlb; + unsigned long pfn; +}; + +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; +} + +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} + +static void __cpa_flush_all(void *arg) +{ + unsigned long cache = (unsigned long)arg; + + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (cache && boot_cpu_data.x86_model >= 4) + wbinvd(); +} + +static void cpa_flush_all(unsigned long cache) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1, 1); + + if (!cache) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (__pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) +{ + pgprot_t forbidden = __pgprot(0); + +#ifndef CONFIG_XEN + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_NX; +#endif + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. + */ + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_RW; + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + + return pte_offset_kernel(pmd, address); +} + +/* + * Set the new pmd in all the pgds we know about: + */ +static void __set_pmd_pte(pte_t *kpte, unsigned long address, + unsigned int level, pte_t pte) +{ + /* change init_mm */ + switch(level) { + case PG_LEVEL_2M: + xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte))); + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte))); + break; +#endif + default: + BUG(); + } +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte))); + } + } +#endif +} + +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int i, do_split = 1; + unsigned int level; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + psize = PUD_PAGE_SIZE; + pmask = PUD_PAGE_MASK; + break; +#endif + default: + do_split = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = pte_pgprot(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + /* + * old_pte points to the large page base address. So we need + * to add the offset of the virtual address: + */ + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(new_prot, address, pfn); + + /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + if (pfn < max_mapnr) { + addr = address + PAGE_SIZE; + for (i = 1; i < cpa->numpages && ++pfn < max_mapnr; + i++, addr += PAGE_SIZE) { + pgprot_t chk_prot = static_protections(new_prot, addr, pfn); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + } + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, level, new_pte); + cpa->flushtlb = 1; + do_split = 0; + } + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + return do_split; +} + +static LIST_HEAD(page_pool); +static unsigned long pool_size, pool_pages, pool_low; +static unsigned long pool_used, pool_failed; + +static void cpa_fill_pool(struct page **ret) +{ + gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; + + /* + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: + */ + if (current->flags & PF_MEMALLOC) + return; + current->flags |= PF_MEMALLOC; + + /* + * Allocate atomically from atomic contexts: + */ + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; + + while (pool_pages < pool_size || (ret && !*ret)) { + p = alloc_pages(gfp, 0); + if (!p) { + pool_failed++; + break; + } + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); + list_add(&p->lru, &page_pool); + pool_pages++; + spin_unlock_irqrestore(&pgd_lock, flags); + } + + current->flags &= ~PF_MEMALLOC; +} + +#define SHIFT_MB (20 - PAGE_SHIFT) +#define ROUND_MB_GB ((1 << 10) - 1) +#define SHIFT_MB_GB 10 +#define POOL_PAGES_PER_GB 16 + +void __init cpa_init(void) +{ + struct sysinfo si; + unsigned long gb; + + si_meminfo(&si); + /* + * Calculate the number of pool pages: + * + * Convert totalram (nr of pages) to MiB and round to the next + * GiB. Shift MiB to Gib and multiply the result by + * POOL_PAGES_PER_GB: + */ + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } + pool_low = pool_size; + + cpa_fill_pool(NULL); + printk(KERN_DEBUG + "CPA: page pool initialized %lu of %lu pages preallocated\n", + pool_pages, pool_size); +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + unsigned long flags, mfn, mfninc = 1; + unsigned int i, level; + pte_t *pbase, *tmp; + pgprot_t ref_prot; + struct page *base; + + /* + * Get a page from the pool. The pool list is protected by the + * pgd_lock, which we have to take anyway for the split + * operation: + */ + spin_lock_irqsave(&pgd_lock, flags); + if (list_empty(&page_pool)) { + spin_unlock_irqrestore(&pgd_lock, flags); + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; + } + + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + pbase = (pte_t *)page_address(base); +#ifdef CONFIG_X86_32 + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); +#endif + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + } +#endif + + /* + * Get the target mfn from the original entry: + */ + mfn = __pte_mfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc) + set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot)); + + /* + * Install the new, split up pagetable. Important details here: + * + * On Intel the NX bit of all levels must be cleared to make a + * page executable. See section 4.13.2 of Intel 64 and IA-32 + * Architectures Software Developer's Manual). + * + * Mark the entry present. The current mapping might be + * set to not present, which we preserved above. + */ + if (!xen_feature(XENFEAT_writable_page_tables) && + HYPERVISOR_update_va_mapping((unsigned long)pbase, + mk_pte(base, PAGE_KERNEL_RO), 0)) + BUG(); + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + pgprot_val(ref_prot) |= _PAGE_PRESENT; + __set_pmd_pte(kpte, address, level, mk_pte(base, ref_prot)); + base = NULL; + +out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) { + list_add(&base->lru, &page_pool); + pool_pages++; + } else + pool_used++; + spin_unlock_irqrestore(&pgd_lock, flags); + + return 0; +} + +static int __change_page_attr(struct cpa_data *cpa, int primary) +{ + unsigned long address = cpa->vaddr; + int do_split, err; + unsigned int level; + pte_t *kpte, old_pte; + +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return primary ? -EINVAL : 0; + + old_pte = *kpte; + if (!__pte_val(old_pte)) { + if (!primary) + return 0; + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); + return -EINVAL; + } + + if (level == PG_LEVEL_4K) { + pte_t new_pte; + pgprot_t new_prot = pte_pgprot(old_pte); + unsigned long mfn = __pte_mfn(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + new_prot = static_protections(new_prot, address, + mfn_to_local_pfn(mfn)); + + /* + * We need to keep the mfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot)); + cpa->pfn = mfn_to_local_pfn(mfn); + /* + * Do we really change anything ? + */ + if (__pte_val(old_pte) != __pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flushtlb = 1; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(kpte, address); + if (!err) { + cpa->flushtlb = 1; + goto repeat; + } + + return err; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) +{ + struct cpa_data alias_cpa; + int ret = 0; + + if (cpa->pfn > max_pfn_mapped) + return 0; + + /* + * No need to redo, when the primary call touched the direct + * mapping already: + */ + if (!within(cpa->vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + } + +#ifdef CONFIG_X86_64 + if (ret) + return ret; + /* + * No need to redo, when the primary call touched the high + * mapping already: + */ + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + return 0; + + /* + * If the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) + return 0; + + alias_cpa = *cpa; + alias_cpa.vaddr = + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map; + + /* + * The high mapping range is imprecise, so ignore the return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); +#endif + return ret; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +{ + int ret, numpages = cpa->numpages; + + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + + ret = __change_page_attr(cpa, checkalias); + if (ret) + return ret; + + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; + } + return 0; +} + +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + struct cpa_data cpa; + int ret, cache, checkalias; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + return 0; + + /* Ensure we are PAGE_SIZE aligned */ + if (addr & ~PAGE_MASK) { + addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + + cpa.vaddr = addr; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flushtlb = 0; + + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + + ret = __change_page_attr_set_clr(&cpa, checkalias); + + /* + * Check whether we really changed something: + */ + if (!cpa.flushtlb) + goto out; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) + cpa_flush_range(addr, numpages, cache); + else + cpa_flush_all(cache); + +out: + cpa_fill_pool(NULL); + + return ret; +} + +static inline int change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PCD)); +} +EXPORT_SYMBOL(set_memory_uc); + +int set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0)}; + + return __change_page_attr_set_clr(&cpa, 1); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + + return __change_page_attr_set_clr(&cpa, 1); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + + /* + * The return value is ignored as the calls cannot fail. + * Large pages are kept enabled at boot time, and are + * split up quickly with DEBUG_PAGEALLOC. If a splitup + * fails here (due to temporary memory shortage) no damage + * is done because we just keep the largepage intact up + * to the next attempt when it will likely be split up: + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); + + /* + * Try to refill the page pool here. We can do this only after + * the tlb flush. + */ + cpa_fill_pool(NULL); +} + +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (__pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +static inline int in_secondary_range(unsigned long va) +{ +#ifdef CONFIG_X86_64 + return va >= VMALLOC_START && va < VMALLOC_END; +#else + return va >= (unsigned long)high_memory; +#endif +} + +static void __make_page_readonly(unsigned long va) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K); + if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0)) + BUG(); + if (in_secondary_range(va)) { + unsigned long pfn = pte_pfn(*pte); + +#ifdef CONFIG_HIGHMEM + if (pfn >= highstart_pfn) + kmap_flush_unused(); /* flush stale writable kmaps */ + else +#endif + __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT)); + } +} + +static void __make_page_writable(unsigned long va) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K); + if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), 0)) + BUG(); + if (in_secondary_range(va)) { + unsigned long pfn = pte_pfn(*pte); + +#ifdef CONFIG_HIGHMEM + if (pfn < highstart_pfn) +#endif + __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT)); + } +} + +void make_page_readonly(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_readonly((unsigned long)va); +} + +void make_page_writable(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_writable((unsigned long)va); +} + +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) +{ + unsigned long addr; + + if (xen_feature(feature)) + return; + + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) + __make_page_readonly(addr); +} + +void make_pages_writable(void *va, unsigned int nr, unsigned int feature) +{ + unsigned long addr; + + if (xen_feature(feature)) + return; + + for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE) + __make_page_writable(addr); +} + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif --- sle11sp1-2010-03-22.orig/arch/x86/mm/pageattr_64-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,542 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -#ifdef CONFIG_XEN -#include <asm/pgalloc.h> -#include <asm/mmu_context.h> - -static void _pin_lock(struct mm_struct *mm, int lock) { - if (lock) - spin_lock(&mm->page_table_lock); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - /* While mm->page_table_lock protects us against insertions and - * removals of higher level page table pages, it doesn't protect - * against updates of pte-s. Such updates, however, require the - * pte pages to be in consistent state (unpinned+writable or - * pinned+readonly). The pinning and attribute changes, however - * cannot be done atomically, which is why such updates must be - * prevented from happening concurrently. - * Note that no pte lock can ever elsewhere be acquired nesting - * with an already acquired one in the same mm, or with the mm's - * page_table_lock already acquired, as that would break in the - * non-split case (where all these are actually resolving to the - * one page_table_lock). Thus acquiring all of them here is not - * going to result in dead locks, and the order of acquires - * doesn't matter. - */ - { - pgd_t *pgd = mm->pgd; - unsigned g; - - for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { - pud_t *pud; - unsigned u; - - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - pmd_t *pmd; - unsigned m; - - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - spinlock_t *ptl; - - if (pmd_none(*pmd)) - continue; - ptl = pte_lockptr(0, pmd); - if (lock) - spin_lock(ptl); - else - spin_unlock(ptl); - } - } - } - } -#endif - if (!lock) - spin_unlock(&mm->page_table_lock); -} -#define pin_lock(mm) _pin_lock(mm, 1) -#define pin_unlock(mm) _pin_lock(mm, 0) - -#define PIN_BATCH 8 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); - -static inline unsigned int pgd_walk_set_prot(void *pt, pgprot_t flags, - unsigned int cpu, unsigned int seq) -{ - struct page *page = virt_to_page(pt); - unsigned long pfn = page_to_pfn(page); - - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); - if (unlikely(++seq == PIN_BATCH)) { - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), - PIN_BATCH, NULL))) - BUG(); - seq = 0; - } - - return seq; -} - -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) -{ - pgd_t *pgd = pgd_base; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int g,u,m; - unsigned int cpu, seq; - multicall_entry_t *mcl; - - cpu = get_cpu(); - - /* - * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not - * be the 'current' task's pagetables (e.g., current may be 32-bit, - * but the pagetables may be for a 64-bit task). - * Subtracting 1 from TASK_SIZE64 means the loop limit is correct - * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. - */ - for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - if (PTRS_PER_PUD > 1) /* not folded */ - seq = pgd_walk_set_prot(pud,flags,cpu,seq); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - if (PTRS_PER_PMD > 1) /* not folded */ - seq = pgd_walk_set_prot(pmd,flags,cpu,seq); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - if (pmd_none(*pmd)) - continue; - pte = pte_offset_kernel(pmd,0); - seq = pgd_walk_set_prot(pte,flags,cpu,seq); - } - } - } - - mcl = per_cpu(pb_mcl, cpu); - if (unlikely(seq > PIN_BATCH - 2)) { - if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) - BUG(); - seq = 0; - } - MULTI_update_va_mapping(mcl + seq, - (unsigned long)__user_pgd(pgd_base), - pfn_pte(virt_to_phys(__user_pgd(pgd_base))>>PAGE_SHIFT, flags), - 0); - MULTI_update_va_mapping(mcl + seq + 1, - (unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH); - if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) - BUG(); - - put_cpu(); -} - -static void __pgd_pin(pgd_t *pgd) -{ - pgd_walk(pgd, PAGE_KERNEL_RO); - xen_pgd_pin(__pa(pgd)); /* kernel */ - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ - SetPagePinned(virt_to_page(pgd)); -} - -static void __pgd_unpin(pgd_t *pgd) -{ - xen_pgd_unpin(__pa(pgd)); - xen_pgd_unpin(__pa(__user_pgd(pgd))); - pgd_walk(pgd, PAGE_KERNEL); - ClearPagePinned(virt_to_page(pgd)); -} - -void pgd_test_and_unpin(pgd_t *pgd) -{ - if (PagePinned(virt_to_page(pgd))) - __pgd_unpin(pgd); -} - -void mm_pin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - pin_lock(mm); - __pgd_pin(mm->pgd); - pin_unlock(mm); -} - -void mm_unpin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - pin_lock(mm); - __pgd_unpin(mm->pgd); - pin_unlock(mm); -} - -void mm_pin_all(void) -{ - struct page *page; - unsigned long flags; - - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - /* - * Allow uninterrupted access to the pgd_list. Also protects - * __pgd_pin() by disabling preemption. - * All other CPUs must be at a safe point (e.g., in stop_machine - * or offlined entirely). - */ - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!PagePinned(page)) - __pgd_pin((pgd_t *)page_address(page)); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - if (!PagePinned(virt_to_page(mm->pgd))) - mm_pin(mm); -} - -void arch_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if (tsk->active_mm == mm) { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); - - switch_mm(mm, &init_mm, tsk); - - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } - - task_unlock(tsk); - - if (PagePinned(virt_to_page(mm->pgd)) - && (atomic_read(&mm->mm_count) == 1) - && !mm->context.has_foreign_mappings) - mm_unpin(mm); -} - -static void _pte_free(struct page *page, unsigned int order) -{ - BUG_ON(order); - pte_free(page); -} - -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); - if (pte) { - SetPageForeign(pte, _pte_free); - init_page_count(pte); - } - return pte; -} - -void pte_free(struct page *pte) -{ - unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); - - if (!pte_write(*virt_to_ptep(va))) - if (HYPERVISOR_update_va_mapping( - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)) - BUG(); - - ClearPageForeign(pte); - init_page_count(pte); - - __free_page(pte); -} -#endif /* CONFIG_XEN */ - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -void clflush_cache_range(void *adr, int size) -{ - int i; - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - clflush_cache_range(adr, PAGE_SIZE); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(__pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - pgprot_val(ref_prot2) &= ~_PAGE_NX; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - /* - * ..., but the XEN guest kernels (currently) do: - * If the pte was reserved, it means it was created at boot - * time (not via split_large_page) and in turn we must not - * replace it with a large page. - */ -#ifndef CONFIG_XEN - BUG_ON(PageReserved(kpte_page)); -#else - if (PageReserved(kpte_page)) - return 0; -#endif - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - /* - * Write-protect the semaphore, to exclude two contexts - * doing a list_replace_init() call in parallel and to - * exclude new additions to the deferred_pages list: - */ - down_write(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_write(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); --- sle11sp1-2010-03-22.orig/arch/x86/mm/pgtable_32-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/mm/pgtable_32-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -29,8 +29,6 @@ #include <xen/features.h> #include <asm/hypervisor.h> -static void pgd_test_and_unpin(pgd_t *pgd); - void show_mem(void) { int total = 0, reserved = 0; @@ -167,53 +165,6 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -static void _pte_free(struct page *page, unsigned int order) -{ - BUG_ON(order); - pte_free(page); -} - -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) { - SetPageForeign(pte, _pte_free); - init_page_count(pte); - } - return pte; -} - -void pte_free(struct page *pte) -{ - unsigned long pfn = page_to_pfn(pte); - - if (!PageHighMem(pte)) { - unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT); - - if (!pte_write(*virt_to_ptep(va))) - if (HYPERVISOR_update_va_mapping( - va, pfn_pte(pfn, PAGE_KERNEL), 0)) - BUG(); - } else - ClearPagePinned(pte); - - ClearPageForeign(pte); - init_page_count(pte); - - __free_page(pte); -} - -void pmd_ctor(struct kmem_cache *cache, void *pmd) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the @@ -224,224 +175,191 @@ void pmd_ctor(struct kmem_cache *cache, * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); - pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + + list_add(&page->lru, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page_private(page); - *pprev = next; - if (next) - set_page_private(next, (unsigned long)pprev); -} + struct page *page = virt_to_page(pgd); + list_del(&page->lru); +} +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) -#if (PTRS_PER_PMD == 1) -/* Non-PAE pgd constructor */ -static void pgd_ctor(void *pgd) +static void pgd_ctor(void *p) { + pgd_t *pgd = p; unsigned long flags; - /* !PAE, no pagetable sharing */ + pgd_test_and_unpin(pgd); + + /* Clear usermode parts of PGD */ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); - /* must happen under lock */ - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} -#else /* PTRS_PER_PMD > 1 */ -/* PAE pgd constructor */ -static void pgd_ctor(void *pgd) -{ - /* PAE, kernel PMD may be shared */ - - if (SHARED_KERNEL_PMD) { - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + clone_pgd_range(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - } else { - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); } + + /* list required to sync kernel mapping updates */ + if (PAGETABLE_LEVELS == 2) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); } -#endif /* PTRS_PER_PMD */ static void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) - return; - - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + if (!SHARED_KERNEL_PMD) { + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } pgd_test_and_unpin(pgd); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -/* If we allocate a pmd for part of the kernel address space, then - make sure its initialized with the appropriate kernel mappings. - Otherwise use a cached zeroed pmd. */ -static pmd_t *pmd_cache_alloc(int idx) +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { - pmd_t *pmd; + int i; - if (idx >= USER_PTRS_PER_PGD) { - pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; -#ifndef CONFIG_XEN - if (pmd) - memcpy(pmd, - (void *)pgd_page_vaddr(swapper_pg_dir[idx]), - sizeof(pmd_t) * PTRS_PER_PMD); -#endif - } else - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (__pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - return pmd; -} + pgdp[i] = xen_make_pgd(0); -static void pmd_cache_free(pmd_t *pmd, int idx) -{ - if (idx >= USER_PTRS_PER_PGD) { - make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables); - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); - free_page((unsigned long)pmd); - } else - kmem_cache_free(pmd_cache, pmd); + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } } -pgd_t *pgd_alloc(struct mm_struct *mm) +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) { + pud_t *pud; + pmd_t *pmds[UNSHARED_PTRS_PER_PGD]; + unsigned long addr, flags; int i; - pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); - pmd_t **pmds = NULL; - unsigned long flags; - - pgd_test_and_unpin(pgd); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; - -#ifdef CONFIG_XEN - if (!SHARED_KERNEL_PMD) { - /* - * We can race save/restore (if we sleep during a GFP_KERNEL memory - * allocation). We therefore store virtual addresses of pmds as they - * do not change across save/restore, and poke the machine addresses - * into the pgdir under the pgd_lock. - */ - pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); - if (!pmds) { - quicklist_free(0, pgd_dtor, pgd); - return NULL; - } - } -#endif - /* Allocate pmds, remember virtual addresses. */ - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pmd_t *pmd = pmd_cache_alloc(i); - - if (!pmd) + /* + * We can race save/restore (if we sleep during a GFP_KERNEL memory + * allocation). We therefore store virtual addresses of pmds as they + * do not change across save/restore, and poke the machine addresses + * into the pgdir under the pgd_lock. + */ + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; i++, addr += PUD_SIZE) { + pmds[i] = pmd_alloc_one(mm, addr); + if (!pmds[i]) goto out_oom; - - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); - if (pmds) - pmds[i] = pmd; - else - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } -#ifdef CONFIG_XEN - if (SHARED_KERNEL_PMD) - return pgd; - spin_lock_irqsave(&pgd_lock, flags); /* Protect against save/restore: move below 4GB under pgd_lock. */ - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { - int rc = xen_create_contiguous_region( - (unsigned long)pgd, 0, 32); - if (rc) { - spin_unlock_irqrestore(&pgd_lock, flags); - goto out_oom; - } + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb) + && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) { + spin_unlock_irqrestore(&pgd_lock, flags); +out_oom: + while (i--) + pmd_free(mm, pmds[i]); + return 0; } /* Copy kernel pmd contents and write-protect the new pmds. */ - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { - memcpy(pmds[i], - (void *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - make_lowmem_page_readonly( - pmds[i], XENFEAT_writable_page_tables); - } + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + if (i >= USER_PTRS_PER_PGD) { + memcpy(pmds[i], + (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + make_lowmem_page_readonly( + pmds[i], XENFEAT_writable_page_tables); + } - /* It is safe to poke machine addresses of pmds under the pmd_lock. */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i]))); + /* It is safe to poke machine addresses of pmds under the pgd_lock. */ + pud_populate(mm, pud, pmds[i]); + } - /* Ensure this pgd gets picked up and pinned on save/restore. */ + /* List required to sync kernel mapping updates and + * to pin/unpin on save/restore. */ pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); - kfree(pmds); -#endif + return 1; +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} - return pgd; +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ +} +#endif /* CONFIG_X86_PAE */ -out_oom: - if (!pmds) { - for (i--; i >= 0; i--) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - } else { - for (i--; i >= 0; i--) { - paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT); - pmd_cache_free(pmds[i], i); - } - kfree(pmds); +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + /* so that alloc_pd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + free_page((unsigned long)pgd); + pgd = NULL; } - quicklist_free(0, pgd_dtor, pgd); - return NULL; + + return pgd; } -void pgd_free(pgd_t *pgd) +void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - int i; - /* * After this the pgd should not be pinned for the duration of this * function's execution. We should never sleep and thus never race: @@ -450,39 +368,43 @@ void pgd_free(pgd_t *pgd) * 2. The machine addresses in PGD entries will not become invalid * due to a concurrent save/restore. */ - pgd_test_and_unpin(pgd); + pgd_dtor(pgd); - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) { - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } + if (PTRS_PER_PMD > 1 && !xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region((unsigned long)pgd, 0); - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) - xen_destroy_contiguous_region((unsigned long)pgd, 0); - } + pgd_mop_up_pmds(mm, pgd); + free_page((unsigned long)pgd); +} - /* in the non-PAE case, free_pgtables() clears user pgd entries */ - quicklist_free(0, pgd_dtor, pgd); +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pt(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); } -void check_pgt_cache(void) +#ifdef CONFIG_X86_PAE + +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - quicklist_trim(0, pgd_dtor, 25, 16); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); } +#endif + void make_lowmem_page_readonly(void *va, unsigned int feature) { pte_t *pte; + unsigned int level; int rc; if (xen_feature(feature)) return; - pte = virt_to_ptep(va); + pte = lookup_address((unsigned long)va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); rc = HYPERVISOR_update_va_mapping( (unsigned long)va, pte_wrprotect(*pte), 0); BUG_ON(rc); @@ -491,313 +413,15 @@ void make_lowmem_page_readonly(void *va, void make_lowmem_page_writable(void *va, unsigned int feature) { pte_t *pte; + unsigned int level; int rc; if (xen_feature(feature)) return; - pte = virt_to_ptep(va); + pte = lookup_address((unsigned long)va, &level); + BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte)); rc = HYPERVISOR_update_va_mapping( (unsigned long)va, pte_mkwrite(*pte), 0); BUG_ON(rc); } - -void make_page_readonly(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_wrprotect(*pte), 0); - if (rc) /* fallback? */ - xen_l1_entry_update(pte, pte_wrprotect(*pte)); - if ((unsigned long)va >= (unsigned long)high_memory) { - unsigned long pfn = pte_pfn(*pte); -#ifdef CONFIG_HIGHMEM - if (pfn >= highstart_pfn) - kmap_flush_unused(); /* flush stale writable kmaps */ - else -#endif - make_lowmem_page_readonly( - phys_to_virt(pfn << PAGE_SHIFT), feature); - } -} - -void make_page_writable(void *va, unsigned int feature) -{ - pte_t *pte; - int rc; - - if (xen_feature(feature)) - return; - - pte = virt_to_ptep(va); - rc = HYPERVISOR_update_va_mapping( - (unsigned long)va, pte_mkwrite(*pte), 0); - if (rc) /* fallback? */ - xen_l1_entry_update(pte, pte_mkwrite(*pte)); - if ((unsigned long)va >= (unsigned long)high_memory) { - unsigned long pfn = pte_pfn(*pte); -#ifdef CONFIG_HIGHMEM - if (pfn < highstart_pfn) -#endif - make_lowmem_page_writable( - phys_to_virt(pfn << PAGE_SHIFT), feature); - } -} - -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - make_page_readonly(va, feature); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -void make_pages_writable(void *va, unsigned int nr, unsigned int feature) -{ - if (xen_feature(feature)) - return; - - while (nr-- != 0) { - make_page_writable(va, feature); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -static void _pin_lock(struct mm_struct *mm, int lock) { - if (lock) - spin_lock(&mm->page_table_lock); -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS - /* While mm->page_table_lock protects us against insertions and - * removals of higher level page table pages, it doesn't protect - * against updates of pte-s. Such updates, however, require the - * pte pages to be in consistent state (unpinned+writable or - * pinned+readonly). The pinning and attribute changes, however - * cannot be done atomically, which is why such updates must be - * prevented from happening concurrently. - * Note that no pte lock can ever elsewhere be acquired nesting - * with an already acquired one in the same mm, or with the mm's - * page_table_lock already acquired, as that would break in the - * non-split case (where all these are actually resolving to the - * one page_table_lock). Thus acquiring all of them here is not - * going to result in dead locks, and the order of acquires - * doesn't matter. - */ - { - pgd_t *pgd = mm->pgd; - unsigned g; - - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { - pud_t *pud; - unsigned u; - - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - pmd_t *pmd; - unsigned m; - - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - spinlock_t *ptl; - - if (pmd_none(*pmd)) - continue; - ptl = pte_lockptr(0, pmd); - if (lock) - spin_lock(ptl); - else - spin_unlock(ptl); - } - } - } - } -#endif - if (!lock) - spin_unlock(&mm->page_table_lock); -} -#define pin_lock(mm) _pin_lock(mm, 1) -#define pin_unlock(mm) _pin_lock(mm, 0) - -#define PIN_BATCH 4 -static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); - -static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, - unsigned int cpu, unsigned seq) -{ - unsigned long pfn = page_to_pfn(page); - - if (PageHighMem(page)) { - if (pgprot_val(flags) & _PAGE_RW) - ClearPagePinned(page); - else - SetPagePinned(page); - } else { - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); - if (unlikely(++seq == PIN_BATCH)) { - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), - PIN_BATCH, NULL))) - BUG(); - seq = 0; - } - } - - return seq; -} - -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) -{ - pgd_t *pgd = pgd_base; - pud_t *pud; - pmd_t *pmd; - int g, u, m; - unsigned int cpu, seq; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - - cpu = get_cpu(); - - for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - if (PTRS_PER_PUD > 1) /* not folded */ - seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - if (PTRS_PER_PMD > 1) /* not folded */ - seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - if (pmd_none(*pmd)) - continue; - seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); - } - } - } - - if (likely(seq != 0)) { - MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, - (unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH); - if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), - seq + 1, NULL))) - BUG(); - } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, - pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), - UVMF_TLB_FLUSH)) - BUG(); - - put_cpu(); -} - -static void __pgd_pin(pgd_t *pgd) -{ - pgd_walk(pgd, PAGE_KERNEL_RO); - kmap_flush_unused(); - xen_pgd_pin(__pa(pgd)); - SetPagePinned(virt_to_page(pgd)); -} - -static void __pgd_unpin(pgd_t *pgd) -{ - xen_pgd_unpin(__pa(pgd)); - pgd_walk(pgd, PAGE_KERNEL); - ClearPagePinned(virt_to_page(pgd)); -} - -static void pgd_test_and_unpin(pgd_t *pgd) -{ - if (PagePinned(virt_to_page(pgd))) - __pgd_unpin(pgd); -} - -void mm_pin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - pin_lock(mm); - __pgd_pin(mm->pgd); - pin_unlock(mm); -} - -void mm_unpin(struct mm_struct *mm) -{ - if (xen_feature(XENFEAT_writable_page_tables)) - return; - pin_lock(mm); - __pgd_unpin(mm->pgd); - pin_unlock(mm); -} - -void mm_pin_all(void) -{ - struct page *page; - unsigned long flags; - - if (xen_feature(XENFEAT_writable_page_tables)) - return; - - /* - * Allow uninterrupted access to the pgd_list. Also protects - * __pgd_pin() by disabling preemption. - * All other CPUs must be at a safe point (e.g., in stop_machine - * or offlined entirely). - */ - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = (struct page *)page->index) { - if (!PagePinned(page)) - __pgd_pin((pgd_t *)page_address(page)); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - if (!PagePinned(virt_to_page(mm->pgd))) - mm_pin(mm); -} - -void arch_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if (tsk->active_mm == mm) { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); - - switch_mm(mm, &init_mm, tsk); - - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } - - task_unlock(tsk); - - if (PagePinned(virt_to_page(mm->pgd)) && - (atomic_read(&mm->mm_count) == 1) && - !mm->context.has_foreign_mappings) - mm_unpin(mm); -} --- sle11sp1-2010-03-22.orig/arch/x86/pci/irq-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/pci/irq-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -204,6 +204,7 @@ static int pirq_ali_get(struct pci_dev * { static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + WARN_ON_ONCE(pirq >= 16); return irqmap[read_config_nybble(router, 0x48, pirq-1)]; } @@ -211,7 +212,8 @@ static int pirq_ali_set(struct pci_dev * { static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; unsigned int val = irqmap[irq]; - + + WARN_ON_ONCE(pirq >= 16); if (val) { write_config_nybble(router, 0x48, pirq-1, val); return 1; @@ -261,12 +263,16 @@ static int pirq_via_set(struct pci_dev * static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } @@ -279,12 +285,16 @@ static int pirq_via586_set(struct pci_de static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); return read_config_nybble(router,0x43, pirqmap[pirq-1]); } static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); return 1; } @@ -423,6 +433,7 @@ static int pirq_sis_set(struct pci_dev * static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -432,6 +443,7 @@ static int pirq_vlsi_get(struct pci_dev static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -453,14 +465,14 @@ static int pirq_vlsi_set(struct pci_dev */ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { - outb_p(pirq, 0xc00); + outb(pirq, 0xc00); return inb(0xc01) & 0xf; } static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - outb_p(pirq, 0xc00); - outb_p(irq, 0xc01); + outb(pirq, 0xc00); + outb(irq, 0xc01); return 1; } @@ -575,6 +587,10 @@ static __init int intel_router_probe(str case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_ICH10_0: + case PCI_DEVICE_ID_INTEL_ICH10_1: + case PCI_DEVICE_ID_INTEL_ICH10_2: + case PCI_DEVICE_ID_INTEL_ICH10_3: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; --- sle11sp1-2010-03-22.orig/arch/x86/vdso/Makefile 2009-12-04 10:44:49.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/vdso/Makefile 2009-11-06 10:51:25.000000000 +0100 @@ -66,6 +66,7 @@ vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter xen-vdso32-$(subst 1,$(CONFIG_COMPAT),$(shell expr $(CONFIG_XEN_COMPAT)0 '<' 0x0302000)) += int80 +xen-vdso32-$(CONFIG_X86_32) += syscall vdso32.so-$(CONFIG_XEN) += $(xen-vdso32-y) vdso32-images = $(vdso32.so-y:%=vdso32-%.so) --- sle11sp1-2010-03-22.orig/arch/x86/vdso/vdso32/syscall.S 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/vdso/vdso32/syscall.S 2009-11-06 10:51:25.000000000 +0100 @@ -19,8 +19,10 @@ __kernel_vsyscall: .Lpush_ebp: movl %ecx, %ebp syscall +#ifndef CONFIG_XEN movl $__USER32_DS, %ecx movl %ecx, %ss +#endif movl %ebp, %ecx popl %ebp .Lpop_ebp: --- sle11sp1-2010-03-22.orig/arch/x86/vdso/vdso32.S 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/vdso/vdso32.S 2009-11-06 10:51:25.000000000 +0100 @@ -19,4 +19,16 @@ vdso32_sysenter_start: .incbin "arch/x86/vdso/vdso32-sysenter.so" vdso32_sysenter_end: +#if defined(CONFIG_X86_64_XEN) && CONFIG_XEN_COMPAT < 0x030200 + .globl vdso32_int80_start, vdso32_int80_end +vdso32_int80_start: + .incbin "arch/x86/vdso/vdso32-int80.so" +vdso32_int80_end: +#elif defined(CONFIG_X86_XEN) + .globl vdso32_syscall_start, vdso32_syscall_end +vdso32_syscall_start: + .incbin "arch/x86/vdso/vdso32-syscall.so" +vdso32_syscall_end: +#endif + __FINIT --- sle11sp1-2010-03-22.orig/arch/x86/vdso/vdso32-setup.c 2009-12-04 10:44:46.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/vdso/vdso32-setup.c 2009-11-06 10:51:25.000000000 +0100 @@ -26,10 +26,6 @@ #include <asm/vdso.h> #include <asm/proto.h> -#ifdef CONFIG_XEN -#include <xen/interface/callback.h> -#endif - enum { VDSO_DISABLED = 0, VDSO_ENABLED = 1, @@ -229,7 +225,6 @@ static inline void map_compat_vdso(int m void enable_sep_cpu(void) { -#ifndef CONFIG_XEN int cpu = get_cpu(); struct tss_struct *tss = &per_cpu(init_tss, cpu); @@ -244,35 +239,6 @@ void enable_sep_cpu(void) wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); put_cpu(); -#else - extern asmlinkage void ia32pv_sysenter_target(void); - static struct callback_register sysenter = { - .type = CALLBACKTYPE_sysenter, - .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target }, - }; - - if (!boot_cpu_has(X86_FEATURE_SEP)) - return; - - get_cpu(); - - if (xen_feature(XENFEAT_supervisor_mode_kernel)) - sysenter.address.eip = (unsigned long)ia32_sysenter_target; - - switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) { - case 0: - break; -#if CONFIG_XEN_COMPAT < 0x030200 - case -ENOSYS: - sysenter.type = CALLBACKTYPE_sysenter_deprecated; - if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0) - break; -#endif - default: - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); - break; - } -#endif } static struct vm_area_struct gate_vma; --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/vdso/vdso32-setup-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -0,0 +1,506 @@ +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/string.h> +#include <linux/elf.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/msr.h> +#include <asm/pgtable.h> +#include <asm/unistd.h> +#include <asm/elf.h> +#include <asm/tlbflush.h> +#include <asm/vdso.h> +#include <asm/proto.h> + +#include <xen/interface/callback.h> + +enum { + VDSO_DISABLED = 0, + VDSO_ENABLED = 1, + VDSO_COMPAT = 2, +}; + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT VDSO_COMPAT +#else +#define VDSO_DEFAULT VDSO_ENABLED +#endif + +#ifdef CONFIG_X86_64 +#define vdso_enabled sysctl_vsyscall32 +#define arch_setup_additional_pages syscall32_setup_pages +#endif + +/* + * This is the difference between the prelinked addresses in the vDSO images + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO + * in the user address space. + */ +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0); + +EXPORT_SYMBOL_GPL(vdso_enabled); +#endif + +static __init void reloc_symtab(Elf32_Ehdr *ehdr, + unsigned offset, unsigned size) +{ + Elf32_Sym *sym = (void *)ehdr + offset; + unsigned nsym = size / sizeof(*sym); + unsigned i; + + for(i = 0; i < nsym; i++, sym++) { + if (sym->st_shndx == SHN_UNDEF || + sym->st_shndx == SHN_ABS) + continue; /* skip */ + + if (sym->st_shndx > SHN_LORESERVE) { + printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", + sym->st_shndx); + continue; + } + + switch(ELF_ST_TYPE(sym->st_info)) { + case STT_OBJECT: + case STT_FUNC: + case STT_SECTION: + case STT_FILE: + sym->st_value += VDSO_ADDR_ADJUST; + } + } +} + +static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) +{ + Elf32_Dyn *dyn = (void *)ehdr + offset; + + for(; dyn->d_tag != DT_NULL; dyn++) + switch(dyn->d_tag) { + case DT_PLTGOT: + case DT_HASH: + case DT_STRTAB: + case DT_SYMTAB: + case DT_RELA: + case DT_INIT: + case DT_FINI: + case DT_REL: + case DT_DEBUG: + case DT_JMPREL: + case DT_VERSYM: + case DT_VERDEF: + case DT_VERNEED: + case DT_ADDRRNGLO ... DT_ADDRRNGHI: + /* definitely pointers needing relocation */ + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; + break; + + case DT_ENCODING ... OLD_DT_LOOS-1: + case DT_LOOS ... DT_HIOS-1: + /* Tags above DT_ENCODING are pointers if + they're even */ + if (dyn->d_tag >= DT_ENCODING && + (dyn->d_tag & 1) == 0) + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; + break; + + case DT_VERDEFNUM: + case DT_VERNEEDNUM: + case DT_FLAGS_1: + case DT_RELACOUNT: + case DT_RELCOUNT: + case DT_VALRNGLO ... DT_VALRNGHI: + /* definitely not pointers */ + break; + + case OLD_DT_LOOS ... DT_LOOS-1: + case DT_HIOS ... DT_VALRNGLO-1: + default: + if (dyn->d_tag > DT_ENCODING) + printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", + dyn->d_tag); + break; + } +} + +static __init void relocate_vdso(Elf32_Ehdr *ehdr) +{ + Elf32_Phdr *phdr; + Elf32_Shdr *shdr; + int i; + + BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || + !elf_check_arch_ia32(ehdr) || + ehdr->e_type != ET_DYN); + + ehdr->e_entry += VDSO_ADDR_ADJUST; + + /* rebase phdrs */ + phdr = (void *)ehdr + ehdr->e_phoff; + for (i = 0; i < ehdr->e_phnum; i++) { + phdr[i].p_vaddr += VDSO_ADDR_ADJUST; + + /* relocate dynamic stuff */ + if (phdr[i].p_type == PT_DYNAMIC) + reloc_dyn(ehdr, phdr[i].p_offset); + } + + /* rebase sections */ + shdr = (void *)ehdr + ehdr->e_shoff; + for(i = 0; i < ehdr->e_shnum; i++) { + if (!(shdr[i].sh_flags & SHF_ALLOC)) + continue; + + shdr[i].sh_addr += VDSO_ADDR_ADJUST; + + if (shdr[i].sh_type == SHT_SYMTAB || + shdr[i].sh_type == SHT_DYNSYM) + reloc_symtab(ehdr, shdr[i].sh_offset, + shdr[i].sh_size); + } +} + +/* + * These symbols are defined by vdso32.S to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vdso32_default_start, vdso32_default_end; +extern const char vdso32_sysenter_start, vdso32_sysenter_end; +static struct page *vdso32_pages[1]; + +#ifdef CONFIG_X86_64 + +#if CONFIG_XEN_COMPAT < 0x030200 +static int use_int80 = 1; +#endif +static int use_sysenter __read_mostly = -1; + +#define vdso32_sysenter() (use_sysenter > 0) + +/* May not be __init: called during resume */ +void syscall32_cpu_init(void) +{ + static const struct callback_register cstar = { + .type = CALLBACKTYPE_syscall32, + .address = (unsigned long)ia32_cstar_target + }; + static const struct callback_register sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = (unsigned long)ia32_sysenter_target + }; + + if ((HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0) || + (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)) +#if CONFIG_XEN_COMPAT < 0x030200 + return; + use_int80 = 0; +#else + BUG(); +#endif + + if (use_sysenter < 0) + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); +} + +#define compat_uses_vma 1 + +static inline void map_compat_vdso(int map) +{ +} + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) + +extern asmlinkage void ia32pv_cstar_target(void); +static const struct callback_register __cpuinitconst cstar = { + .type = CALLBACKTYPE_syscall32, + .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target }, +}; + +void __cpuinit enable_sep_cpu(void) +{ + extern asmlinkage void ia32pv_sysenter_target(void); + static struct callback_register __cpuinitdata sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target }, + }; + + if (boot_cpu_has(X86_FEATURE_SYSCALL)) { + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0) + BUG(); + return; + } + + if (!boot_cpu_has(X86_FEATURE_SEP)) + return; + + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + sysenter.address.eip = (unsigned long)ia32_sysenter_target; + + switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) { + case 0: + break; +#if CONFIG_XEN_COMPAT < 0x030200 + case -ENOSYS: + sysenter.type = CALLBACKTYPE_sysenter_deprecated; + if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0) + break; +#endif + default: + setup_clear_cpu_cap(X86_FEATURE_SEP); + break; + } +} + +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; +} + +#define compat_uses_vma 0 + +static void map_compat_vdso(int map) +{ + static int vdso_mapped; + + if (map == vdso_mapped) + return; + + vdso_mapped = map; + + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, + map ? PAGE_READONLY_EXEC : PAGE_NONE); + + /* flush stray tlbs */ + flush_tlb_all(); +} + +#endif /* CONFIG_X86_64 */ + +int __init sysenter_setup(void) +{ + void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); + const void *vsyscall; + size_t vsyscall_len; + + vdso32_pages[0] = virt_to_page(syscall_page); + +#ifdef CONFIG_X86_32 + gate_vma_init(); + + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#endif + +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT < 0x030200 + if (use_int80) { + extern const char vdso32_int80_start, vdso32_int80_end; + + vsyscall = &vdso32_int80_start; + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; + } else +#elif defined(CONFIG_X86_32) + if (boot_cpu_has(X86_FEATURE_SYSCALL) + && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD + || HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL); + barrier(); /* until clear_bit()'s constraints are correct ... */ + if (boot_cpu_has(X86_FEATURE_SYSCALL)) { + extern const char vdso32_syscall_start, vdso32_syscall_end; + + vsyscall = &vdso32_syscall_start; + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + } else +#endif + if (!vdso32_sysenter()) { + vsyscall = &vdso32_default_start; + vsyscall_len = &vdso32_default_end - &vdso32_default_start; + } else { + vsyscall = &vdso32_sysenter_start; + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + } + + memcpy(syscall_page, vsyscall, vsyscall_len); + relocate_vdso(syscall_page); + + return 0; +} + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + bool compat; + + down_write(&mm->mmap_sem); + + /* Test compat mode once here, in case someone + changes it via sysctl */ + compat = (vdso_enabled == VDSO_COMPAT); + + map_compat_vdso(compat); + + if (compat) + addr = VDSO_HIGH_BASE; + else { + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + } + + if (compat_uses_vma || !compat) { + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully + * interpretable later without matching up the same + * kernel and hardware config to see what PC values + * meant. + */ + ret = install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdso32_pages); + + if (ret) + goto up_fail; + } + + current->mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = + VDSO32_SYMBOL(addr, SYSENTER_RETURN); + + up_fail: + up_write(&mm->mmap_sem); + + return ret; +} + +#ifdef CONFIG_X86_64 + +/* + * This must be done early in case we have an initrd containing 32-bit + * binaries (e.g., hotplug). This could be pushed upstream. + */ +core_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include <linux/sysctl.h> + +static ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &sysctl_vsyscall32, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static ctl_table abi_root_table2[] = { + { + .ctl_name = CTL_ABI, + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif + +#else /* CONFIG_X86_32 */ + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + + /* Check to see if this task was created in compat vdso mode */ + if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) + return &gate_vma; + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + const struct vm_area_struct *vma = get_gate_vma(task); + + return vma && addr >= vma->vm_start && addr < vma->vm_end; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} + +#endif /* CONFIG_X86_64 */ --- sle11sp1-2010-03-22.orig/drivers/pci/msi-xen.c 2009-12-04 11:01:50.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/pci/msi-xen.c 2009-12-04 11:03:15.000000000 +0100 @@ -45,6 +45,53 @@ struct msi_pirq_entry { int entry_nr; }; +/* Arch hooks */ + +int __attribute__ ((weak)) +arch_msi_check_device(struct pci_dev *dev, int nvec, int type) +{ + return 0; +} + +#ifndef CONFIG_XEN +int __attribute__ ((weak)) +arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) +{ + return 0; +} + +int __attribute__ ((weak)) +arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + struct msi_desc *entry; + int ret; + + list_for_each_entry(entry, &dev->msi_list, list) { + ret = arch_setup_msi_irq(dev, entry); + if (ret) + return ret; + } + + return 0; +} + +void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) +{ + return; +} + +void __attribute__ ((weak)) +arch_teardown_msi_irqs(struct pci_dev *dev) +{ + struct msi_desc *entry; + + list_for_each_entry(entry, &dev->msi_list, list) { + if (entry->irq != 0) + arch_teardown_msi_irq(entry->irq); + } +} +#endif + static void msi_set_enable(struct pci_dev *dev, int enable) { int pos; @@ -266,7 +313,6 @@ static void pci_intx_for_msi(struct pci_ pci_intx(dev, enable); } -#ifdef CONFIG_PM void pci_restore_msi_state(struct pci_dev *dev) { int rc; @@ -286,7 +332,7 @@ void pci_restore_msi_state(struct pci_de rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore); WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc); } -#endif /* CONFIG_PM */ +EXPORT_SYMBOL_GPL(pci_restore_msi_state); /** * msi_capability_init - configure device's MSI capability structure @@ -707,51 +753,3 @@ void pci_msi_init_pci_dev(struct pci_dev INIT_LIST_HEAD(&dev->msi_list); #endif } - - -/* Arch hooks */ - -int __attribute__ ((weak)) -arch_msi_check_device(struct pci_dev* dev, int nvec, int type) -{ - return 0; -} - -#ifndef CONFIG_XEN -int __attribute__ ((weak)) -arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *entry) -{ - return 0; -} - -int __attribute__ ((weak)) -arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - struct msi_desc *entry; - int ret; - - list_for_each_entry(entry, &dev->msi_list, list) { - ret = arch_setup_msi_irq(dev, entry); - if (ret) - return ret; - } - - return 0; -} - -void __attribute__ ((weak)) arch_teardown_msi_irq(unsigned int irq) -{ - return; -} - -void __attribute__ ((weak)) -arch_teardown_msi_irqs(struct pci_dev *dev) -{ - struct msi_desc *entry; - - list_for_each_entry(entry, &dev->msi_list, list) { - if (entry->irq != 0) - arch_teardown_msi_irq(entry->irq); - } -} -#endif --- sle11sp1-2010-03-22.orig/drivers/pci/pci.c 2010-03-22 12:11:11.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/pci/pci.c 2010-03-11 09:18:36.000000000 +0100 @@ -385,7 +385,12 @@ pci_find_parent_resource(const struct pc * Restore the BAR values for a given device, so as to make it * accessible by its driver. */ +#ifndef CONFIG_XEN static void +#else +EXPORT_SYMBOL_GPL(pci_restore_bars); +void +#endif pci_restore_bars(struct pci_dev *dev) { int i; --- sle11sp1-2010-03-22.orig/drivers/xen/balloon/sysfs.c 2009-11-06 10:46:41.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/balloon/sysfs.c 2009-11-06 10:51:25.000000000 +0100 @@ -104,7 +104,7 @@ static struct attribute_group balloon_in }; static struct sysdev_class balloon_sysdev_class = { - set_kset_name(BALLOON_CLASS_NAME), + .name = BALLOON_CLASS_NAME, }; static struct sys_device balloon_sysdev; --- sle11sp1-2010-03-22.orig/drivers/xen/blkback/blkback.c 2010-03-22 12:21:00.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/blkback/blkback.c 2010-01-04 12:38:24.000000000 +0100 @@ -150,7 +150,7 @@ static void unplug_queue(blkif_t *blkif) return; if (blkif->plug->unplug_fn) blkif->plug->unplug_fn(blkif->plug); - blk_put_queue(blkif->plug); + kobject_put(&blkif->plug->kobj); blkif->plug = NULL; } @@ -161,7 +161,8 @@ static void plug_queue(blkif_t *blkif, s if (q == blkif->plug) return; unplug_queue(blkif); - blk_get_queue(q); + WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)); + kobject_get(&q->kobj); blkif->plug = q; } --- sle11sp1-2010-03-22.orig/drivers/xen/blkfront/blkfront.c 2010-03-22 12:21:03.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/blkfront/blkfront.c 2010-03-22 12:21:12.000000000 +0100 @@ -739,7 +739,6 @@ static irqreturn_t blkif_int(int irq, vo RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; - int uptodate; spin_lock_irqsave(&blkif_io_lock, flags); @@ -764,13 +763,13 @@ static irqreturn_t blkif_int(int irq, vo ADD_ID_TO_FREELIST(info, id); - uptodate = (bret->status == BLKIF_RSP_OKAY); + ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk("blkfront: %s: write barrier op failed\n", info->gd->disk_name); - uptodate = -EOPNOTSUPP; + ret = -EOPNOTSUPP; info->feature_barrier = 0; xlvbd_barrier(info); } @@ -781,10 +780,8 @@ static irqreturn_t blkif_int(int irq, vo DPRINTK("Bad return from blkdev data " "request: %x\n", bret->status); - ret = end_that_request_first(req, uptodate, - req->hard_nr_sectors); + ret = __blk_end_request(req, ret, blk_rq_bytes(req)); BUG_ON(ret); - end_that_request_last(req, uptodate); break; default: BUG(); --- sle11sp1-2010-03-22.orig/drivers/xen/blktap/blktap.c 2009-11-06 10:46:27.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/blktap/blktap.c 2010-01-04 12:38:20.000000000 +0100 @@ -336,8 +336,8 @@ static pte_t blktap_clear_pte(struct vm_ uvstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); } if (vma->vm_file == NULL || uvaddr < uvstart) - return ptep_get_and_clear_full(vma->vm_mm, uvaddr, - ptep, is_fullmm); + return xen_ptep_get_and_clear_full(vma, uvaddr, ptep, + is_fullmm); /* TODO Should these be changed to if statements? */ BUG_ON(!info); @@ -380,8 +380,8 @@ static pte_t blktap_clear_pte(struct vm_ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap)); /* USING SHADOW PAGE TABLES. */ - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, - is_fullmm); + copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep, + is_fullmm); } if (count) { --- sle11sp1-2010-03-22.orig/drivers/xen/blktap2/device.c 2010-01-04 12:36:00.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/blktap2/device.c 2010-01-04 12:37:57.000000000 +0100 @@ -163,9 +163,9 @@ blktap_map_uaddr_fn(pte_t *ptep, struct } static int -blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) +blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte) { - return apply_to_page_range(mm, address, + return apply_to_page_range(vma ? vma->vm_mm : NULL, address, PAGE_SIZE, blktap_map_uaddr_fn, &pte); } @@ -173,18 +173,29 @@ static int blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, unsigned long addr, void *data) { - struct mm_struct *mm = (struct mm_struct *)data; + struct vm_area_struct *vma = data; BTDBG("ptep %p\n", ptep); - pte_clear(mm, addr, ptep); + xen_ptep_get_and_clear_full(vma, addr, ptep, 1); return 0; } static int -blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) +blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = NULL; + + if (!vma) { +#ifdef CONFIG_X86 + if (HYPERVISOR_update_va_mapping(address, __pte(0), + UVMF_INVLPG|UVMF_ALL)) + BUG(); + return 1; +#endif + } else + mm = vma->vm_mm; return apply_to_page_range(mm, address, - PAGE_SIZE, blktap_umap_uaddr_fn, mm); + PAGE_SIZE, blktap_umap_uaddr_fn, vma); } static inline void @@ -198,17 +209,10 @@ flush_tlb_kernel_page(unsigned long kvad } static void -blktap_device_end_dequeued_request(struct blktap_device *dev, - struct request *req, int uptodate) +blktap_device_end_dequeued_request(struct request *req, int ret) { - int ret; - - ret = end_that_request_first(req, uptodate, req->hard_nr_sectors); - BUG_ON(ret); - - spin_lock_irq(&dev->lock); - end_that_request_last(req, uptodate); - spin_unlock_irq(&dev->lock); + if (blk_end_request(req, ret, blk_rq_bytes(req))) + BUG(); } /* @@ -336,8 +340,8 @@ blktap_unmap(struct blktap *tap, struct if (!xen_feature(XENFEAT_auto_translated_physmap) && request->handles[i].kernel == INVALID_GRANT_HANDLE) { - blktap_umap_uaddr(&init_mm, kvaddr); - flush_tlb_kernel_page(kvaddr); + if (blktap_umap_uaddr(NULL, kvaddr) == 0) + flush_tlb_kernel_page(kvaddr); set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); } @@ -377,7 +381,7 @@ blktap_device_fail_pending_requests(stru blktap_unmap(tap, request); req = (struct request *)(unsigned long)request->id; - blktap_device_end_dequeued_request(dev, req, 0); + blktap_device_end_dequeued_request(req, -ENODEV); blktap_request_free(tap, request); } @@ -400,16 +404,11 @@ blktap_device_finish_request(struct blkt blkif_response_t *res, struct blktap_request *request) { - int uptodate; struct request *req; - struct blktap_device *dev; - - dev = &tap->device; blktap_unmap(tap, request); req = (struct request *)(unsigned long)request->id; - uptodate = (res->status == BLKIF_RSP_OKAY); BTDBG("req %p res status %d operation %d/%d id %lld\n", req, res->status, res->operation, request->operation, @@ -421,7 +420,8 @@ blktap_device_finish_request(struct blkt if (unlikely(res->status != BLKIF_RSP_OKAY)) BTERR("Bad return from device data " "request: %x\n", res->status); - blktap_device_end_dequeued_request(dev, req, uptodate); + blktap_device_end_dequeued_request(req, + res->status == BLKIF_RSP_OKAY ? 0 : -EIO); break; default: BUG(); @@ -571,9 +571,9 @@ blktap_map(struct blktap *tap, if (!xen_feature(XENFEAT_auto_translated_physmap)) { pte = mk_pte(page, ring->vma->vm_page_prot); - blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); + blktap_map_uaddr(ring->vma, uvaddr, pte_mkwrite(pte)); flush_tlb_page(ring->vma, uvaddr); - blktap_map_uaddr(&init_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); + blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL)); flush_tlb_kernel_page(kvaddr); set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); @@ -896,7 +896,7 @@ blktap_device_run_queue(struct blktap *t if (!err) queued++; else { - blktap_device_end_dequeued_request(dev, req, 0); + blktap_device_end_dequeued_request(req, err); blktap_request_free(tap, request); } --- sle11sp1-2010-03-22.orig/drivers/xen/blktap2/ring.c 2009-12-16 11:43:21.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/blktap2/ring.c 2009-11-06 10:51:25.000000000 +0100 @@ -103,8 +103,8 @@ blktap_ring_clear_pte(struct vm_area_str * mapped region. */ if (uvaddr < ring->user_vstart) - return ptep_get_and_clear_full(vma->vm_mm, uvaddr, - ptep, is_fullmm); + return xen_ptep_get_and_clear_full(vma, uvaddr, + ptep, is_fullmm); offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; @@ -146,8 +146,8 @@ blktap_ring_clear_pte(struct vm_area_str khandle->user); count++; } else - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, - is_fullmm); + copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep, + is_fullmm); if (count) if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, --- sle11sp1-2010-03-22.orig/drivers/xen/core/Makefile 2008-07-21 11:00:33.000000000 +0200 +++ sle11sp1-2010-03-22/drivers/xen/core/Makefile 2009-11-06 10:51:25.000000000 +0100 @@ -10,5 +10,6 @@ obj-$(CONFIG_SYS_HYPERVISOR) += hypervis obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o +obj-$(CONFIG_X86_SMP) += spinlock.o obj-$(CONFIG_KEXEC) += machine_kexec.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o --- sle11sp1-2010-03-22.orig/drivers/xen/core/evtchn.c 2010-02-09 16:48:26.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/core/evtchn.c 2010-02-09 16:53:05.000000000 +0100 @@ -194,7 +194,7 @@ static inline unsigned int cpu_from_evtc /* Upcall to generic IRQ layer. */ #ifdef CONFIG_X86 -extern fastcall unsigned int do_IRQ(struct pt_regs *regs); +extern unsigned int do_IRQ(struct pt_regs *regs); void __init xen_init_IRQ(void); void __init init_IRQ(void) { @@ -203,13 +203,11 @@ void __init init_IRQ(void) } #if defined (__i386__) static inline void exit_idle(void) {} -#define IRQ_REG orig_eax #elif defined (__x86_64__) #include <asm/idle.h> -#define IRQ_REG orig_rax #endif #define do_IRQ(irq, regs) do { \ - (regs)->IRQ_REG = ~(irq); \ + (regs)->orig_ax = ~(irq); \ do_IRQ((regs)); \ } while (0) #endif @@ -676,13 +674,12 @@ static void set_affinity_irq(unsigned in int resend_irq_on_evtchn(unsigned int irq) { int masked, evtchn = evtchn_from_irq(irq); - shared_info_t *s = HYPERVISOR_shared_info; if (!VALID_EVTCHN(evtchn)) return 1; masked = test_and_set_evtchn_mask(evtchn); - synch_set_bit(evtchn, s->evtchn_pending); + set_evtchn(evtchn); if (!masked) unmask_evtchn(evtchn); @@ -975,6 +972,43 @@ void disable_all_local_evtchn(void) synch_set_bit(i, &s->evtchn_mask[0]); } +/* Clear an irq's pending state, in preparation for polling on it. */ +void xen_clear_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +/* Set an irq's pending state, to avoid blocking on it. */ +void xen_set_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + set_evtchn(evtchn); +} + +/* Test an irq's pending state. */ +int xen_test_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + return VALID_EVTCHN(evtchn) && test_evtchn(evtchn); +} + +/* Poll waiting for an irq to become pending. In the usual case, the + irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ + evtchn_port_t evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn) + && HYPERVISOR_poll_no_timeout(&evtchn, 1)) + BUG(); +} + static void restore_cpu_virqs(unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -1028,8 +1062,8 @@ static void restore_cpu_ipis(unsigned in bind_evtchn_to_cpu(evtchn, cpu); /* Ready for use. */ - unmask_evtchn(evtchn); - + if (!(irq_desc[irq].status & IRQ_DISABLED)) + unmask_evtchn(evtchn); } } --- sle11sp1-2010-03-22.orig/drivers/xen/core/hypervisor_sysfs.c 2009-11-06 10:49:47.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/core/hypervisor_sysfs.c 2009-11-06 10:51:25.000000000 +0100 @@ -50,7 +50,7 @@ static int __init hypervisor_subsys_init if (!is_running_on_xen()) return -ENODEV; - hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type; + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; return 0; } --- sle11sp1-2010-03-22.orig/drivers/xen/core/smpboot.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/core/smpboot.c 2010-03-19 14:41:36.000000000 +0100 @@ -72,6 +72,10 @@ void __init prefill_possible_map(void) return; for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_HOTPLUG_CPU + if (i >= setup_max_cpus) + break; +#endif rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) cpu_set(i, cpu_possible_map); @@ -134,6 +138,10 @@ static int __cpuinit xen_smp_intr_init(u goto fail; per_cpu(callfunc_irq, cpu) = rc; + rc = xen_spinlock_init(cpu); + if (rc < 0) + goto fail; + if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0)) goto fail; @@ -144,6 +152,7 @@ static int __cpuinit xen_smp_intr_init(u unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); if (per_cpu(callfunc_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + xen_spinlock_cleanup(cpu); return rc; } @@ -155,6 +164,7 @@ static void xen_smp_intr_exit(unsigned i unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + xen_spinlock_cleanup(cpu); } #endif @@ -207,36 +217,25 @@ static void __cpuinit cpu_initialize_con smp_trap_init(ctxt.trap_ctxt); ctxt.ldt_ents = 0; - ctxt.gdt_ents = GDT_SIZE / 8; - -#ifdef __i386__ ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu)); + ctxt.gdt_ents = GDT_SIZE / 8; ctxt.user_regs.cs = __KERNEL_CS; - ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt.kernel_ss = __KERNEL_DS; - ctxt.kernel_sp = idle->thread.esp0; + ctxt.kernel_sp = idle->thread.sp0; - ctxt.event_callback_cs = __KERNEL_CS; ctxt.event_callback_eip = (unsigned long)hypervisor_callback; - ctxt.failsafe_callback_cs = __KERNEL_CS; ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; +#ifdef __i386__ + ctxt.event_callback_cs = __KERNEL_CS; + ctxt.failsafe_callback_cs = __KERNEL_CS; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); ctxt.user_regs.fs = __KERNEL_PERCPU; #else /* __x86_64__ */ - ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address); - - ctxt.user_regs.cs = __KERNEL_CS; - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); - - ctxt.kernel_ss = __KERNEL_DS; - ctxt.kernel_sp = idle->thread.rsp0; - - ctxt.event_callback_eip = (unsigned long)hypervisor_callback; - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.syscall_callback_eip = (unsigned long)system_call; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/drivers/xen/core/spinlock.c 2010-02-24 16:10:40.000000000 +0100 @@ -0,0 +1,246 @@ +/* + * Xen spinlock functions + * + * See arch/x86/xen/smp.c for copyright and credits for derived + * portions of this file. + */ +#define XEN_SPINLOCK_SOURCE +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <xen/evtchn.h> + +extern irqreturn_t smp_reschedule_interrupt(int, void *); + +static DEFINE_PER_CPU(int, spinlock_irq) = -1; +static char spinlock_name[NR_CPUS][15]; + +struct spinning { + raw_spinlock_t *lock; + unsigned int ticket; + struct spinning *prev; +}; +static DEFINE_PER_CPU(struct spinning *, spinning); +/* + * Protect removal of objects: Addition can be done lockless, and even + * removal itself doesn't need protection - what needs to be prevented is + * removed objects going out of scope (as they're allocated on the stack. + */ +static DEFINE_PER_CPU(raw_rwlock_t, spinning_rm_lock) = __RAW_RW_LOCK_UNLOCKED; + +int __cpuinit xen_spinlock_init(unsigned int cpu) +{ + int rc; + + sprintf(spinlock_name[cpu], "spinlock%u", cpu); + rc = bind_ipi_to_irqhandler(SPIN_UNLOCK_VECTOR, + cpu, + smp_reschedule_interrupt, + IRQF_DISABLED|IRQF_NOBALANCING, + spinlock_name[cpu], + NULL); + if (rc < 0) + return rc; + + disable_irq(rc); /* make sure it's never delivered */ + per_cpu(spinlock_irq, cpu) = rc; + + return 0; +} + +void __cpuinit xen_spinlock_cleanup(unsigned int cpu) +{ + if (per_cpu(spinlock_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(spinlock_irq, cpu), NULL); + per_cpu(spinlock_irq, cpu) = -1; +} + +static unsigned int spin_adjust(struct spinning *spinning, + const raw_spinlock_t *lock, + unsigned int token) +{ + for (; spinning; spinning = spinning->prev) + if (spinning->lock == lock) { + unsigned int ticket = spinning->ticket; + + if (unlikely(!(ticket + 1))) + break; + spinning->ticket = token >> TICKET_SHIFT; + token = (token & ((1 << TICKET_SHIFT) - 1)) + | (ticket << TICKET_SHIFT); + break; + } + + return token; +} + +unsigned int xen_spin_adjust(const raw_spinlock_t *lock, unsigned int token) +{ + return spin_adjust(__get_cpu_var(spinning), lock, token); +} + +bool xen_spin_wait(raw_spinlock_t *lock, unsigned int *ptok, + unsigned int flags) +{ + int irq = __get_cpu_var(spinlock_irq); + bool rc; + typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask; + raw_rwlock_t *rm_lock; + struct spinning spinning, *other; + + /* If kicker interrupt not initialized yet, just spin. */ + if (unlikely(irq < 0) || unlikely(!cpu_online(raw_smp_processor_id()))) + return false; + + /* announce we're spinning */ + spinning.ticket = *ptok >> TICKET_SHIFT; + spinning.lock = lock; + spinning.prev = __get_cpu_var(spinning); + smp_wmb(); + __get_cpu_var(spinning) = &spinning; + upcall_mask = current_vcpu_info()->evtchn_upcall_mask; + + do { + bool nested = false; + + xen_clear_irq_pending(irq); + + /* + * Check again to make sure it didn't become free while + * we weren't looking. + */ + if (lock->cur == spinning.ticket) { + /* + * If we interrupted another spinlock while it was + * blocking, make sure it doesn't block (again) + * without rechecking the lock. + */ + if (spinning.prev) + xen_set_irq_pending(irq); + rc = true; + break; + } + + for (other = spinning.prev; other; other = other->prev) { + if (other->lock == lock) + nested = true; + else { + /* + * Return the ticket if we now own the lock. + * While just being desirable generally (to + * reduce latency on other CPUs), this is + * essential in the case where interrupts + * get re-enabled below. + * Try to get a new ticket right away (to + * reduce latency after the current lock was + * released), but don't acquire the lock. + */ + raw_spinlock_t *lock = other->lock; + + raw_local_irq_disable(); + while (lock->cur == other->ticket) { + unsigned int token; + bool kick, free; + + other->ticket = -1; + __raw_spin_unlock_body; + if (!kick) + break; + xen_spin_kick(lock, token); + __raw_spin_lock_preamble; + if (!free) + token = spin_adjust( + other->prev, lock, + token); + other->ticket = token >> TICKET_SHIFT; + smp_mb(); + } + } + } + + /* + * No need to use raw_local_irq_restore() here, as the + * intended event processing will happen with the poll + * call. + */ + current_vcpu_info()->evtchn_upcall_mask = + nested ? upcall_mask : flags; + + xen_poll_irq(irq); + + current_vcpu_info()->evtchn_upcall_mask = upcall_mask; + + rc = !xen_test_irq_pending(irq); + kstat_this_cpu.irqs[irq] += !rc; + } while (spinning.prev || rc); + + /* + * Leave the irq pending so that any interrupted blocker will + * re-check. + */ + + /* announce we're done */ + __get_cpu_var(spinning) = other = spinning.prev; + rm_lock = &__get_cpu_var(spinning_rm_lock); + raw_local_irq_disable(); + __raw_write_lock(rm_lock); + __raw_write_unlock(rm_lock); + *ptok = lock->cur | (spinning.ticket << TICKET_SHIFT); + + /* + * Obtain new tickets for (or acquire) all those locks where + * above we avoided acquiring them. + */ + for (; other; other = other->prev) + if (!(other->ticket + 1)) { + unsigned int token; + bool free; + + lock = other->lock; + __raw_spin_lock_preamble; + if (!free) + token = spin_adjust(other->prev, lock, token); + other->ticket = token >> TICKET_SHIFT; + } + raw_local_irq_restore(upcall_mask); + + return rc; +} + +void xen_spin_kick(raw_spinlock_t *lock, unsigned int token) +{ + unsigned int cpu; + + token &= (1U << TICKET_SHIFT) - 1; + for_each_online_cpu(cpu) { + raw_rwlock_t *rm_lock; + unsigned long flags; + struct spinning *spinning; + + if (cpu == raw_smp_processor_id() || !per_cpu(spinning, cpu)) + continue; + + rm_lock = &per_cpu(spinning_rm_lock, cpu); + raw_local_irq_save(flags); + __raw_read_lock(rm_lock); + + spinning = per_cpu(spinning, cpu); + smp_rmb(); + while (spinning) { + if (spinning->lock == lock && spinning->ticket == token) + break; + spinning = spinning->prev; + } + + __raw_read_unlock(rm_lock); + raw_local_irq_restore(flags); + + if (unlikely(spinning)) { + notify_remote_via_irq(per_cpu(spinlock_irq, cpu)); + return; + } + } +} +EXPORT_SYMBOL(xen_spin_kick); --- sle11sp1-2010-03-22.orig/drivers/xen/core/xen_sysfs.c 2009-11-06 10:49:47.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/core/xen_sysfs.c 2009-11-06 10:51:25.000000000 +0100 @@ -30,12 +30,12 @@ HYPERVISOR_ATTR_RO(type); static int __init xen_sysfs_type_init(void) { - return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr); + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); } static void xen_sysfs_type_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr); + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); } /* xen version attributes */ @@ -91,13 +91,12 @@ static struct attribute_group version_gr static int __init xen_sysfs_version_init(void) { - return sysfs_create_group(&hypervisor_subsys.kobj, - &version_group); + return sysfs_create_group(hypervisor_kobj, &version_group); } static void xen_sysfs_version_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kobj, &version_group); + sysfs_remove_group(hypervisor_kobj, &version_group); } /* UUID */ @@ -126,12 +125,12 @@ HYPERVISOR_ATTR_RO(uuid); static int __init xen_sysfs_uuid_init(void) { - return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr); + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); } static void xen_sysfs_uuid_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr); + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); } /* xen compilation attributes */ @@ -204,14 +203,12 @@ static struct attribute_group xen_compil int __init static xen_compilation_init(void) { - return sysfs_create_group(&hypervisor_subsys.kobj, - &xen_compilation_group); + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); } static void xen_compilation_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kobj, - &xen_compilation_group); + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); } /* xen properties info */ @@ -325,14 +322,12 @@ static struct attribute_group xen_proper static int __init xen_properties_init(void) { - return sysfs_create_group(&hypervisor_subsys.kobj, - &xen_properties_group); + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); } static void xen_properties_destroy(void) { - sysfs_remove_group(&hypervisor_subsys.kobj, - &xen_properties_group); + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } #ifdef CONFIG_KEXEC @@ -350,13 +345,12 @@ HYPERVISOR_ATTR_RO(vmcoreinfo); static int __init xen_sysfs_vmcoreinfo_init(void) { - return sysfs_create_file(&hypervisor_subsys.kobj, - &vmcoreinfo_attr.attr); + return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr); } static void xen_sysfs_vmcoreinfo_destroy(void) { - sysfs_remove_file(&hypervisor_subsys.kobj, &vmcoreinfo_attr.attr); + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); } #endif --- sle11sp1-2010-03-22.orig/drivers/xen/gntdev/gntdev.c 2010-01-04 12:22:26.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/gntdev/gntdev.c 2010-01-04 12:38:10.000000000 +0100 @@ -791,7 +791,7 @@ static pte_t gntdev_clear_pte(struct vm_ op.status); } else { /* USING SHADOW PAGE TABLES. */ - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm); } /* Finally, we unmap the grant from kernel space. */ @@ -819,7 +819,7 @@ static pte_t gntdev_clear_pte(struct vm_ INVALID_P2M_ENTRY); } else { - copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm); } return copy; --- sle11sp1-2010-03-22.orig/drivers/xen/scsifront/scsifront.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/scsifront/scsifront.c 2009-11-06 10:51:25.000000000 +0100 @@ -260,19 +260,19 @@ static int map_data_for_request(struct v return -ENOMEM; } - if (sc->use_sg) { + if (scsi_bufflen(sc)) { /* quoted scsi_lib.c/scsi_req_map_sg . */ - struct scatterlist *sg, *sgl = (struct scatterlist *)sc->request_buffer; - unsigned int data_len = sc->request_bufflen; + struct scatterlist *sg, *sgl = scsi_sglist(sc); + unsigned int data_len = scsi_bufflen(sc); - nr_pages = (sc->request_bufflen + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT; if (nr_pages > VSCSIIF_SG_TABLESIZE) { printk(KERN_ERR "scsifront: Unable to map request_buffer for command!\n"); ref_cnt = (-E2BIG); goto big_to_sg; } - for_each_sg (sgl, sg, sc->use_sg, i) { + for_each_sg (sgl, sg, scsi_sg_count(sc), i) { page = sg_page(sg); off = sg->offset; len = sg->length; @@ -306,45 +306,6 @@ static int map_data_for_request(struct v ref_cnt++; } } - } else if (sc->request_bufflen) { - unsigned long end = ((unsigned long)sc->request_buffer - + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT; - - page = virt_to_page(sc->request_buffer); - nr_pages = end - start; - len = sc->request_bufflen; - - if (nr_pages > VSCSIIF_SG_TABLESIZE) { - ref_cnt = (-E2BIG); - goto big_to_sg; - } - - buffer_pfn = page_to_phys(page) >> PAGE_SHIFT; - - off = offset_in_page((unsigned long)sc->request_buffer); - for (i = 0; i < nr_pages; i++) { - bytes = PAGE_SIZE - off; - - if (bytes > len) - bytes = len; - - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id, - buffer_pfn, write); - - info->shadow[id].gref[i] = ref; - ring_req->seg[i].gref = ref; - ring_req->seg[i].offset = (uint16_t)off; - ring_req->seg[i].length = (uint16_t)bytes; - - buffer_pfn++; - len -= bytes; - off = 0; - ref_cnt++; - } } big_to_sg: --- sle11sp1-2010-03-22.orig/drivers/xen/usbfront/usbfront-dbg.c 2009-11-06 10:45:48.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/usbfront/usbfront-dbg.c 2009-11-06 10:51:25.000000000 +0100 @@ -43,17 +43,16 @@ * DEALINGS IN THE SOFTWARE. */ -static ssize_t show_statistics(struct class_device *class_dev, char *buf) +static ssize_t show_statistics(struct device *dev, + struct device_attribute *attr, char *buf) { - struct usb_bus *bus; struct usb_hcd *hcd; struct usbfront_info *info; unsigned long flags; unsigned temp, size; char *next; - bus = class_get_devdata(class_dev); - hcd = bus->hcpriv; + hcd = dev_get_drvdata(dev); info = hcd_to_info(hcd); next = buf; size = PAGE_SIZE; @@ -85,18 +84,18 @@ static ssize_t show_statistics(struct cl return PAGE_SIZE - size; } -static CLASS_DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL); +static DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL); static inline void create_debug_file(struct usbfront_info *info) { - struct class_device *cldev = info_to_hcd(info)->self.class_dev; - if (class_device_create_file(cldev, &class_device_attr_statistics)) + struct device *dev = info_to_hcd(info)->self.controller; + if (device_create_file(dev, &dev_attr_statistics)) printk(KERN_WARNING "statistics file not created for %s\n", info_to_hcd(info)->self.bus_name); } static inline void remove_debug_file(struct usbfront_info *info) { - struct class_device *cldev = info_to_hcd(info)->self.class_dev; - class_device_remove_file(cldev, &class_device_attr_statistics); + struct device *dev = info_to_hcd(info)->self.controller; + device_remove_file(dev, &dev_attr_statistics); } --- sle11sp1-2010-03-22.orig/drivers/xen/xenoprof/xenoprofile.c 2010-01-07 09:58:23.000000000 +0100 +++ sle11sp1-2010-03-22/drivers/xen/xenoprof/xenoprofile.c 2010-01-07 09:58:35.000000000 +0100 @@ -78,7 +78,7 @@ static int xenoprof_resume(struct sys_de static struct sysdev_class oprofile_sysclass = { - set_kset_name("oprofile"), + .name = "oprofile", .resume = xenoprof_resume, .suspend = xenoprof_suspend }; --- sle11sp1-2010-03-22.orig/arch/x86/include/asm/e820.h 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/asm/e820.h 2009-11-06 10:51:25.000000000 +0100 @@ -129,7 +129,11 @@ extern char *default_machine_specific_me #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ +#ifndef CONFIG_XEN #define ISA_START_ADDRESS 0xa0000 +#else +#define ISA_START_ADDRESS 0 +#endif #define ISA_END_ADDRESS 0x100000 #define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/agp.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/agp.h 2009-11-06 10:51:25.000000000 +0100 @@ -13,18 +13,13 @@ * page. This avoids data corruption on some CPUs. */ -/* - * Caller's responsibility to call global_flush_tlb() for performance - * reasons - */ #define map_page_into_agp(page) ( \ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ - ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) + ?: set_pages_uc(page, 1)) #define unmap_page_from_agp(page) ( \ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ - change_page_attr(page, 1, PAGE_KERNEL)) -#define flush_agp_mappings() global_flush_tlb() + set_pages_wb(page, 1)) /* * Could use CLFLUSH here if the cpu supports it. But then it would --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/desc.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/desc.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,5 +1,404 @@ +#ifndef _ASM_DESC_H_ +#define _ASM_DESC_H_ + +#ifndef __ASSEMBLY__ +#include <asm/desc_defs.h> +#include <asm/ldt.h> +#include <asm/mmu.h> +#include <linux/smp.h> + +static inline void fill_ldt(struct desc_struct *desc, + const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + desc->base0 = info->base_addr & 0x0000ffff; + + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + desc->base2 = (info->base_addr & 0xff000000) >> 24; +} + +#ifndef CONFIG_X86_NO_IDT +extern struct desc_ptr idt_descr; +extern gate_desc idt_table[]; +#endif + +#ifdef CONFIG_X86_64 +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; +extern struct desc_ptr cpu_gdt_descr[]; +/* the cpu gdt accessor */ +#define get_cpu_gdt_table(x) ((struct desc_struct *)cpu_gdt_descr[x].address) + +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); +} + +#else +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); +DECLARE_PER_CPU(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + +static inline void pack_gate(gate_desc *gate, unsigned char type, + unsigned long base, unsigned dpl, unsigned flags, unsigned short seg) + +{ + gate->a = (seg << 16) | (base & 0xffff); + gate->b = (base & 0xffff0000) | + (((0x80 | type | (dpl << 5)) & 0xff) << 8); +} + +#endif + +static inline int desc_empty(const void *ptr) +{ + const u32 *desc = ptr; + return !(desc[0] | desc[1]); +} + +#ifndef CONFIG_XEN +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) \ + native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) \ + native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) + +static inline void native_write_idt_entry(gate_desc *idt, int entry, + const gate_desc *gate) +{ + memcpy(&idt[entry], gate, sizeof(*gate)); +} + +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc) +{ + memcpy(&ldt[entry], desc, 8); +} + +static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type) +{ + unsigned int size; + switch (type) { + case DESC_TSS: + size = sizeof(tss_desc); + break; + case DESC_LDT: + size = sizeof(ldt_desc); + break; + default: + size = sizeof(struct desc_struct); + break; + } + memcpy(&gdt[entry], desc, size); +} +#endif + +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + unsigned long limit, unsigned char type, + unsigned char flags) +{ + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); + desc->p = 1; +} + + +#ifndef CONFIG_XEN +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) +{ +#ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; + memset(desc, 0, sizeof(*desc)); + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); +#else + + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); +#endif +} + +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + tss_desc tss; + + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); + write_gdt_entry(d, entry, &tss, DESC_TSS); +} + +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + +static inline void native_set_ldt(const void *addr, unsigned int entries) +{ + if (likely(entries == 0)) + __asm__ __volatile__("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + ldt_desc ldt; + + set_tssldt_descriptor(&ldt, (unsigned long)addr, + DESC_LDT, entries * sizeof(ldt) - 1); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + &ldt, DESC_LDT); + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + asm volatile("str %0":"=r" (tr)); + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} +#else +#define load_TLS(t, cpu) xen_load_tls(t, cpu) +#define set_ldt xen_set_ldt + +extern int write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc); +extern int write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type); + +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) + BUG(); +} +#endif + +#define _LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +static inline void clear_LDT(void) +{ + set_ldt(NULL, 0); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc) +{ + set_ldt(pc->ldt, pc->size); +} + +static inline void load_LDT(mm_context_t *pc) +{ + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); +} + +static inline unsigned long get_desc_base(const struct desc_struct *desc) +{ + return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +#ifndef CONFIG_X86_NO_IDT +static inline void _set_gate(int gate, unsigned type, void *addr, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate_desc s; + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); + /* + * does not need to be atomic because it is only done once at + * setup time + */ + write_idt_entry(idt_table, gate, &s); +} + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +static inline void set_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); +} + +static inline void set_system_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); #ifdef CONFIG_X86_32 -# include "desc_32.h" + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); +#else + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +#endif +} + +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); +} + +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); +} + +static inline void set_system_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); +} +#endif + #else -# include "desc_64.h" +/* + * GET_DESC_BASE reads the descriptor base of the specified segment. + * + * Args: + * idx - descriptor index + * gdt - GDT pointer + * base - 32bit register to which the base will be written + * lo_w - lo word of the "base" register + * lo_b - lo byte of the "base" register + * hi_b - hi byte of the low word of the "base" register + * + * Example: + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. + */ +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ + movb idx*8+4(gdt), lo_b; \ + movb idx*8+7(gdt), hi_b; \ + shll $16, base; \ + movw idx*8+2(gdt), lo_w; + + +#endif /* __ASSEMBLY__ */ + #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/desc_32.h 2009-11-06 10:49:47.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,262 +0,0 @@ -#ifndef __ARCH_DESC_H -#define __ARCH_DESC_H - -#include <asm/ldt.h> -#include <asm/segment.h> - -#ifndef __ASSEMBLY__ - -#include <linux/preempt.h> -#include <linux/smp.h> - -#include <asm/mmu.h> - -struct Xgt_desc_struct { - unsigned short size; - unsigned long address __attribute__((packed)); - unsigned short pad; -} __attribute__ ((packed)); - -struct gdt_page -{ - struct desc_struct gdt[GDT_ENTRIES]; -} __attribute__((aligned(PAGE_SIZE))); -DECLARE_PER_CPU(struct gdt_page, gdt_page); - -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) -{ - return per_cpu(gdt_page, cpu).gdt; -} - -extern struct Xgt_desc_struct idt_descr; -extern struct desc_struct idt_table[]; -extern void set_intr_gate(unsigned int irq, void * addr); - -static inline void pack_descriptor(__u32 *a, __u32 *b, - unsigned long base, unsigned long limit, unsigned char type, unsigned char flags) -{ - *a = ((base & 0xffff) << 16) | (limit & 0xffff); - *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20); -} - -static inline void pack_gate(__u32 *a, __u32 *b, - unsigned long base, unsigned short seg, unsigned char type, unsigned char flags) -{ - *a = (seg << 16) | (base & 0xffff); - *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff); -} - -#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */ -#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */ -#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */ -#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */ -#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */ -#define DESCTYPE_DPL3 0x60 /* DPL-3 */ -#define DESCTYPE_S 0x10 /* !system */ - -#ifndef CONFIG_XEN -#define load_TR_desc() native_load_tr_desc() -#define load_gdt(dtr) native_load_gdt(dtr) -#define load_idt(dtr) native_load_idt(dtr) -#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) - -#define store_gdt(dtr) native_store_gdt(dtr) -#define store_idt(dtr) native_store_idt(dtr) -#define store_tr(tr) (tr = native_store_tr()) -#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) - -#define load_TLS(t, cpu) native_load_tls(t, cpu) -#define set_ldt native_set_ldt - -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) - -static inline void write_dt_entry(struct desc_struct *dt, - int entry, u32 entry_low, u32 entry_high) -{ - dt[entry].a = entry_low; - dt[entry].b = entry_high; -} - -static inline void native_set_ldt(const void *addr, unsigned int entries) -{ - if (likely(entries == 0)) - __asm__ __volatile__("lldt %w0"::"q" (0)); - else { - unsigned cpu = smp_processor_id(); - __u32 a, b; - - pack_descriptor(&a, &b, (unsigned long)addr, - entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b); - __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); - } -} - - -static inline void native_load_tr_desc(void) -{ - asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); -} - -static inline void native_load_gdt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lgdt %0"::"m" (*dtr)); -} - -static inline void native_load_idt(const struct Xgt_desc_struct *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} - -static inline void native_store_gdt(struct Xgt_desc_struct *dtr) -{ - asm ("sgdt %0":"=m" (*dtr)); -} - -static inline void native_store_idt(struct Xgt_desc_struct *dtr) -{ - asm ("sidt %0":"=m" (*dtr)); -} - -static inline unsigned long native_store_tr(void) -{ - unsigned long tr; - asm ("str %0":"=r" (tr)); - return tr; -} - -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ - unsigned int i; - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; -} -#else -#define load_TLS(t, cpu) xen_load_tls(t, cpu) -#define set_ldt xen_set_ldt - -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); - -static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) -{ - unsigned int i; - struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), - *(u64 *)&t->tls_array[i])) - BUG(); -} -#endif - -#ifndef CONFIG_X86_NO_IDT -static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg) -{ - __u32 a, b; - pack_gate(&a, &b, (unsigned long)addr, seg, type, 0); - write_idt_entry(idt_table, gate, a, b); -} -#endif - -#ifndef CONFIG_X86_NO_TSS -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr) -{ - __u32 a, b; - pack_descriptor(&a, &b, (unsigned long)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, - DESCTYPE_TSS, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b); -} -#endif - - -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 ) - -static inline void clear_LDT(void) -{ - set_ldt(NULL, 0); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc) -{ - set_ldt(pc->ldt, pc->size); -} - -static inline void load_LDT(mm_context_t *pc) -{ - preempt_disable(); - load_LDT_nolock(pc); - preempt_enable(); -} - -static inline unsigned long get_desc_base(unsigned long *desc) -{ - unsigned long base; - base = ((desc[0] >> 16) & 0x0000ffff) | - ((desc[1] << 16) & 0x00ff0000) | - (desc[1] & 0xff000000); - return base; -} - -#else /* __ASSEMBLY__ */ - -/* - * GET_DESC_BASE reads the descriptor base of the specified segment. - * - * Args: - * idx - descriptor index - * gdt - GDT pointer - * base - 32bit register to which the base will be written - * lo_w - lo word of the "base" register - * lo_b - lo byte of the "base" register - * hi_b - hi byte of the low word of the "base" register - * - * Example: - * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah) - * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax. - */ -#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \ - movb idx*8+4(gdt), lo_b; \ - movb idx*8+7(gdt), hi_b; \ - shll $16, base; \ - movw idx*8+2(gdt), lo_w; - -#endif /* !__ASSEMBLY__ */ - -#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/desc_64.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,228 +0,0 @@ -/* Written 2000 by Andi Kleen */ -#ifndef __ARCH_DESC_H -#define __ARCH_DESC_H - -#include <linux/threads.h> -#include <asm/ldt.h> - -#ifndef __ASSEMBLY__ - -#include <linux/string.h> -#include <linux/smp.h> -#include <asm/desc_defs.h> - -#include <asm/segment.h> -#include <asm/mmu.h> - -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; - -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; - -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8)) - -static inline void clear_LDT(void) -{ - int cpu = get_cpu(); - - /* - * NB. We load the default_ldt for lcall7/27 handling on demand, as - * it slows down context switching. Noone uses it anyway. - */ - cpu = cpu; /* XXX avoid compiler warning */ - xen_set_ldt(NULL, 0); - put_cpu(); -} - -#ifndef CONFIG_X86_NO_TSS -static inline unsigned long __store_tr(void) -{ - unsigned long tr; - - asm volatile ("str %w0":"=r" (tr)); - return tr; -} - -#define store_tr(tr) (tr) = __store_tr() -#endif - -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; -#ifndef CONFIG_X86_NO_IDT -extern struct gate_struct idt_table[]; -#endif -extern struct desc_ptr cpu_gdt_descr[]; - -/* the cpu gdt accessor */ -#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address) - -#ifndef CONFIG_XEN -static inline void load_gdt(const struct desc_ptr *ptr) -{ - asm volatile("lgdt %w0"::"m" (*ptr)); -} - -static inline void store_gdt(struct desc_ptr *ptr) -{ - asm("sgdt %w0":"=m" (*ptr)); -} -#endif - -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) -{ - struct gate_struct s; - s.offset_low = PTR_LOW(func); - s.segment = __KERNEL_CS; - s.ist = ist; - s.p = 1; - s.dpl = dpl; - s.zero0 = 0; - s.zero1 = 0; - s.type = type; - s.offset_middle = PTR_MIDDLE(func); - s.offset_high = PTR_HIGH(func); - /* does not need to be atomic because it is only done once at setup time */ - memcpy(adr, &s, 16); -} - -#ifndef CONFIG_X86_NO_IDT -static inline void set_intr_gate(int nr, void *func) -{ - BUG_ON((unsigned)nr > 0xFF); - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); -} - -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) -{ - BUG_ON((unsigned)nr > 0xFF); - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); -} - -static inline void set_system_gate(int nr, void *func) -{ - BUG_ON((unsigned)nr > 0xFF); - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); -} - -static inline void set_system_gate_ist(int nr, void *func, unsigned ist) -{ - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); -} - -static inline void load_idt(const struct desc_ptr *ptr) -{ - asm volatile("lidt %w0"::"m" (*ptr)); -} - -static inline void store_idt(struct desc_ptr *dtr) -{ - asm("sidt %w0":"=m" (*dtr)); -} -#endif - -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, - unsigned size) -{ - struct ldttss_desc d; - memset(&d,0,sizeof(d)); - d.limit0 = size & 0xFFFF; - d.base0 = PTR_LOW(tss); - d.base1 = PTR_MIDDLE(tss) & 0xFF; - d.type = type; - d.p = 1; - d.limit1 = (size >> 16) & 0xF; - d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; - d.base3 = PTR_HIGH(tss); - memcpy(ptr, &d, 16); -} - -#ifndef CONFIG_X86_NO_TSS -static inline void set_tss_desc(unsigned cpu, void *addr) -{ - /* - * sizeof(unsigned long) coming from an extra "long" at the end - * of the iobitmap. See tss_struct definition in processor.h - * - * -1? seg base+limit should be pointing to the address of the - * last valid byte - */ - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], - (unsigned long)addr, DESC_TSS, - IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); -} -#endif - -static inline void set_ldt_desc(unsigned cpu, void *addr, int size) -{ - set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr, - DESC_LDT, size * 8 - 1); -} - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) -/* Don't allow setting of the lm bit. It is useless anyways because - 64bit system calls require __USER_CS. */ -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - /* ((info)->lm << 21) | */ \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 && \ - (info)->lm == 0) - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ - unsigned int i; - u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), - t->tls_array[i])) - BUG(); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock (mm_context_t *pc, int cpu) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) - segments = NULL; - - xen_set_ldt(segments, count); -} - -static inline void load_LDT(mm_context_t *pc) -{ - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); -} - -extern struct desc_ptr idt_descr; - -#endif /* !__ASSEMBLY__ */ - -#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/fixmap_32.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/fixmap_32.h 2009-11-06 10:51:25.000000000 +0100 @@ -64,7 +64,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_VISWS_APIC FIX_CO_CPU, /* Cobalt timer */ - FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ FIX_LI_PCIA, /* Lithium PCI Bridge A */ FIX_LI_PCIB, /* Lithium PCI Bridge B */ #endif @@ -73,7 +73,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_CYCLONE_TIMER FIX_CYCLONE_TIMER, /*cyclone timer register*/ -#endif +#endif #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -93,11 +93,23 @@ enum fixed_addresses { FIX_ISAMAP_END, FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, __end_of_permanent_fixed_addresses, - /* temporary boot-time mappings, used before ioremap() is functional */ -#define NR_FIX_BTMAPS 16 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses, - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 512 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_NESTING 4 + FIX_BTMAP_END = + __end_of_permanent_fixed_addresses + 512 - + (__end_of_permanent_fixed_addresses & 511), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, FIX_WP_TEST, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif __end_of_fixed_addresses }; --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/fixmap_64.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/fixmap_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -15,6 +15,7 @@ #include <asm/apicdef.h> #include <asm/page.h> #include <asm/vsyscall.h> +#include <asm/efi.h> #include <asm/acpi.h> /* @@ -46,6 +47,10 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, #endif +#ifdef CONFIG_EFI + FIX_EFI_IO_MAP_LAST_PAGE, + FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE+MAX_EFI_IO_PAGES-1, +#endif #ifdef CONFIG_ACPI FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -55,10 +60,22 @@ enum fixed_addresses { FIX_ISAMAP_END, FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, __end_of_permanent_fixed_addresses, - /* temporary boot-time mappings, used before ioremap() is functional */ -#define NR_FIX_BTMAPS 16 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses, - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * We round it up to the next 512 pages boundary so that we + * can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_NESTING 4 + FIX_BTMAP_END = + __end_of_permanent_fixed_addresses + 512 - + (__end_of_permanent_fixed_addresses & 511), + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_NESTING - 1, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif __end_of_fixed_addresses }; --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/highmem.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/highmem.h 2009-11-06 10:51:25.000000000 +0100 @@ -37,11 +37,6 @@ extern pte_t *pkmap_page_table; * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif /* * Ordering is: * @@ -57,13 +52,12 @@ extern pte_t *pkmap_page_table; * VMALLOC_START * high_memory */ -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); void *kmap(struct page *page); void kunmap(struct page *page); --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/hypervisor.h 2009-11-23 10:32:11.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/hypervisor.h 2009-11-23 10:32:34.000000000 +0100 @@ -271,6 +271,25 @@ HYPERVISOR_poll( return rc; } +static inline int __must_check +HYPERVISOR_poll_no_timeout( + evtchn_port_t *ports, unsigned int nr_ports) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + #ifdef CONFIG_XEN static inline void @@ -310,4 +329,6 @@ MULTI_grant_table_op(multicall_entry_t * #endif +#define uvm_multi(cpumask) ((unsigned long)cpus_addr(cpumask) | UVMF_MULTI) + #endif /* __HYPERVISOR_H__ */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/irqflags.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/irqflags.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,5 +1,249 @@ -#ifdef CONFIG_X86_32 -# include "irqflags_32.h" +#ifndef _X86_IRQFLAGS_H_ +#define _X86_IRQFLAGS_H_ + +#include <asm/processor-flags.h> + +#ifndef __ASSEMBLY__ +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) + +#define xen_restore_fl(f) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ + barrier(); /* unmask then check (avoid races) */\ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define xen_irq_disable() \ +do { \ + current_vcpu_info()->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define xen_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +void xen_safe_halt(void); + +void xen_halt(void); + +#define __raw_local_save_flags() xen_save_fl() + +#define raw_local_irq_restore(flags) xen_restore_fl(flags) + +#define raw_local_irq_disable() xen_irq_disable() + +#define raw_local_irq_enable() xen_irq_enable() + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +static inline void raw_safe_halt(void) +{ + xen_safe_halt(); +} + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +static inline void halt(void) +{ + xen_halt(); +} + +/* + * For spinlocks, etc: + */ +#define __raw_local_irq_save() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_local_irq_disable(); \ + \ + flags; \ +}) #else -# include "irqflags_64.h" + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_X86_64 +# define __REG_si %rsi +# define __CPU_num %gs:pda_cpunumber +#else +# define __REG_si %esi +# define __CPU_num TI_cpu(%ebp) +#endif + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl __CPU_num,%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + add HYPERVISOR_shared_info,__REG_si +#else +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si) +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS + +#ifndef CONFIG_X86_64 +#define INTERRUPT_RETURN iret +#define ENABLE_INTERRUPTS_SYSCALL_RET __ENABLE_INTERRUPTS ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + __TEST_PENDING ; \ + jnz 14f /* process more events if necessary... */ ; \ + movl PT_ESI(%esp), %esi ; \ + sysexit ; \ +14: __DISABLE_INTERRUPTS ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + mov $__KERNEL_PERCPU, %ecx ; \ + push %esp ; \ + mov %ecx, %fs ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#endif + + +#endif /* __ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) + +/* + * makes the traced hardirq state match with the machine state + * + * should be a rarely used function, only in places where its + * otherwise impossible to know the irq state, like in traps. + */ +static inline void trace_hardirqs_fixup_flags(unsigned long flags) +{ + if (raw_irqs_disabled_flags(flags)) + trace_hardirqs_off(); + else + trace_hardirqs_on(); +} + +#define trace_hardirqs_fixup() \ + trace_hardirqs_fixup_flags(__raw_local_save_flags()) + +#else + +#ifdef CONFIG_X86_64 +/* + * Currently paravirt can't handle swapgs nicely when we + * don't have a stack we can rely on (such as a user space + * stack). So we either find a way around these or just fault + * and emulate if a guest tries to call swapgs directly. + * + * Either way, this is a good way to document that we don't + * have a reliable stack. x86_64 only. + */ +#define SWAPGS_UNSAFE_STACK swapgs +#define ARCH_TRACE_IRQS_ON call trace_hardirqs_on_thunk +#define ARCH_TRACE_IRQS_OFF call trace_hardirqs_off_thunk +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + ENABLE_INTERRUPTS(CLBR_NONE); \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + __DISABLE_INTERRUPTS; \ + TRACE_IRQS_OFF; + +#else +#define ARCH_TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT_IRQ +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON ARCH_TRACE_IRQS_ON +# define TRACE_IRQS_OFF ARCH_TRACE_IRQS_OFF +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif + +#endif /* __ASSEMBLY__ */ #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/irqflags_32.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,214 +0,0 @@ -/* - * include/asm-i386/irqflags.h - * - * IRQ flags handling - * - * This file gets included from lowlevel asm headers too, to provide - * wrapped versions of the local_irq_*() APIs, based on the - * raw_local_irq_*() functions from the lowlevel headers. - */ -#ifndef _ASM_IRQFLAGS_H -#define _ASM_IRQFLAGS_H - -#ifndef __ASSEMBLY__ -#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) - -#define xen_restore_fl(f) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ - barrier(); /* unmask then check (avoid races) */\ - if (unlikely(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ - } \ -} while (0) - -#define xen_irq_disable() \ -do { \ - current_vcpu_info()->evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - -#define xen_irq_enable() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if (unlikely(_vcpu->evtchn_upcall_pending)) \ - force_evtchn_callback(); \ -} while (0) - -void xen_safe_halt(void); - -void xen_halt(void); - -/* - * The use of 'barrier' in the following reflects their use as local-lock - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following - * critical operations are executed. All critical operations must complete - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also - * includes these barriers, for example. - */ - -#define __raw_local_save_flags() xen_save_fl() - -#define raw_local_irq_restore(flags) xen_restore_fl(flags) - -#define raw_local_irq_disable() xen_irq_disable() - -#define raw_local_irq_enable() xen_irq_enable() - -/* - * Used in the idle loop; sti takes one instruction cycle - * to complete: - */ -static inline void raw_safe_halt(void) -{ - xen_safe_halt(); -} - -/* - * Used when interrupts are already enabled or to - * shutdown the processor: - */ -static inline void halt(void) -{ - xen_halt(); -} - -/* - * For spinlocks, etc: - */ -#define __raw_local_irq_save() \ -({ \ - unsigned long flags = __raw_local_save_flags(); \ - \ - raw_local_irq_disable(); \ - \ - flags; \ -}) - -#else -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 6 - -#ifdef CONFIG_SMP -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ - shl $sizeof_vcpu_shift,%esi ; \ - addl HYPERVISOR_shared_info,%esi -#else -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi -#endif - -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) -#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ - __DISABLE_INTERRUPTS -#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ - __ENABLE_INTERRUPTS -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \ -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ - __TEST_PENDING ; \ - jnz 14f /* process more events if necessary... */ ; \ - movl PT_ESI(%esp), %esi ; \ - sysexit ; \ -14: __DISABLE_INTERRUPTS ; \ - TRACE_IRQS_OFF ; \ -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ - mov $__KERNEL_PERCPU, %ecx ; \ - push %esp ; \ - mov %ecx, %fs ; \ - call evtchn_do_upcall ; \ - add $4,%esp ; \ - jmp ret_from_intr -#define INTERRUPT_RETURN iret -#endif /* __ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - -#define raw_local_irq_save(flags) \ - do { (flags) = __raw_local_irq_save(); } while (0) - -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return (flags != 0); -} - -#define raw_irqs_disabled() \ -({ \ - unsigned long flags = __raw_local_save_flags(); \ - \ - raw_irqs_disabled_flags(flags); \ -}) - -/* - * makes the traced hardirq state match with the machine state - * - * should be a rarely used function, only in places where its - * otherwise impossible to know the irq state, like in traps. - */ -static inline void trace_hardirqs_fixup_flags(unsigned long flags) -{ - if (raw_irqs_disabled_flags(flags)) - trace_hardirqs_off(); - else - trace_hardirqs_on(); -} - -#define trace_hardirqs_fixup() \ - trace_hardirqs_fixup_flags(__raw_local_save_flags()) -#endif /* __ASSEMBLY__ */ - -/* - * Do the CPU's IRQ-state tracing from assembly code. We call a - * C function, so save all the C-clobbered registers: - */ -#ifdef CONFIG_TRACE_IRQFLAGS - -# define TRACE_IRQS_ON \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_on; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -# define TRACE_IRQS_OFF \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call trace_hardirqs_off; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCKDEP_SYS_EXIT \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call lockdep_sys_exit; \ - popl %edx; \ - popl %ecx; \ - popl %eax; -#else -# define LOCKDEP_SYS_EXIT -#endif - -#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/irqflags_64.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,178 +0,0 @@ -/* - * include/asm-x86_64/irqflags.h - * - * IRQ flags handling - * - * This file gets included from lowlevel asm headers too, to provide - * wrapped versions of the local_irq_*() APIs, based on the - * raw_local_irq_*() functions from the lowlevel headers. - */ -#ifndef _ASM_IRQFLAGS_H -#define _ASM_IRQFLAGS_H -#include <asm/processor-flags.h> - -#ifndef __ASSEMBLY__ -/* - * Interrupt control: - */ - -/* - * The use of 'barrier' in the following reflects their use as local-lock - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following - * critical operations are executed. All critical operations must complete - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also - * includes these barriers, for example. - */ - -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) - -#define raw_local_save_flags(flags) \ - do { (flags) = __raw_local_save_flags(); } while (0) - -#define raw_local_irq_restore(x) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - } \ -} while (0) - -#ifdef CONFIG_X86_VSMP - -/* - * Interrupt control for the VSMP architecture: - */ - -static inline void raw_local_irq_disable(void) -{ - unsigned long flags = __raw_local_save_flags(); - - raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); -} - -static inline void raw_local_irq_enable(void) -{ - unsigned long flags = __raw_local_save_flags(); - - raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); -} - -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC); -} - -#else /* CONFIG_X86_VSMP */ - -#define raw_local_irq_disable() \ -do { \ - current_vcpu_info()->evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - -#define raw_local_irq_enable() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - _vcpu = current_vcpu_info(); \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ -} while (0) - -static inline int raw_irqs_disabled_flags(unsigned long flags) -{ - return (flags != 0); -} - -#endif - -/* - * For spinlocks, etc.: - */ - -#define __raw_local_irq_save() \ -({ \ - unsigned long flags = __raw_local_save_flags(); \ - \ - raw_local_irq_disable(); \ - \ - flags; \ -}) - -#define raw_local_irq_save(flags) \ - do { (flags) = __raw_local_irq_save(); } while (0) - -#define raw_irqs_disabled() \ -({ \ - unsigned long flags = __raw_local_save_flags(); \ - \ - raw_irqs_disabled_flags(flags); \ -}) - -/* - * makes the traced hardirq state match with the machine state - * - * should be a rarely used function, only in places where its - * otherwise impossible to know the irq state, like in traps. - */ -static inline void trace_hardirqs_fixup_flags(unsigned long flags) -{ - if (raw_irqs_disabled_flags(flags)) - trace_hardirqs_off(); - else - trace_hardirqs_on(); -} - -#define trace_hardirqs_fixup() \ - trace_hardirqs_fixup_flags(__raw_local_save_flags()) -/* - * Used in the idle loop; sti takes one instruction cycle - * to complete: - */ -void xen_safe_halt(void); -static inline void raw_safe_halt(void) -{ - xen_safe_halt(); -} - -/* - * Used when interrupts are already enabled or to - * shutdown the processor: - */ -void xen_halt(void); -static inline void halt(void) -{ - xen_halt(); -} - -#else /* __ASSEMBLY__: */ -# ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk -# else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -# endif -# ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -# define LOCKDEP_SYS_EXIT_IRQ \ - TRACE_IRQS_ON; \ - sti; \ - SAVE_REST; \ - LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ - cli; \ - TRACE_IRQS_OFF; -# else -# define LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ -# endif -#endif - -#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/maddr_32.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/maddr_32.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,6 +1,7 @@ #ifndef _I386_MADDR_H #define _I386_MADDR_H +#include <asm/bug.h> #include <xen/features.h> #include <xen/interface/xen.h> @@ -151,25 +152,9 @@ static inline paddr_t pte_machine_to_phy phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); return phys; } -#endif - -#ifdef CONFIG_X86_PAE -#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } ) -extern unsigned long long __supported_pte_mask; -static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) -{ - pte_t pte; - - pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ - (pgprot_val(pgprot) >> 32); - pte.pte_high &= (__supported_pte_mask >> 32); - pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ - __supported_pte_mask; - return pte; -} #else -#define __pte_ma(x) ((pte_t) { (x) } ) -#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_phys_to_machine phys_to_machine +#define pte_machine_to_phys machine_to_phys #endif #else /* !CONFIG_XEN */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/maddr_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,6 +1,7 @@ #ifndef _X86_64_MADDR_H #define _X86_64_MADDR_H +#include <asm/bug.h> #include <xen/features.h> #include <xen/interface/xen.h> @@ -16,6 +17,7 @@ typedef unsigned long maddr_t; #ifdef CONFIG_XEN extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; #undef machine_to_phys_mapping extern unsigned long *machine_to_phys_mapping; @@ -25,7 +27,7 @@ static inline unsigned long pfn_to_mfn(u { if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(max_mapnr && pfn >= max_mapnr); return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; } @@ -33,7 +35,7 @@ static inline int phys_to_machine_mappin { if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(max_mapnr && pfn >= max_mapnr); return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); } @@ -45,7 +47,7 @@ static inline unsigned long mfn_to_pfn(u return mfn; if (unlikely((mfn >> machine_to_phys_order) != 0)) - return end_pfn; + return max_mapnr; /* The array access can fail (e.g., device space beyond end of RAM). */ asm ( @@ -60,7 +62,7 @@ static inline unsigned long mfn_to_pfn(u " .quad 1b,3b\n" ".previous" : "=r" (pfn) - : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) ); + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); return pfn; } @@ -88,16 +90,16 @@ static inline unsigned long mfn_to_pfn(u static inline unsigned long mfn_to_local_pfn(unsigned long mfn) { unsigned long pfn = mfn_to_pfn(mfn); - if ((pfn < end_pfn) + if ((pfn < max_mapnr) && !xen_feature(XENFEAT_auto_translated_physmap) && (phys_to_machine_mapping[pfn] != mfn)) - return end_pfn; /* force !pfn_valid() */ + return max_mapnr; /* force !pfn_valid() */ return pfn; } static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - BUG_ON(end_pfn && pfn >= end_pfn); + BUG_ON(max_mapnr && pfn >= max_mapnr); if (xen_feature(XENFEAT_auto_translated_physmap)) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return; @@ -135,9 +137,6 @@ static inline paddr_t pte_machine_to_phy return phys; } -#define __pte_ma(x) ((pte_t) { (x) } ) -#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask) - #else /* !CONFIG_XEN */ #define pfn_to_mfn(pfn) (pfn) --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/mmu_context_32.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/mmu_context_32.h 2009-11-06 10:51:25.000000000 +0100 @@ -51,8 +51,6 @@ static inline void __prepare_arch_switch : : "r" (0) ); } -void leave_mm(unsigned long cpu); - static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/mmu_context_64.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/mmu_context_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -62,12 +62,6 @@ extern void mm_pin(struct mm_struct *mm) extern void mm_unpin(struct mm_struct *mm); void mm_pin_all(void); -static inline void load_cr3(pgd_t *pgd) -{ - asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) : - "memory"); -} - static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -97,7 +91,7 @@ static inline void switch_mm(struct mm_s op++; if (unlikely(next->context.ldt != prev->context.ldt)) { - /* load_LDT_nolock(&next->context, cpu) */ + /* load_LDT_nolock(&next->context) */ op->cmd = MMUEXT_SET_LDT; op->arg1.linear_addr = (unsigned long)next->context.ldt; op->arg2.nr_ents = next->context.size; @@ -110,7 +104,7 @@ static inline void switch_mm(struct mm_s else { write_pda(mmu_state, TLBSTATE_OK); if (read_pda(active_mm) != next) - out_of_line_bug(); + BUG(); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 @@ -118,7 +112,7 @@ static inline void switch_mm(struct mm_s */ load_cr3(next->pgd); xen_new_user_pt(__pa(__user_pgd(next->pgd))); - load_LDT_nolock(&next->context, cpu); + load_LDT_nolock(&next->context); } } #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pci.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pci.h 2009-11-06 10:51:25.000000000 +0100 @@ -71,6 +71,7 @@ extern int pci_mmap_page_range(struct pc #ifdef CONFIG_PCI +extern void early_quirks(void); static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, unsigned long *strategy_parameter) @@ -78,9 +79,10 @@ static inline void pci_dma_burst_advice( *strat = PCI_DMA_BURST_INFINITY; *strategy_parameter = ~0UL; } +#else +static inline void early_quirks(void) { } #endif - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 @@ -95,6 +97,19 @@ static inline void pci_dma_burst_advice( /* generic pci stuff */ #include <asm-generic/pci.h> +#ifdef CONFIG_NUMA +/* Returns the node based on pci bus */ +static inline int __pcibus_to_node(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + + return sd->node; +} +static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) +{ + return node_to_cpumask(__pcibus_to_node(bus)); +} +#endif #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgalloc_32.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgalloc_32.h 2009-11-06 10:51:25.000000000 +0100 @@ -3,69 +3,109 @@ #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ +#include <linux/pagemap.h> +#include <asm/tlb.h> +#include <asm-generic/tlb.h> #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ #define paravirt_alloc_pt(mm, pfn) do { } while (0) -#define paravirt_alloc_pd(pfn) do { } while (0) -#define paravirt_alloc_pd(pfn) do { } while (0) +#define paravirt_alloc_pd(mm, pfn) do { } while (0) #define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0) #define paravirt_release_pt(pfn) do { } while (0) #define paravirt_release_pd(pfn) do { } while (0) -#define pmd_populate_kernel(mm, pmd, pte) \ -do { \ - paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ -} while (0) - -#define pmd_populate(mm, pmd, pte) \ -do { \ - unsigned long pfn = page_to_pfn(pte); \ - paravirt_alloc_pt(mm, pfn); \ - if (PagePinned(virt_to_page((mm)->pgd))) { \ - if (!PageHighMem(pte)) \ - BUG_ON(HYPERVISOR_update_va_mapping( \ - (unsigned long)__va(pfn << PAGE_SHIFT), \ - pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \ - else if (!test_and_set_bit(PG_pinned, &pte->flags)) \ - kmap_flush_unused(); \ - set_pmd(pmd, \ - __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \ - } else \ - *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \ -} while (0) +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + + paravirt_alloc_pt(mm, pfn); + if (PagePinned(virt_to_page(mm->pgd))) { + if (!PageHighMem(pte)) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); + else if (!test_and_set_bit(PG_pinned, &pte->flags)) + kmap_flush_unused(); + set_pmd(pmd, __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); + } else + *pmd = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); +} +#define pmd_pgtable(pmd) pmd_page(pmd) /* * Allocate and free page tables. */ +extern void pgd_test_and_unpin(pgd_t *); extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); free_page((unsigned long)pte); } -extern void pte_free(struct page *pte); +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, pgtable_t pte) +{ + __pte_free(pte); +} + -#define __pte_free_tlb(tlb,pte) \ -do { \ - paravirt_release_pt(page_to_pfn(pte)); \ - tlb_remove_page((tlb),(pte)); \ -} while (0) +extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); #ifdef CONFIG_X86_PAE /* * In the PAE case we free the pmds as part of the pgd. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) -#define pud_populate(mm, pmd, pte) BUG() -#endif +extern pmd_t *pmd_alloc_one(struct mm_struct *, unsigned long); + +extern void __pmd_free(pgtable_t); +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pmd)); +} + +extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + struct page *page = virt_to_page(pmd); + unsigned long pfn = page_to_pfn(page); + + paravirt_alloc_pd(mm, pfn); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + if (PagePinned(virt_to_page(mm->pgd))) { + BUG_ON(PageHighMem(page)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + } else + *pudp = __pud(__pa(pmd) | _PAGE_PRESENT); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + xen_tlb_flush(); +} +#endif /* CONFIG_X86_PAE */ #endif /* _I386_PGALLOC_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgalloc_64.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgalloc_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -6,30 +6,13 @@ #include <linux/mm.h> #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ -#include <xen/features.h> -void make_page_readonly(void *va, unsigned int feature); -void make_page_writable(void *va, unsigned int feature); -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); -void make_pages_writable(void *va, unsigned int nr, unsigned int feature); +pmd_t *early_get_pmd(unsigned long va); +void early_make_page_readonly(void *va, unsigned int feature); #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) -{ - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { - BUG_ON(HYPERVISOR_update_va_mapping( - (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); - } else { - *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); - } -} +#define pmd_populate_kernel(mm, pmd, pte) \ + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { @@ -63,53 +46,58 @@ static inline void pgd_populate(struct m } } -extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); -extern void pte_free(struct page *pte); +#define pmd_pgtable(pmd) pmd_page(pmd) -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - struct page *pg; - - pg = pte_alloc_one(mm, addr); - return pg ? page_address(pg) : NULL; + if (unlikely(PagePinned(virt_to_page((mm)->pgd)))) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + } else { + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); + } } -static inline void pmd_free(pmd_t *pmd) +extern void __pmd_free(pgtable_t); +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pte_free(virt_to_page(pmd)); + __pmd_free(virt_to_page(pmd)); } +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - struct page *pg; - - pg = pte_alloc_one(mm, addr); - return pg ? page_address(pg) : NULL; + return (pud_t *)pmd_alloc_one(mm, addr); } -static inline void pud_free(pud_t *pud) +static inline void pud_free(struct mm_struct *mm, pud_t *pud) { BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pte_free(virt_to_page(pud)); + __pmd_free(virt_to_page(pud)); } static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_add(&page->lru, &pgd_list); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } static inline void pgd_list_del(pgd_t *pgd) { struct page *page = virt_to_page(pgd); + unsigned long flags; - spin_lock(&pgd_lock); + spin_lock_irqsave(&pgd_lock, flags); list_del(&page->lru); - spin_unlock(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } extern void pgd_test_and_unpin(pgd_t *); @@ -145,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm return pgd; } -static inline void pgd_free(pgd_t *pgd) +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_test_and_unpin(pgd); pgd_list_del(pgd); @@ -161,17 +149,30 @@ static inline pte_t *pte_alloc_one_kerne return pte; } +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); + /* Should really implement gc for free page table pages. This could be done with a reference count in struct page. */ -static inline void pte_free_kernel(pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); make_page_writable(pte, XENFEAT_writable_page_tables); free_page((unsigned long)pte); } -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, pgtable_t pte) +{ + __pte_free(pte); +} + +#define __pte_free_tlb(tlb,pte) \ +do { \ + pgtable_page_dtor((pte)); \ + tlb_remove_page((tlb), (pte)); \ +} while (0) + #define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) #define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgtable.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,5 +1,467 @@ +#ifndef _ASM_X86_PGTABLE_H +#define _ASM_X86_PGTABLE_H + +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 +#define _PAGE_BIT_RW 1 +#define _PAGE_BIT_USER 2 +#define _PAGE_BIT_PWT 3 +#define _PAGE_BIT_PCD 4 +#define _PAGE_BIT_ACCESSED 5 +#define _PAGE_BIT_DIRTY 6 +#define _PAGE_BIT_FILE 6 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and + * has no associated page struct. */ +#define _PAGE_BIT_UNUSED2 10 /* available for programmer */ +#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +/* + * Note: we use _AC(1, L) instead of _AC(1, UL) so that we get a + * sign-extended value on 32-bit with all 1's in the upper word, + * which preserves the upper pte values on 64-bit ptes: + */ +#define _PAGE_PRESENT (_AC(1, L)<<_PAGE_BIT_PRESENT) +#define _PAGE_RW (_AC(1, L)<<_PAGE_BIT_RW) +#define _PAGE_USER (_AC(1, L)<<_PAGE_BIT_USER) +#define _PAGE_PWT (_AC(1, L)<<_PAGE_BIT_PWT) +#define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */ +#define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */ +#define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO) +#define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2) +#define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3) +#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE) + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX) +#else +#define _PAGE_NX 0 +#endif + +/* If _PAGE_PRESENT is clear, we use these: */ +#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; + pte_present gives true */ + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) + +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) + +#ifdef CONFIG_X86_32 +#define _PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) +#define _PAGE_KERNEL (_PAGE_KERNEL_EXEC | _PAGE_NX) + +#ifndef __ASSEMBLY__ +extern pteval_t __PAGE_KERNEL, __PAGE_KERNEL_EXEC; +#endif /* __ASSEMBLY__ */ +#else +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) +#endif + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +/* + * We don't support GLOBAL page in xenolinux64 + */ +#define MAKE_GLOBAL(x) __pgprot((x)) + +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX MAKE_GLOBAL(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS MAKE_GLOBAL(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_EXEC MAKE_GLOBAL(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +#ifndef __ASSEMBLY__ + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } +static inline int pte_global(pte_t pte) { return 0; } +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); } + +static inline int pmd_large(pmd_t pte) { + return (__pmd_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == + (_PAGE_PSE|_PAGE_PRESENT); +} + +static inline pte_t pte_mkclean(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_DIRTY); } +static inline pte_t pte_mkold(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_ACCESSED); } +static inline pte_t pte_wrprotect(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_RW); } +static inline pte_t pte_mkexec(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_NX); } +static inline pte_t pte_mkdirty(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_DIRTY); } +static inline pte_t pte_mkyoung(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_ACCESSED); } +static inline pte_t pte_mkwrite(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_RW); } +static inline pte_t pte_mkhuge(pte_t pte) { return __pte_ma(__pte_val(pte) | _PAGE_PSE); } +static inline pte_t pte_clrhuge(pte_t pte) { return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); } +static inline pte_t pte_mkglobal(pte_t pte) { return pte; } +static inline pte_t pte_clrglobal(pte_t pte) { return pte; } + +extern pteval_t __supported_pte_mask; + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte_ma((((phys_addr_t)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pteval_t val = pte_val(pte); + + val &= _PAGE_CHG_MASK; + val |= pgprot_val(newprot) & __supported_pte_mask; + + return __pte(val); +} + +#define pte_pgprot(x) __pgprot(pte_val(x) & (0xfff | _PAGE_NX)) + +#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) + +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) + +#define set_pte_atomic(ptep, pte) \ + xen_set_pte_atomic(ptep, pte) + +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) xen_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) xen_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) + +#endif /* __ASSEMBLY__ */ + #ifdef CONFIG_X86_32 # include "pgtable_32.h" #else # include "pgtable_64.h" #endif + +#ifndef __ASSEMBLY__ + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, +}; + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +/* local pte updates need not use xchg for locking */ +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + xen_set_pte(ptep, __pte(0)); + return res; +} + +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_set_pte(ptep, pte); +} + +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + __xen_pte_clear(ptep); +} + +#ifndef CONFIG_PARAVIRT +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#endif + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ +({ \ + int __changed = !pte_same(*(ptep), entry); \ + if (__changed && (dirty)) { \ + if ( likely((vma)->vm_mm == current->mm) ) { \ + BUG_ON(HYPERVISOR_update_va_mapping(address, \ + entry, \ + uvm_multi((vma)->vm_mm->cpu_vm_mask) | \ + UVMF_INVLPG)); \ + } else { \ + xen_l1_entry_update(ptep, entry); \ + flush_tlb_page(vma, address); \ + } \ + } \ + __changed; \ +}) + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ + int __ret = 0; \ + if (pte_young(*(ptep))) \ + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ + &(ptep)->pte); \ + if (__ret) \ + pte_update((vma)->vm_mm, addr, ptep); \ + __ret; \ +}) + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +#define ptep_clear_flush_young(vma, address, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __young = pte_young(__pte); \ + __pte = pte_mkold(__pte); \ + if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + else if (__young) \ + (ptep)->pte_low = __pte.pte_low; \ + __young; \ +}) + +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + uvm_multi((vma)->vm_mm->cpu_vm_mask) | \ + UVMF_INVLPG))) { \ + __xen_pte_clear(__ptep); \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { + pte = xen_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ + ((full) ? ({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!PagePinned(virt_to_page((mm)->pgd))) \ + __xen_pte_clear(__ptep); \ + else if (!pte_none(__res)) \ + xen_l1_entry_update(__ptep, __pte(0)); \ + __res; \ + }) : \ + ptep_get_and_clear(mm, addr, ptep)) + +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) + +#define arbitrary_virt_to_machine(va) \ +({ \ + unsigned int __lvl; \ + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ + (((maddr_t)pte_mfn(*__ptep) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))); \ +}) + +#ifdef CONFIG_HIGHPTE +#include <asm/io.h> +struct page *kmap_atomic_to_page(void *); +#define ptep_to_machine(ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + page_to_phys(kmap_atomic_to_page(__ptep)) \ + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ +}) +#else +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +#include <asm-generic/pgtable.h> + +#include <xen/features.h> +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +struct vm_area_struct; + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); + +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgtable-3level.h 2009-11-06 10:51:07.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable-3level.h 2009-11-06 10:51:25.000000000 +0100 @@ -18,16 +18,18 @@ printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) -#define pud_none(pud) 0 -#define pud_bad(pud) 0 -#define pud_present(pud) 1 -/* - * All present pages with !NX bit are kernel-executable: - */ -static inline int pte_exec_kernel(pte_t pte) +static inline int pud_none(pud_t pud) +{ + return __pud_val(pud) == 0; +} +static inline int pud_bad(pud_t pud) { - return !(__pte_val(pte) & _PAGE_NX); + return (__pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0; +} +static inline int pud_present(pud_t pud) +{ + return __pud_val(pud) & _PAGE_PRESENT; } /* Rules for using set_pte: the pte being assigned *must* be @@ -44,14 +46,6 @@ static inline void xen_set_pte(pte_t *pt ptep->pte_low = pte.pte_low; } -static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , pte_t pte) -{ - if ((mm != current->mm && mm != &init_mm) || - HYPERVISOR_update_va_mapping(addr, pte, 0)) - xen_set_pte(ptep, pte); -} - static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { set_64bit((unsigned long long *)(ptep),__pte_val(pte)); @@ -70,14 +64,11 @@ static inline void xen_set_pud(pud_t *pu * entry, so clear the bottom half first and enforce ordering with a compiler * barrier. */ -static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static inline void __xen_pte_clear(pte_t *ptep) { - if ((mm != current->mm && mm != &init_mm) - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { - ptep->pte_low = 0; - smp_wmb(); - ptep->pte_high = 0; - } + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; } static inline void xen_pmd_clear(pmd_t *pmd) @@ -85,21 +76,25 @@ static inline void xen_pmd_clear(pmd_t * xen_l2_entry_update(pmd, __pmd(0)); } -#define set_pte(ptep, pte) xen_set_pte(ptep, pte) -#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) -#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte) -#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) -#define set_pud(pudp, pud) xen_set_pud(pudp, pud) -#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) -#define pmd_clear(pmd) xen_pmd_clear(pmd) +static inline void pud_clear(pud_t *pudp) +{ + pgdval_t pgd; + + set_pud(pudp, __pud(0)); -/* - * Pentium-II erratum A13: in PAE mode we explicitly have to flush - * the TLB via cr3 if the top-level pgd is changed... - * We do not let the generic code free and clear pgd entries due to - * this erratum. - */ -static inline void pud_clear (pud_t * pud) { } + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + * + * Make sure the pud entry we're updating is within the + * current pgd to avoid unnecessary TLB flushes. + */ + pgd = read_cr3(); + if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + xen_tlb_flush(); +} #define pud_page(pud) \ ((struct page *) __va(pud_val(pud) & PAGE_MASK)) @@ -128,24 +123,6 @@ static inline pte_t xen_ptep_get_and_cle #define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) #endif -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(vma, addr, ptep) \ -({ \ - pte_t *__ptep = (ptep); \ - pte_t __res = *__ptep; \ - if (!pte_none(__res) && \ - ((vma)->vm_mm != current->mm || \ - HYPERVISOR_update_va_mapping(addr, __pte(0), \ - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ - UVMF_INVLPG|UVMF_MULTI))) { \ - __ptep->pte_low = 0; \ - smp_wmb(); \ - __ptep->pte_high = 0; \ - flush_tlb_page(vma, addr); \ - } \ - __res; \ -}) - #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t a, pte_t b) { @@ -168,26 +145,12 @@ static inline int pte_none(pte_t pte) mfn_to_local_pfn(__pte_mfn(_pte)) : \ __pte_mfn(_pte)) -extern unsigned long long __supported_pte_mask; - -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) -{ - return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); -} - -static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) -{ - return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | - pgprot_val(pgprot)) & __supported_pte_mask); -} - /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. */ #define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) +#define pgoff_to_pte(off) ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ @@ -195,8 +158,6 @@ static inline pmd_t pfn_pmd(unsigned lon #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) -#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) - -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) #endif /* _I386_PGTABLE_3LEVEL_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgtable_32.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable_32.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,8 +1,6 @@ #ifndef _I386_PGTABLE_H #define _I386_PGTABLE_H -#include <asm/hypervisor.h> - /* * The Linux memory management assumes a three-level page table setup. On * the i386, we use that, but "fold" the mid level into the top-level page @@ -25,20 +23,10 @@ struct vm_area_struct; -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -extern unsigned long empty_zero_page[1024]; extern pgd_t *swapper_pg_dir; -extern struct kmem_cache *pmd_cache; -extern spinlock_t pgd_lock; -extern struct page *pgd_list; -void check_pgt_cache(void); -void pmd_ctor(struct kmem_cache *, void *); -void pgtable_cache_init(void); +static inline void pgtable_cache_init(void) { } +static inline void check_pgt_cache(void) { } void paging_init(void); @@ -58,16 +46,9 @@ void paging_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0 - #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) -#define TWOLEVEL_PGDIR_SHIFT 22 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that @@ -78,121 +59,19 @@ void paging_init(void); #define VMALLOC_OFFSET (8*1024*1024) #define VMALLOC_START (((unsigned long) high_memory + \ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif - -/* - * _PAGE_PSE set in the page directory entry just means that - * the page directory entry points directly to a 4MB-aligned block of - * memory. - */ -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */ -#define _PAGE_BIT_UNUSED2 10 -#define _PAGE_BIT_UNUSED3 11 -#define _PAGE_BIT_NX 63 - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ -/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */ -#define _PAGE_UNUSED2 0x400 -#define _PAGE_UNUSED3 0x800 - -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; - pte_present gives true */ #ifdef CONFIG_X86_PAE -#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) +#define LAST_PKMAP 512 #else -#define _PAGE_NX 0 +#define LAST_PKMAP 1024 #endif -/* Mapped page is I/O or foreign and has no associated page struct. */ -#define _PAGE_IO 0x200 +#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) - -#define PAGE_NONE \ - __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED \ - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) - -#define PAGE_SHARED_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY \ - PAGE_COPY_NOEXEC -#define PAGE_READONLY \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) - -#define _PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) -#define _PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) - -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) -#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) - -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) - -/* - * The i386 can't do page protection for execute, and considers that - * the same are read. Also, write permissions imply read permissions. - * This is the closest we can get.. - */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC +#ifdef CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif /* * Define this if things work differently on an i386 and an i486: @@ -221,28 +100,6 @@ extern unsigned long pg0[]; #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } -static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } - -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } - -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } - #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> #else @@ -250,111 +107,6 @@ static inline pte_t pte_mkhuge(pte_t pte #endif /* - * Rules for using pte_update - it must be called after any PTE update which - * has not been done using the set_pte / clear_pte interfaces. It is used by - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE - * updates should either be sets, clears, or set_pte_atomic for P->P - * transitions, which means this hook should only be called for user PTEs. - * This hook implies a P->P protection or access change has taken place, which - * requires a subsequent TLB flush. The notification can optionally be delayed - * until the TLB flush event by using the pte_update_defer form of the - * interface, but care must be taken to assure that the flush happens while - * still holding the same page table lock so that the shadow and primary pages - * do not become out of sync on SMP. - */ -#define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) - -/* local pte updates need not use xchg for locking */ -static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) -{ - xen_set_pte(ptep, __pte(0)); - return res; -} - -/* - * We only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware - * will do the accessed bit for us, and we don't want to - * race with other CPU's that might be updating the dirty - * bit at the same time. - */ -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ -({ \ - int __changed = !pte_same(*(ptep), entry); \ - if (__changed && (dirty)) { \ - if ( likely((vma)->vm_mm == current->mm) ) { \ - BUG_ON(HYPERVISOR_update_va_mapping(address, \ - entry, \ - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ - UVMF_INVLPG|UVMF_MULTI)); \ - } else { \ - xen_l1_entry_update(ptep, entry); \ - flush_tlb_page(vma, address); \ - } \ - } \ - __changed; \ -}) - -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ - int __ret = 0; \ - if (pte_young(*(ptep))) \ - __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ - &(ptep)->pte_low); \ - if (__ret) \ - pte_update((vma)->vm_mm, addr, ptep); \ - __ret; \ -}) - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(vma, address, ptep) \ -({ \ - pte_t __pte = *(ptep); \ - int __young = pte_young(__pte); \ - __pte = pte_mkold(__pte); \ - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ - else if (__young) \ - (ptep)->pte_low = __pte.pte_low; \ - __young; \ -}) - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_none(pte) - && (mm != &init_mm - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { - pte = xen_ptep_get_and_clear(ptep, pte); - pte_update(mm, addr, ptep); - } - return pte; -} - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(mm, addr, ptep, full) \ - ((full) ? ({ \ - pte_t __res = *(ptep); \ - if (PagePinned(virt_to_page((mm)->pgd))) \ - xen_l1_entry_update(ptep, __pte(0)); \ - else \ - *(ptep) = __pte(0); \ - __res; \ - }) : \ - ptep_get_and_clear(mm, addr, ptep)) - -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - if (pte_write(pte)) - set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); -} - -/* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * * dst - pointer to pgd range anwhere on a pgd page @@ -383,26 +135,6 @@ static inline void clone_pgd_range(pgd_t #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - /* - * Since this might change the present bit (which controls whether - * a pte_t object has undergone p2m translation), we must use - * pte_val() on the input pte and __pte() for the return value. - */ - paddr_t pteval = pte_val(pte); - - pteval &= _PAGE_CHG_MASK; - pteval |= pgprot_val(newprot); -#ifdef CONFIG_X86_PAE - pteval &= __supported_pte_mask; -#endif - return __pte(pteval); -} - -#define pmd_large(pmd) \ -((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) - /* * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] * @@ -424,6 +156,8 @@ static inline pte_t pte_modify(pte_t pte */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) +static inline int pud_large(pud_t pud) { return 0; } + /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] * @@ -449,26 +183,6 @@ static inline pte_t pte_modify(pte_t pte #define pmd_page_vaddr(pmd) \ ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) -/* - * Helper function that returns the kernel pagetable entry controlling - * the virtual address 'address'. NULL means no pagetable entry present. - * NOTE: the return type is pte_t but if the pmd is PSE then we return it - * as a pte too. - */ -extern pte_t *lookup_address(unsigned long address); - -/* - * Make a given kernel text page executable/non-executable. - * Returns the previous executability setting of that page (which - * is used to restore the previous state). Used by the SMP bootup code. - * NOTE: this is an __init function for security reasons. - */ -#ifdef CONFIG_X86_PAE - extern int set_kernel_exec(unsigned long vaddr, int enable); -#else - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} -#endif - #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) @@ -496,72 +210,22 @@ extern pte_t *lookup_address(unsigned lo */ #define update_mmu_cache(vma,address,pte) do { } while (0) -#include <xen/features.h> void make_lowmem_page_readonly(void *va, unsigned int feature); void make_lowmem_page_writable(void *va, unsigned int feature); -void make_page_readonly(void *va, unsigned int feature); -void make_page_writable(void *va, unsigned int feature); -void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); -void make_pages_writable(void *va, unsigned int nr, unsigned int feature); - -#define virt_to_ptep(va) \ -({ \ - pte_t *__ptep = lookup_address((unsigned long)(va)); \ - BUG_ON(!__ptep || !pte_present(*__ptep)); \ - __ptep; \ -}) - -#define arbitrary_virt_to_machine(va) \ - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ - | ((unsigned long)(va) & (PAGE_SIZE - 1))) - -#ifdef CONFIG_HIGHPTE -#include <asm/io.h> -struct page *kmap_atomic_to_page(void *); -#define ptep_to_machine(ptep) \ -({ \ - pte_t *__ptep = (ptep); \ - page_to_phys(kmap_atomic_to_page(__ptep)) \ - | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ -}) -#else -#define ptep_to_machine(ptep) virt_to_machine(ptep) -#endif #endif /* !__ASSEMBLY__ */ +/* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM + */ #ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) -#endif /* CONFIG_FLATMEM */ - -int direct_remap_pfn_range(struct vm_area_struct *vma, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid); -int direct_kernel_remap_pfn_range(unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid); -int create_lookup_pte_addr(struct mm_struct *mm, - unsigned long address, - uint64_t *ptep); -int touch_pte_range(struct mm_struct *mm, - unsigned long address, - unsigned long size); - -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable); - -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) +#else +#define kern_addr_valid(kaddr) (0) +#endif #define io_remap_pfn_range(vma,from,pfn,size,prot) \ direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) -#include <asm-generic/pgtable.h> - #endif /* _I386_PGTABLE_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/pgtable_64.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -13,49 +13,26 @@ #include <linux/threads.h> #include <linux/sched.h> #include <asm/pda.h> -#ifdef CONFIG_XEN -#include <asm/hypervisor.h> +#ifdef CONFIG_XEN extern pud_t level3_user_pgt[512]; extern void xen_init_pt(void); - -extern pte_t *lookup_address(unsigned long address); - -#define virt_to_ptep(va) \ -({ \ - pte_t *__ptep = lookup_address((unsigned long)(va)); \ - BUG_ON(!__ptep || !pte_present(*__ptep)); \ - __ptep; \ -}) - -#define arbitrary_virt_to_machine(va) \ - (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ - | ((unsigned long)(va) & (PAGE_SIZE - 1))) - -#define ptep_to_machine(ptep) virt_to_machine(ptep) #endif extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt extern void paging_init(void); -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif /* !__ASSEMBLY__ */ +#define SHARED_KERNEL_PMD 1 + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ @@ -98,31 +75,63 @@ extern unsigned long empty_zero_page[PAG #define pgd_none(x) (!__pgd_val(x)) #define pud_none(x) (!__pud_val(x)) -static inline void set_pte(pte_t *dst, pte_t val) +struct mm_struct; + +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0)) + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - *dst = val; + xen_set_pte(ptep, pte); } -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval)) +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret) +{ + return __pte_ma(xchg(&xp->pte, 0)); +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif -static inline void pud_clear (pud_t * pud) +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) { - set_pud(pud, __pud(0)); + xen_l2_entry_update(pmdp, pmd); +} + +static inline void xen_pmd_clear(pmd_t *pmd) +{ + xen_set_pmd(pmd, xen_make_pmd(0)); +} + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +static inline void xen_pud_clear(pud_t *pud) +{ + xen_set_pud(pud, xen_make_pud(0)); } #define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) -static inline void pgd_clear (pgd_t * pgd) +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) { - set_pgd(pgd, __pgd(0)); - set_pgd(__user_pgd(pgd), __pgd(0)); + xen_l4_entry_update(pgdp, pgd); } -#define pte_same(a, b) ((a).pte == (b).pte) +static inline void xen_pgd_clear(pgd_t * pgd) +{ + xen_set_pgd(pgd, xen_make_pgd(0)); + xen_set_pgd(__user_pgd(pgd), xen_make_pgd(0)); +} -#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) +#define pte_same(a, b) ((a).pte == (b).pte) #endif /* !__ASSEMBLY__ */ @@ -133,8 +142,6 @@ static inline void pgd_clear (pgd_t * pg #define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) -#define FIRST_USER_ADDRESS 0 #define MAXMEM _AC(0x6fffffffff, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) @@ -144,105 +151,6 @@ static inline void pgd_clear (pgd_t * pg #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PSE 0x080 /* 2MB page */ -#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_GLOBAL 0x100 /* Global TLB entry */ - -#define _PAGE_PROTNONE 0x080 /* If not present */ -#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) - -/* Mapped page is I/O or foreign and has no associated page struct. */ -#define _PAGE_IO 0x200 - -#ifndef __ASSEMBLY__ -#if CONFIG_XEN_COMPAT <= 0x030002 -extern unsigned int __kernel_page_user; -#else -#define __kernel_page_user 0 -#endif -#endif - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) - -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) - -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY PAGE_COPY_NOEXEC -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define __PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) -#define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) -#define __PAGE_KERNEL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) -#define __PAGE_KERNEL_RO \ - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) -#define __PAGE_KERNEL_VSYSCALL \ - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD) -#define __PAGE_KERNEL_LARGE \ - (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_EXEC \ - (__PAGE_KERNEL_EXEC | _PAGE_PSE) - -/* - * We don't support GLOBAL page in xenolinux64 - */ -#define MAKE_GLOBAL(x) __pgprot((x)) - -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) - -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - #ifndef __ASSEMBLY__ static inline unsigned long pgd_bad(pgd_t pgd) @@ -260,119 +168,26 @@ static inline unsigned long pmd_bad(pmd_ return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); } -#define set_pte_at(_mm,addr,ptep,pteval) do { \ - if (((_mm) != current->mm && (_mm) != &init_mm) || \ - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ - set_pte((ptep), (pteval)); \ -} while (0) - #define pte_none(x) (!(x).pte) #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this right? */ #define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) #define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) -#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \ +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? max_mapnr : \ (_pte).pte & _PAGE_PRESENT ? \ mfn_to_local_pfn(__pte_mfn(_pte)) : \ __pte_mfn(_pte)) #define pte_page(x) pfn_to_page(pte_pfn(x)) -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) -{ - unsigned long pte = page_nr << PAGE_SHIFT; - pte |= pgprot_val(pgprot); - pte &= __supported_pte_mask; - return __pte(pte); -} - -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_none(pte)) { - if ((mm != &init_mm) || - HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) - pte = __pte_ma(xchg(&ptep->pte, 0)); - } - return pte; -} - -static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) -{ - if (full) { - pte_t pte = *ptep; - if (PagePinned(virt_to_page(mm->pgd))) - xen_l1_entry_update(ptep, __pte(0)); - else - *ptep = __pte(0); - return pte; - } - return ptep_get_and_clear(mm, addr, ptep); -} - -#define ptep_clear_flush(vma, addr, ptep) \ -({ \ - pte_t *__ptep = (ptep); \ - pte_t __res = *__ptep; \ - if (!pte_none(__res) && \ - ((vma)->vm_mm != current->mm || \ - HYPERVISOR_update_va_mapping(addr, __pte(0), \ - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ - UVMF_INVLPG|UVMF_MULTI))) { \ - __ptep->pte = 0; \ - flush_tlb_page(vma, addr); \ - } \ - __res; \ -}) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) -static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } -static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } - -static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; } -static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } -static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } -static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } -static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; } - -static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) -{ - if (!pte_young(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte); -} - -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pte_t pte = *ptep; - if (pte_write(pte)) - set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); -} - /* * Macro to mark a page protection value as "uncacheable". */ #define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) -static inline int pmd_large(pmd_t pte) { - return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; -} - /* * Conversion functions: convert a page and protection to a page entry, @@ -388,6 +203,7 @@ static inline int pmd_large(pmd_t pte) { #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) #define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) +static inline int pgd_large(pgd_t pgd) { return 0; } #define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) /* PUD - Level3 access */ @@ -398,6 +214,12 @@ static inline int pmd_large(pmd_t pte) { #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address)) #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) +static inline int pud_large(pud_t pte) +{ + return (__pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == + (_PAGE_PSE|_PAGE_PRESENT); +} + /* PMD - Level 2 access */ #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) @@ -413,36 +235,18 @@ static inline int pmd_large(pmd_t pte) { #else #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) #endif -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) #define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | _PAGE_FILE }) #define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ /* page, protection -> pte */ #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) -/* Change flags of a PTE */ -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - /* - * Since this might change the present bit (which controls whether - * a pte_t object has undergone p2m translation), we must use - * pte_val() on the input pte and __pte() for the return value. - */ - unsigned long pteval = pte_val(pte); - - pteval &= _PAGE_CHG_MASK; - pteval |= pgprot_val(newprot); - pteval &= __supported_pte_mask; - return __pte(pteval); -} - #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \ @@ -456,101 +260,21 @@ static inline pte_t pte_modify(pte_t pte #define update_mmu_cache(vma,address,pte) do { } while (0) -/* - * Rules for using ptep_establish: the pte MUST be a user pte, and - * must be a present->present transition. - */ -#define __HAVE_ARCH_PTEP_ESTABLISH -#define ptep_establish(vma, address, ptep, pteval) \ - do { \ - if ( likely((vma)->vm_mm == current->mm) ) { \ - BUG_ON(HYPERVISOR_update_va_mapping(address, \ - pteval, \ - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ - UVMF_INVLPG|UVMF_MULTI)); \ - } else { \ - xen_l1_entry_update(ptep, pteval); \ - flush_tlb_page(vma, address); \ - } \ - } while (0) - -/* We only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware - * will do the accessed bit for us, and we don't want to - * race with other CPU's that might be updating the dirty - * bit at the same time. */ -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ -({ \ - int __changed = !pte_same(*(ptep), entry); \ - if (__changed && (dirty)) \ - ptep_establish(vma, address, ptep, entry); \ - __changed; \ -}) - -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(vma, address, ptep) \ -({ \ - pte_t __pte = *(ptep); \ - int __young = pte_young(__pte); \ - __pte = pte_mkold(__pte); \ - if (PagePinned(virt_to_page((vma)->vm_mm->pgd))) \ - (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ - else if (__young) \ - set_pte(ptep, __pte); \ - __young; \ -}) - /* Encode and de-code a swap entry */ #define __swp_type(x) (((x).val >> 1) & 0x3f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -extern spinlock_t pgd_lock; -extern struct list_head pgd_list; +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) extern int kern_addr_valid(unsigned long addr); - -#define DOMID_LOCAL (0xFFFFU) - -struct vm_area_struct; - -int direct_remap_pfn_range(struct vm_area_struct *vma, - unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid); - -int direct_kernel_remap_pfn_range(unsigned long address, - unsigned long mfn, - unsigned long size, - pgprot_t prot, - domid_t domid); - -int create_lookup_pte_addr(struct mm_struct *mm, - unsigned long address, - uint64_t *ptep); - -int touch_pte_range(struct mm_struct *mm, - unsigned long address, - unsigned long size); - -int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable); - -#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \ - xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) - -pte_t *lookup_address(unsigned long addr); +extern void cleanup_highmap(void); #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) #define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define pgtable_cache_init() do { } while (0) #define check_pgt_cache() do { } while (0) @@ -563,13 +287,7 @@ pte_t *lookup_address(unsigned long addr #define kc_offset_to_vaddr(o) \ (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define __HAVE_ARCH_PTEP_SET_WRPROTECT #define __HAVE_ARCH_PTE_SAME -#include <asm-generic/pgtable.h> #endif /* !__ASSEMBLY__ */ #endif /* _X86_64_PGTABLE_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/processor.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/processor.h 2010-03-17 14:34:36.000000000 +0100 @@ -1,5 +1,791 @@ +#ifndef __ASM_X86_PROCESSOR_H +#define __ASM_X86_PROCESSOR_H + +#include <asm/processor-flags.h> + +/* migration helpers, for KVM - will be removed in 2.6.25: */ +#include <asm/vm86.h> +#define Xgt_desc_struct desc_ptr + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +#include <asm/vm86.h> +#include <asm/math_emu.h> +#include <asm/segment.h> +#include <asm/types.h> +#include <asm/sigcontext.h> +#include <asm/current.h> +#include <asm/cpufeature.h> +#include <asm/system.h> +#include <asm/page.h> +#include <asm/percpu.h> +#include <asm/msr.h> +#include <asm/desc_defs.h> +#include <asm/nops.h> +#include <linux/personality.h> +#include <linux/cpumask.h> +#include <linux/cache.h> +#include <linux/threads.h> +#include <linux/init.h> +#include <xen/interface/physdev.h> + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +static inline void *current_text_addr(void) +{ + void *pc; + asm volatile("mov $1f,%0\n1:":"=r" (pc)); + return pc; +} + +#ifdef CONFIG_X86_VSMP +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +#define ARCH_MIN_TASKALIGN 16 +#define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ + char hard_math; + char rfu; + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; +#else + /* number of 4K pages in DTLB/ITLB combined(in pages)*/ + int x86_tlbsize; + __u8 x86_virt_bits, x86_phys_bits; + /* cpuid returned core id bits */ + __u8 x86_coreid_bits; + /* Max extended CPUID function supported */ + __u32 extended_cpuid_level; +#endif + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + int x86_cache_size; /* in KB - valid for CPUS which support this + call */ + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ +#endif + u16 x86_max_cores; /* cpuid returned max cores value */ + u16 apicid; + u16 x86_clflush_size; +#ifdef CONFIG_SMP + u16 booted_cores; /* number of cores as seen by OS */ + u16 phys_proc_id; /* Physical processor id. */ + u16 cpu_core_id; /* Core id */ + u16 cpu_index; /* index into per_cpu list */ +#endif +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_NEXGEN 4 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 +#define X86_VENDOR_UNKNOWN 0xff + +/* + * capabilities of CPUs + */ +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; +extern __u32 cleared_cpu_caps[NCAPINTS]; + +#ifdef CONFIG_SMP +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data cpu_data(smp_processor_id()) +#else +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data +#endif + +void cpu_detect(struct cpuinfo_x86 *c); + +extern void identify_cpu(struct cpuinfo_x86 *); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +#if defined(CONFIG_X86_HT) || defined(CONFIG_X86_64) +extern void detect_ht(struct cpuinfo_x86 *c); +#else +static inline void detect_ht(struct cpuinfo_x86 *c) {} +#endif + +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + __asm__(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void load_cr3(pgd_t *pgdir) +{ + write_cr3(__pa(pgdir)); +} + +#ifndef CONFIG_X86_NO_TSS +#ifdef CONFIG_X86_32 +/* This is the TSS defined by the hardware. */ +struct x86_hw_tss { + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + unsigned short ss1, __ss1h; /* ss1 caches MSR_IA32_SYSENTER_CS */ + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax, cx, dx, bx; + unsigned long sp, bp, si, di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace, io_bitmap_base; +} __attribute__((packed)); +extern struct tss_struct doublefault_tss; +#else +struct x86_hw_tss { + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; +} __attribute__((packed)) ____cacheline_aligned; +#endif +#endif /* CONFIG_X86_NO_TSS */ + +/* + * Size of io_bitmap. + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + struct x86_hw_tss x86_tss; + + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + /* + * Cache the current maximum and the last task that used the bitmap: + */ + unsigned long io_bitmap_max; + struct thread_struct *io_bitmap_owner; + /* + * pads the TSS to be cacheline-aligned (size is 0x100) + */ + unsigned long __cacheline_filler[35]; + /* + * .. and then another 0x100 bytes for emergency kernel stack + */ + unsigned long stack[64]; +} __attribute__((packed)); + +DECLARE_PER_CPU(struct tss_struct, init_tss); + +/* Save the original ist values for checking stack pointers during debugging */ +struct orig_ist { + unsigned long ist[7]; +}; +#endif /* CONFIG_X86_NO_TSS */ + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + u32 status; /* software status information */ +}; + +struct i387_fxsave_struct { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + union { + struct { + u64 rip; + u64 rdp; + }; + struct { + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + }; + }; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + u8 ftop, changed, lookahead, no_update, rm, alimit; + struct info *info; + u32 entry_eip; +}; + +union i387_union { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; +}; + +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(u8, cpu_llc_id); +#elif !defined(CONFIG_X86_NO_TSS) +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +struct thread_struct { +/* cached TLS descriptors. */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; +#ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +#else + unsigned long usersp; /* Copy from PDA */ + unsigned short es, ds, fsindex, gsindex; +#endif + unsigned long ip; + unsigned long fs; + unsigned long gs; +/* Hardware debugging registers */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; +/* fault info */ + unsigned long cr2, trap_no, error_code; +/* floating point info */ + union i387_union i387 __attribute__((aligned(16)));; +#ifdef CONFIG_X86_32 +/* virtual 86 mode info */ + struct vm86_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, saved_sp0; + unsigned int saved_fs, saved_gs; +#endif +/* IO permissions */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; +/* max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; +/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ + unsigned long debugctlmsr; +/* Debug Store - if not 0 points to a DS Save Area configuration; + * goes into MSR_IA32_DS_AREA */ + unsigned long ds_area_msr; +}; + +static inline unsigned long xen_get_debugreg(int regno) +{ + return HYPERVISOR_get_debugreg(regno); +} + +static inline void xen_set_debugreg(int regno, unsigned long value) +{ + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); +} + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + +#ifndef CONFIG_X86_NO_TSS +static inline void native_load_sp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->x86_tss.sp0 = thread->sp0; +#ifdef CONFIG_X86_32 + /* Only happens when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +#endif +} +#else +#define xen_load_sp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \ + BUG(); \ +} while (0) +#endif + +#define __cpuid xen_cpuid +#define paravirt_enabled() 0 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = xen_get_debugreg(register) +#define set_debugreg(value, register) \ + xen_set_debugreg(register, value) + +#define load_sp0 xen_load_sp0 + +#define set_iopl_mask xen_set_iopl_mask + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4(unsigned long mask) +{ + unsigned cr4; + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4(unsigned long mask) +{ + unsigned cr4; + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +struct microcode_header { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode { + struct microcode_header hdr; + unsigned int bits[0]; +}; + +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + +typedef struct { + unsigned long seg; +} mm_segment_t; + + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy status */ +extern void prepare_to_copy(struct task_struct *tsk); + +unsigned long get_wchan(struct task_struct *p); + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; +} +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +/* Stop speculative execution */ +static inline void sync_core(void) +{ + int tmp; + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); +} + +#define cpu_relax() rep_nop() + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax,%ecx,%edx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc8;" + : :"a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + "sti; .byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); + +extern int force_mwait; + +extern void select_idle_routine(const struct cpuinfo_x86 *c); + +extern unsigned long boot_option_idle_override; + +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +/* Defined in head.S */ +extern struct desc_ptr early_gdt_descr; + +extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(void); +extern void cpu_init(void); +extern void init_gdt(int cpu); + +/* from system description table in BIOS. Mostly for MCA use, but + * others may find it useful. */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; + +/* Boot loader type from the setup header */ +extern int bootloader_type; + +extern char ignore_fpu_irq; +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +#ifdef CONFIG_X86_32 +#define BASE_PREFETCH ASM_NOP4 +#define ARCH_HAS_PREFETCH +#else +#define BASE_PREFETCH "prefetcht0 (%1)" +#endif + +/* Prefetch instructions for Pentium III and AMD Athlon */ +/* It's not worth to care about 3dnow! prefetches for the K6 + because they are microcoded there and very slow. + However we don't do prefetches for pre XP Athlons currently + That should be fixed. */ +static inline void prefetch(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +/* 3dnow! prefetch to get an exclusive cache line. Useful for + spinlocks to avoid one state transition in the cache coherency protocol. */ +static inline void prefetchw(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +#define spin_lock_prefetch(x) prefetchw(x) #ifdef CONFIG_X86_32 -# include "processor_32.h" +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE (PAGE_OFFSET) +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ + .fs = __KERNEL_PERCPU, \ +} + +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .x86_tss = { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +#define start_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%gs": :"r" (0)); \ + regs->fs = 0; \ + set_fs(USER_DS); \ + regs->ds = __USER_DS; \ + regs->es = __USER_DS; \ + regs->ss = __USER_DS; \ + regs->cs = __USER_CS; \ + regs->ip = new_eip; \ + regs->sp = new_esp; \ +} while (0) + + +extern unsigned long thread_saved_pc(struct task_struct *tsk); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the ss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + #else -# include "processor_64.h" +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE64 (0x800000000000UL - 4096) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) + +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE64 + +#define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define INIT_TSS { \ + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define start_thread(regs, new_rip, new_rsp) do { \ + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ + load_gs_index(0); \ + (regs)->ip = (new_rip); \ + (regs)->sp = (new_rsp); \ + write_pda(oldrsp, (new_rsp)); \ + (regs)->cs = __USER_CS; \ + (regs)->ss = __USER_DS; \ + (regs)->flags = 0x200; \ + set_fs(USER_DS); \ +} while (0) + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) + +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#endif /* CONFIG_X86_64 */ + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/processor_32.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,751 +0,0 @@ -/* - * include/asm-i386/processor.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -#ifndef __ASM_I386_PROCESSOR_H -#define __ASM_I386_PROCESSOR_H - -#include <asm/vm86.h> -#include <asm/math_emu.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/sigcontext.h> -#include <asm/cpufeature.h> -#include <asm/msr.h> -#include <asm/system.h> -#include <linux/cache.h> -#include <linux/threads.h> -#include <asm/percpu.h> -#include <linux/cpumask.h> -#include <linux/init.h> -#include <asm/processor-flags.h> -#include <xen/interface/physdev.h> - -/* flag for disabling the tsc */ -#define tsc_disable 0 - -struct desc_struct { - unsigned long a,b; -}; - -#define desc_empty(desc) \ - (!((desc)->a | (desc)->b)) - -#define desc_equal(desc1, desc2) \ - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) -/* - * Default implementation of macro that returns current - * instruction pointer ("program counter"). - */ -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice - * before touching them. [mj] - */ - -struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; - char wp_works_ok; /* It doesn't on 386's */ - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ - char hard_math; - char rfu; - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - unsigned long x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB - valid for CPUS which support this - call */ - int x86_cache_alignment; /* In bytes */ - char fdiv_bug; - char f00f_bug; - char coma_bug; - char pad0; - int x86_power; - unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP - cpumask_t llc_shared_map; /* cpus sharing the last level cache */ -#endif - unsigned char x86_max_cores; /* cpuid returned max cores value */ - unsigned char apicid; - unsigned short x86_clflush_size; -#ifdef CONFIG_SMP - unsigned char booted_cores; /* number of cores as seen by OS */ - __u8 phys_proc_id; /* Physical processor id. */ - __u8 cpu_core_id; /* Core id */ - __u8 cpu_index; /* index into per_cpu list */ -#endif -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NSC 8 -#define X86_VENDOR_NUM 9 -#define X86_VENDOR_UNKNOWN 0xff - -/* - * capabilities of CPUs - */ - -extern struct cpuinfo_x86 boot_cpu_data; -extern struct cpuinfo_x86 new_cpu_data; -#ifndef CONFIG_X86_NO_TSS -extern struct tss_struct doublefault_tss; -DECLARE_PER_CPU(struct tss_struct, init_tss); -#endif - -#ifdef CONFIG_SMP -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); -#define cpu_data(cpu) per_cpu(cpu_info, cpu) -#define current_cpu_data cpu_data(smp_processor_id()) -#else -#define cpu_data(cpu) boot_cpu_data -#define current_cpu_data boot_cpu_data -#endif - -/* - * the following now lives in the per cpu area: - * extern int cpu_llc_id[NR_CPUS]; - */ -DECLARE_PER_CPU(u8, cpu_llc_id); -extern char ignore_fpu_irq; - -void __init cpu_detect(struct cpuinfo_x86 *c); - -extern void identify_boot_cpu(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); -extern void print_cpu_info(struct cpuinfo_x86 *); -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; - -#ifdef CONFIG_X86_HT -extern void detect_ht(struct cpuinfo_x86 *c); -#else -static inline void detect_ht(struct cpuinfo_x86 *c) {} -#endif - -static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - __asm__(XEN_CPUID - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -#define load_cr3(pgdir) write_cr3(__pa(pgdir)) - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; - -static inline void set_in_cr4 (unsigned long mask) -{ - unsigned cr4; - mmu_cr4_features |= mask; - cr4 = read_cr4(); - cr4 |= mask; - write_cr4(cr4); -} - -static inline void clear_in_cr4 (unsigned long mask) -{ - unsigned cr4; - mmu_cr4_features &= ~mask; - cr4 = read_cr4(); - cr4 &= ~mask; - write_cr4(cr4); -} - -/* Stop speculative execution */ -static inline void sync_core(void) -{ - int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); -} - -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax,%ecx,%edx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc8;" - : :"a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax,%ecx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); -} - -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); - -/* from system description table in BIOS. Mostly for MCA use, but -others may find it useful. */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; -extern unsigned int mca_pentium_flag; - -/* Boot loader type from the setup header */ -extern int bootloader_type; - -/* - * User space process size: 3GB (default). - */ -#define TASK_SIZE (PAGE_OFFSET) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) - -#define HAVE_ARCH_PICK_MMAP_LAYOUT - -extern void hard_disable_TSC(void); -extern void disable_TSC(void); -extern void hard_enable_TSC(void); - -/* - * Size of io_bitmap. - */ -#define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#ifndef CONFIG_X86_NO_TSS -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) -#endif -#define INVALID_IO_BITMAP_OFFSET 0x8000 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 - -struct i387_fsave_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ -}; - -struct i387_fxsave_struct { - unsigned short cwd; - unsigned short swd; - unsigned short twd; - unsigned short fop; - long fip; - long fcs; - long foo; - long fos; - long mxcsr; - long mxcsr_mask; - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ - long padding[56]; -} __attribute__ ((aligned (16))); - -struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; -}; - -union i387_union { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; -}; - -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct thread_struct; - -#ifndef CONFIG_X86_NO_TSS -/* This is the TSS defined by the hardware. */ -struct i386_hw_tss { - unsigned short back_link,__blh; - unsigned long esp0; - unsigned short ss0,__ss0h; - unsigned long esp1; - unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ - unsigned long esp2; - unsigned short ss2,__ss2h; - unsigned long __cr3; - unsigned long eip; - unsigned long eflags; - unsigned long eax,ecx,edx,ebx; - unsigned long esp; - unsigned long ebp; - unsigned long esi; - unsigned long edi; - unsigned short es, __esh; - unsigned short cs, __csh; - unsigned short ss, __ssh; - unsigned short ds, __dsh; - unsigned short fs, __fsh; - unsigned short gs, __gsh; - unsigned short ldt, __ldth; - unsigned short trace, io_bitmap_base; -} __attribute__((packed)); - -struct tss_struct { - struct i386_hw_tss x86_tss; - - /* - * The extra 1 is there because the CPU will access an - * additional byte beyond the end of the IO permission - * bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - /* - * Cache the current maximum and the last task that used the bitmap: - */ - unsigned long io_bitmap_max; - struct thread_struct *io_bitmap_owner; - /* - * pads the TSS to be cacheline-aligned (size is 0x100) - */ - unsigned long __cacheline_filler[35]; - /* - * .. and then another 0x100 bytes for emergency kernel stack - */ - unsigned long stack[64]; -} __attribute__((packed)); -#endif - -#define ARCH_MIN_TASKALIGN 16 - -struct thread_struct { -/* cached TLS descriptors. */ - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; - unsigned long esp0; - unsigned long sysenter_cs; - unsigned long eip; - unsigned long esp; - unsigned long fs; - unsigned long gs; -/* Hardware debugging registers */ - unsigned long debugreg[8]; /* %%db0-7 debug registers */ -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387; -/* virtual 86 mode info */ - struct vm86_struct __user * vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags, v86mask, saved_esp0; - unsigned int saved_fs, saved_gs; -/* IO permissions */ - unsigned long *io_bitmap_ptr; - unsigned long iopl; -/* max allowed port in the bitmap, in bytes: */ - unsigned long io_bitmap_max; -}; - -#define INIT_THREAD { \ - .esp0 = sizeof(init_stack) + (long)&init_stack, \ - .vm86_info = NULL, \ - .sysenter_cs = __KERNEL_CS, \ - .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ -} - -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .esp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - -#define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%gs": :"r" (0)); \ - regs->xfs = 0; \ - set_fs(USER_DS); \ - regs->xds = __USER_DS; \ - regs->xes = __USER_DS; \ - regs->xss = __USER_DS; \ - regs->xcs = __USER_CS; \ - regs->eip = new_eip; \ - regs->esp = new_esp; \ -} while (0) - -/* Forward declaration, a strange C thing */ -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - -/* - * create a kernel thread without removing it from tasklists - */ -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -extern unsigned long thread_saved_pc(struct task_struct *tsk); -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack); - -unsigned long get_wchan(struct task_struct *p); - -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - -/* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessable even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the xss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ - __regs__ - 1; \ -}) - -#define KSTK_EIP(task) (task_pt_regs(task)->eip) -#define KSTK_ESP(task) (task_pt_regs(task)->esp) - - -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - -#ifndef CONFIG_X86_NO_TSS -static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) -{ - tss->x86_tss.esp0 = thread->esp0; - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -} -#else -#define xen_load_esp0(tss, thread) do { \ - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ - BUG(); \ -} while (0) -#endif - - -static inline unsigned long xen_get_debugreg(int regno) -{ - return HYPERVISOR_get_debugreg(regno); -} - -static inline void xen_set_debugreg(int regno, unsigned long value) -{ - WARN_ON(HYPERVISOR_set_debugreg(regno, value)); -} - -/* - * Set IOPL bits in EFLAGS from given mask - */ -static inline void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); -} - - -#define paravirt_enabled() 0 -#define __cpuid xen_cpuid - -#define load_esp0 xen_load_esp0 - -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = xen_get_debugreg(register) -#define set_debugreg(value, register) \ - xen_set_debugreg(register, value) - -#define set_iopl_mask xen_set_iopl_mask - -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - return edx; -} - -/* generic versions from gas */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 - -/* Opteron nops */ -#define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 - -/* K7 nops */ -/* uses eax dependencies (arbitary choice) */ -#define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 ".byte 0x8b,0xc0\n" -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" -#define K7_NOP5 K7_NOP4 ASM_NOP1 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 - -/* P6 nops */ -/* uses eax dependencies (Intel-recommended choice) */ -#define P6_NOP1 GENERIC_NOP1 -#define P6_NOP2 ".byte 0x66,0x90\n" -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" - -#ifdef CONFIG_MK8 -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 -#elif defined(CONFIG_MK7) -#define ASM_NOP1 K7_NOP1 -#define ASM_NOP2 K7_NOP2 -#define ASM_NOP3 K7_NOP3 -#define ASM_NOP4 K7_NOP4 -#define ASM_NOP5 K7_NOP5 -#define ASM_NOP6 K7_NOP6 -#define ASM_NOP7 K7_NOP7 -#define ASM_NOP8 K7_NOP8 -#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \ - defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \ - defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4) -#define ASM_NOP1 P6_NOP1 -#define ASM_NOP2 P6_NOP2 -#define ASM_NOP3 P6_NOP3 -#define ASM_NOP4 P6_NOP4 -#define ASM_NOP5 P6_NOP5 -#define ASM_NOP6 P6_NOP6 -#define ASM_NOP7 P6_NOP7 -#define ASM_NOP8 P6_NOP8 -#else -#define ASM_NOP1 GENERIC_NOP1 -#define ASM_NOP2 GENERIC_NOP2 -#define ASM_NOP3 GENERIC_NOP3 -#define ASM_NOP4 GENERIC_NOP4 -#define ASM_NOP5 GENERIC_NOP5 -#define ASM_NOP6 GENERIC_NOP6 -#define ASM_NOP7 GENERIC_NOP7 -#define ASM_NOP8 GENERIC_NOP8 -#endif - -#define ASM_NOP_MAX 8 - -/* Prefetch instructions for Pentium III and AMD Athlon */ -/* It's not worth to care about 3dnow! prefetches for the K6 - because they are microcoded there and very slow. - However we don't do prefetches for pre XP Athlons currently - That should be fixed. */ -#define ARCH_HAS_PREFETCH -static inline void prefetch(const void *x) -{ - alternative_input(ASM_NOP4, - "prefetchnta (%1)", - X86_FEATURE_XMM, - "r" (x)); -} - -#define ARCH_HAS_PREFETCH -#define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH - -/* 3dnow! prefetch to get an exclusive cache line. Useful for - spinlocks to avoid one state transition in the cache coherency protocol. */ -static inline void prefetchw(const void *x) -{ - alternative_input(ASM_NOP4, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); -} -#define spin_lock_prefetch(x) prefetchw(x) - -extern void select_idle_routine(const struct cpuinfo_x86 *c); - -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) - -extern unsigned long boot_option_idle_override; -extern void enable_sep_cpu(void); -extern int sysenter_setup(void); - -/* Defined in head.S */ -extern struct Xgt_desc_struct early_gdt_descr; - -extern void cpu_set_gdt(int); -extern void switch_to_new_gdt(void); -extern void cpu_init(void); -extern void init_gdt(int cpu); - -extern int force_mwait; - -#endif /* __ASM_I386_PROCESSOR_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/processor_64.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,461 +0,0 @@ -/* - * include/asm-x86_64/processor.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -#ifndef __ASM_X86_64_PROCESSOR_H -#define __ASM_X86_64_PROCESSOR_H - -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/sigcontext.h> -#include <asm/cpufeature.h> -#include <linux/threads.h> -#include <asm/msr.h> -#include <asm/current.h> -#include <asm/system.h> -#include <asm/mmsegment.h> -#include <asm/percpu.h> -#include <linux/personality.h> -#include <linux/cpumask.h> -#include <asm/processor-flags.h> - -#define TF_MASK 0x00000100 -#define IF_MASK 0x00000200 -#define IOPL_MASK 0x00003000 -#define NT_MASK 0x00004000 -#define VM_MASK 0x00020000 -#define AC_MASK 0x00040000 -#define VIF_MASK 0x00080000 /* virtual interrupt flag */ -#define VIP_MASK 0x00100000 /* virtual interrupt pending */ -#define ID_MASK 0x00200000 - -#define desc_empty(desc) \ - (!((desc)->a | (desc)->b)) - -#define desc_equal(desc1, desc2) \ - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) - -/* - * Default implementation of macro that returns current - * instruction pointer ("program counter"). - */ -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; }) - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - */ - -struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - __u32 x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB */ - int x86_clflush_size; - int x86_cache_alignment; - int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ - __u8 x86_virt_bits, x86_phys_bits; - __u8 x86_max_cores; /* cpuid returned max cores value */ - __u32 x86_power; - __u32 extended_cpuid_level; /* Max extended CPUID function supported */ - unsigned long loops_per_jiffy; -#ifdef CONFIG_SMP - cpumask_t llc_shared_map; /* cpus sharing the last level cache */ -#endif - __u8 apicid; -#ifdef CONFIG_SMP - __u8 booted_cores; /* number of cores as seen by OS */ - __u8 phys_proc_id; /* Physical Processor id. */ - __u8 cpu_core_id; /* Core id. */ - __u8 cpu_index; /* index into per_cpu list */ -#endif -} ____cacheline_aligned; - -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NUM 8 -#define X86_VENDOR_UNKNOWN 0xff - -#ifdef CONFIG_SMP -DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); -#define cpu_data(cpu) per_cpu(cpu_info, cpu) -#define current_cpu_data cpu_data(smp_processor_id()) -#else -#define cpu_data(cpu) boot_cpu_data -#define current_cpu_data boot_cpu_data -#endif - -extern char ignore_irq13; - -extern void identify_cpu(struct cpuinfo_x86 *); -extern void print_cpu_info(struct cpuinfo_x86 *); -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; - -static inline void set_in_cr4 (unsigned long mask) -{ - mmu_cr4_features |= mask; - __asm__("movq %%cr4,%%rax\n\t" - "orq %0,%%rax\n\t" - "movq %%rax,%%cr4\n" - : : "irg" (mask) - :"ax"); -} - -static inline void clear_in_cr4 (unsigned long mask) -{ - mmu_cr4_features &= ~mask; - __asm__("movq %%cr4,%%rax\n\t" - "andq %0,%%rax\n\t" - "movq %%rax,%%cr4\n" - : : "irg" (~mask) - :"ax"); -} - - -/* - * User space process size. 47bits minus one guard page. - */ -#define TASK_SIZE64 (0x800000000000UL - 4096) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) - -#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) -#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) - -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3) - -/* - * Size of io_bitmap. - */ -#define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#ifndef CONFIG_X86_NO_TSS -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) -#endif -#define INVALID_IO_BITMAP_OFFSET 0x8000 - -struct i387_fxsave_struct { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - u32 padding[24]; -} __attribute__ ((aligned (16))); - -union i387_union { - struct i387_fxsave_struct fxsave; -}; - -#ifndef CONFIG_X86_NO_TSS -struct tss_struct { - u32 reserved1; - u64 rsp0; - u64 rsp1; - u64 rsp2; - u64 reserved2; - u64 ist[7]; - u32 reserved3; - u32 reserved4; - u16 reserved5; - u16 io_bitmap_base; - /* - * The extra 1 is there because the CPU will access an - * additional byte beyond the end of the IO permission - * bitmap. The extra byte must be all 1 bits, and must - * be within the limit. Thus we have: - * - * 128 bytes, the bitmap itself, for ports 0..0x3ff - * 8 bytes, for an extra "long" of ~0UL - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; -} __attribute__((packed)) ____cacheline_aligned; - -DECLARE_PER_CPU(struct tss_struct,init_tss); -#endif - - -extern struct cpuinfo_x86 boot_cpu_data; -#ifndef CONFIG_X86_NO_TSS -/* Save the original ist values for checking stack pointers during debugging */ -struct orig_ist { - unsigned long ist[7]; -}; -DECLARE_PER_CPU(struct orig_ist, orig_ist); -#endif - -#ifdef CONFIG_X86_VSMP -#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) -#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) -#else -#define ARCH_MIN_TASKALIGN 16 -#define ARCH_MIN_MMSTRUCT_ALIGN 0 -#endif - -struct thread_struct { - unsigned long rsp0; - unsigned long rsp; - unsigned long userrsp; /* Copy from PDA */ - unsigned long fs; - unsigned long gs; - unsigned short es, ds, fsindex, gsindex; -/* Hardware debugging registers */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387 __attribute__((aligned(16))); -/* IO permissions. the bitmap could be moved into the GDT, that would make - switch faster for a limited number of ioperm using tasks. -AK */ - int ioperm; - unsigned long *io_bitmap_ptr; - unsigned io_bitmap_max; -/* cached TLS descriptors. */ - u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; - unsigned int iopl; -} __attribute__((aligned(16))); - -#define INIT_THREAD { \ - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - -#ifndef CONFIG_X86_NO_TSS -#define INIT_TSS { \ - .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} -#endif - -#define INIT_MMAP \ -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } - -#define start_thread(regs,new_rip,new_rsp) do { \ - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ - load_gs_index(0); \ - (regs)->rip = (new_rip); \ - (regs)->rsp = (new_rsp); \ - write_pda(oldrsp, (new_rsp)); \ - (regs)->cs = __USER_CS; \ - (regs)->ss = __USER_DS; \ - (regs)->eflags = 0x200; \ - set_fs(USER_DS); \ -} while(0) - -#define get_debugreg(var, register) \ - var = HYPERVISOR_get_debugreg(register) -#define set_debugreg(value, register) do { \ - if (HYPERVISOR_set_debugreg(register, value)) \ - BUG(); \ -} while (0) - -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - -/* - * create a kernel thread without removing it from tasklists - */ -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8)) - -extern unsigned long get_wchan(struct task_struct *p); -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1) -#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip) -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ - - -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - - -#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2) -#define ASM_NOP1 P6_NOP1 -#define ASM_NOP2 P6_NOP2 -#define ASM_NOP3 P6_NOP3 -#define ASM_NOP4 P6_NOP4 -#define ASM_NOP5 P6_NOP5 -#define ASM_NOP6 P6_NOP6 -#define ASM_NOP7 P6_NOP7 -#define ASM_NOP8 P6_NOP8 -#else -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 -#endif - -/* Opteron nops */ -#define K8_NOP1 ".byte 0x90\n" -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 - -/* P6 nops */ -/* uses eax dependencies (Intel-recommended choice) */ -#define P6_NOP1 ".byte 0x90\n" -#define P6_NOP2 ".byte 0x66,0x90\n" -#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" -#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" -#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" -#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" -#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" - -#define ASM_NOP_MAX 8 - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -/* Stop speculative execution */ -static inline void sync_core(void) -{ - int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); -} - -#define ARCH_HAS_PREFETCHW 1 -static inline void prefetchw(void *x) -{ - alternative_input("prefetcht0 (%1)", - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); -} - -#define ARCH_HAS_SPINLOCK_PREFETCH 1 - -#define spin_lock_prefetch(x) prefetchw(x) - -#define cpu_relax() rep_nop() - -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax,%ecx,%edx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc8;" - : :"a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax,%ecx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax,%ecx;" */ - asm volatile( - "sti; .byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); -} - -extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); - -#define stack_current() \ -({ \ - struct thread_info *ti; \ - asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->task; \ -}) - -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) - -extern unsigned long boot_option_idle_override; -/* Boot loader type from the setup header */ -extern int bootloader_type; - -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 - -#endif /* __ASM_X86_64_PROCESSOR_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/smp_32.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/smp_32.h 2009-11-20 11:12:22.000000000 +0100 @@ -1,56 +1,51 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H +#ifndef __ASSEMBLY__ +#include <linux/cpumask.h> +#include <linux/init.h> + /* * We need the APIC definitions automatically as part of 'smp.h' */ -#ifndef __ASSEMBLY__ -#include <linux/kernel.h> -#include <linux/threads.h> -#include <linux/cpumask.h> +#ifdef CONFIG_X86_LOCAL_APIC +# include <asm/mpspec.h> +# include <asm/apic.h> +# ifdef CONFIG_X86_IO_APIC +# include <asm/io_apic.h> +# endif #endif -#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) -#include <linux/bitops.h> -#include <asm/mpspec.h> -#include <asm/apic.h> -#ifdef CONFIG_X86_IO_APIC -#include <asm/io_apic.h> -#endif -#endif +#define cpu_callout_map cpu_possible_map +#define cpu_callin_map cpu_possible_map -#define BAD_APICID 0xFFu -#ifdef CONFIG_SMP -#ifndef __ASSEMBLY__ +extern int smp_num_siblings; +extern unsigned int num_processors; -/* - * Private routines/data - */ - extern void smp_alloc_memory(void); -extern int pic_mode; -extern int smp_num_siblings; -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); -DECLARE_PER_CPU(cpumask_t, cpu_core_map); +extern void lock_ipi_call_lock(void); +extern void unlock_ipi_call_lock(void); extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); -extern void lock_ipi_call_lock(void); -extern void unlock_ipi_call_lock(void); -#define MAX_APICID 256 -extern u8 __initdata x86_cpu_to_apicid_init[]; -extern void *x86_cpu_to_apicid_ptr; +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(u8, cpu_llc_id); DECLARE_PER_CPU(u8, x86_cpu_to_apicid); -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) - #ifdef CONFIG_HOTPLUG_CPU extern void cpu_exit_clear(void); extern void cpu_uninit(void); #endif +#ifdef CONFIG_SMP + #ifndef CONFIG_XEN + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + struct smp_ops { void (*smp_prepare_boot_cpu)(void); @@ -104,11 +99,7 @@ void native_smp_prepare_cpus(unsigned in int native_cpu_up(unsigned int cpunum); void native_smp_cpus_done(unsigned int max_cpus); -#define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ -do { } while (0) - -#else - +#else /* CONFIG_XEN */ void xen_smp_send_stop(void); void xen_smp_send_reschedule(int cpu); @@ -120,7 +111,12 @@ int xen_smp_call_function_mask(cpumask_t #define smp_send_reschedule xen_smp_send_reschedule #define smp_call_function_mask xen_smp_call_function_mask -#endif +extern void prefill_possible_map(void); + +#endif /* CONFIG_XEN */ + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); /* * This function is needed by all SMP systems. It must _always_ be valid @@ -130,64 +126,49 @@ int xen_smp_call_function_mask(cpumask_t DECLARE_PER_CPU(int, cpu_number); #define raw_smp_processor_id() (x86_read_percpu(cpu_number)) -extern cpumask_t cpu_possible_map; -#define cpu_callin_map cpu_possible_map +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +#define safe_smp_processor_id() smp_processor_id() /* We don't mark CPUs online until __cpu_up(), so we need another measure */ static inline int num_booting_cpus(void) { - return cpus_weight(cpu_possible_map); + return cpus_weight(cpu_callout_map); } -#define safe_smp_processor_id() smp_processor_id() -extern int __cpu_disable(void); -extern void __cpu_die(unsigned int cpu); -extern void prefill_possible_map(void); -extern unsigned int num_processors; - -#endif /* !__ASSEMBLY__ */ - #else /* CONFIG_SMP */ #define safe_smp_processor_id() 0 #define cpu_physical_id(cpu) boot_cpu_physical_apicid -#define NO_PROC_ID 0xFF /* No processor magic marker */ - -#endif /* CONFIG_SMP */ - -#ifndef __ASSEMBLY__ +#endif /* !CONFIG_SMP */ #ifdef CONFIG_X86_LOCAL_APIC -#ifdef APIC_DEFINITION +static __inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); +} + +# ifdef APIC_DEFINITION extern int hard_smp_processor_id(void); -#else -#include <mach_apicdef.h> +# else +# include <mach_apicdef.h> static inline int hard_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); } -#endif /* APIC_DEFINITION */ +# endif /* APIC_DEFINITION */ #else /* CONFIG_X86_LOCAL_APIC */ -#ifndef CONFIG_SMP -#define hard_smp_processor_id() 0 -#endif +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif #endif /* CONFIG_X86_LOCAL_APIC */ -extern u8 apicid_2_node[]; - -#ifdef CONFIG_X86_LOCAL_APIC -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); -} -#endif -#endif - +#endif /* !ASSEMBLY */ #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/smp_64.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/smp_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,139 +1,103 @@ #ifndef __ASM_SMP_H #define __ASM_SMP_H -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/bitops.h> #include <linux/init.h> -extern int disable_apic; #ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ #include <asm/apic.h> #ifdef CONFIG_X86_IO_APIC #include <asm/io_apic.h> #endif -#include <asm/thread_info.h> +#include <asm/mpspec.h> #endif - -#ifdef CONFIG_SMP - #include <asm/pda.h> +#include <asm/thread_info.h> -struct pt_regs; - -extern cpumask_t cpu_present_mask; -extern cpumask_t cpu_possible_map; -extern cpumask_t cpu_online_map; extern cpumask_t cpu_initialized; -/* - * Private routines/data - */ - +extern int smp_num_siblings; +extern unsigned int num_processors; + extern void smp_alloc_memory(void); -extern volatile unsigned long smp_invalidate_needed; extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); -extern int smp_num_siblings; -extern void smp_send_reschedule(int cpu); + extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait); -/* - * cpu_sibling_map and cpu_core_map now live - * in the per cpu area - * - * extern cpumask_t cpu_sibling_map[NR_CPUS]; - * extern cpumask_t cpu_core_map[NR_CPUS]; - */ DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); -DECLARE_PER_CPU(u8, cpu_llc_id); - -#define SMP_TRAMPOLINE_BASE 0x6000 +DECLARE_PER_CPU(u16, cpu_llc_id); +DECLARE_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); -/* - * On x86 all CPUs are mapped 1:1 to the APIC space. - * This simplifies scheduling and IPI sending and - * compresses data structures. - */ - -static inline int num_booting_cpus(void) +#ifdef CONFIG_X86_LOCAL_APIC +static inline int cpu_present_to_apicid(int mps_cpu) { - return cpus_weight(cpu_possible_map); + if (cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; } +#endif -#define raw_smp_processor_id() read_pda(cpunumber) +#ifdef CONFIG_SMP + +#define SMP_TRAMPOLINE_BASE 0x6000 extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); -extern unsigned num_processors; extern unsigned __cpuinitdata disabled_cpus; -#define NO_PROC_ID 0xFF /* No processor magic marker */ - -#endif /* CONFIG_SMP */ +#define raw_smp_processor_id() read_pda(cpunumber) +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -#define safe_smp_processor_id() smp_processor_id() - -#ifdef CONFIG_X86_LOCAL_APIC -static inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); -} -#endif +#define stack_smp_processor_id() \ + ({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) /* - * Some lowlevel functions might want to know about - * the real APIC ID <-> CPU # mapping. + * On x86 all CPUs are mapped 1:1 to the APIC space. This simplifies + * scheduling and IPI sending and compresses data structures. */ -extern u8 __initdata x86_cpu_to_apicid_init[]; -extern void *x86_cpu_to_apicid_ptr; -DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */ -extern u8 bios_cpu_apicid[]; - -#ifdef CONFIG_X86_LOCAL_APIC -static inline int cpu_present_to_apicid(int mps_cpu) +static inline int num_booting_cpus(void) { - if (mps_cpu < NR_CPUS) - return (int)bios_cpu_apicid[mps_cpu]; - else - return BAD_APICID; + return cpus_weight(cpu_possible_map); } -#endif -#ifndef CONFIG_SMP +extern void smp_send_reschedule(int cpu); + +#else /* CONFIG_SMP */ + +extern unsigned int boot_cpu_id; +#define cpu_physical_id(cpu) boot_cpu_id #define stack_smp_processor_id() 0 -#define cpu_logical_map(x) (x) -#else -#include <asm/thread_info.h> -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) -#endif + +#endif /* !CONFIG_SMP */ + +#define safe_smp_processor_id() smp_processor_id() #ifdef CONFIG_X86_LOCAL_APIC static __inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); + return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); +} + +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); } #endif -#ifdef CONFIG_SMP -#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -#else -extern unsigned int boot_cpu_id; -#define cpu_physical_id(cpu) boot_cpu_id -#endif /* !CONFIG_SMP */ #endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/spinlock.h 2010-02-23 14:14:26.000000000 +0100 @@ -0,0 +1,341 @@ +#ifndef _X86_SPINLOCK_H_ +#define _X86_SPINLOCK_H_ + +#include <asm/atomic.h> +#include <asm/rwlock.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <linux/compiler.h> + +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + * + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * These are fair FIFO ticket locks, which are currently limited to 256 + * CPUs. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +#ifdef CONFIG_X86_32 +# define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" +#else +# define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" +#endif + +#if defined(CONFIG_X86_32) && \ + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)) +/* + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * (PPro errata 66, 92) + */ +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX +#else +# define UNLOCK_LOCK_PREFIX +#endif + +#include <asm/irqflags.h> + +int xen_spinlock_init(unsigned int cpu); +void xen_spinlock_cleanup(unsigned int cpu); +bool xen_spin_wait(raw_spinlock_t *, unsigned int *token, + unsigned int flags); +unsigned int xen_spin_adjust(const raw_spinlock_t *, unsigned int token); +void xen_spin_kick(raw_spinlock_t *, unsigned int token); + +/* + * Ticket locks are conceptually two parts, one indicating the current head of + * the queue, and the other indicating the current tail. The lock is acquired + * by atomically noting the tail and incrementing it by one (thus adding + * ourself to the queue and noting our position), then waiting until the head + * becomes equal to the the initial value of the tail. + * + * We use an xadd covering *both* parts of the lock, to increment the tail and + * also load the position of the head, which takes care of memory ordering + * issues and should be optimal for the uncontended case. Note the tail must be + * in the high part, because a wide xadd increment of the low part would carry + * up and contaminate the high part. + * + * With fewer than 2^8 possible CPUs, we can use x86's partial registers to + * save some instructions and make the code more elegant. There really isn't + * much between them in performance though, especially as locks are out of line. + */ +#if TICKET_SHIFT == 8 +#define __raw_spin_lock_preamble \ + asm(LOCK_PREFIX "xaddw %w0, %2\n\t" \ + "cmpb %h0, %b0\n\t" \ + "sete %1" \ + : "=&Q" (token), "=qm" (free), "+m" (lock->slock) \ + : "0" (0x0100) \ + : "memory", "cc") +#define __raw_spin_lock_body \ + asm("1:\t" \ + "cmpb %h0, %b0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movb %2, %b0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+Q" (token), "+g" (count) \ + : "m" (lock->slock) \ + : "memory", "cc") +#define __raw_spin_unlock_body \ + asm(UNLOCK_LOCK_PREFIX "incb %2\n\t" \ + "movzwl %2, %0\n\t" \ + "cmpb %h0, %b0\n\t" \ + "setne %1" \ + : "=&Q" (token), "=qm" (kick), "+m" (lock->slock) \ + : \ + : "memory", "cc") + +static inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + int tmp, new; + + asm("movzwl %2, %0\n\t" + "cmpb %h0, %b0\n\t" + "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgw %w1, %2\n\t" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} +#elif TICKET_SHIFT == 16 +#define __raw_spin_lock_preamble \ + do { \ + unsigned int tmp; \ + asm(LOCK_PREFIX "xaddl %0, %2\n\t" \ + "shldl $16, %0, %3\n\t" \ + "cmpw %w3, %w0\n\t" \ + "sete %1" \ + : "=&r" (token), "=qm" (free), "+m" (lock->slock), \ + "=&g" (tmp) \ + : "0" (0x00010000) \ + : "memory", "cc"); \ + } while (0) +#define __raw_spin_lock_body \ + do { \ + unsigned int tmp; \ + asm("shldl $16, %0, %2\n" \ + "1:\t" \ + "cmpw %w2, %w0\n\t" \ + "je 2f\n\t" \ + "decl %1\n\t" \ + "jz 2f\n\t" \ + "rep ; nop\n\t" \ + "movw %3, %w0\n\t" \ + /* don't need lfence here, because loads are in-order */ \ + "jmp 1b\n" \ + "2:" \ + : "+r" (token), "+g" (count), "=&g" (tmp) \ + : "m" (lock->slock) \ + : "memory", "cc"); \ + } while (0) +#define __raw_spin_unlock_body \ + do { \ + unsigned int tmp; \ + asm(UNLOCK_LOCK_PREFIX "incw %2\n\t" \ + "movl %2, %0\n\t" \ + "shldl $16, %0, %3\n\t" \ + "cmpw %w3, %w0\n\t" \ + "setne %1" \ + : "=&r" (token), "=qm" (kick), "+m" (lock->slock), \ + "=&r" (tmp) \ + : \ + : "memory", "cc"); \ + } while (0) + +static inline int __raw_spin_trylock(raw_spinlock_t *lock) +{ + int tmp; + int new; + + asm("movl %2, %0\n\t" + "movl %0, %1\n\t" + "roll $16, %0\n\t" + "cmpl %0, %1\n\t" + "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" + "jne 1f\n\t" + LOCK_PREFIX "cmpxchgl %1, %2\n" + "1:\t" + "sete %b1\n\t" + "movzbl %b1, %0\n\t" + : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) + : + : "memory", "cc"); + + return tmp; +} +#endif + +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + int tmp = *(volatile signed int *)(&(lock)->slock); + + return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); +} + +static inline int __raw_spin_is_contended(raw_spinlock_t *lock) +{ + int tmp = *(volatile signed int *)(&(lock)->slock); + + return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; +} + +static inline void __raw_spin_lock(raw_spinlock_t *lock) +{ + unsigned int token, count; + unsigned int flags = __raw_local_irq_save(); + bool free; + + __raw_spin_lock_preamble; + if (likely(free)) { + raw_local_irq_restore(flags); + return; + } + token = xen_spin_adjust(lock, token); + raw_local_irq_restore(flags); + do { + count = 1 << 10; + __raw_spin_lock_body; + } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); +} + +static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, + unsigned long flags) +{ + unsigned int token, count; + bool free; + + __raw_spin_lock_preamble; + if (likely(free)) + return; + token = xen_spin_adjust(lock, token); + do { + count = 1 << 10; + __raw_spin_lock_body; + } while (unlikely(!count) && !xen_spin_wait(lock, &token, flags)); +} + +static inline void __raw_spin_unlock(raw_spinlock_t *lock) +{ + unsigned int token; + bool kick; + + __raw_spin_unlock_body; + if (kick) + xen_spin_kick(lock, token); +} + +#ifndef XEN_SPINLOCK_SOURCE +#undef __raw_spin_lock_preamble +#undef __raw_spin_lock_body +#undef __raw_spin_unlock_body +#endif + +static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) +{ + while (__raw_spin_is_locked(lock)) + cpu_relax(); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + * + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + */ + +/** + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_read_can_lock(raw_rwlock_t *lock) +{ + return (int)(lock)->lock > 0; +} + +/** + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int __raw_write_can_lock(raw_rwlock_t *lock) +{ + return (lock)->lock == RW_LOCK_BIAS; +} + +static inline void __raw_read_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw) : "memory"); +} + +static inline void __raw_write_lock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); +} + +static inline int __raw_read_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + atomic_dec(count); + if (atomic_read(count) >= 0) + return 1; + atomic_inc(count); + return 0; +} + +static inline int __raw_write_trylock(raw_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + return 1; + atomic_add(RW_LOCK_BIAS, count); + return 0; +} + +static inline void __raw_read_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); +} + +static inline void __raw_write_unlock(raw_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX "addl %1, %0" + : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); +} + +#define _raw_spin_relax(lock) cpu_relax() +#define _raw_read_relax(lock) cpu_relax() +#define _raw_write_relax(lock) cpu_relax() + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/spinlock_types.h 2010-01-18 16:22:42.000000000 +0100 @@ -0,0 +1,34 @@ +#ifndef __ASM_SPINLOCK_TYPES_H +#define __ASM_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +# error "please don't include this file directly" +#endif + +typedef union { + unsigned int slock; + struct { +/* + * On Xen we support a single level of interrupt re-enabling per lock. Hence + * we can have twice as many outstanding tickets. Thus the cut-off for using + * byte register pairs must be at half the number of CPUs. + */ +#if 2 * CONFIG_NR_CPUS < 256 +# define TICKET_SHIFT 8 + u8 cur, seq; +#else +# define TICKET_SHIFT 16 + u16 cur, seq; +#endif + }; +} raw_spinlock_t; + +#define __RAW_SPIN_LOCK_UNLOCKED { 0 } + +typedef struct { + unsigned int lock; +} raw_rwlock_t; + +#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } + +#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/system.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/system.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,5 +1,393 @@ +#ifndef _ASM_X86_SYSTEM_H_ +#define _ASM_X86_SYSTEM_H_ + +#include <asm/asm.h> +#include <asm/segment.h> +#include <asm/cpufeature.h> +#include <asm/cmpxchg.h> +#include <asm/nops.h> +#include <asm/hypervisor.h> + +#include <linux/kernel.h> +#include <linux/irqflags.h> + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +#ifdef CONFIG_X86_32 + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) do { \ + unsigned long esi, edi; \ + asm volatile("pushfl\n\t" /* Save flags */ \ + "pushl %%ebp\n\t" \ + "movl %%esp,%0\n\t" /* save ESP */ \ + "movl %5,%%esp\n\t" /* restore ESP */ \ + "movl $1f,%1\n\t" /* save EIP */ \ + "pushl %6\n\t" /* restore EIP */ \ + "jmp __switch_to\n" \ + "1:\t" \ + "popl %%ebp\n\t" \ + "popfl" \ + :"=m" (prev->thread.sp), "=m" (prev->thread.ip), \ + "=a" (last), "=S" (esi), "=D" (edi) \ + :"m" (next->thread.sp), "m" (next->thread.ip), \ + "2" (prev), "d" (next)); \ +} while (0) + +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#else +#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" +#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "jc ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [tif_fork] "i" (TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ + : "memory", "cc" __EXTRA_CLOBBER) +#endif + +#ifdef __KERNEL__ +#define _set_base(addr, base) do { unsigned long __pr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %%dl,%2\n\t" \ + "movb %%dh,%3" \ + :"=&d" (__pr) \ + :"m" (*((addr)+2)), \ + "m" (*((addr)+4)), \ + "m" (*((addr)+7)), \ + "0" (base) \ + ); } while (0) + +#define _set_limit(addr, limit) do { unsigned long __lr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %2,%%dh\n\t" \ + "andb $0xf0,%%dh\n\t" \ + "orb %%dh,%%dl\n\t" \ + "movb %%dl,%2" \ + :"=&d" (__lr) \ + :"m" (*(addr)), \ + "m" (*((addr)+6)), \ + "0" (limit) \ + ); } while (0) + +#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) +#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) + +extern void load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ + asm volatile("\n" \ + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %k1, %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b,3b) \ + : :"r" (value), "r" (0)) + + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm volatile("mov %%" #seg ",%0":"=rm" (value)) + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + __asm__("lsll %1,%0" + :"=r" (__limit):"r" (segment)); + return __limit+1; +} + +static inline void xen_clts(void) +{ + HYPERVISOR_fpu_taskswitch(0); +} + +static inline void xen_stts(void) +{ + HYPERVISOR_fpu_taskswitch(1); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long xen_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" :"=r" (val), "=m" (__force_order)); + return val; +} + +static inline void xen_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": :"r" (val), "m" (__force_order)); +} + +#define xen_read_cr2() (current_vcpu_info()->arch.cr2) +#define xen_write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val))) + +static inline unsigned long xen_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" :"=r" (val), "=m" (__force_order)); +#ifdef CONFIG_X86_32 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; +#else + return machine_to_phys(val); +#endif +} + +static inline void xen_write_cr3(unsigned long val) +{ +#ifdef CONFIG_X86_32 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); +#else + val = phys_to_machine(val); +#endif + asm volatile("mov %0,%%cr3": :"r" (val), "m" (__force_order)); +} + +static inline unsigned long xen_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" :"=r" (val), "=m" (__force_order)); + return val; +} + +#define xen_read_cr4_safe() xen_read_cr4() + +static inline void xen_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": :"r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} + +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static inline void xen_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} +#define read_cr0() (xen_read_cr0()) +#define write_cr0(x) (xen_write_cr0(x)) +#define read_cr2() (xen_read_cr2()) +#define write_cr2(x) (xen_write_cr2(x)) +#define read_cr3() (xen_read_cr3()) +#define write_cr3(x) (xen_write_cr3(x)) +#define read_cr4() (xen_read_cr4()) +#define read_cr4_safe() (xen_read_cr4_safe()) +#define write_cr4(x) (xen_write_cr4(x)) +#define wbinvd() (xen_wbinvd()) +#ifdef CONFIG_X86_64 +#define read_cr8() (xen_read_cr8()) +#define write_cr8(x) (xen_write_cr8(x)) +#endif + +/* Clear the 'TS' bit */ +#define clts() (xen_clts()) +#define stts() (xen_stts()) + +#endif /* __KERNEL__ */ + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() __asm__ __volatile__ ("nop") + +void disable_hlt(void); +void enable_hlt(void); + +extern int es7000_plat; +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ #ifdef CONFIG_X86_32 -# include "system_32.h" +/* + * For now, "wmb()" doesn't actually do anything, as all + * Intel CPU's follow what Intel calls a *Processor Order*, + * in which all writes are seen in the program order even + * outside the CPU. + * + * I expect future Intel CPU's to have a weaker ordering, + * but I'd also expect them to finally get their act together + * and add some real memory barriers if so. + * + * Some non intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * <programlisting> + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * </programlisting> + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * <programlisting> + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * </programlisting> + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() #else -# include "system_64.h" +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/system_32.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,312 +0,0 @@ -#ifndef __ASM_SYSTEM_H -#define __ASM_SYSTEM_H - -#include <linux/kernel.h> -#include <asm/segment.h> -#include <asm/cpufeature.h> -#include <asm/cmpxchg.h> -#include <asm/synch_bitops.h> -#include <asm/hypervisor.h> - -#ifdef __KERNEL__ -#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */ - -struct task_struct; /* one of the stranger aspects of C forward declarations.. */ -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); - -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ -#define switch_to(prev,next,last) do { \ - unsigned long esi,edi; \ - asm volatile("pushfl\n\t" /* Save flags */ \ - "pushl %%ebp\n\t" \ - "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %5,%%esp\n\t" /* restore ESP */ \ - "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %6\n\t" /* restore EIP */ \ - "jmp __switch_to\n" \ - "1:\t" \ - "popl %%ebp\n\t" \ - "popfl" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=a" (last),"=S" (esi),"=D" (edi) \ - :"m" (next->thread.esp),"m" (next->thread.eip), \ - "2" (prev), "d" (next)); \ -} while (0) - -#define _set_base(addr,base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while(0) - -#define _set_limit(addr,limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while(0) - -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) ) - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg,value) \ - asm volatile("\n" \ - "1:\t" \ - "mov %0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "pushl $0\n\t" \ - "popl %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,3b\n" \ - ".previous" \ - : :"rm" (value)) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm volatile("mov %%" #seg ",%0":"=rm" (value)) - -static inline void xen_clts(void) -{ - HYPERVISOR_fpu_taskswitch(0); -} - -static inline unsigned long xen_read_cr0(void) -{ - unsigned long val; - asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); - return val; -} - -static inline void xen_write_cr0(unsigned long val) -{ - asm volatile("movl %0,%%cr0": :"r" (val)); -} - -#define xen_read_cr2() (current_vcpu_info()->arch.cr2) - -static inline void xen_write_cr2(unsigned long val) -{ - asm volatile("movl %0,%%cr2": :"r" (val)); -} - -static inline unsigned long xen_read_cr3(void) -{ - unsigned long val; - asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); - return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; -} - -static inline void xen_write_cr3(unsigned long val) -{ - val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); - asm volatile("movl %0,%%cr3": :"r" (val)); -} - -static inline unsigned long xen_read_cr4(void) -{ - unsigned long val; - asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); - return val; -} - -static inline unsigned long xen_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist */ - asm volatile("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (val): "0" (0)); - return val; -} - -static inline void xen_write_cr4(unsigned long val) -{ - asm volatile("movl %0,%%cr4": :"r" (val)); -} - -static inline void xen_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(char __force *)__p)); -} - -#define read_cr0() (xen_read_cr0()) -#define write_cr0(x) (xen_write_cr0(x)) -#define read_cr2() (xen_read_cr2()) -#define write_cr2(x) (xen_write_cr2(x)) -#define read_cr3() (xen_read_cr3()) -#define write_cr3(x) (xen_write_cr3(x)) -#define read_cr4() (xen_read_cr4()) -#define read_cr4_safe() (xen_read_cr4_safe()) -#define write_cr4(x) (xen_write_cr4(x)) -#define wbinvd() (xen_wbinvd()) - -/* Clear the 'TS' bit */ -#define clts() (xen_clts()) - -/* Set the 'TS' bit */ -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#endif /* __KERNEL__ */ - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - __asm__("lsll %1,%0" - :"=r" (__limit):"r" (segment)); - return __limit+1; -} - -#define nop() __asm__ __volatile__ ("nop") - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - * - * For now, "wmb()" doesn't actually do anything, as all - * Intel CPU's follow what Intel calls a *Processor Order*, - * in which all writes are seen in the program order even - * outside the CPU. - * - * I expect future Intel CPU's to have a weaker ordering, - * but I'd also expect them to finally get their act together - * and add some real memory barriers if so. - * - * Some non intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ - - -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while(0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -#include <linux/irqflags.h> - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -void disable_hlt(void); -void enable_hlt(void); - -extern int es7000_plat; -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - -void default_idle(void); -void __show_registers(struct pt_regs *, int all); - -#endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/system_64.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/system_64.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,122 +1,9 @@ #ifndef __ASM_SYSTEM_H #define __ASM_SYSTEM_H -#include <linux/kernel.h> #include <asm/segment.h> #include <asm/cmpxchg.h> -#include <asm/synch_bitops.h> -#include <asm/hypervisor.h> -#include <xen/interface/arch-x86_64.h> - -#ifdef __KERNEL__ - -/* entries in ARCH_DLINFO: */ -#ifdef CONFIG_IA32_EMULATION -# define AT_VECTOR_SIZE_ARCH 2 -#else -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t" - -#define __EXTRA_CLOBBER \ - ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" - -/* Save restore flags to clear handle leaking NT */ -#define switch_to(prev,next,last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "jc ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)),\ - [tif_fork] "i" (TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ - : "memory", "cc" __EXTRA_CLOBBER) - -extern void load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg,value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %1,%%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 8\n\t" \ - ".quad 1b,3b\n" \ - ".previous" \ - : :"r" (value), "r" (0)) - -/* - * Clear and set 'TS' bit respectively - */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) - -static inline unsigned long read_cr0(void) -{ - unsigned long cr0; - asm volatile("movq %%cr0,%0" : "=r" (cr0)); - return cr0; -} - -static inline void write_cr0(unsigned long val) -{ - asm volatile("movq %0,%%cr0" :: "r" (val)); -} - -#define read_cr2() current_vcpu_info()->arch.cr2 - -#define write_cr2(val) ((void)(current_vcpu_info()->arch.cr2 = (val))) - -#define read_cr3() ({ \ - unsigned long __dummy; \ - asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \ - machine_to_phys(__dummy); \ -}) - -static inline void write_cr3(unsigned long val) -{ - val = phys_to_machine(val); - asm volatile("movq %0,%%cr3" :: "r" (val) : "memory"); -} - -static inline unsigned long read_cr4(void) -{ - unsigned long cr4; - asm volatile("movq %%cr4,%0" : "=r" (cr4)); - return cr4; -} - -static inline void write_cr4(unsigned long val) -{ - asm volatile("movq %0,%%cr4" :: "r" (val) : "memory"); -} static inline unsigned long read_cr8(void) { @@ -128,52 +15,6 @@ static inline void write_cr8(unsigned lo BUG_ON(val); } -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory") - -#endif /* __KERNEL__ */ - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(char __force *)__p)); -} - -#define nop() __asm__ __volatile__ ("nop") - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do {} while(0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do {} while(0) -#endif - - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") - -#define read_barrier_depends() do {} while(0) -#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) - -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0) - #include <linux/irqflags.h> -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - #endif --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/tlbflush.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/tlbflush.h 2009-11-06 10:51:25.000000000 +0100 @@ -1,5 +1,106 @@ +#ifndef _ASM_X86_TLBFLUSH_H +#define _ASM_X86_TLBFLUSH_H + +#include <linux/mm.h> +#include <linux/sched.h> + +#include <asm/processor.h> +#include <asm/system.h> + +#define __flush_tlb() xen_tlb_flush() +#define __flush_tlb_global() xen_tlb_flush() +#define __flush_tlb_single(addr) xen_invlpg(addr) +#define __flush_tlb_all() xen_tlb_flush() +#define __flush_tlb_one(addr) xen_invlpg(addr) + #ifdef CONFIG_X86_32 -# include "tlbflush_32.h" +# define TLB_FLUSH_ALL 0xffffffff #else -# include "tlbflush_64.h" +# define TLB_FLUSH_ALL -1ULL #endif + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +#else /* SMP */ + +#include <asm/smp.h> + +#define local_flush_tlb() __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +#ifdef CONFIG_X86_32 +struct tlb_state +{ + struct mm_struct *active_mm; + int state; + char __cacheline_padding[L1_CACHE_BYTES-8]; +}; +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); +#endif + +#endif /* SMP */ + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + +#endif /* _ASM_X86_TLBFLUSH_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/tlbflush_32.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,99 +0,0 @@ -#ifndef _I386_TLBFLUSH_H -#define _I386_TLBFLUSH_H - -#include <linux/mm.h> -#include <asm/processor.h> - -#define __flush_tlb() xen_tlb_flush() -#define __flush_tlb_global() xen_tlb_flush() -#define __flush_tlb_all() xen_tlb_flush() - -#define cpu_has_invlpg (boot_cpu_data.x86 > 3) - -#define __flush_tlb_single(addr) xen_invlpg(addr) - -#define __flush_tlb_one(addr) __flush_tlb_single(addr) - -/* - * TLB flushing: - * - * - flush_tlb() flushes the current mm struct TLBs - * - flush_tlb_all() flushes all processes TLBs - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - * ..but the i386 has somewhat limited tlb flushing capabilities, - * and page-granular flushes are available only on i486 and up. - */ - -#define TLB_FLUSH_ALL 0xffffffff - - -#ifndef CONFIG_SMP - -#include <linux/sched.h> - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -#else /* SMP */ - -#include <asm/smp.h> - -#define local_flush_tlb() \ - __flush_tlb() - -#define flush_tlb_all xen_tlb_flush_all -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) - -#define flush_tlb() flush_tlb_current_task() - -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) -{ - flush_tlb_mm(vma->vm_mm); -} - -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - -struct tlb_state -{ - struct mm_struct *active_mm; - int state; - char __cacheline_padding[L1_CACHE_BYTES-8]; -}; -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); -#endif /* SMP */ - -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - -#endif /* _I386_TLBFLUSH_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/asm/tlbflush_64.h 2009-11-06 10:51:17.000000000 +0100 +++ /dev/null 1970-01-01 00:00:00.000000000 +0000 @@ -1,97 +0,0 @@ -#ifndef _X8664_TLBFLUSH_H -#define _X8664_TLBFLUSH_H - -#include <linux/mm.h> -#include <linux/sched.h> -#include <asm/processor.h> -#include <asm/system.h> - -#define __flush_tlb() xen_tlb_flush() - -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -#define __flush_tlb_global() xen_tlb_flush() - -#define __flush_tlb_all() __flush_tlb_global() - -#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) - - -/* - * TLB flushing: - * - * - flush_tlb() flushes the current mm struct TLBs - * - flush_tlb_all() flushes all processes TLBs - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - * x86-64 can only flush individual pages or full VMs. For a range flush - * we always do the full VM. Might be worth trying if for a small - * range a few INVLPGs in a row are a win. - */ - -#ifndef CONFIG_SMP - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -#else - -#include <asm/smp.h> - -#define local_flush_tlb() \ - __flush_tlb() - -#define flush_tlb_all xen_tlb_flush_all -#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) -#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) -#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) - -#define flush_tlb() flush_tlb_current_task() - -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) -{ - flush_tlb_mm(vma->vm_mm); -} - -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - -/* Roughly an IPI every 20MB with 4k pages for freeing page table - ranges. Cost is about 42k of memory for each CPU. */ -#define ARCH_FREE_PTE_NR 5350 - -#endif - -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - -#endif /* _X8664_TLBFLUSH_H */ --- sle11sp1-2010-03-22.orig/arch/x86/include/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/irq_vectors.h 2009-11-06 10:51:25.000000000 +0100 @@ -82,7 +82,8 @@ #define RESCHEDULE_VECTOR 0 #define CALL_FUNCTION_VECTOR 1 -#define NR_IPIS 2 +#define SPIN_UNLOCK_VECTOR 2 +#define NR_IPIS 3 /* * The maximum number of vectors supported by i386 processors --- sle11sp1-2010-03-22.orig/arch/x86/include/asm/mmu.h 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/asm/mmu.h 2009-11-06 10:51:25.000000000 +0100 @@ -18,7 +18,7 @@ typedef struct { void *vdso; } mm_context_t; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) void leave_mm(int cpu); #else static inline void leave_mm(int cpu) --- sle11sp1-2010-03-22.orig/arch/x86/include/asm/ptrace.h 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/asm/ptrace.h 2009-11-06 10:51:25.000000000 +0100 @@ -247,7 +247,9 @@ extern void user_enable_single_step(stru extern void user_disable_single_step(struct task_struct *); extern void user_enable_block_step(struct task_struct *); -#ifdef CONFIG_X86_DEBUGCTLMSR +#if defined(CONFIG_XEN) +#define arch_has_block_step() (0) +#elif defined(CONFIG_X86_DEBUGCTLMSR) #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) --- sle11sp1-2010-03-22.orig/arch/x86/include/asm/thread_info.h 2010-02-09 16:51:41.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/asm/thread_info.h 2010-02-09 16:53:00.000000000 +0100 @@ -95,6 +95,9 @@ struct thread_info { #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#ifdef CONFIG_X86_XEN +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */ +#endif #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -117,6 +120,7 @@ struct thread_info { #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_CSTAR (1 << TIF_CSTAR) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -147,12 +151,12 @@ struct thread_info { #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) -#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW -#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) #else -#define _TIF_WORK_CTXSW_NEXT (_TIF_NOTSC | _TIF_DEBUG) -#define _TIF_WORK_CTXSW_PREV (_TIF_NOTSC) +#define _TIF_WORK_CTXSW (_TIF_NOTSC \ + /*todo | _TIF_DEBUGCTLMSR | _TIF_DS_AREA_MSR | _TIF_BTS_TRACE_TS*/) #endif +#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW +#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) #define PREEMPT_ACTIVE 0x10000000 --- sle11sp1-2010-03-22.orig/arch/x86/include/asm/time.h 2010-03-22 12:07:54.000000000 +0100 +++ sle11sp1-2010-03-22/arch/x86/include/asm/time.h 2009-11-06 10:51:25.000000000 +0100 @@ -7,4 +7,10 @@ extern void hpet_time_init(void); extern void time_init(void); +#ifdef CONFIG_XEN +extern int xen_independent_wallclock(void); +extern unsigned long xen_read_persistent_clock(void); +extern int xen_update_persistent_clock(void); +#endif + #endif /* _ASM_X86_TIME_H */ --- sle11sp1-2010-03-22.orig/include/linux/page-flags.h 2010-02-17 14:49:38.000000000 +0100 +++ sle11sp1-2010-03-22/include/linux/page-flags.h 2009-12-16 11:50:48.000000000 +0100 @@ -130,8 +130,8 @@ enum pageflags { #ifdef CONFIG_XEN PG_pinned = PG_locked, /* Cannot alias with PG_owner_priv_1 since * bad_page() checks should include this bit. - * Also cannot use PG_arch_1 since that now - * has a different purpose on x86. */ + * Should not use PG_arch_1 as that may have + * a different purpose elsewhere. */ #else PG_pinned = PG_owner_priv_1, PG_savepinned = PG_dirty, --- sle11sp1-2010-03-22.orig/include/linux/pci.h 2009-11-06 10:49:47.000000000 +0100 +++ sle11sp1-2010-03-22/include/linux/pci.h 2010-03-11 09:18:51.000000000 +0100 @@ -747,6 +747,9 @@ int pci_reset_function(struct pci_dev *d void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_select_bars(struct pci_dev *dev, unsigned long flags); +#ifdef CONFIG_XEN +void pci_restore_bars(struct pci_dev *); +#endif /* ROM control related routines */ int pci_enable_rom(struct pci_dev *pdev); --- sle11sp1-2010-03-22.orig/include/xen/evtchn.h 2009-12-18 10:09:29.000000000 +0100 +++ sle11sp1-2010-03-22/include/xen/evtchn.h 2009-12-18 10:09:52.000000000 +0100 @@ -133,12 +133,37 @@ static inline void clear_evtchn(int port synch_clear_bit(port, s->evtchn_pending); } +static inline void set_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + synch_set_bit(port, s->evtchn_pending); +} + +static inline int test_evtchn(int port) +{ + shared_info_t *s = HYPERVISOR_shared_info; + return synch_test_bit(port, s->evtchn_pending); +} + static inline void notify_remote_via_evtchn(int port) { struct evtchn_send send = { .port = port }; VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send)); } +/* Clear an irq's pending state, in preparation for polling on it. */ +void xen_clear_irq_pending(int irq); + +/* Set an irq's pending state, to avoid blocking on it. */ +void xen_set_irq_pending(int irq); + +/* Test an irq's pending state. */ +int xen_test_irq_pending(int irq); + +/* Poll waiting for an irq to become pending. In the usual case, the + irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq); + /* * Use these to access the event channel underlying the IRQ handle returned * by bind_*_to_irqhandler(). --- sle11sp1-2010-03-22.orig/kernel/sysctl_check.c 2010-02-09 16:52:27.000000000 +0100 +++ sle11sp1-2010-03-22/kernel/sysctl_check.c 2010-02-09 16:52:40.000000000 +0100 @@ -899,7 +899,7 @@ static const struct trans_ctl_table tran }; #ifdef CONFIG_XEN -static struct trans_ctl_table trans_xen_table[] = { +static const struct trans_ctl_table trans_xen_table[] = { { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" }, { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" }, {} --- sle11sp1-2010-03-22.orig/lib/swiotlb-xen.c 2009-11-06 10:51:17.000000000 +0100 +++ sle11sp1-2010-03-22/lib/swiotlb-xen.c 2009-11-06 10:51:25.000000000 +0100 @@ -30,7 +30,6 @@ #include <asm/gnttab_dma.h> int swiotlb; -EXPORT_SYMBOL(swiotlb); #define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) @@ -289,6 +288,15 @@ __sync_single(struct phys_addr buffer, c } } +static inline unsigned int is_span_boundary(unsigned int index, + unsigned int nslots, + unsigned long offset_slots, + unsigned long max_slots) +{ + unsigned long offset = (offset_slots + index) & (max_slots - 1); + return offset + nslots > max_slots; +} + /* * Allocates bounce buffer and returns its kernel virtual address. */ @@ -300,6 +308,15 @@ map_single(struct device *hwdev, struct unsigned int nslots, stride, index, wrap; struct phys_addr slot_buf; int i; + unsigned long mask; + unsigned long offset_slots; + unsigned long max_slots; + + mask = dma_get_seg_boundary(hwdev); + offset_slots = -IO_TLB_SEGSIZE; + max_slots = mask + 1 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than a page, we limit the stride (and @@ -319,12 +336,21 @@ map_single(struct device *hwdev, struct */ spin_lock_irqsave(&io_tlb_lock, flags); { - wrap = index = ALIGN(io_tlb_index, stride); - + index = ALIGN(io_tlb_index, stride); if (index >= iotlb_nslabs) - wrap = index = 0; + index = 0; + wrap = index; do { + while (is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= iotlb_nslabs) + index = 0; + if (index == wrap) + goto not_found; + } + /* * If we find a slot that indicates we have 'nslots' * number of contiguous buffers, we allocate the @@ -359,6 +385,7 @@ map_single(struct device *hwdev, struct index = 0; } while (index != wrap); + not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); return NULL; } --- sle11sp1-2010-03-22.orig/mm/memory.c 2010-03-01 14:27:31.000000000 +0100 +++ sle11sp1-2010-03-22/mm/memory.c 2010-03-11 09:18:45.000000000 +0100 @@ -2019,6 +2019,10 @@ int apply_to_page_range(struct mm_struct unsigned long start = addr, end = addr + size; int err; +#ifdef CONFIG_XEN + if (!mm) + mm = &init_mm; +#endif BUG_ON(addr >= end); mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr);