Subject: xen3 xen-arch From: http://xenbits.xensource.com/linux-2.6.18-xen.hg (tip 1011:11175e60d393) Patch-mainline: obsolete Acked-by: jbeulich@novell.com List of files having Xen derivates (perhaps created during the merging of newer kernel versions), for xen-port-patches.py to pick up (i.e. this must be retained here until the XenSource tree has these in the right places): +++ linux/arch/x86/kernel/acpi/sleep-xen.c +++ linux/arch/x86/kernel/apic/apic-xen.c +++ linux/arch/x86/kernel/apic/io_apic-xen.c +++ linux/arch/x86/kernel/apic/ipi-xen.c +++ linux/arch/x86/kernel/apic/probe_32-xen.c +++ linux/arch/x86/kernel/cpu/common_64-xen.c +++ linux/arch/x86/kernel/e820-xen.c +++ linux/arch/x86/kernel/head-xen.c +++ linux/arch/x86/kernel/head32-xen.c +++ linux/arch/x86/kernel/ioport-xen.c +++ linux/arch/x86/kernel/io_apic-xen.c +++ linux/arch/x86/kernel/ipi-xen.c +++ linux/arch/x86/kernel/irq-xen.c +++ linux/arch/x86/kernel/ldt-xen.c +++ linux/arch/x86/kernel/microcode_core-xen.c +++ linux/arch/x86/kernel/mpparse-xen.c +++ linux/arch/x86/kernel/pci-nommu-xen.c +++ linux/arch/x86/kernel/process-xen.c +++ linux/arch/x86/kernel/setup-xen.c +++ linux/arch/x86/kernel/smp-xen.c +++ linux/arch/x86/kernel/traps-xen.c +++ linux/arch/x86/kernel/x86_init-xen.c +++ linux/arch/x86/mm/fault-xen.c +++ linux/arch/x86/mm/init-xen.c +++ linux/arch/x86/mm/iomap_32-xen.c +++ linux/arch/x86/mm/ioremap-xen.c +++ linux/arch/x86/mm/pageattr-xen.c +++ linux/arch/x86/mm/pat-xen.c +++ linux/arch/x86/mm/pgtable-xen.c +++ linux/arch/x86/vdso/vdso32-setup-xen.c +++ linux/drivers/char/mem-xen.c +++ linux/arch/x86/include/mach-xen/asm/desc.h +++ linux/arch/x86/include/mach-xen/asm/dma-mapping.h +++ linux/arch/x86/include/mach-xen/asm/fixmap.h +++ linux/arch/x86/include/mach-xen/asm/io.h +++ linux/arch/x86/include/mach-xen/asm/ipi.h +++ linux/arch/x86/include/mach-xen/asm/irq_vectors.h +++ linux/arch/x86/include/mach-xen/asm/irqflags.h +++ linux/arch/x86/include/mach-xen/asm/mmu_context.h +++ linux/arch/x86/include/mach-xen/asm/pci.h +++ linux/arch/x86/include/mach-xen/asm/pgalloc.h +++ linux/arch/x86/include/mach-xen/asm/pgtable.h +++ linux/arch/x86/include/mach-xen/asm/pgtable-3level_types.h +++ linux/arch/x86/include/mach-xen/asm/pgtable_64_types.h +++ linux/arch/x86/include/mach-xen/asm/pgtable_types.h +++ linux/arch/x86/include/mach-xen/asm/processor.h +++ linux/arch/x86/include/mach-xen/asm/smp.h +++ linux/arch/x86/include/mach-xen/asm/spinlock.h +++ linux/arch/x86/include/mach-xen/asm/spinlock_types.h +++ linux/arch/x86/include/mach-xen/asm/swiotlb.h +++ linux/arch/x86/include/mach-xen/asm/system.h +++ linux/arch/x86/include/mach-xen/asm/tlbflush.h +++ linux/arch/x86/include/mach-xen/asm/xor.h List of files folded into their native counterparts (and hence removed from this patch for xen-port-patches.py to not needlessly pick them up; for reference, prefixed with the version the removal occured): 2.6.18/arch/x86/include/mach-xen/asm/pgtable-2level.h 2.6.18/arch/x86/include/mach-xen/asm/pgtable-2level-defs.h 2.6.19/arch/x86/include/mach-xen/asm/ptrace.h 2.6.23/arch/x86/include/mach-xen/asm/ptrace_64.h 2.6.23/arch/x86/kernel/vsyscall-note_32-xen.S 2.6.24/arch/x86/include/mach-xen/asm/arch_hooks_64.h 2.6.24/arch/x86/include/mach-xen/asm/bootsetup_64.h 2.6.24/arch/x86/include/mach-xen/asm/mmu_32.h 2.6.24/arch/x86/include/mach-xen/asm/mmu_64.h 2.6.24/arch/x86/include/mach-xen/asm/nmi_64.h 2.6.24/arch/x86/include/mach-xen/asm/setup.h 2.6.24/arch/x86/include/mach-xen/asm/time_64.h (added in 2.6.20) 2.6.24/arch/x86/include/mach-xen/mach_timer.h 2.6.24/arch/x86/kernel/early_printk_32-xen.c 2.6.25/arch/x86/ia32/syscall32-xen.c 2.6.25/arch/x86/ia32/syscall32_syscall-xen.S 2.6.25/arch/x86/ia32/vsyscall-int80.S 2.6.25/arch/x86/include/mach-xen/asm/msr.h 2.6.25/arch/x86/include/mach-xen/asm/page_32.h 2.6.25/arch/x86/include/mach-xen/asm/spinlock_32.h 2.6.25/arch/x86/include/mach-xen/asm/timer.h (added in 2.6.24) 2.6.25/arch/x86/include/mach-xen/asm/timer_64.h 2.6.25/arch/x86/include/mach-xen/mach_time.h 2.6.25/arch/x86/kernel/acpi/boot-xen.c 2.6.26/arch/x86/include/mach-xen/asm/dma-mapping_32.h 2.6.26/arch/x86/include/mach-xen/asm/dma-mapping_64.h 2.6.26/arch/x86/include/mach-xen/asm/nmi.h (added in 2.6.24) 2.6.26/arch/x86/include/mach-xen/asm/scatterlist.h (added in 2.6.24) 2.6.26/arch/x86/include/mach-xen/asm/scatterlist_32.h 2.6.26/arch/x86/include/mach-xen/asm/swiotlb_32.h 2.6.26/arch/x86/kernel/pci-dma_32-xen.c 2.6.26/arch/x86/kernel/pci-swiotlb_64-xen.c 2.6.26/include/xen/xencomm.h 2.6.27/arch/x86/include/mach-xen/asm/e820.h (added in 2.6.24) 2.6.27/arch/x86/include/mach-xen/asm/e820_64.h 2.6.27/arch/x86/include/mach-xen/asm/hw_irq.h (added in 2.6.24) 2.6.27/arch/x86/include/mach-xen/asm/hw_irq_32.h 2.6.27/arch/x86/include/mach-xen/asm/hw_irq_64.h 2.6.27/arch/x86/include/mach-xen/asm/io_32.h 2.6.27/arch/x86/include/mach-xen/asm/io_64.h 2.6.27/arch/x86/include/mach-xen/asm/irq.h (added in 2.6.24) 2.6.27/arch/x86/include/mach-xen/asm/irq_64.h 2.6.27/arch/x86/kernel/e820_32-xen.c 2.6.28/arch/x86/include/mach-xen/asm/pci_64.h 2.6.28/arch/x86/include/mach-xen/asm/segment.h (added in 2.6.24) 2.6.28/arch/x86/include/mach-xen/asm/segment_32.h 2.6.30/arch/x86/include/mach-xen/asm/page.h (added in 2.6.24) 2.6.30/arch/x86/include/mach-xen/asm/page_64.h 2.6.30/arch/x86/include/mach-xen/asm/pci_32.h 2.6.30/arch/x86/kernel/apic/apic_xen_64.c 2.6.30/arch/x86/kernel/apic/probe_64-xen.c 2.6.30/arch/x86/kernel/setup_percpu-xen.c (added in 2.6.27) 2.6.31/arch/x86/kernel/init_task-xen.c 2.6.32/arch/x86/include/mach-xen/asm/setup_arch.h --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/acpi/processor_extcntl_xen.c 2010-03-22 12:00:53.000000000 +0100 @@ -0,0 +1,208 @@ +/* + * processor_extcntl_xen.c - interface to notify Xen + * + * Copyright (C) 2008, Intel corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <linux/pm.h> +#include <linux/cpu.h> + +#include <linux/cpufreq.h> +#include <acpi/processor.h> +#include <asm/hypercall.h> + +static int xen_cx_notifier(struct acpi_processor *pr, int action) +{ + int ret, count = 0, i; + xen_platform_op_t op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = pr->acpi_id, + .u.set_pminfo.type = XEN_PM_CX, + }; + struct xen_processor_cx *data, *buf; + struct acpi_processor_cx *cx; + + /* Convert to Xen defined structure and hypercall */ + buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + + data = buf; + for (i = 1; i <= pr->power.count; i++) { + cx = &pr->power.states[i]; + /* Skip invalid cstate entry */ + if (!cx->valid) + continue; + + data->type = cx->type; + data->latency = cx->latency; + data->power = cx->power; + data->reg.space_id = cx->reg.space_id; + data->reg.bit_width = cx->reg.bit_width; + data->reg.bit_offset = cx->reg.bit_offset; + data->reg.access_size = cx->reg.reserved; + data->reg.address = cx->reg.address; + + /* Get dependency relationships */ + if (cx->csd_count) { + printk("Wow! _CSD is found. Not support for now!\n"); + kfree(buf); + return -EINVAL; + } else { + data->dpcnt = 0; + set_xen_guest_handle(data->dp, NULL); + } + + data++; + count++; + } + + if (!count) { + printk("No available Cx info for cpu %d\n", pr->acpi_id); + kfree(buf); + return -EINVAL; + } + + op.u.set_pminfo.u.power.count = count; + op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control; + op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check; + op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst; + op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done; + + set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf); + ret = HYPERVISOR_platform_op(&op); + kfree(buf); + return ret; +} + +static int xen_px_notifier(struct acpi_processor *pr, int action) +{ + int ret = -EINVAL; + xen_platform_op_t op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = pr->acpi_id, + .u.set_pminfo.type = XEN_PM_PX, + }; + struct xen_processor_performance *perf; + struct xen_processor_px *states = NULL; + struct acpi_processor_performance *px; + struct acpi_psd_package *pdomain; + + if (!pr) + return -EINVAL; + + perf = &op.u.set_pminfo.u.perf; + px = pr->performance; + if (!px) + return -EINVAL; + + switch(action) { + case PROCESSOR_PM_CHANGE: + /* ppc dynamic handle */ + perf->flags = XEN_PX_PPC; + perf->platform_limit = pr->performance_platform_limit; + + ret = HYPERVISOR_platform_op(&op); + break; + + case PROCESSOR_PM_INIT: + /* px normal init */ + perf->flags = XEN_PX_PPC | + XEN_PX_PCT | + XEN_PX_PSS | + XEN_PX_PSD; + + /* ppc */ + perf->platform_limit = pr->performance_platform_limit; + + /* pct */ + xen_convert_pct_reg(&perf->control_register, &px->control_register); + xen_convert_pct_reg(&perf->status_register, &px->status_register); + + /* pss */ + perf->state_count = px->state_count; + states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL); + if (!states) + return -ENOMEM; + xen_convert_pss_states(states, px->states, px->state_count); + set_xen_guest_handle(perf->states, states); + + /* psd */ + pdomain = &px->domain_info; + xen_convert_psd_pack(&perf->domain_info, pdomain); + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + perf->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + perf->shared_type = CPUFREQ_SHARED_TYPE_ANY; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + perf->shared_type = CPUFREQ_SHARED_TYPE_HW; + else { + ret = -ENODEV; + kfree(states); + break; + } + + ret = HYPERVISOR_platform_op(&op); + kfree(states); + break; + + default: + break; + } + + return ret; +} + +static int xen_tx_notifier(struct acpi_processor *pr, int action) +{ + return -EINVAL; +} +static int xen_hotplug_notifier(struct acpi_processor *pr, int event) +{ + return -EINVAL; +} + +static struct processor_extcntl_ops xen_extcntl_ops = { + .hotplug = xen_hotplug_notifier, +}; + +void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops) +{ + unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8; + + if (!pmbits) + return; + if (pmbits & XEN_PROCESSOR_PM_CX) + xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier; + if (pmbits & XEN_PROCESSOR_PM_PX) + xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier; + if (pmbits & XEN_PROCESSOR_PM_TX) + xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier; + + *ops = &xen_extcntl_ops; +} +EXPORT_SYMBOL(arch_acpi_processor_init_extcntl); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/acpi/sleep_32-xen.c 2008-04-15 09:29:41.000000000 +0200 @@ -0,0 +1,113 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> + +#include <asm/smp.h> + +#ifndef CONFIG_ACPI_PV_SLEEP +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_video_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); +#endif + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if (!acpi_wakeup_address) + return 1; + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); +#endif + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +#endif +} + +#ifndef CONFIG_ACPI_PV_SLEEP +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_video_flags = 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_video_flags |= 2; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); + +static __init int reset_videomode_after_s3(struct dmi_system_id *d) +{ + acpi_video_flags |= 2; + return 0; +} + +static __initdata struct dmi_system_id acpisleep_dmi_table[] = { + { /* Reset video mode after returning from ACPI S3 sleep */ + .callback = reset_videomode_after_s3, + .ident = "Toshiba Satellite 4030cdt", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), + }, + }, + {} +}; + +static int __init acpisleep_dmi_init(void) +{ + dmi_check_system(acpisleep_dmi_table); + return 0; +} + +core_initcall(acpisleep_dmi_init); +#endif /* CONFIG_ACPI_PV_SLEEP */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/apic_32-xen.c 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,155 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/hpet.h> +#include <asm/i8253.h> +#include <asm/nmi.h> + +#include <mach_apic.h> +#include <mach_apicdef.h> +#include <mach_ipi.h> + +#include "io_ports.h" + +#ifndef CONFIG_XEN +/* + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as + * IPIs in place of local APIC timers + */ +static cpumask_t timer_bcast_ipi; +#endif + +/* + * Knob to control our willingness to enable the local APIC. + */ +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +/* + * Debug level + */ +int apic_verbosity; + +#ifndef CONFIG_XEN +static int modern_apic(void) +{ + unsigned int lvr, version; + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + lvr = apic_read(APIC_LVR); + version = GET_APIC_VERSION(lvr); + return version >= 0x14; +} +#endif /* !CONFIG_XEN */ + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +} + +int get_physical_broadcast(void) +{ + return 0xff; +} + +#ifndef CONFIG_XEN +#ifndef CONFIG_SMP +static void up_apic_timer_interrupt_call(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + smp_local_timer_interrupt(regs); +} +#endif + +void smp_send_timer_broadcast_ipi(struct pt_regs *regs) +{ + cpumask_t mask; + + cpus_and(mask, cpu_online_map, timer_bcast_ipi); + if (!cpus_empty(mask)) { +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#else + /* + * We can directly call the apic timer interrupt handler + * in UP case. Minus all irq related functions + */ + up_apic_timer_interrupt_call(regs); +#endif + } +} +#endif + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/cpu/common-xen.c 2009-05-19 09:16:41.000000000 +0200 @@ -0,0 +1,745 @@ +#include <linux/init.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/bootmem.h> +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/mtrr.h> +#include <asm/mce.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#else +#ifdef CONFIG_XEN +#define phys_pkg_id(a,b) a +#endif +#endif +#include <asm/hypervisor.h> + +#include "cpu.h" + +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); + +#ifndef CONFIG_XEN +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); +#endif + +static int cachesize_override __cpuinitdata = -1; +static int disable_x86_fxsr __cpuinitdata; +static int disable_x86_serial_nr __cpuinitdata = 1; +static int disable_x86_sep __cpuinitdata; + +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; + +extern int disable_pse; + +static void default_init(struct cpuinfo_x86 * c) +{ + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +} + +static struct cpu_dev default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", +}; +static struct cpu_dev * this_cpu = &default_cpu; + +static int __init cachesize_setup(char *str) +{ + get_option (&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +int __cpuinit get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (cpuid_eax(0x80000000) < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ + p = q = &c->x86_model_id[0]; + while ( *p == ' ' ) + p++; + if ( p != q ) { + while ( *p ) + *q++ = *p++; + while ( q <= &c->x86_model_id[48] ) + *q++ = '\0'; /* Zero-pad the rest */ + } + + return 1; +} + + +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ecx, edx, l2size; + + n = cpuid_eax(0x80000000); + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + ecx = cpuid_ecx(0x80000006); + l2size = ecx >> 16; + + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c,l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if ( l2size == 0 ) + return; /* Again, no L2 cache is possible */ + + c->x86_cache_size = l2size; + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + l2size, ecx & 0xFF); +} + +/* Naming convention should be: <Name> [(<Codename>)] */ +/* This table only is used unless init_<vendor>() below doesn't set it; */ +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if ( c->x86_model >= 16 ) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + + +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +{ + char *v = c->x86_vendor_id; + int i; + static int printed; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v,cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v,cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + if (!early) + this_cpu = cpu_devs[i]; + return; + } + } + } + if (!printed) { + printed++; + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + + +static int __init x86_fxsr_setup(char * s) +{ + disable_x86_fxsr = 1; + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + + +static int __init x86_sep_setup(char * s) +{ + disable_x86_sep = 1; + return 1; +} +__setup("nosep", x86_sep_setup); + + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 1); + + c->x86 = 4; + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + c->x86_mask = tfms & 15; + if (cap0 & (1<<19)) + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + } +} + +void __cpuinit generic_identify(struct cpuinfo_x86 * c) +{ + u32 tfms, xlvl; + int ebx; + + if (have_cpuid_p()) { + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 0); + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if ( c->cpuid_level >= 0x00000001 ) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + c->x86_mask = tfms & 15; +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); +#else + c->apicid = (ebx >> 24) & 0xFF; +#endif +#endif + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ( (xlvl & 0xffff0000) == 0x80000000 ) { + if ( xlvl >= 0x80000001 ) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if ( xlvl >= 0x80000004 ) + get_model_name(c); /* Default name */ + } + } + + early_intel_workaround(c); + +#ifdef CONFIG_X86_HT + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; +#endif +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { + /* Disable processor serial number */ + unsigned long lo,hi; + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_bit(X86_FEATURE_PN, c->x86_capability); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); + + + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_max_cores = 1; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + if (!have_cpuid_p()) { + /* First of all, decide if this is a 486 or higher */ + /* It's a 486 if we can modify the AC flag */ + if ( flag_is_changeable_p(X86_EFLAGS_AC) ) + c->x86 = 4; + else + c->x86 = 3; + } + + generic_identify(c); + + printk(KERN_DEBUG "CPU: After generic identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + if (this_cpu->c_identify) { + this_cpu->c_identify(c); + + printk(KERN_DEBUG "CPU: After vendor identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + } + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. Now + * we do "generic changes." + */ + + /* TSC disabled? */ + if ( tsc_disable ) + clear_bit(X86_FEATURE_TSC, c->x86_capability); + + /* FXSR disabled? */ + if (disable_x86_fxsr) { + clear_bit(X86_FEATURE_FXSR, c->x86_capability); + clear_bit(X86_FEATURE_XMM, c->x86_capability); + } + + /* SEP disabled? */ + if (disable_x86_sep) + clear_bit(X86_FEATURE_SEP, c->x86_capability); + + if (disable_pse) + clear_bit(X86_FEATURE_PSE, c->x86_capability); + + /* If the model name is still unset, do table lookup. */ + if ( !c->x86_model_id[0] ) { + char *p; + p = table_lookup_model(c); + if ( p ) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + + /* Now the feature flags better reflect actual CPU features! */ + + printk(KERN_DEBUG "CPU: After all inits, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if ( c != &boot_cpu_data ) { + /* AND the already accumulated flags with these */ + for ( i = 0 ; i < NCAPINTS ; i++ ) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ + mcheck_init(c); + + if (c == &boot_cpu_data) + sysenter_setup(); + enable_sep_cpu(); + + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); +} + +#ifdef CONFIG_X86_HT +void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1 ) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the " + "siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + ((1 << core_bits) - 1); + + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +} +#endif + +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +{ + char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) + vendor = this_cpu->c_vendor; + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + printk("%s ", vendor); + + if (!c->x86_model_id[0]) + printk("%d86", c->x86); + else + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +/* This is hacky. :) + * We're emulating future behavior. + * In the future, the cpu-specific init functions will be called implicitly + * via the magic of initcalls. + * They will insert themselves into the cpu_devs structure. + * Then, when cpu_init() is called, we can just iterate over that array. + */ + +extern int intel_cpu_init(void); +extern int cyrix_init_cpu(void); +extern int nsc_init_cpu(void); +extern int amd_init_cpu(void); +extern int centaur_init_cpu(void); +extern int transmeta_init_cpu(void); +extern int rise_init_cpu(void); +extern int nexgen_init_cpu(void); +extern int umc_init_cpu(void); + +void __init early_cpu_init(void) +{ + intel_cpu_init(); + cyrix_init_cpu(); + nsc_init_cpu(); + amd_init_cpu(); + centaur_init_cpu(); + transmeta_init_cpu(); + rise_init_cpu(); + nexgen_init_cpu(); + umc_init_cpu(); + early_cpu_detect(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif +} + +static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); + } + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8)) + BUG(); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct * t = &per_cpu(init_tss, cpu); +#endif + struct thread_struct *thread = ¤t->thread; + struct desc_struct *gdt; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + +#ifndef CONFIG_XEN + /* The CPU hotplug case */ + if (cpu_gdt_descr->address) { + gdt = (struct desc_struct *)cpu_gdt_descr->address; + memset(gdt, 0, PAGE_SIZE); + goto old_gdt; + } + /* + * This is a horrible hack to allocate the GDT. The problem + * is that cpu_init() is called really early for the boot CPU + * (and hence needs bootmem) but much later for the secondary + * CPUs, when bootmem will have gone away + */ + if (NODE_DATA(0)->bdata->node_bootmem_map) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + } else { + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + if (unlikely(!gdt)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + for (;;) + local_irq_enable(); + } + } +old_gdt: + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + /* Set up GDT entry for 16bit stack */ + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | + (CPU_16BIT_STACK_SIZE - 1); + + cpu_gdt_descr->size = GDT_SIZE - 1; + cpu_gdt_descr->address = (unsigned long)gdt; +#else + if (cpu == 0 && cpu_gdt_descr->address == 0) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + cpu_gdt_descr->size = GDT_SIZE; + cpu_gdt_descr->address = (unsigned long)gdt; + } +#endif + + cpu_gdt_init(cpu_gdt_descr); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + if (current->mm) + BUG(); + enter_lazy_tlb(&init_mm, current); + + load_esp0(t, thread); + + load_LDT(&init_mm.context); + +#ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); + + /* + * Force FPU initialization: + */ + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); +} + +#ifdef CONFIG_HOTPLUG_CPU +void __cpuinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/cpu/mcheck/mce_dom0.c 2009-10-01 11:00:47.000000000 +0200 @@ -0,0 +1,134 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <xen/interface/xen.h> +#include <xen/evtchn.h> +#include <xen/interface/vcpu.h> +#include <asm/hypercall.h> +#include <asm/mce.h> + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct mce m; + + x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); + if (mic == NULL) + { + printk(KERN_ERR "DOM0_MCE_LOG: global data is NULL\n"); + return -1; + } + + mc_global = (struct mcinfo_global*)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.cpu = mc_global->mc_coreid;/*for test*/ + x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); + do + { + if (mic == NULL || mic->size == 0) + break; + if (mic->type == MC_TYPE_BANK) + { + mc_bank = (struct mcinfo_bank*)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.res1 = mc_bank->mc_ctrl2; + m.bank = mc_bank->mc_bank; + printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", + m.bank, m.cpu, m.addr, m.status); + /*log this record*/ + mce_log(&m); + } + mic = x86_mcinfo_next(mic); + }while (1); + + return 0; +} + +static struct mc_info *g_mi; + +/*dom0 mce virq handler, logging physical mce error info*/ + +static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + xen_mc_t mc_op; + int result = 0; + + printk(KERN_DEBUG "MCE_DOM0_LOG: enter dom0 mce vIRQ handler\n"); + mc_op.cmd = XEN_MC_fetch; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi); +urgent: + mc_op.u.mc_fetch.flags = XEN_MC_URGENT; + result = HYPERVISOR_mca(&mc_op); + if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + { + printk(KERN_DEBUG "MCE_DOM0_LOG: No more urgent data\n"); + goto nonurgent; + } + else + { + result = convert_log(g_mi); + if (result) { + printk(KERN_ERR "MCE_DOM0_LOG: Log conversion failed\n"); + goto end; + } + /* After fetching the telem from DOM0, we need to dec the telem's + * refcnt and release the entry. The telem is reserved and inc + * refcnt when filling the telem. + */ + mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK; + result = HYPERVISOR_mca(&mc_op); + + goto urgent; + } +nonurgent: + mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT; + result = HYPERVISOR_mca(&mc_op); + if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + { + printk(KERN_DEBUG "MCE_DOM0_LOG: No more nonurgent data\n"); + goto end; + } + else + { + result = convert_log(g_mi); + if (result) { + printk(KERN_ERR "MCE_DOM0_LOG: Log conversion failed\n"); + goto end; + } + /* After fetching the telem from DOM0, we need to dec the telem's + * refcnt and release the entry. The telem is reserved and inc + * refcnt when filling the telem. + */ + mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK; + result = HYPERVISOR_mca(&mc_op); + + goto nonurgent; + } +end: + return IRQ_HANDLED; +} + +void bind_virq_for_mce(void) +{ + int ret; + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + mce_dom0_interrupt, 0, "mce", NULL); + + g_mi = kmalloc(sizeof(struct mc_info), GFP_KERNEL); + if (ret < 0) + printk(KERN_ERR "MCE_DOM0_LOG: bind_virq for DOM0 failed\n"); + + /* Log the machine checks left over from the previous reset. */ + mce_dom0_interrupt(VIRQ_MCA, NULL, NULL); +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/cpu/mtrr/main-xen.c 2008-01-28 12:24:18.000000000 +0100 @@ -0,0 +1,198 @@ +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> +#include <linux/mutex.h> + +#include <asm/mtrr.h> +#include "mtrr.h" + +static DEFINE_MUTEX(mtrr_mutex); + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + struct xen_platform_op op; + + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = reg; + if (unlikely(HYPERVISOR_platform_op(&op))) + memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype)); + + *size = op.u.read_memtype.nr_mfns; + *base = op.u.read_memtype.mfn; + *type = op.u.read_memtype.type; +} + +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .get = generic_get_mtrr, +}; + +struct mtrr_ops *mtrr_if = &generic_mtrr_ops; +unsigned int num_var_ranges; +unsigned int *usage_table; + +static void __init set_num_var_ranges(void) +{ + struct xen_platform_op op; + + for (num_var_ranges = 0; ; num_var_ranges++) { + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = num_var_ranges; + if (HYPERVISOR_platform_op(&op) != 0) + break; + } +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) + == NULL) { + printk(KERN_ERR "mtrr: could not allocate\n"); + return; + } + for (i = 0; i < max; i++) + usage_table[i] = 0; +} + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, char increment) +{ + int error; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + op.cmd = XENPF_add_memtype; + op.u.add_memtype.mfn = base; + op.u.add_memtype.nr_mfns = size; + op.u.add_memtype.type = type; + error = HYPERVISOR_platform_op(&op); + if (error) { + mutex_unlock(&mtrr_mutex); + BUG_ON(error > 0); + return error; + } + + if (increment) + ++usage_table[op.u.add_memtype.reg]; + + mutex_unlock(&mtrr_mutex); + + return op.u.add_memtype.reg; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING + "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG + "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +int +mtrr_add(unsigned long base, unsigned long size, unsigned int type, + char increment) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + unsigned i; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + int error = -EINVAL; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, + size); + goto out; + } + } + if (usage_table[reg] < 1) { + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--usage_table[reg] < 1) { + op.cmd = XENPF_del_memtype; + op.u.del_memtype.handle = 0; + op.u.del_memtype.reg = reg; + error = HYPERVISOR_platform_op(&op); + if (error) { + BUG_ON(error > 0); + goto out; + } + } + error = reg; + out: + mutex_unlock(&mtrr_mutex); + return error; +} + +int +mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +EXPORT_SYMBOL(mtrr_add); +EXPORT_SYMBOL(mtrr_del); + +void __init mtrr_bp_init(void) +{ +} + +void mtrr_ap_init(void) +{ +} + +static int __init mtrr_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!is_initial_xendomain()) + return -ENODEV; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + set_num_var_ranges(); + init_table(); + + return 0; +} + +subsys_initcall(mtrr_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/entry_32-xen.S 2009-05-19 09:16:41.000000000 +0200 @@ -0,0 +1,1242 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - orig_eax + * 28(%esp) - %eip + * 2C(%esp) - %cs + * 30(%esp) - %eflags + * 34(%esp) - %oldesp + * 38(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include <linux/linkage.h> +#include <asm/thread_info.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page.h> +#include <asm/desc.h> +#include <asm/dwarf2.h> +#include "irq_vectors.h" +#include <xen/interface/xen.h> + +#define nr_syscalls ((syscall_table_size)/4) + +EBX = 0x00 +ECX = 0x04 +EDX = 0x08 +ESI = 0x0C +EDI = 0x10 +EBP = 0x14 +EAX = 0x18 +DS = 0x1C +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C +EFLAGS = 0x30 +OLDESP = 0x34 +OLDSS = 0x38 + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 +/* Pseudo-eflags. */ +NMI_MASK = 0x80000000 + +#ifndef CONFIG_XEN +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#else +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + addl HYPERVISOR_shared_info,%esi +#else +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) +#endif + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli; TRACE_IRQS_OFF +#else +#define preempt_stop +#define resume_kernel restore_nocheck +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $IF_MASK,EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET es, 0;*/\ + pushl %ds; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET ds, 0;*/\ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET eax, 0;\ + pushl %ebp; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebp, 0;\ + pushl %edi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edi, 0;\ + pushl %esi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET esi, 0;\ + pushl %edx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edx, 0;\ + pushl %ecx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ecx, 0;\ + pushl %ebx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebx, 0;\ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; + +#define RESTORE_INT_REGS \ + popl %ebx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebx;\ + popl %ecx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ecx;\ + popl %edx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edx;\ + popl %esi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE esi;\ + popl %edi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edi;\ + popl %ebp; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebp;\ + popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE ds;*/\ +2: popl %es; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE es;*/\ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3b; \ + .long 2b,4b; \ +.previous + +#define RING0_INT_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 3*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 4*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, OLDESP-EBX;\ + /*CFI_OFFSET cs, CS-OLDESP;*/\ + CFI_OFFSET eip, EIP-OLDESP;\ + /*CFI_OFFSET es, ES-OLDESP;*/\ + /*CFI_OFFSET ds, DS-OLDESP;*/\ + CFI_OFFSET eax, EAX-OLDESP;\ + CFI_OFFSET ebp, EBP-OLDESP;\ + CFI_OFFSET edi, EDI-OLDESP;\ + CFI_OFFSET esi, ESI-OLDESP;\ + CFI_OFFSET edx, EDX-OLDESP;\ + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 + jmp syscall_exit + CFI_ENDPROC + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 2), %eax + jz resume_kernel +ENTRY(resume_userspace) + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + cli + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_nocheck +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + CFI_STARTPROC simple + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl SYSENTER_stack_esp0(%esp),%esp +sysenter_past_esp: + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ + sti + pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) + DISABLE_INTERRUPTS + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx + movl OLDESP(%esp), %ecx + xorl %ebp,%ebp +#ifdef CONFIG_XEN + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + movl ESI(%esp), %esi + sysexit +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ + push %esp + call evtchn_do_upcall + add $4,%esp + jmp ret_from_intr +#else + TRACE_IRQS_ON + sti + sysexit +#endif /* !CONFIG_XEN */ + CFI_ENDPROC + + # pv sysenter call handler stub +ENTRY(sysenter_entry_pv) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,12(%esp) + movl $__USER_CS,4(%esp) + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + /* fall through */ + CFI_ENDPROC +ENDPROC(sysenter_entry_pv) + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $TF_MASK,EFLAGS(%esp) + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: + # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # store the return value +syscall_exit: + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + +restore_all: +#ifndef CONFIG_XEN + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb OLDSS(%esp), %ah + movb CS(%esp), %al + andl $(VM_MASK | (4 << 8) | 3), %eax + cmpl $((4 << 8) | 3), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: +#else +restore_nocheck: + movl EFLAGS(%esp), %eax + testl $(VM_MASK|NMI_MASK), %eax + CFI_REMEMBER_STATE + jnz hypervisor_iret + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + CFI_REMEMBER_STATE + jnz restore_all_enable_events # != 0 => enable event delivery +#endif + TRACE_IRQS_IRET +restore_nocheck_notrace: + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 +1: iret +.section .fixup,"ax" +iret_exc: +#ifndef CONFIG_XEN + TRACE_IRQS_ON + sti +#endif + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + + CFI_RESTORE_STATE +#ifndef CONFIG_XEN +ldt_ss: + larl OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + /* If returning to userspace with 16bit stack, + * try to fix the higher word of ESP, as the CPU + * won't restore it. + * This is an "official" bug of all the x86-compatible + * CPUs, which we can try to work around to make + * dosemu and wine happy. */ + subl $8, %esp # reserve space for switch16 pointer + CFI_ADJUST_CFA_OFFSET 8 + cli + TRACE_IRQS_OFF + movl %esp, %eax + /* Set up the 16bit stack frame with switch32 pointer on top, + * and a switch16 pointer on top of the current frame. */ + call setup_x86_bogus_stack + CFI_ADJUST_CFA_OFFSET -8 # frame has moved + TRACE_IRQS_IRET + RESTORE_REGS + lss 20+4(%esp), %esp # switch to 16bit stack +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +#else + ALIGN +restore_all_enable_events: + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +ecrit: /**** END OF CRITICAL REGION ****/ + jmp .Ldo_upcall + + CFI_RESTORE_STATE +hypervisor_iret: + andl $~NMI_MASK, EFLAGS(%esp) + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + jmp hypercall_page + (__HYPERVISOR_iret * 32) +#endif + CFI_ENDPROC + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests + testl $VM_MASK, EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: +#ifdef CONFIG_VM86 + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +#endif + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + cmpl $0, %eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,EAX(%esp) + jmp resume_userspace + +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + CFI_ENDPROC + +#ifndef CONFIG_XEN +#define FIXUP_ESPFIX_STACK \ + movl %esp, %eax; \ + /* switch to 32bit stack using the pointer on top of 16bit stack */ \ + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ + /* copy data from 16bit stack to 32bit stack */ \ + call fixup_x86_bogus_stack; \ + /* put ESP to the proper location */ \ + movl %eax, %esp; +#define UNWIND_ESPFIX_STACK \ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4; \ + movl %ss, %eax; \ + /* see if on 16bit stack */ \ + cmpw $__ESPFIX_SS, %ax; \ + je 28f; \ +27: popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4; \ +.section .fixup,"ax"; \ +28: movl $__KERNEL_DS, %eax; \ + movl %eax, %ds; \ + movl %eax, %es; \ + /* switch to 32bit stack */ \ + FIXUP_ESPFIX_STACK; \ + jmp 27b; \ +.previous + +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +vector=0 +ENTRY(irq_entries_start) + RING0_INT_FRAME +.rept NR_IRQS + ALIGN + .if vector + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $~(vector) + CFI_ADJUST_CFA_OFFSET 4 + jmp common_interrupt +.data + .long 1b +.text +vector=vector+1 +.endr + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + ALIGN +common_interrupt: + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr + CFI_ENDPROC + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call smp_/**/name; \ + jmp ret_from_intr; \ + CFI_ENDPROC + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" +#else +#define UNWIND_ESPFIX_STACK +#endif + +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + xorl %eax, %eax + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + decl %eax # eax = -1 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl ES(%esp), %edi # get the function address + movl ORIG_EAX(%esp), %edx # get the error code + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + /*CFI_REL_OFFSET es, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC + +#ifdef CONFIG_XEN +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +# +# The sysexit critical region is slightly different. sysexit +# atomically removes the entire stack frame. If we interrupt in the +# critical region we know that the entire frame is present and correct +# so we can simply throw away the new one. +ENTRY(hypervisor_callback) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + testb $2,CS(%esp) + movl EIP(%esp),%eax + jnz .Ldo_upcall + cmpl $scrit,%eax + jb 0f + cmpl $ecrit,%eax + jb critical_region_fixup +0: +#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL + cmpl $sysexit_scrit,%eax + jb .Ldo_upcall + cmpl $sysexit_ecrit,%eax + ja .Ldo_upcall + addl $OLDESP,%esp # Remove eflags...ebx from stack frame. +#endif +.Ldo_upcall: + push %esp + CFI_ADJUST_CFA_OFFSET 4 + call evtchn_do_upcall + add $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_intr + CFI_ENDPROC + +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped + testl %ecx,%ecx + leal (%esp,%ecx,4),%esi # %esi points at end of src region + leal OLDESP(%esp),%edi # %edi points at end of dst region + jle 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp .Ldo_upcall + +.section .rodata,"a" +critical_fixup_table: + .byte -1,-1,-1 # testb $0xff,(%esi) = __TEST_PENDING + .byte -1,-1 # jnz 14f + .byte 0 # pop %ebx + .byte 1 # pop %ecx + .byte 2 # pop %edx + .byte 3 # pop %esi + .byte 4 # pop %edi + .byte 5 # pop %ebp + .byte 6 # pop %eax + .byte 7 # pop %ds + .byte 8 # pop %es + .byte 9,9,9 # add $4,%esp + .byte 10 # iret + .byte -1,-1,-1,-1 # movb $1,1(%esi) = __DISABLE_INTERRUPTS +.previous + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + RING0_INT_FRAME + pushl $0 + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: xorl %eax,%eax; \ + movl %eax,4(%esp); \ + jmp 1b; \ +7: xorl %eax,%eax; \ + movl %eax,8(%esp); \ + jmp 2b; \ +8: xorl %eax,%eax; \ + movl %eax,12(%esp); \ + jmp 3b; \ +9: xorl %eax,%eax; \ + movl %eax,16(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous +#endif + CFI_ENDPROC + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL +#ifndef CONFIG_XEN + movl %cr0, %eax + testl $0x4, %eax # EM (math emulation bit) + je device_available_emulate + pushl $0 # temporary storage for ORIG_EIP + CFI_ADJUST_CFA_OFFSET 4 + call math_emulate + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_exception +device_available_emulate: +#endif + preempt_stop + call math_state_restore + jmp ret_from_exception + CFI_ENDPROC + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + pushfl; \ + pushl $__KERNEL_CS; \ + pushl $sysenter_past_esp +#endif /* CONFIG_XEN */ + +KPROBE_ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC + .previous .text +#ifndef CONFIG_XEN +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_16bit_stack + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_16bit_stack: + RING0_INT_FRAME + /* create the pointer to lss back */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + movzwl %sp, %esp + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to 16bit stack +1: iret + CFI_ENDPROC +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +#else +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, EFLAGS(%esp) + jmp restore_all + CFI_ENDPROC +#endif + +KPROBE_ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC + .previous .text + +ENTRY(overflow) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(bounds) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +KPROBE_ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + .previous .text + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + .previous .text + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif + +#ifndef CONFIG_XEN +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, EBX(%edx) + xorl %ebx, %ebx + movl %ebx, ECX(%edx) + movl %ebx, EDX(%edx) + movl %esi, ESI(%edx) + movl %edi, EDI(%edx) + movl %ebp, EBP(%edx) + movl %ebx, EAX(%edx) + movl $__USER_DS, DS(%edx) + movl $__USER_DS, ES(%edx) + movl %ebx, ORIG_EAX(%edx) + movl %ecx, EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, CS(%edx) + movl %ebx, EFLAGS(%edx) + movl %eax, OLDESP(%edx) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl EBX(%edx), %ebx + movl $__KERNEL_DS, OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(fixup_4gb_segment) + RING0_EC_FRAME + pushl $do_fixup_4gb_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +.section .rodata,"a" +#include "syscall_table.S" + +syscall_table_size=(.-sys_call_table) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/fixup.c 2008-01-28 12:24:18.000000000 +0100 @@ -0,0 +1,88 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/version.h> + +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) + +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + /* Ignore statically-linked init. */ + if (current->tgid == 1) + return; + + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, + VMASST_TYPE_4gb_segments_notify)); + + if (test_and_set_bit(0, &printed)) + return; + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls glibc libraries. The emulation is **"); + DP("** slow. To ensure full performance you should **"); + DP("** install a 'xen-friendly' (nosegneg) version of **"); + DP("** the library, or disable tls support by executing **"); + DP("** the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for (i = 5; i > 0; i--) { + touch_softlockup_watchdog(); + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + + printk("Continuing...\n\n"); +} + +static int __init fixup_init(void) +{ + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments_notify)); + return 0; +} +__initcall(fixup_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/head_32-xen.S 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,207 @@ + + +.text +#include <linux/elfnote.h> +#include <linux/threads.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/cache.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf2.h> +#include <xen/interface/xen.h> +#include <xen/interface/elfnote.h> + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +#define VIRT_ENTRY_OFFSET 0x0 +.org VIRT_ENTRY_OFFSET +ENTRY(startup_32) + movl %esi,xen_start_info + cld + + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + XEN_CPUID + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + XEN_CPUID + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + + movb $1,X86_HARD_MATH + + xorl %eax,%eax # Clear FS/GS and LDT + movl %eax,%fs + movl %eax,%gs + cld # gcc2 wants the direction flag cleared at all times + + pushl %eax # fake return address + jmp start_kernel + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + CFI_STARTPROC +.skip 0x1000 + CFI_ENDPROC + +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) + +/* + * BSS section + */ +.section ".bss.page_aligned","w" +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ + .align L1_CACHE_BYTES +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* 0x0b reserved */ + .quad 0x0000000000000000 /* 0x13 reserved */ + .quad 0x0000000000000000 /* 0x1b reserved */ + .quad 0x0000000000000000 /* 0x20 unused */ + .quad 0x0000000000000000 /* 0x28 unused */ + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ + .quad 0x0000000000000000 /* 0x4b reserved */ + .quad 0x0000000000000000 /* 0x53 reserved */ + .quad 0x0000000000000000 /* 0x5b reserved */ + + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ + + .quad 0x0000000000000000 /* 0x80 TSS descriptor */ + .quad 0x0000000000000000 /* 0x88 LDT descriptor */ + + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + .quad 0x0000000000000000 /* 0x90 32-bit code */ + .quad 0x0000000000000000 /* 0x98 16-bit code */ + .quad 0x0000000000000000 /* 0xa0 16-bit data */ + .quad 0x0000000000000000 /* 0xa8 16-bit data */ + .quad 0x0000000000000000 /* 0xb0 16-bit data */ + + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + .quad 0x0000000000000000 /* 0xb8 APM CS code */ + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ + .quad 0x0000000000000000 /* 0xc8 APM DS data */ + + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xe0 - unused */ + .quad 0x0000000000000000 /* 0xe8 - unused */ + .quad 0x0000000000000000 /* 0xf0 - unused */ + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoa value + .if (\value) < 0 || (\value) >= 0x10 + utoa (((\value)>>4)&0x0fffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoa __PAGE_OFFSET + .ascii ",ELF_PADDR_OFFSET=0x" + utoa __PAGE_OFFSET + .ascii ",VIRT_ENTRY=0x" + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET) + .ascii ",HYPERCALL_PAGE=0x" + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" + .ascii "|supervisor_mode_kernel" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes[extended-cr3]" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/io_apic_32-xen.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,2786 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/compiler.h> +#include <linux/acpi.h> +#include <linux/module.h> +#include <linux/sysdev.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/timer.h> +#include <asm/i8259.h> +#include <asm/nmi.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +#ifdef CONFIG_XEN + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> +#include <xen/evtchn.h> + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +} + +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct physdev_apic apic_op; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + apic_op.value = value; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +} + +#define io_apic_read(a,r) xen_io_apic_read(a,r) +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) + +#endif /* CONFIG_XEN */ + +int (*ioapic_renumber_irq)(int ioapic, int irq); +atomic_t irq_mis_count; + +#ifndef CONFIG_XEN +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +#endif + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int timer_over_8254 __initdata = 1; + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +int disable_timer_pin_1 __initdata; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + int apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +#ifdef CONFIG_XEN +#define clear_IO_APIC() ((void)0) +#else +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin, reg; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00010000); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) +{ + unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int apicid_value; + cpumask_t tmp; + + cpus_and(tmp, cpumask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(cpumask, tmp, CPU_MASK_ALL); + + apicid_value = cpu_mask_to_apicid(cpumask); + /* Prepare to do the io_apic_write */ + apicid_value = apicid_value << 24; + spin_lock_irqsave(&ioapic_lock, flags); + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + set_irq_info(irq, cpumask); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#if defined(CONFIG_IRQBALANCE) +# include <asm/processor.h> /* kernel_thread() */ +# include <linux/kernel_stat.h> /* kstat */ +# include <linux/slab.h> /* kmalloc() */ +# include <linux/timer.h> /* time_after() */ + +#ifdef CONFIG_BALANCED_IRQ_DEBUG +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) +# define Dprintk(x...) do { TDprintk(x); } while (0) +# else +# define TDprintk(x...) +# define Dprintk(x...) +# endif + +#define IRQBALANCE_CHECK_ARCH -999 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; +static int physical_balance __read_mostly; +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; + +static struct irq_cpu_info { + unsigned long * last_irq; + unsigned long * irq_delta; + unsigned long irq; +} irq_cpu_data[NR_CPUS]; + +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) + +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) + +static cpumask_t balance_irq_affinity[NR_IRQS] = { + [0 ... NR_IRQS-1] = CPU_MASK_ALL +}; + +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) +{ + balance_irq_affinity[irq] = mask; +} + +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, + unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= NR_CPUS) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = NR_CPUS-1; + } + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int cpu, int irq) +{ + unsigned long now = jiffies; + cpumask_t allowed_mask; + unsigned int new_cpu; + + if (irqbalance_disabled) + return; + + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); + new_cpu = move(cpu, allowed_mask, now, 1); + if (cpu != new_cpu) { + set_pending_irq(irq, cpumask_of_cpu(new_cpu)); + } +} + +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) +{ + int i, j; + Dprintk("Rotating IRQs among CPUs.\n"); + for_each_online_cpu(i) { + for (j = 0; j < NR_IRQS; j++) { + if (!irq_desc[j].action) + continue; + /* Is it a significant load ? */ + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + useful_load_threshold) + continue; + balance_irq(i, j); + } + } + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; +} + +static void do_irq_balance(void) +{ + int i, j; + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); + unsigned long move_this_load = 0; + int max_loaded = 0, min_loaded = 0; + int load; + unsigned long useful_load_threshold = balanced_irq_interval + 10; + int selected_irq; + int tmp_loaded, first_attempt = 1; + unsigned long tmp_cpu_irq; + unsigned long imbalance = 0; + cpumask_t allowed_mask, target_cpu_mask, tmp; + + for_each_possible_cpu(i) { + int package_index; + CPU_IRQ(i) = 0; + if (!cpu_online(i)) + continue; + package_index = CPU_TO_PACKAGEINDEX(i); + for (j = 0; j < NR_IRQS; j++) { + unsigned long value_now, delta; + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if ( package_index == i ) + IRQ_DELTA(package_index,j) = 0; + /* Determine the total count per processor per IRQ */ + value_now = (unsigned long) kstat_cpu(i).irqs[j]; + + /* Determine the activity per processor per IRQ */ + delta = value_now - LAST_CPU_IRQ(i,j); + + /* Update last_cpu_irq[][] for the next time */ + LAST_CPU_IRQ(i,j) = value_now; + + /* Ignore IRQs whose rate is less than the clock */ + if (delta < useful_load_threshold) + continue; + /* update the load for the processor or package total */ + IRQ_DELTA(package_index,j) += delta; + + /* Keep track of the higher numbered sibling as well */ + if (i != package_index) + CPU_IRQ(i) += delta; + /* + * We have sibling A and sibling B in the package + * + * cpu_irq[A] = load for cpu A + load for cpu B + * cpu_irq[B] = load for cpu B + */ + CPU_IRQ(package_index) += delta; + } + } + /* Find the least loaded processor package */ + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (min_cpu_irq > CPU_IRQ(i)) { + min_cpu_irq = CPU_IRQ(i); + min_loaded = i; + } + } + max_cpu_irq = ULONG_MAX; + +tryanothercpu: + /* Look for heaviest loaded processor. + * We may come back to get the next heaviest loaded processor. + * Skip processors with trivial loads. + */ + tmp_cpu_irq = 0; + tmp_loaded = -1; + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (max_cpu_irq <= CPU_IRQ(i)) + continue; + if (tmp_cpu_irq < CPU_IRQ(i)) { + tmp_cpu_irq = CPU_IRQ(i); + tmp_loaded = i; + } + } + + if (tmp_loaded == -1) { + /* In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original + * approach to rotate them around. + */ + if (!first_attempt && imbalance >= useful_load_threshold) { + rotate_irqs_among_cpus(useful_load_threshold); + return; + } + goto not_worth_the_effort; + } + + first_attempt = 0; /* heaviest search */ + max_cpu_irq = tmp_cpu_irq; /* load */ + max_loaded = tmp_loaded; /* processor */ + imbalance = (max_cpu_irq - min_cpu_irq) / 2; + + Dprintk("max_loaded cpu = %d\n", max_loaded); + Dprintk("min_loaded cpu = %d\n", min_loaded); + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); + Dprintk("load imbalance = %lu\n", imbalance); + + /* if imbalance is less than approx 10% of max load, then + * observe diminishing returns action. - quit + */ + if (imbalance < (max_cpu_irq >> 3)) { + Dprintk("Imbalance too trivial\n"); + goto not_worth_the_effort; + } + +tryanotherirq: + /* if we select an IRQ to move that can't go where we want, then + * see if there is another one to try. + */ + move_this_load = 0; + selected_irq = -1; + for (j = 0; j < NR_IRQS; j++) { + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if (imbalance <= IRQ_DELTA(max_loaded,j)) + continue; + /* Try to find the IRQ that is closest to the imbalance + * without going over. + */ + if (move_this_load < IRQ_DELTA(max_loaded,j)) { + move_this_load = IRQ_DELTA(max_loaded,j); + selected_irq = j; + } + } + if (selected_irq == -1) { + goto tryanothercpu; + } + + imbalance = move_this_load; + + /* For physical_balance case, we accumlated both load + * values in the one of the siblings cpu_irq[], + * to use the same code for physical and logical processors + * as much as possible. + * + * NOTE: the cpu_irq[] array holds the sum of the load for + * sibling A and sibling B in the slot for the lowest numbered + * sibling (A), _AND_ the load for sibling B in the slot for + * the higher numbered sibling. + * + * We seek the least loaded sibling by making the comparison + * (A+B)/2 vs B + */ + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } + + cpus_and(allowed_mask, + cpu_online_map, + balance_irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); + + if (!cpus_empty(tmp)) { + + Dprintk("irq = %d moved to cpu = %d\n", + selected_irq, min_loaded); + /* mark for change destination */ + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); + + /* Since we made a change, come back sooner to + * check for more variation. + */ + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; + } + goto tryanotherirq; + +not_worth_the_effort: + /* + * if we did not find an IRQ to move, then adjust the time interval + * upward + */ + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + Dprintk("IRQ worth rotating not found\n"); + return; +} + +static int balanced_irq(void *unused) +{ + int i; + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + + daemonize("kirqd"); + + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + irq_desc[i].pending_mask = cpumask_of_cpu(0); + set_pending_irq(i, cpumask_of_cpu(0)); + } + + for ( ; ; ) { + time_remaining = schedule_timeout_interruptible(time_remaining); + try_to_freeze(); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { + preempt_disable(); + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; + preempt_enable(); + } + } + return 0; +} + +static int __init balanced_irq_init(void) +{ + int i; + struct cpuinfo_x86 *c; + cpumask_t tmp; + + cpus_shift_right(tmp, cpu_online_map, 2); + c = &boot_cpu_data; + /* When not overwritten by the command line ask subarchitecture. */ + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) + irqbalance_disabled = NO_BALANCE_IRQ; + if (irqbalance_disabled) + return 0; + + /* disable irqbalance completely if there is only one processor online */ + if (num_online_cpus() < 2) { + irqbalance_disabled = 1; + return 0; + } + /* + * Enable physical balance only if more than 1 physical processor + * is present + */ + if (smp_num_siblings > 1 && !cpus_empty(tmp)) + physical_balance = 1; + + for_each_online_cpu(i) { + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { + printk(KERN_ERR "balanced_irq_init: out of memory"); + goto failed; + } + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + } + + printk(KERN_INFO "Starting balanced_irq\n"); + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + return 0; + else + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); +failed: + for_each_possible_cpu(i) { + kfree(irq_cpu_data[i].irq_delta); + irq_cpu_data[i].irq_delta = NULL; + kfree(irq_cpu_data[i].last_irq); + irq_cpu_data[i].last_irq = NULL; + } + return 0; +} + +int __init irqbalance_disable(char *str) +{ + irqbalance_disabled = 1; + return 1; +} + +__setup("noirqbalance", irqbalance_disable); + +late_initcall(balanced_irq_init); +#endif /* CONFIG_IRQBALANCE */ +#endif /* CONFIG_SMP */ +#endif + +#ifndef CONFIG_SMP +void fastcall send_IPI_self(int vector) +{ +#ifndef CONFIG_XEN + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +#endif +} +#endif /* !CONFIG_SMP */ + + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; + +static int __init ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +__setup("noapic", ioapic_setup); + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +#ifndef CONFIG_XEN +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} +#endif + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " + "slot:%d, pin:%d.\n", bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +#ifndef CONFIG_XEN +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif /* !CONFIG_XEN */ +#endif + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +/* NEC98 interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_NEC98_trigger(idx) (0) +#define default_NEC98_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + polarity = default_NEC98_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + trigger = default_NEC98_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + case MP_BUS_NEC98: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ + +int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + struct physdev_irq irq_op; + + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + + if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) + return -EINVAL; + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); + return IO_APIC_VECTOR(irq); + } + + irq_op.irq = irq; + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + spin_unlock_irqrestore(&vector_lock, flags); + return -ENOSPC; + } + + vector = irq_op.vector; + vector_irq[vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = vector; + + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; +} + +#ifndef CONFIG_XEN +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + unsigned idx; + + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].chip = &ioapic_level_type; + else + irq_desc[idx].chip = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); +} +#else +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) +#endif + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, + pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", + mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + } + + irq = pin_2_irq(idx, apic, pin); + /* + * skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum + */ + if (multi_timer_check(apic, irq)) + continue; + else + add_pin_to_irq(irq, apic, pin); + + if (/*!apic &&*/ !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the 8259A-master output pin: + */ +#ifndef CONFIG_XEN +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].chip = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +static inline void UNEXPECTED_IO_APIC(void) +{ +} + +void __init print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + if (reg_00.bits.ID >= get_physical_broadcast()) + UNEXPECTED_IO_APIC(); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + if (reg_03.bits.__reserved_1) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void /*__init*/ print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void /*__init*/ print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} +#endif /* !CONFIG_XEN */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; +#ifndef CONFIG_XEN + int i8259_apic, i8259_pin; +#endif + int i, apic; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } +#ifndef CONFIG_XEN + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } +#endif + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + +#ifndef CONFIG_XEN + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = + GET_APIC_ID(apic_read(APIC_ID)); + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, + *(((int *)&entry)+1)); + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, + *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +#endif +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mpc_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mpc_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mpc_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mpc_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +#ifndef CONFIG_XEN +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + if (jiffies - t1 > 4) + return 1; + + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + unsigned long v; + int i; + + move_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = IO_APIC_VECTOR(irq); + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_native_irq_info(vector, cpu_mask); + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif +#endif + +static int ioapic_retrigger(unsigned int irq) +{ + send_IPI_self(IO_APIC_VECTOR(irq)); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ +static struct hw_interrupt_type ioapic_edge_type __read_mostly = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; + +static struct hw_interrupt_type ioapic_level_type __read_mostly = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); +#ifndef CONFIG_XEN + else + /* Strange. Oh, well.. */ + irq_desc[irq].chip = &no_irq_type; +#endif + } + } +} + +#ifndef CONFIG_XEN +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type __read_mostly = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + apic = find_isa_irq_apic(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +int timer_uses_ioapic_pin_0; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int apic1, pin1, apic2, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + timer_ack = 1; + if (timer_over_8254 > 0) + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + return; + } + clear_IO_APIC_pin(apic1, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " + "IO-APIC\n"); + } + + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (pin1 != -1) + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + else + add_pin_to_irq(0, apic2, pin2); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(apic2, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].chip = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + printk(" failed.\n"); + + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + timer_ack = 0; + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option"); +} +#else +int timer_uses_ioapic_pin_0 = 0; +#define check_timer() ((void)0) +#endif + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + printk("ENABLING IO-APIC IRQs\n"); + + /* + * Set up IO-APIC IRQ routing. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#ifndef CONFIG_XEN + sync_Arb_IDs(); +#endif + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if(sis_apic_bug == -1) + sis_apic_bug = 0; + if (is_initial_xendomain()) { + struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; + op.u.platform_quirk.quirk_id = sis_apic_bug ? + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; + VOID(HYPERVISOR_platform_op(&op)); + } + return 0; +} + +late_initcall(io_apic_bug_finalize); + +#ifndef CONFIG_XEN + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +#endif /* CONFIG_XEN */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ +#ifndef CONFIG_XEN + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); +#endif /* !CONFIG_XEN */ + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /* CONFIG_ACPI */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ioport_32-xen.c 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,123 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <xen/interface/physdev.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + unsigned long *bitmap; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned long unused) +{ + volatile struct pt_regs * regs = (struct pt_regs *) &unused; + unsigned int level = regs->ebx; + struct thread_struct *t = ¤t->thread; + unsigned int old = (t->iopl >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + t->iopl = level << 12; + set_iopl_mask(t->iopl); + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/irq_32-xen.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,324 @@ +/* + * linux/arch/i386/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> + +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; +EXPORT_PER_CPU_SYMBOL(irq_stat); + +#ifndef CONFIG_X86_LOCAL_APIC +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} +#endif + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +}; + +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +#endif + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bit used in ret_from_ code */ + int irq = ~regs->orig_eax; +#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx, *irqctx; + u32 *isp; +#endif + + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } + + /*irq_enter();*/ +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + +#ifdef CONFIG_4KSTACKS + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (curctx != irqctx) { + int arg1, arg2, ebx; + + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_IRQ \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (ebx) + : "0" (irq), "1" (regs), "2" (isp) + : "memory", "cc", "ecx" + ); + } else +#endif + __do_IRQ(irq, regs); + + /*irq_exit();*/ + + return 1; +} + +#ifdef CONFIG_4KSTACKS + +/* + * These should really be __section__(".bss.page_aligned") as well, but + * gcc's 3.0 and earlier don't handle that correctly. + */ +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + /* + * Shouldnt happen, we returned above if in_interrupt(): + */ + WARN_ON_ONCE(softirq_count()); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_desc[irq].affinity, map); + if (any_online_cpu(mask) == NR_CPUS) { + /*printk("Breaking affinity for irq %i\n", irq);*/ + mask = map; + } + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ldt_32-xen.c 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,270 @@ +/* + * linux/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt; + void *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); +#endif + make_pages_readonly( + pc->ldt, + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable( + oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + make_pages_readonly( + new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + mm->context.has_foreign_mappings = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable( + mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + void *address; + + err = 0; + address = &default_ldt[0]; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; + + err = size; + if (copy_to_user(ptr, address, size)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, + entry_1, entry_2); + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/microcode-xen.c 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,144 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2004 Tigran Aivazian + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, + * Order Number 245472 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/245472.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +//#define DEBUG /* pr_debug */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/syscalls.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>"); +MODULE_LICENSE("GPL"); + +static int verbose; +module_param(verbose, int, 0644); + +#define MICROCODE_VERSION "1.14a-xen" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + + +static int do_microcode_update (const void __user *ubuf, size_t len) +{ + int err; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, ubuf, len) == 0) { + struct xen_platform_op op; + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_platform_op(&op); + } else + err = -EFAULT; + + vfree(kbuf); + + return err; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if (len < MC_HEADER_SIZE) { + printk(KERN_ERR "microcode: not enough data\n"); + return -EINVAL; + } + + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + + return ret; +} + +static struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n"); + return 0; +} + +static void __exit microcode_exit (void) +{ + misc_deregister(µcode_dev); +} + +module_init(microcode_init) +module_exit(microcode_exit) +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/mpparse_32-xen.c 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,1185 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> + +#include <asm/smp.h> +#include <asm/acpi.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> + +#include <mach_apic.h> +#include <mach_mpparse.h> +#include <bios_ebda.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +int mp_bus_id_to_type [MAX_MP_BUSSES]; +int mp_bus_id_to_node [MAX_MP_BUSSES]; +int mp_bus_id_to_local [MAX_MP_BUSSES]; +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +static int mp_current_pci_id; + +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; + +int pic_mode; +unsigned long mp_lapic_addr; + +unsigned int def_to_bigsmp = 0; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +/* Internal processor count */ +static unsigned int __devinitdata num_processors; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* + * Intel MP BIOS table parsing routines: + */ + + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; + +#ifndef CONFIG_XEN +static void __devinit MP_processor_info (struct mpc_config_processor *m) +{ + int ver, apicid; + physid_mask_t phys_cpu; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (m->mpc_featureflag&(1<<0)) + Dprintk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + Dprintk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + Dprintk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + Dprintk(" Internal APIC present.\n"); + if (m->mpc_featureflag&(1<<11)) + Dprintk(" SEP present.\n"); + if (m->mpc_featureflag&(1<<12)) + Dprintk(" MTRR present.\n"); + if (m->mpc_featureflag&(1<<13)) + Dprintk(" PGE present.\n"); + if (m->mpc_featureflag&(1<<14)) + Dprintk(" MCA present.\n"); + if (m->mpc_featureflag&(1<<15)) + Dprintk(" CMOV present.\n"); + if (m->mpc_featureflag&(1<<16)) + Dprintk(" PAT present.\n"); + if (m->mpc_featureflag&(1<<17)) + Dprintk(" PSE present.\n"); + if (m->mpc_featureflag&(1<<18)) + Dprintk(" PSN present.\n"); + if (m->mpc_featureflag&(1<<19)) + Dprintk(" Cache Line Flush Instruction present.\n"); + /* 20 Reserved */ + if (m->mpc_featureflag&(1<<21)) + Dprintk(" Debug Trace and EMON Store present.\n"); + if (m->mpc_featureflag&(1<<22)) + Dprintk(" ACPI Thermal Throttle Registers present.\n"); + if (m->mpc_featureflag&(1<<23)) + Dprintk(" MMX present.\n"); + if (m->mpc_featureflag&(1<<24)) + Dprintk(" FXSR present.\n"); + if (m->mpc_featureflag&(1<<25)) + Dprintk(" XMM present.\n"); + if (m->mpc_featureflag&(1<<26)) + Dprintk(" Willamette New Instructions present.\n"); + if (m->mpc_featureflag&(1<<27)) + Dprintk(" Self Snoop present.\n"); + if (m->mpc_featureflag&(1<<28)) + Dprintk(" HT present.\n"); + if (m->mpc_featureflag&(1<<29)) + Dprintk(" Thermal Monitor present.\n"); + /* 30, 31 Reserved */ + + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_physical_apicid = m->mpc_apicid; + } + + ver = m->mpc_apicver; + + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + + phys_cpu = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + cpu_set(num_processors, cpu_possible_map); + num_processors++; + + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj <ashok.raj@intel.com> + */ + if (num_processors > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(ver)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} +#else +void __init MP_processor_info (struct mpc_config_processor *m) +{ + num_processors++; +} +#endif /* CONFIG_XEN */ + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + + mpc_oem_bus_info(m, str, translation_table[mpc_record]); + + if (m->mpc_busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->mpc_busid, str, MAX_MP_BUSSES - 1); + return; + } + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { + mpc_oem_pci_bus(m, translation_table[mpc_record]); + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; + } else { + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info (struct mpc_config_translation *m) +{ + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ + unsigned short oemsize) +{ + int count = sizeof (*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable)+count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) + { + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], + oemtable->oem_signature[1], + oemtable->oem_signature[2], + oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) + { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m= + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + char oem[10]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", + *(u32 *)mpc->mpc_signature); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk(KERN_ERR "SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(oem,mpc->mpc_oem,8); + oem[8]=0; + printk(KERN_INFO "OEM ID: %s ",oem); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + mps_oem_check(mpc, oem, str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* + * Save the local APIC address (it might be non-default) -- but only + * if we're not using ACPI. + */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + mpc_record = 0; + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + default: + { + count = mpc->mpc_length; + break; + } + } + ++mpc_record; + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk("???\n"); + printk(KERN_ERR "Unknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + unsigned long *bp = isa_bus_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } +#else + printk(KERN_INFO "found SMP MP-table at %08lx\n", + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); +#endif + + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config (void) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +#endif +} + +int es7000_plat; + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +void __init mp_register_lapic_address ( + u64 address) +{ +#ifndef CONFIG_XEN + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +#endif +} + + +void __devinit mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + +#ifndef CONFIG_XEN + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; +#endif + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_base; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + int tmpid; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + tmpid = io_apic_get_unique_id(idx, id); + else + tmpid = id; + if (tmpid == -1) { + nr_ioapics--; + return; + } + mp_ioapics[idx].mpc_apicid = tmpid; + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overriden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +#define MAX_GSI_NUM 4096 + +int mp_register_gsi (u32 gsi, int triggering, int polarity) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrups, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi_to_irq[gsi]; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_fadt.sci_int) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/pci-dma-xen.c 2009-11-06 10:23:23.000000000 +0100 @@ -0,0 +1,406 @@ +/* + * Dynamic DMA mapping support. + * + * On i386 there is no hardware dynamic DMA address translation, + * so consistent alloc/free are merely page allocation/freeing. + * The rest of the dynamic DMA mapping interface is implemented + * in asm/pci.h. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/version.h> +#include <asm/io.h> +#include <xen/balloon.h> +#include <xen/gnttab.h> +#include <asm/swiotlb.h> +#include <asm/tlbflush.h> +#include <asm-i386/mach-xen/asm/swiotlb.h> +#include <asm-i386/mach-xen/asm/gnttab_dma.h> +#include <asm/bug.h> + +#ifdef __x86_64__ +#include <asm/proto.h> + +int iommu_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address __read_mostly; +EXPORT_SYMBOL(bad_dma_address); + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +int force_iommu __read_mostly= 0; + +__init int iommu_setup(char *p) +{ + return 1; +} + +void __init pci_iommu_alloc(void) +{ +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ + no_iommu_init(); + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); +#endif + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +int range_straddles_page_boundary(paddr_t p, size_t size) +{ + unsigned long pfn = p >> PAGE_SHIFT; + unsigned int offset = p & ~PAGE_MASK; + + return ((offset + size > PAGE_SIZE) && + !check_pages_physically_contiguous(pfn, offset, size)); +} + +int +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int i, rc; + + if (direction == DMA_NONE) + BUG(); + WARN_ON(nents == 0 || sg[0].length == 0); + + if (swiotlb) { + rc = swiotlb_map_sg(hwdev, sg, nents, direction); + } else { + for (i = 0; i < nents; i++ ) { + BUG_ON(!sg[i].page); + sg[i].dma_address = + gnttab_dma_map_page(sg[i].page) + sg[i].offset; + sg[i].dma_length = sg[i].length; + IOMMU_BUG_ON(address_needs_mapping( + hwdev, sg[i].dma_address)); + IOMMU_BUG_ON(range_straddles_page_boundary( + page_to_pseudophys(sg[i].page) + sg[i].offset, + sg[i].length)); + } + rc = nents; + } + + flush_write_buffers(); + return rc; +} +EXPORT_SYMBOL(dma_map_sg); + +void +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int i; + + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_sg(hwdev, sg, nents, direction); + else { + for (i = 0; i < nents; i++ ) + gnttab_dma_unmap_page(sg[i].dma_address); + } +} +EXPORT_SYMBOL(dma_unmap_sg); + +#ifdef CONFIG_HIGHMEM +dma_addr_t +dma_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_addr; + + BUG_ON(direction == DMA_NONE); + + if (swiotlb) { + dma_addr = swiotlb_map_page( + dev, page, offset, size, direction); + } else { + dma_addr = gnttab_dma_map_page(page) + offset; + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_map_page); + +void +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_page(dev, dma_address, size, direction); + else + gnttab_dma_unmap_page(dma_address); +} +EXPORT_SYMBOL(dma_unmap_page); +#endif /* CONFIG_HIGHMEM */ + +int +dma_mapping_error(dma_addr_t dma_addr) +{ + if (swiotlb) + return swiotlb_dma_mapping_error(dma_addr); + return 0; +} +EXPORT_SYMBOL(dma_mapping_error); + +int +dma_supported(struct device *dev, u64 mask) +{ + if (swiotlb) + return swiotlb_dma_supported(dev, mask); + /* + * By default we'll BUG when an infeasible DMA is requested, and + * request swiotlb=force (see IOMMU_BUG_ON). + */ + return 1; +} +EXPORT_SYMBOL(dma_supported); + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + void *ret; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + unsigned int order = get_order(size); + unsigned long vstart; + u64 mask; + + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + ret = mem->virt_base + (page << PAGE_SHIFT); + memset(ret, 0, size); + return ret; + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + return NULL; + } + + vstart = __get_free_pages(gfp, order); + ret = (void *)vstart; + + if (dev != NULL && dev->coherent_dma_mask) + mask = dev->coherent_dma_mask; + else + mask = 0xffffffff; + + if (ret != NULL) { + if (xen_create_contiguous_region(vstart, order, + fls64(mask)) != 0) { + free_pages(vstart, order); + return NULL; + } + memset(ret, 0, size); + *dma_handle = virt_to_bus(ret); + } + return ret; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + } else { + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); + } +} +EXPORT_SYMBOL(dma_free_coherent); + +#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base; + int pages = size >> PAGE_SHIFT; + int bitmap_size = (pages + 31)/32; + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + memset(dev->dma_mem->bitmap, 0, bitmap_size); + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem->bitmap); + out: + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if(!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + int pos, err; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); +#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ + +dma_addr_t +dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction direction) +{ + dma_addr_t dma; + + if (direction == DMA_NONE) + BUG(); + WARN_ON(size == 0); + + if (swiotlb) { + dma = swiotlb_map_single(dev, ptr, size, direction); + } else { + dma = gnttab_dma_map_page(virt_to_page(ptr)) + + offset_in_page(ptr); + IOMMU_BUG_ON(range_straddles_page_boundary(__pa(ptr), size)); + IOMMU_BUG_ON(address_needs_mapping(dev, dma)); + } + + flush_write_buffers(); + return dma; +} +EXPORT_SYMBOL(dma_map_single); + +void +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + if (direction == DMA_NONE) + BUG(); + if (swiotlb) + swiotlb_unmap_single(dev, dma_addr, size, direction); + else + gnttab_dma_unmap_page(dma_addr); +} +EXPORT_SYMBOL(dma_unmap_single); + +void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + if (swiotlb) + swiotlb_sync_single_for_device(dev, dma_handle, size, direction); +} +EXPORT_SYMBOL(dma_sync_single_for_device); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/process_32-xen.c 2008-07-21 11:00:32.000000000 +0200 @@ -0,0 +1,877 @@ +/* + * linux/arch/i386/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <stdarg.h> + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/interrupt.h> +#include <linux/utsname.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/random.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/desc.h> +#include <asm/vm86.h> +#ifdef CONFIG_MATH_EMULATION +#include <asm/math_emu.h> +#endif + +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/cpu_hotplug.h> + +#include <linux/err.h> + +#include <asm/tlbflush.h> +#include <asm/cpu.h> + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + +static int hlt_counter; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.esp)[3]; +} + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + hlt_counter++; +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} + +EXPORT_SYMBOL(enable_hlt); + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + local_irq_enable(); + + asm volatile( + "2:" + "testl %0, %1;" + "rep; nop;" + "je 2b;" + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + +#ifdef CONFIG_HOTPLUG_CPU +extern cpumask_t cpu_initialized; +static inline void play_dead(void) +{ + idle_task_exit(); + local_irq_disable(); + cpu_clear(smp_processor_id(), cpu_initialized); + preempt_enable_no_resched(); + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); + cpu_bringup(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + int cpu = smp_processor_id(); + + current_thread_info()->status |= TS_POLLING; + + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + idle = xen_idle; /* no alternatives */ + + if (cpu_is_offline(cpu)) + play_dead(); + + __get_cpu_var(irq_stat).idle_timestamp = jiffies; + idle(); + } + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) +{ +} + +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } + + boot_option_idle_override = 1; + return 1; +} + +__setup("idle=", idle_setup); + +void show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + + printk("\n"); + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + print_symbol("EIP is at %s\n", regs->eip); + + if (user_mode_vm(regs)) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" EFLAGS: %08lx %s (%s %.*s)\n", + regs->eflags, print_tainted(), system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->eax,regs->ebx,regs->ecx,regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + regs->esi, regs->edi, regs->ebp); + printk(" DS: %04x ES: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes); + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4_safe(); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + show_trace(NULL, regs, ®s->esp); +} + +/* + * This gets run with %ebx containing the + * function to call, and %edx containing + * the "args". + */ +extern void kernel_thread_helper(void); +__asm__(".section .text\n" + ".align 4\n" + "kernel_thread_helper:\n\t" + "movl %edx,%eax\n\t" + "pushl %edx\n\t" + "call *%ebx\n\t" + "pushl %eax\n\t" + "call do_exit\n" + ".previous"); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = GET_KERNEL_CS(); + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + /* The process may have allocated an io port bitmap... nuke it. */ + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { + struct task_struct *tsk = current; + struct thread_struct *t = &tsk->thread; + struct physdev_set_iobitmap set_iobitmap; + memset(&set_iobitmap, 0, sizeof(set_iobitmap)); + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + clear_tsk_thread_flag(tsk, TIF_DEBUG); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + struct pt_regs * childregs; + struct task_struct *tsk; + int err; + + childregs = task_pt_regs(p); + *childregs = *regs; + childregs->eax = 0; + childregs->esp = esp; + + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); + + p->thread.eip = (unsigned long) ret_from_fork; + + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); + + tsk = current; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { + struct desc_struct *desc; + struct user_desc info; + int idx; + + err = -EFAULT; + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) + goto out; + err = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + goto out; + + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + p->thread.iopl = current->thread.iopl; + + err = 0; + out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * fill in the user structure for a core dump.. + */ +void dump_thread(struct pt_regs * regs, struct user * dump) +{ + int i; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + for (i = 0; i < 8; i++) + dump->u_debugreg[i] = current->thread.debugreg[i]; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; + + dump->regs.ebx = regs->ebx; + dump->regs.ecx = regs->ecx; + dump->regs.edx = regs->edx; + dump->regs.esi = regs->esi; + dump->regs.edi = regs->edi; + dump->regs.ebp = regs->ebp; + dump->regs.eax = regs->eax; + dump->regs.ds = regs->xds; + dump->regs.es = regs->xes; + savesegment(fs,dump->regs.fs); + savesegment(gs,dump->regs.gs); + dump->regs.orig_eax = regs->orig_eax; + dump->regs.eip = regs->eip; + dump->regs.cs = regs->xcs; + dump->regs.eflags = regs->eflags; + dump->regs.esp = regs->esp; + dump->regs.ss = regs->xss; + + dump->u_fpvalid = dump_fpu (regs, &dump->i387); +} +EXPORT_SYMBOL(dump_thread); + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs ptregs = *task_pt_regs(tsk); + ptregs.xcs &= 0xffff; + ptregs.xds &= 0xffff; + ptregs.xes &= 0xffff; + ptregs.xss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +static noinline void __switch_to_xtra(struct task_struct *next_p) +{ + struct thread_struct *next; + + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg[6], 6); + set_debugreg(next->debugreg[7], 7); + } +} + +/* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = task_thread_info(prev_p); + next = task_thread_info(next_p); + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* + * switch_to(x,yn) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %eax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + */ + if (prev_p->thread_info->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } +#if 0 /* lazy fpu sanity check */ + else BUG_ON(!(read_cr0() & 8)); +#endif + + /* + * Reload esp0. + * This is load_esp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->esp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + *(u64 *)&mcl->args[0] = virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Restore %fs and %gs if needed. + * + * Glibc normally makes %fs be zero, and %gs is one of + * the TLS segments. + */ + if (unlikely(next->fs)) + loadsegment(fs, next->fs); + + if (next->gs) + loadsegment(gs, next->gs); + + /* + * Now maybe handle debug registers + */ + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + __switch_to_xtra(next_p); + + disable_tsc(prev_p, next_p); + + return prev_p; +} + +asmlinkage int sys_fork(struct pt_regs regs) +{ + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +asmlinkage int sys_clone(struct pt_regs regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs.ebx; + newsp = regs.ecx; + parent_tidptr = (int __user *)regs.edx; + child_tidptr = (int __user *)regs.edi; + if (!newsp) + newsp = regs.esp; + return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage int sys_vfork(struct pt_regs regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage int sys_execve(struct pt_regs regs) +{ + int error; + char * filename; + + filename = getname((char __user *) regs.ebx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, + (char __user * __user *) regs.ecx, + (char __user * __user *) regs.edx, + ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } + putname(filename); +out: + return error; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ebp, esp, eip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)task_stack_page(p); + esp = p->thread.esp; + if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + ebp = *(unsigned long *) esp; + do { + if (ebp < stack_page || ebp > top_ebp+stack_page) + return 0; + eip = *(unsigned long *) (ebp+4); + if (!in_sched_functions(eip)) + return eip; + ebp = *(unsigned long *) ebp; + } while (count++ < 16); + return 0; +} + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(t->tls_array + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + */ +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + struct thread_struct *t = ¤t->thread; + struct user_desc info; + struct desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + load_TLS(t, cpu); + + put_cpu(); + + return 0; +} + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + struct user_desc info; + struct desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/quirks-xen.c 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,47 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include <linux/pci.h> +#include <linux/irq.h> + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u32 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* read xTPR register */ + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { + struct xen_platform_op op; + printk(KERN_INFO "Disabling irq balancing and affinity\n"); + op.cmd = XENPF_platform_quirk; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + WARN_ON(HYPERVISOR_platform_op(&op)); + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup_32-xen.c 2008-04-22 15:41:51.000000000 +0200 @@ -0,0 +1,1919 @@ +/* + * linux/arch/i386/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@mvista.com>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@osdl.org>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/screen_info.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/seq_file.h> +#include <linux/platform_device.h> +#include <linux/console.h> +#include <linux/mca.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/nodemask.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <linux/dmi.h> +#include <linux/pfn.h> + +#include <video/edid.h> + +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/arch_hooks.h> +#include <asm/sections.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/io.h> +#include <asm/hypervisor.h> +#include <xen/interface/physdev.h> +#include <xen/interface/memory.h> +#include <xen/features.h> +#include <xen/firmware.h> +#include <xen/xencons.h> +#include <setup_arch.h> +#include <bios_ebda.h> + +#ifdef CONFIG_XEN +#include <xen/interface/kexec.h> +#endif + +/* Forward Declaration. */ +void __init find_max_pfn(void); + +static int xen_panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block xen_panic_block = { + xen_panic_event, NULL, 0 /* try to go last */ +}; + +extern char hypercall_page[PAGE_SIZE]; +EXPORT_SYMBOL(hypercall_page); + +int disable_pse __devinitdata = 0; + +/* + * Machine setup.. + */ + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +/* cpu data as detected by the assembly code in head.S */ +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +/* common cpu data for all cpus */ +struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +EXPORT_SYMBOL(boot_cpu_data); + +unsigned long mmu_cr4_features; + +#ifdef CONFIG_ACPI + int acpi_disabled = 0; +#else + int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_ACPI +int __initdata acpi_force = 0; +extern acpi_interrupt_flags acpi_sci_flags; +#endif + +/* for MCA, but anyone else can use it if they want */ +unsigned int machine_id; +#ifdef CONFIG_MCA +EXPORT_SYMBOL(machine_id); +#endif +unsigned int machine_submodel_id; +unsigned int BIOS_revision; +unsigned int mca_pentium_flag; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * Setup options + */ +struct drive_info_struct { char dummy[32]; } drive_info; +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \ + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +EXPORT_SYMBOL(drive_info); +#endif +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); +struct sys_desc_table_struct { + unsigned short length; + unsigned char table[0]; +}; +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); +#ifndef CONFIG_XEN +#define copy_edid() (edid_info = EDID_INFO) +#endif +struct ist_info ist_info; +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +EXPORT_SYMBOL(ist_info); +#endif +struct e820map e820; +#ifdef CONFIG_XEN +struct e820map machine_e820; +#endif + +extern void early_cpu_init(void); +extern void generic_apic_probe(char *); +extern int root_mountflags; + +unsigned long saved_videomode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char command_line[COMMAND_LINE_SIZE]; + +unsigned char __initdata boot_params[PARAM_SIZE]; + +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + +#ifdef CONFIG_XEN + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return; +#endif + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +/* + * Point at the empty zero page to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +unsigned long *phys_to_machine_mapping; +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16]; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/* Raw start-of-day parameters from the hypervisor. */ +start_info_t *xen_start_info; +EXPORT_SYMBOL(xen_start_info); + +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +static void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + if (efi_enabled) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map, i = 0; p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + current_addr = md->phys_addr + (md->num_pages << 12); + if (md->type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + md->num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + return; + } +#ifdef CONFIG_XEN + if (i==e820.nr_map && current_addr < size) { + /* + * The e820 map finished before our requested size so + * extend the final entry to the requested address. + */ + --i; + if (e820.map[i].type == E820_RAM) + e820.map[i].size -= current_addr - size; + else + add_memory_region(current_addr, size - current_addr, E820_RAM); + } +#endif +} + +#define E820_DEBUG 1 + +static void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; + +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i=0; i < 2*old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i=0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if <current_addr> > <last_addr>, swap */ + /* or, if current=<start_addr> & last=<end_addr>, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; i<overlap_entries; i++) + { + if (overlap_list[i] == change_point[chgidx]->pbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; i<overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ +#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +#else + BUG_ON(nr_map < 1); +#endif + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + +#ifndef CONFIG_XEN + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } +#endif + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + struct xen_memory_map memmap; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; + } else + machine_e820 = e820; +#endif + + return 0; +} + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +#ifndef CONFIG_XEN +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); + edd.mbr_signature_nr = EDD_MBR_SIG_NR; + edd.edd_info_nr = EDD_NR; +} +#endif +#else +static inline void copy_edd(void) +{ +} +#endif + +static void __init parse_cmdline_early (char ** cmdline_p) +{ + char c = ' ', *to = command_line, *from = saved_command_line; + int len = 0, max_cmdline; + int userdef = 0; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); + /* Save unparsed command line copy for /proc/cmdline */ + saved_command_line[max_cmdline-1] = '\0'; + + for (;;) { + if (c != ' ') + goto next_char; + /* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ + if (!memcmp(from, "mem=", 4)) { + if (to != command_line) + to--; + if (!memcmp(from+4, "nopentium", 9)) { + from += 9+4; + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(from+4, &from); + limit_regions(mem_size); + userdef=1; + } + } + + else if (!memcmp(from, "memmap=", 7)) { + if (to != command_line) + to--; + if (!memcmp(from+7, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + from += 8+7; + e820.nr_map = 0; + userdef = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(from+7, &from); + if (*from == '@') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*from == '#') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*from == '$') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + userdef=1; + } + } + } + + else if (!memcmp(from, "noexec=", 7)) + noexec_setup(from + 7); + + +#ifdef CONFIG_X86_MPPARSE + /* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ + else if (!memcmp(from, "maxcpus=", 8)) { + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(from + 8, NULL, 0); + } +#endif + +#ifdef CONFIG_ACPI + /* "acpi=off" disables both ACPI table parsing and interpreter */ + else if (!memcmp(from, "acpi=off", 8)) { + disable_acpi(); + } + + /* acpi=force to over-ride black-list */ + else if (!memcmp(from, "acpi=force", 10)) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + + /* acpi=strict disables out-of-spec workarounds */ + else if (!memcmp(from, "acpi=strict", 11)) { + acpi_strict = 1; + } + + /* Limit ACPI just to boot-time to enable HT */ + else if (!memcmp(from, "acpi=ht", 7)) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ + else if (!memcmp(from, "pci=noacpi", 10)) { + acpi_disable_pci(); + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (!memcmp(from, "acpi=noirq", 10)) { + acpi_noirq_set(); + } + + else if (!memcmp(from, "acpi_sci=edge", 13)) + acpi_sci_flags.trigger = 1; + + else if (!memcmp(from, "acpi_sci=level", 14)) + acpi_sci_flags.trigger = 3; + + else if (!memcmp(from, "acpi_sci=high", 13)) + acpi_sci_flags.polarity = 1; + + else if (!memcmp(from, "acpi_sci=low", 12)) + acpi_sci_flags.polarity = 3; + +#ifdef CONFIG_X86_IO_APIC + else if (!memcmp(from, "acpi_skip_timer_override", 24)) + acpi_skip_timer_override = 1; + + if (!memcmp(from, "disable_timer_pin_1", 19)) + disable_timer_pin_1 = 1; + if (!memcmp(from, "enable_timer_pin_1", 18)) + disable_timer_pin_1 = -1; + + /* disable IO-APIC */ + else if (!memcmp(from, "noapic", 6)) + disable_ioapic_setup(); +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ + +#ifdef CONFIG_X86_LOCAL_APIC + /* enable local APIC */ + else if (!memcmp(from, "lapic", 5)) + lapic_enable(); + + /* disable local APIC */ + else if (!memcmp(from, "nolapic", 6)) + lapic_disable(); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif + } +#endif +#ifdef CONFIG_PROC_VMCORE + /* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ + else if (!memcmp(from, "elfcorehdr=", 11)) + elfcorehdr_addr = memparse(from+11, &from); +#endif + + /* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ + else if (!memcmp(from, "highmem=", 8)) + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; + + /* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ + else if (!memcmp(from, "vmalloc=", 8)) + __VMALLOC_RESERVE = memparse(from+8, &from); + + next_char: + c = *(from++); + if (!c) + break; + if (COMMAND_LINE_SIZE <= ++len) + break; + *(to++) = c; + } + *to = '\0'; + *cmdline_p = command_line; + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } +} + +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, start, end); + return 0; +} + +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + + /* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} + +/* + * Determine low and high memory ranges: + */ +unsigned long __init find_max_low_pfn(void) +{ + unsigned long max_low_pfn; + + max_low_pfn = max_pfn; + if (max_low_pfn > MAXMEM_PFN) { + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", + MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_X86_PAE + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING "Warning only 4GB will be used.\n"); + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + } +#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM */ + } else { + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif + } + return max_low_pfn; +} + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +static void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + +#ifdef CONFIG_XEN + /* + * Truncate to the number of actual pages currently + * present. + */ + if (last_pfn > xen_start_info->nr_pages) + last_pfn = xen_start_info->nr_pages; +#endif + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +#ifndef CONFIG_XEN +/* + * workaround for Dell systems that neglect to reserve EBDA + */ +static void __init reserve_ebda_region(void) +{ + unsigned int addr; + addr = get_bios_ebda(); + if (addr) + reserve_bootmem(addr, PAGE_SIZE); +} +#endif + +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init setup_bootmem_allocator(void); +static unsigned long __init setup_memory(void) +{ + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(__pa(xen_start_info->pt_base)) + + xen_start_info->nr_pt_frames; + + find_max_pfn(); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) { + highstart_pfn = max_low_pfn; + } + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned int max_dma, low; + + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + low = max_low_pfn; + + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = highend_pfn - low; +#endif + } + free_area_init(zones_size); +} +#else +extern unsigned long __init setup_memory(void); +extern void zone_sizes_init(void); +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +void __init setup_bootmem_allocator(void) +{ + unsigned long bootmap_size; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); + + register_bootmem_low_pages(max_low_pfn); + + /* + * Reserve the bootmem bitmap itself as well. We do this in two + * steps (first step was init_bootmem()) because this catches + * the (very unlikely) case of us accidentally initializing the + * bootmem allocator with an invalid RAM area. + */ + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); + +#ifndef CONFIG_XEN + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, + * enabling clean reboots, SMP operation, laptop functions. + */ + reserve_bootmem(0, PAGE_SIZE); + + /* reserve EBDA region, it's a 4K region */ + reserve_ebda_region(); + + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent + PCI prefetch into it (errata #56). Usually the page is reserved anyways, + unless you have no PS/2 mouse plugged in. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 6) + reserve_bootmem(0xa0000 - 4096, 4096); + +#ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_bootmem(PAGE_SIZE, PAGE_SIZE); +#endif +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_BLK_DEV_INITRD + if (xen_start_info->mod_start) { + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ + initrd_start = INITRD_START + PAGE_OFFSET; + initrd_end = initrd_start+INITRD_SIZE; + initrd_below_start_ok = 1; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + INITRD_START + INITRD_SIZE, + max_low_pfn << PAGE_SHIFT); + initrd_start = 0; + } + } +#endif +#ifdef CONFIG_KEXEC +#ifdef CONFIG_XEN + xen_machine_kexec_setup_resources(); +#else + if (crashk_res.start != crashk_res.end) + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); +#endif +#endif +} + +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct e820entry *e820, int nr_map, + struct resource *code_resource, + struct resource *data_resource) +{ + int i; + + probe_roms(); + + for (i = 0; i < nr_map; i++) { + struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT + if (e820[i].addr + e820[i].size > 0x100000000ULL) + continue; +#endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (e820[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820[i].addr; + res->end = res->start + e820[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } + if (e820[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ +#ifndef CONFIG_XEN + request_resource(res, code_resource); + request_resource(res, data_resource); +#endif +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); +#ifdef CONFIG_XEN + xen_machine_kexec_register_resources(res); +#endif +#endif + } + } +} + +/* + * Locate a unused range of the physical address space below 4G which + * can be used for PCI mappings. + */ +static void __init +e820_setup_gap(struct e820entry *e820, int nr_map) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = nr_map; + while (--i >= 0) { + unsigned long long start = e820[i].addr; + unsigned long long end = start + e820[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return 0; + + printk("Setting up standard PCI resources\n"); +#ifdef CONFIG_XEN + legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map, + &code_resource, &data_resource); +#else + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(e820.map, e820.nr_map, + &code_resource, &data_resource); +#endif + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); + +static void __init register_memory(void) +{ +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + e820_setup_gap(machine_e820.map, machine_e820.nr_map); + else +#endif + e820_setup_gap(e820.map, e820.nr_map); +} + +#ifdef CONFIG_MCA +static void set_mca_bus(int x) +{ + MCA_bus = x; +} +#else +static void set_mca_bus(int x) { } +#endif + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +void __init setup_arch(char **cmdline_p) +{ + int i, j, k, fpp; + struct physdev_set_iopl set_iopl; + unsigned long max_low_pfn; + unsigned long p2m_pages; + + /* Force a quick death if the kernel panics (not domain 0). */ + extern int panic_timeout; + if (!panic_timeout && !is_initial_xendomain()) + panic_timeout = 1; + + /* Register a call for panic conditions. */ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments)); + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + pre_setup_arch_hook(); + early_cpu_init(); +#ifdef CONFIG_SMP + prefill_possible_map(); +#endif + + /* + * FIXME: This isn't an official loader_type right + * now but does currently work with elilo. + * If we were configured as an EFI kernel, check to make + * sure that we were loaded correctly from elilo and that + * the system table is valid. If not, then initialize normally. + */ +#ifdef CONFIG_EFI + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) + efi_enabled = 1; +#endif + + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. + */ + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + copy_edid(); + apm_info.bios = APM_BIOS_INFO; + ist_info = IST_INFO; + saved_videomode = VIDEO_MODE; + if( SYS_DESC_TABLE.length != 0 ) { + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); + machine_id = SYS_DESC_TABLE.table[0]; + machine_submodel_id = SYS_DESC_TABLE.table[1]; + BIOS_revision = SYS_DESC_TABLE.table[2]; + } + bootloader_type = LOADER_TYPE; + + if (is_initial_xendomain()) { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + + dom0_init_screen_info(info, + xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + } else + screen_info.orig_video_isVGA = 0; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + + ARCH_SETUP + if (efi_enabled) + efi_init(); + else { + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(machine_specific_memory_setup()); + } + + copy_edd(); + + if (!MOUNT_ROOT_RDONLY) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = (PFN_UP(__pa(xen_start_info->pt_base)) + + xen_start_info->nr_pt_frames) << PAGE_SHIFT; + + code_resource.start = virt_to_phys(_text); + code_resource.end = virt_to_phys(_etext)-1; + data_resource.start = virt_to_phys(_etext); + data_resource.end = virt_to_phys(_edata)-1; + + parse_cmdline_early(cmdline_p); + +#ifdef CONFIG_EARLY_PRINTK + { + char *s = strstr(*cmdline_p, "earlyprintk="); + if (s) { + setup_early_printk(strchr(s, '=') + 1); + printk("early console enabled\n"); + } + } +#endif + + max_low_pfn = setup_memory(); + + /* + * NOTE: before this point _nobody_ is allowed to allocate + * any memory using the bootmem allocator. Although the + * alloctor is now initialised only the first 8Mb of the kernel + * virtual address space has been mapped. All allocations before + * paging_init() has completed must use the alloc_bootmem_low_pages() + * variant (which allocates DMA'able memory) and care must be taken + * not to exceed the 8Mb limit. + */ + +#ifdef CONFIG_SMP + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ +#endif + paging_init(); + remapped_pgdat_init(); + sparse_init(); + zone_sizes_init(); + +#ifdef CONFIG_X86_FIND_SMP_CONFIG + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + + p2m_pages = max_pfn; + if (xen_start_info->nr_pages > max_pfn) { + /* + * the max_pfn was shrunk (probably by mem= or highmem= + * kernel parameter); shrink reservation with the HV + */ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned int difference; + int ret; + + difference = xen_start_info->nr_pages - max_pfn; + + set_xen_guest_handle(reservation.extent_start, + ((unsigned long *)xen_start_info->mfn_list) + max_pfn); + reservation.nr_extents = difference; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + BUG_ON (ret != difference); + } + else if (max_pfn > xen_start_info->nr_pages) + p2m_pages = xen_start_info->nr_pages; + + /* Make sure we have a correctly sized P->M table. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + phys_to_machine_mapping = alloc_bootmem_low_pages( + max_pfn * sizeof(unsigned long)); + memset(phys_to_machine_mapping, ~0, + max_pfn * sizeof(unsigned long)); + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info->mfn_list, + p2m_pages * sizeof(unsigned long)); + free_bootmem( + __pa(xen_start_info->mfn_list), + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * + sizeof(unsigned long)))); + + /* + * Initialise the list of the frames that specify the list of + * frames that make up the p2m table. Used by save/restore + */ + pfn_to_mfn_frame_list_list = alloc_bootmem_low_pages(PAGE_SIZE); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++) { + if ((j % fpp) == 0) { + k++; + BUG_ON(k>=16); + pfn_to_mfn_frame_list[k] = + alloc_bootmem_low_pages(PAGE_SIZE); + pfn_to_mfn_frame_list_list[k] = + virt_to_mfn(pfn_to_mfn_frame_list[k]); + j=0; + } + pfn_to_mfn_frame_list[k][j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = max_pfn; + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(pfn_to_mfn_frame_list_list); + } + + /* Mark all ISA DMA channels in-use - using them wouldn't work. */ + for (i = 0; i < MAX_DMA_CHANNELS; ++i) + if (i != 4 && request_dma(i, "xen") != 0) + BUG(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + + if (is_initial_xendomain()) + dmi_scan_machine(); + +#ifdef CONFIG_X86_GENERICARCH + generic_apic_probe(*cmdline_p); +#endif + if (efi_enabled) + efi_map_memmap(); + + set_iopl.iopl = 1; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); + +#ifdef CONFIG_ACPI + if (!is_initial_xendomain()) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + acpi_disabled = 1; + acpi_ht = 0; + } + + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); +#endif + +#ifdef CONFIG_X86_IO_APIC + check_acpi_pci(); /* Checks more than just ACPI actually */ +#endif + +#ifdef CONFIG_ACPI + acpi_boot_init(); + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) + if (def_to_bigsmp) + printk(KERN_WARNING "More than 8 CPUs detected and " + "CONFIG_X86_PC cannot handle it.\nUse " + "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + if (smp_found_config) + get_smp_config(); +#endif + + register_memory(); + + if (is_initial_xendomain()) { +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + if (!efi_enabled || + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + } else { +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif + } + tsc_init(); +} + +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + HYPERVISOR_shutdown(SHUTDOWN_crash); + /* we're never actually going to get here... */ + return NOTIFY_DONE; +} + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + if (!is_initial_xendomain()) + return 0; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); + +/* + * Local Variables: + * mode:c + * c-file-style:"k&r" + * c-basic-offset:8 + * End: + */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/smp_32-xen.c 2007-12-10 08:47:31.000000000 +0100 @@ -0,0 +1,605 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#if 0 +#include <mach_apic.h> +#endif +#include <xen/evtchn.h> + +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically thats so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; + +/* + * the following functions deal with sending IPIs between CPUs. + * + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. + */ + +static inline int __prepare_ICR (unsigned int shortcut, int vector) +{ + unsigned int icr = shortcut | APIC_DEST_LOGICAL; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +static inline int __prepare_ICR2 (unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); + +static inline void __send_IPI_one(unsigned int cpu, int vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + +void __send_IPI_shortcut(unsigned int shortcut, int vector) +{ + int cpu; + + switch (shortcut) { + case APIC_DEST_SELF: + __send_IPI_one(smp_processor_id(), vector); + break; + case APIC_DEST_ALLBUT: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + default: + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, + vector); + break; + } +} + +void fastcall send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector); +} + +/* + * This is only used on smaller machines. + */ +void send_IPI_mask_bitmask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned int cpu; + + local_irq_save(flags); + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, mask)) { + __send_IPI_one(cpu, vector); + } + } + + local_irq_restore(flags); +} + +void send_IPI_mask_sequence(cpumask_t mask, int vector) +{ + + send_IPI_mask_bitmask(mask, vector); +} + +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ + +#if 0 /* XEN */ +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + */ + +static cpumask_t flush_cpumask; +static struct mm_struct * flush_mm; +static unsigned long flush_va; +static DEFINE_SPINLOCK(tlbstate_lock); +#define FLUSH_ALL 0xffffffff + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + * + * We need to reload %cr3 since the page tables may be going + * away from under us.. + */ +static inline void leave_mm (unsigned long cpu) +{ + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superflous + * tlb flush. + * 1a2) set cpu_tlbstate to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu_tlbstate[].active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu_tlbstate[].active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu_tlbstate is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + */ + +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + unsigned long cpu; + + cpu = get_cpu(); + + if (!cpu_isset(cpu, flush_cpumask)) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_va == FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(flush_va); + } else + leave_mm(cpu); + } + smp_mb__before_clear_bit(); + cpu_clear(cpu, flush_cpumask); + smp_mb__after_clear_bit(); +out: + put_cpu_no_resched(); + + return IRQ_HANDLED; +} + +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + /* + * A couple of (to be removed) sanity checks: + * + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + + /* + * i'm not happy about this global shared spinlock in the + * MM hot path, but we'll see how contended it is. + * Temporarily this turns IRQs off, so that lockups are + * detected by the NMI watchdog. + */ + spin_lock(&tlbstate_lock); + + flush_mm = mm; + flush_va = va; +#if NR_CPUS <= BITS_PER_LONG + atomic_set_mask(cpumask, &flush_cpumask); +#else + { + int k; + unsigned long *flush_mask = (unsigned long *)&flush_cpumask; + unsigned long *cpu_mask = (unsigned long *)&cpumask; + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) + atomic_set_mask(cpu_mask[k], &flush_mask[k]); + } +#endif + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + + while (!cpus_empty(flush_cpumask)) + /* nothing. lockup detection does not belong here */ + mb(); + + flush_mm = NULL; + flush_va = 0; + spin_unlock(&tlbstate_lock); +} + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + local_flush_tlb(); + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm (struct mm_struct * mm) +{ + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if(current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, va); + + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_page); + +static void do_flush_tlb_all(void* info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); +} + +#endif /* XEN */ + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void smp_send_reschedule(int cpu) +{ + WARN_ON(cpu_is_offline(cpu)); + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +} + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + +static struct call_data_struct *call_data; + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + struct call_data_struct data; + int cpus; + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus() - 1; + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + spin_unlock(&call_lock); + + return 0; +} +EXPORT_SYMBOL(smp_call_function); + +static void stop_this_cpu (void * dummy) +{ + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + local_irq_disable(); + disable_all_local_evtchn(); + if (cpu_data[smp_processor_id()].hlt_works_ok) + for(;;) halt(); + for (;;); +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +void smp_send_stop(void) +{ + smp_call_function(stop_this_cpu, NULL, 1, 0); + + local_irq_disable(); + disable_all_local_evtchn(); + local_irq_enable(); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + + return IRQ_HANDLED; +} + +#include <linux/kallsyms.h> +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } + + return IRQ_HANDLED; +} + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/time-xen.c 2010-03-01 14:03:37.000000000 +0100 @@ -0,0 +1,1224 @@ +/* + * linux/arch/i386/kernel/time.c + * + * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * + * This file contains the PC-specific time handling details: + * reading the RTC at bootup, etc.. + * 1994-07-02 Alan Modra + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime + * 1995-03-26 Markus Kuhn + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 + * precision CMOS clock update + * 1996-05-03 Ingo Molnar + * fixed time warps in do_[slow|fast]_gettimeoffset() + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-09-05 (Various) + * More robust do_fast_gettimeoffset() algorithm implemented + * (works with APM, Cyrix 6x86MX and Centaur C6), + * monotonic gettimeofday() with fast_get_timeoffset(), + * drift-proof precision TSC calibration on boot + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). + * 1998-12-16 Andrea Arcangeli + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy + * because was not accounting lost_ticks. + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/bcd.h> +#include <linux/efi.h> +#include <linux/mca.h> +#include <linux/sysctl.h> +#include <linux/percpu.h> +#include <linux/kernel_stat.h> +#include <linux/posix-timers.h> +#include <linux/cpufreq.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/msr.h> +#include <asm/delay.h> +#include <asm/mpspec.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/timer.h> +#include <asm/sections.h> + +#include "mach_time.h" + +#include <linux/timex.h> + +#include <asm/hpet.h> + +#include <asm/arch_hooks.h> + +#include <xen/evtchn.h> +#include <xen/interface/vcpu.h> + +#if defined (__i386__) +#include <asm/i8259.h> +#endif + +int pit_latch_buggy; /* extern */ + +#if defined(__x86_64__) +unsigned long vxtime_hz = PIT_TICK_RATE; +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; +struct timespec __xtime __section_xtime; +struct timezone __sys_tz __section_sys_tz; +#endif + +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + +extern unsigned long wall_jiffies; + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +extern struct init_timer_opts timer_tsc_init; +extern struct timer_opts timer_tsc; +#define timer_none timer_tsc + +/* These are peridically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + u32 tsc_to_usec_mul; + int tsc_shift; + u32 version; +}; +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +static struct timespec shadow_tv; +static u32 shadow_tv_version; + +static struct timeval monotonic_tv; +static spinlock_t monotonic_lock = SPIN_LOCK_UNLOCKED; + +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ +static DEFINE_PER_CPU(u64, processed_system_time); + +/* How much CPU time was spent blocked and how much was 'stolen'? */ +static DEFINE_PER_CPU(u64, processed_stolen_time); +static DEFINE_PER_CPU(u64, processed_blocked_time); + +/* Current runstate of each CPU (updated automatically by the hypervisor). */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* Must be signed, as it's compared with s64 quantities which can be -ve. */ +#define NS_PER_TICK (1000000000LL/HZ) + +static void __clock_was_set(void *unused) +{ + clock_was_set(); +} +static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL); + +/* + * GCC 4.3 can turn loops over an induction variable into division. We do + * not support arbitrary 64-bit division, and so must break the induction. + */ +#define clobber_induction_variable(v) asm ( "" : "+r" (v) ) + +static inline void __normalize_time(time_t *sec, s64 *nsec) +{ + while (*nsec >= NSEC_PER_SEC) { + clobber_induction_variable(*nsec); + (*nsec) -= NSEC_PER_SEC; + (*sec)++; + } + while (*nsec < 0) { + clobber_induction_variable(*nsec); + (*nsec) += NSEC_PER_SEC; + (*sec)--; + } +} + +/* Does this guest OS track Xen time, or set its wall clock independently? */ +static int independent_wallclock = 0; +static int __init __independent_wallclock(char *str) +{ + independent_wallclock = 1; + return 1; +} +__setup("independent_wallclock", __independent_wallclock); + +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ +static int __init __permitted_clock_jitter(char *str) +{ + permitted_clock_jitter = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("permitted_clock_jitter=", __permitted_clock_jitter); + +#if 0 +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do { + rep_nop(); + rdtscl(now); + } while ((now - bclock) < loops); +} + +struct timer_opts timer_tsc = { + .name = "tsc", + .delay = delay_tsc, +}; +#endif + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#else + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#endif + + return product; +} + +#if 0 /* defined (__i386__) */ +int read_current_timer(unsigned long *timer_val) +{ + rdtscl(*timer_val); + return 0; +} +#endif + +void init_cpu_khz(void) +{ + u64 __cpu_khz = 1000000ULL << 32; + struct vcpu_time_info *info = &vcpu_info(0)->time; + do_div(__cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static unsigned long get_usec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift); +} + +static void __update_wallclock(time_t sec, long nsec) +{ + long wtm_nsec, xtime_nsec; + time_t wtm_sec, xtime_sec; + u64 tmp, wc_nsec; + + /* Adjust wall-clock time base based on wall_jiffies ticks. */ + wc_nsec = processed_system_time; + wc_nsec += sec * (u64)NSEC_PER_SEC; + wc_nsec += nsec; + wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK; + + /* Split wallclock base into seconds and nanoseconds. */ + tmp = wc_nsec; + xtime_nsec = do_div(tmp, 1000000000); + xtime_sec = (time_t)tmp; + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); + + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + + if (!independent_wallclock) + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec); +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static void get_time_values_from_xen(unsigned int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + unsigned long flags; + u32 pre_version, post_version; + + src = &vcpu_info(cpu)->time; + dst = &per_cpu(shadow_time, cpu); + + local_irq_save(flags); + + do { + pre_version = dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + post_version = src->version; + } while ((pre_version & 1) | (pre_version ^ post_version)); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; + + local_irq_restore(flags); +} + +static inline int time_values_up_to_date(unsigned int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &vcpu_info(cpu)->time; + dst = &per_cpu(shadow_time, cpu); + + rmb(); + return (dst->version == src->version); +} + +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +/* + * This version of gettimeofday has microsecond resolution + * and better than microsecond precision on fast x86 machines with TSC. + */ +void do_gettimeofday(struct timeval *tv) +{ + unsigned long seq; + unsigned long usec, sec; + unsigned long flags; + s64 nsec; + unsigned int cpu; + struct shadow_time_info *shadow; + u32 local_time_version; + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + + do { + unsigned long lost; + + local_time_version = shadow->version; + seq = read_seqbegin(&xtime_lock); + + usec = get_usec_offset(shadow); + lost = jiffies - wall_jiffies; + + if (unlikely(lost)) + usec += lost * (USEC_PER_SEC / HZ); + + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / NSEC_PER_USEC); + + nsec = shadow->system_timestamp - processed_system_time; + __normalize_time(&sec, &nsec); + usec += (long)nsec / NSEC_PER_USEC; + + if (unlikely(!time_values_up_to_date(cpu))) { + /* + * We may have blocked for a long time, + * rendering our calculations invalid + * (e.g. the time delta may have + * overflowed). Detect that and recalculate + * with fresh values. + */ + get_time_values_from_xen(cpu); + continue; + } + } while (read_seqretry(&xtime_lock, seq) || + (local_time_version != shadow->version)); + + put_cpu(); + + while (usec >= USEC_PER_SEC) { + usec -= USEC_PER_SEC; + sec++; + } + + spin_lock_irqsave(&monotonic_lock, flags); + if ((sec > monotonic_tv.tv_sec) || + ((sec == monotonic_tv.tv_sec) && (usec > monotonic_tv.tv_usec))) + { + monotonic_tv.tv_sec = sec; + monotonic_tv.tv_usec = usec; + } else { + sec = monotonic_tv.tv_sec; + usec = monotonic_tv.tv_usec; + } + spin_unlock_irqrestore(&monotonic_lock, flags); + + tv->tv_sec = sec; + tv->tv_usec = usec; +} + +EXPORT_SYMBOL(do_gettimeofday); + +int do_settimeofday(struct timespec *tv) +{ + time_t sec; + s64 nsec; + unsigned int cpu; + struct shadow_time_info *shadow; + struct xen_platform_op op; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + if (!is_initial_xendomain() && !independent_wallclock) + return -EPERM; + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + + write_seqlock_irq(&xtime_lock); + + /* + * Ensure we don't get blocked for a long time so that our time delta + * overflows. If that were to happen then our shadow time values would + * be stale, so we can retry with fresh ones. + */ + for (;;) { + nsec = tv->tv_nsec - get_nsec_offset(shadow); + if (time_values_up_to_date(cpu)) + break; + get_time_values_from_xen(cpu); + } + sec = tv->tv_sec; + __normalize_time(&sec, &nsec); + + if (is_initial_xendomain() && !independent_wallclock) { + op.cmd = XENPF_settime; + op.u.settime.secs = sec; + op.u.settime.nsecs = nsec; + op.u.settime.system_time = shadow->system_timestamp; + WARN_ON(HYPERVISOR_platform_op(&op)); + update_wallclock(); + } else if (independent_wallclock) { + nsec -= shadow->system_timestamp; + __normalize_time(&sec, &nsec); + __update_wallclock(sec, nsec); + } + ntp_clear(); + + /* Reset monotonic gettimeofday() timeval. */ + spin_lock(&monotonic_lock); + monotonic_tv.tv_sec = 0; + monotonic_tv.tv_usec = 0; + spin_unlock(&monotonic_lock); + + write_sequnlock_irq(&xtime_lock); + + put_cpu(); + + clock_was_set(); + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +static void sync_xen_wallclock(unsigned long dummy); +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); +static void sync_xen_wallclock(unsigned long dummy) +{ + time_t sec; + s64 nsec; + struct xen_platform_op op; + + if (!ntp_synced() || independent_wallclock || !is_initial_xendomain()) + return; + + write_seqlock_irq(&xtime_lock); + + sec = xtime.tv_sec; + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK); + __normalize_time(&sec, &nsec); + + op.cmd = XENPF_settime; + op.u.settime.secs = sec; + op.u.settime.nsecs = nsec; + op.u.settime.system_time = processed_system_time; + WARN_ON(HYPERVISOR_platform_op(&op)); + + update_wallclock(); + + write_sequnlock_irq(&xtime_lock); + + /* Once per minute. */ + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); +} + +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + unsigned long flags; + + if (independent_wallclock || !is_initial_xendomain()) + return 0; + + /* gets recalled with irq locally disabled */ + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +unsigned long long monotonic_clock(void) +{ + unsigned int cpu = get_cpu(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + u64 time; + u32 local_time_version; + + do { + local_time_version = shadow->version; + barrier(); + time = shadow->system_timestamp + get_nsec_offset(shadow); + if (!time_values_up_to_date(cpu)) + get_time_values_from_xen(cpu); + barrier(); + } while (local_time_version != shadow->version); + + put_cpu(); + + return time; +} +EXPORT_SYMBOL(monotonic_clock); + +#ifdef __x86_64__ +unsigned long long sched_clock(void) +{ + return monotonic_clock(); +} +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + +#ifdef __x86_64__ + /* Assume the lock function has either no stack frame or only a single word. + This checks if the address on the stack looks like a kernel text address. + There is a small window for false hits, but in that case the tick + is just accounted to the spinlock function. + Better would be to write these functions in assembler again + and check exactly. */ + if (!user_mode_vm(regs) && in_lock_functions(pc)) { + char *v = *(char **)regs->rsp; + if ((v >= _stext && v <= _etext) || + (v >= _sinittext && v <= _einittext) || + (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) + return (unsigned long)v; + return ((unsigned long *)regs->rsp)[1]; + } +#else + if (!user_mode_vm(regs) && in_lock_functions(pc)) + return *(unsigned long *)(regs->ebp + 4); +#endif + + return pc; +} +EXPORT_SYMBOL(profile_pc); +#endif + +/* + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. + */ +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + s64 delta, delta_cpu, stolen, blocked; + u64 sched_time; + unsigned int i, cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + + do { + get_time_values_from_xen(cpu); + + /* Obtain a consistent snapshot of elapsed wallclock cycles. */ + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= per_cpu(processed_system_time, cpu); + + /* + * Obtain a consistent snapshot of stolen/blocked cycles. We + * can use state_entry_time to detect if we get preempted here. + */ + do { + sched_time = runstate->state_entry_time; + barrier(); + stolen = runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline] - + per_cpu(processed_stolen_time, cpu); + blocked = runstate->time[RUNSTATE_blocked] - + per_cpu(processed_blocked_time, cpu); + barrier(); + } while (sched_time != runstate->state_entry_time); + } while (!time_values_up_to_date(cpu)); + + if ((unlikely(delta < -(s64)permitted_clock_jitter) || + unlikely(delta_cpu < -(s64)permitted_clock_jitter)) + && printk_ratelimit()) { + printk("Timer ISR/%u: Time went backwards: " + "delta=%lld delta_cpu=%lld shadow=%lld " + "off=%lld processed=%lld cpu_processed=%lld\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), + processed_system_time, + per_cpu(processed_system_time, cpu)); + for (i = 0; i < num_online_cpus(); i++) + printk(" %d: %lld\n", i, + per_cpu(processed_system_time, i)); + } + + /* System-wide jiffy work. */ + while (delta >= NS_PER_TICK) { + delta -= NS_PER_TICK; + processed_system_time += NS_PER_TICK; + do_timer(regs); + } + + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + if (keventd_up()) + schedule_work(&clock_was_set_work); + } + + write_sequnlock(&xtime_lock); + + /* + * Account stolen ticks. + * HACK: Passing NULL to account_steal_time() + * ensures that the ticks are accounted as stolen. + */ + if ((stolen > 0) && (delta_cpu > 0)) { + delta_cpu -= stolen; + if (unlikely(delta_cpu < 0)) + stolen += delta_cpu; /* clamp local-time progress */ + do_div(stolen, NS_PER_TICK); + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; + account_steal_time(NULL, (cputime_t)stolen); + } + + /* + * Account blocked ticks. + * HACK: Passing idle_task to account_steal_time() + * ensures that the ticks are accounted as idle/wait. + */ + if ((blocked > 0) && (delta_cpu > 0)) { + delta_cpu -= blocked; + if (unlikely(delta_cpu < 0)) + blocked += delta_cpu; /* clamp local-time progress */ + do_div(blocked, NS_PER_TICK); + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; + account_steal_time(idle_task(cpu), (cputime_t)blocked); + } + + /* Account user/system ticks. */ + if (delta_cpu > 0) { + do_div(delta_cpu, NS_PER_TICK); + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; + if (user_mode_vm(regs)) + account_user_time(current, (cputime_t)delta_cpu); + else + account_system_time(current, HARDIRQ_OFFSET, + (cputime_t)delta_cpu); + } + + /* Offlined for more than a few seconds? Avoid lockup warnings. */ + if (stolen > 5*HZ) + touch_softlockup_watchdog(); + + /* Local timer processing (see update_process_times()). */ + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_mode_vm(regs)); + scheduler_tick(); + run_posix_cpu_timers(current); + profile_tick(CPU_PROFILING, regs); + + return IRQ_HANDLED; +} + +static void init_missing_ticks_accounting(unsigned int cpu) +{ + struct vcpu_register_runstate_memory_area area; + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + int rc; + + memset(runstate, 0, sizeof(*runstate)); + + area.addr.v = runstate; + rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); + WARN_ON(rc && rc != -ENOSYS); + + per_cpu(processed_blocked_time, cpu) = + runstate->time[RUNSTATE_blocked]; + per_cpu(processed_stolen_time, cpu) = + runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline]; +} + +/* not static: needed by APM */ +unsigned long get_cmos_time(void) +{ + unsigned long retval; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} +EXPORT_SYMBOL(get_cmos_time); + +static void sync_cmos_clock(unsigned long dummy); + +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timeval now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + do_gettimeofday(&now); + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) + fail = set_rtc_mmss(now.tv_sec); + + next.tv_usec = USEC_AFTER - now.tv_usec; + if (next.tv_usec <= 0) + next.tv_usec += USEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_usec >= USEC_PER_SEC) { + next.tv_sec++; + next.tv_usec -= USEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); +} + +void notify_arch_cmos_timer(void) +{ + mod_timer(&sync_cmos_timer, jiffies + 1); + mod_timer(&sync_xen_wallclock_timer, jiffies + 1); +} + +static int timer_resume(struct sys_device *dev) +{ + extern void time_resume(void); + time_resume(); + return 0; +} + +static struct sysdev_class timer_sysclass = { + .resume = timer_resume, + set_kset_name("timer"), +}; + + +/* XXX this driverfs stuff should probably go elsewhere later -john */ +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int time_init_device(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(time_init_device); + +#ifdef CONFIG_HPET_TIMER +extern void (*late_time_init)(void); +/* Duplicate of time_init() below, with hpet_enable part added */ +static void __init hpet_time_init(void) +{ + xtime.tv_sec = get_cmos_time(); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + if ((hpet_enable() >= 0) && hpet_use_timer) { + printk("Using HPET for base-timer\n"); + } + + time_init_hook(); +} +#endif + +/* Dynamically-mapped IRQ. */ +DEFINE_PER_CPU(int, timer_irq); + +extern void (*late_time_init)(void); +static void setup_cpu0_timer_irq(void) +{ + per_cpu(timer_irq, 0) = + bind_virq_to_irqhandler( + VIRQ_TIMER, + 0, + timer_interrupt, + SA_INTERRUPT, + "timer0", + NULL); + BUG_ON(per_cpu(timer_irq, 0) < 0); +} + +static struct vcpu_set_periodic_timer xen_set_periodic_tick = { + .period_ns = NS_PER_TICK +}; + +void __init time_init(void) +{ +#ifdef CONFIG_HPET_TIMER + if (is_hpet_capable()) { + /* + * HPET initialization needs to do memory-mapped io. So, let + * us do a late initialization after mem_init(). + */ + late_time_init = hpet_time_init; + return; + } +#endif + + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0, + &xen_set_periodic_tick)) { + case 0: +#if CONFIG_XEN_COMPAT <= 0x030004 + case -ENOSYS: +#endif + break; + default: + BUG(); + } + + get_time_values_from_xen(0); + + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; + init_missing_ticks_accounting(0); + + update_wallclock(); + + init_cpu_khz(); + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + +#if defined(__x86_64__) + vxtime.mode = VXTIME_TSC; + vxtime.quot = (1000000L << 32) / vxtime_hz; + vxtime.tsc_quot = (1000L << 32) / cpu_khz; + sync_core(); + rdtscll(vxtime.last_tsc); +#endif + + /* Cannot request_irq() until kmem is initialised. */ + late_time_init = setup_cpu0_timer_irq; +} + +/* Convert jiffies to system time. */ +u64 jiffies_to_st(unsigned long j) +{ + unsigned long seq; + long delta; + u64 st; + + do { + seq = read_seqbegin(&xtime_lock); + delta = j - jiffies; + if (delta < 1) { + /* Triggers in some wrap-around cases, but that's okay: + * we just end up with a shorter timeout. */ + st = processed_system_time + NS_PER_TICK; + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { + /* Very long timeout means there is no pending timer. + * We indicate this to Xen by passing zero timeout. */ + st = 0; + } else { + st = processed_system_time + delta * (u64)NS_PER_TICK; + } + } while (read_seqretry(&xtime_lock, seq)); + + return st; +} +EXPORT_SYMBOL(jiffies_to_st); + +/* + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu + * These functions are based on implementations from arch/s390/kernel/time.c + */ +static void stop_hz_timer(void) +{ + struct vcpu_set_singleshot_timer singleshot; + unsigned int cpu = smp_processor_id(); + unsigned long j; + int rc; + + cpu_set(cpu, nohz_cpu_mask); + + /* See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs */ + /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a */ + /* value of rcp->cur that matches rdp->quiescbatch and allows us to */ + /* stop the hz timer then the cpumasks created for subsequent values */ + /* of cur in rcu_start_batch are guaranteed to pick up the updated */ + /* nohz_cpu_mask and so will not depend on this cpu. */ + + smp_mb(); + + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + j = jiffies + 1; + } + + singleshot.timeout_abs_ns = jiffies_to_st(j); + if (!singleshot.timeout_abs_ns) + return; + singleshot.timeout_abs_ns += NS_PER_TICK / 2; + singleshot.flags = 0; + rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot); +#if CONFIG_XEN_COMPAT <= 0x030004 + if (rc) { + BUG_ON(rc != -ENOSYS); + rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns); + } +#endif + BUG_ON(rc); +} + +static void start_hz_timer(void) +{ + unsigned int cpu = smp_processor_id(); + int rc = HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030004 + if (rc) { + BUG_ON(rc != -ENOSYS); + rc = HYPERVISOR_set_timer_op(0); + } +#endif + BUG_ON(rc); + cpu_clear(cpu, nohz_cpu_mask); +} + +void raw_safe_halt(void) +{ + stop_hz_timer(); + /* Blocking includes an implicit local_irq_enable(). */ + HYPERVISOR_block(); + start_hz_timer(); +} +EXPORT_SYMBOL(raw_safe_halt); + +void halt(void) +{ + if (irqs_disabled()) + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); +} +EXPORT_SYMBOL(halt); + +/* No locking required. Interrupts are disabled on all CPUs. */ +void time_resume(void) +{ + unsigned int cpu; + + init_cpu_khz(); + + for_each_online_cpu(cpu) { + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, + &xen_set_periodic_tick)) { + case 0: +#if CONFIG_XEN_COMPAT <= 0x030004 + case -ENOSYS: +#endif + break; + default: + BUG(); + } + get_time_values_from_xen(cpu); + per_cpu(processed_system_time, cpu) = + per_cpu(shadow_time, 0).system_timestamp; + init_missing_ticks_accounting(cpu); + } + + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + + update_wallclock(); +} + +#ifdef CONFIG_SMP +static char timer_name[NR_CPUS][15]; + +int __cpuinit local_setup_timer(unsigned int cpu) +{ + int seq, irq; + + BUG_ON(cpu == 0); + + switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu, + &xen_set_periodic_tick)) { + case 0: +#if CONFIG_XEN_COMPAT <= 0x030004 + case -ENOSYS: +#endif + break; + default: + BUG(); + } + + do { + seq = read_seqbegin(&xtime_lock); + /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ + per_cpu(processed_system_time, cpu) = + per_cpu(shadow_time, 0).system_timestamp; + init_missing_ticks_accounting(cpu); + } while (read_seqretry(&xtime_lock, seq)); + + sprintf(timer_name[cpu], "timer%u", cpu); + irq = bind_virq_to_irqhandler(VIRQ_TIMER, + cpu, + timer_interrupt, + SA_INTERRUPT, + timer_name[cpu], + NULL); + if (irq < 0) + return irq; + per_cpu(timer_irq, cpu) = irq; + + return 0; +} + +void __cpuexit local_teardown_timer(unsigned int cpu) +{ + BUG_ON(cpu == 0); + unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); +} +#endif + +#ifdef CONFIG_CPU_FREQ +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + struct xen_platform_op op; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + if (val == CPUFREQ_PRECHANGE) + return 0; + + op.cmd = XENPF_change_freq; + op.u.change_freq.flags = 0; + op.u.change_freq.cpu = freq->cpu; + op.u.change_freq.freq = (u64)freq->new * 1000; + WARN_ON(HYPERVISOR_platform_op(&op)); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_time_setup(void) +{ + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) { + printk(KERN_ERR "failed to set up cpufreq notifier\n"); + return -ENODEV; + } + return 0; +} + +core_initcall(cpufreq_time_setup); +#endif + +/* + * /proc/sys/xen: This really belongs in another file. It can stay here for + * now however. + */ +static ctl_table xen_subtable[] = { + { + .ctl_name = 1, + .procname = "independent_wallclock", + .data = &independent_wallclock, + .maxlen = sizeof(independent_wallclock), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .ctl_name = 2, + .procname = "permitted_clock_jitter", + .data = &permitted_clock_jitter, + .maxlen = sizeof(permitted_clock_jitter), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, + { 0 } +}; +static ctl_table xen_table[] = { + { + .ctl_name = 123, + .procname = "xen", + .mode = 0555, + .child = xen_subtable}, + { 0 } +}; +static int __init xen_sysctl_init(void) +{ + (void)register_sysctl_table(xen_table, 0); + return 0; +} +__initcall(xen_sysctl_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/traps_32-xen.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,1190 @@ +/* + * linux/arch/i386/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. + */ +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/highmem.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/utsname.h> +#include <linux/kprobes.h> +#include <linux/kexec.h> +#include <linux/unwind.h> + +#ifdef CONFIG_EISA +#include <linux/ioport.h> +#include <linux/eisa.h> +#endif + +#ifdef CONFIG_MCA +#include <linux/mca.h> +#endif + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/nmi.h> +#include <asm/unwind.h> +#include <asm/smp.h> +#include <asm/arch_hooks.h> +#include <asm/kdebug.h> + +#include <linux/module.h> + +#include "mach_traps.h" + +asmlinkage int system_call(void); + +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 } }; + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq = 0; + +#ifndef CONFIG_X86_NO_IDT +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +#endif + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void alignment_check(void); +#ifndef CONFIG_XEN +asmlinkage void spurious_interrupt_bug(void); +#else +asmlinkage void fixup_4gb_segment(void); +#endif +asmlinkage void machine_check(void); + +static int kstack_depth_to_print = 24; +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif +ATOMIC_NOTIFIER_HEAD(i386die_chain); + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&i386die_chain, nb); +} +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&i386die_chain, nb); +} +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +/* + * Print one address/symbol entries per line. + */ +static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl) +{ + printk(" [<%08lx>] ", addr); + + print_symbol("%s\n", addr); +} + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp, + char *log_lvl) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + addr = *(unsigned long *)(ebp + 4); + print_addr_and_symbol(addr, log_lvl); + /* + * break out of recursive entries (such as + * end_of_stack_stop_unwind_function): + */ + if (ebp == *(unsigned long *)ebp) + break; + ebp = *(unsigned long *)ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) + print_addr_and_symbol(addr, log_lvl); + } +#endif + return ebp; +} + +static asmlinkage int +show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +{ + int n = 0; + + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + print_addr_and_symbol(UNW_PC(info), log_lvl); + if (arch_unw_user_mode(info)) + break; + } + return n; +} + +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) +{ + unsigned long ebp; + + if (!task) + task = current; + + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + + if (regs) { + if (unwind_init_frame_info(&info, task, regs) == 0) + unw_ret = show_trace_unwind(&info, log_lvl); + } else if (task == current) + unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); + else { + if (unwind_init_blocked(&info, task) == 0) + unw_ret = show_trace_unwind(&info, log_lvl); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if (UNW_SP(&info) >= PAGE_OFFSET) { + printk("Leftover inexact backtrace:\n"); + stack = (void *)UNW_SP(&info); + } else + printk("Full inexact backtrace again:\n"); + } else if (call_trace >= 1) + return; + else + printk("Full inexact backtrace again:\n"); + } else + printk("Inexact backtrace:\n"); + } + + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } + + while (1) { + struct thread_info *context; + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + ebp = print_context_stack(context, stack, ebp, log_lvl); + stack = (unsigned long*)context->previous_esp; + if (!stack) + break; + printk("%s =======================\n", log_lvl); + } +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) +{ + show_trace_log_lvl(task, regs, stack, ""); +} + +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *esp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (esp == NULL) { + if (task) + esp = (unsigned long*)task->thread.esp; + else + esp = (unsigned long *)&esp; + } + + stack = esp; + for(i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % 8) == 0)) + printk("\n%s ", log_lvl); + printk("%08lx ", *stack++); + } + printk("\n%sCall Trace:\n", log_lvl); + show_trace_log_lvl(task, regs, esp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + printk(" "); + show_stack_log_lvl(task, NULL, esp, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long stack; + + show_trace(current, NULL, &stack); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + int in_kernel = 1; + unsigned long esp; + unsigned short ss; + + esp = (unsigned long) (®s->esp); + savesegment(ss, ss); + if (user_mode_vm(regs)) { + in_kernel = 0; + esp = regs->esp; + ss = regs->xss & 0xffff; + } + print_modules(); + printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" + "EFLAGS: %08lx (%s %.*s) \n", + smp_processor_id(), 0xffff & regs->xcs, regs->eip, + print_tainted(), regs->eflags, system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); + print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); + printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, current->pid, + current_thread_info(), current, current->thread_info); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (in_kernel) { + u8 __user *eip; + + printk("\n" KERN_EMERG "Stack: "); + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + eip = (u8 __user *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { + unsigned char c; + + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + printk(" Bad EIP value."); + break; + } + if (eip == (u8 __user *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +static void handle_BUG(struct pt_regs *regs) +{ + unsigned long eip = regs->eip; + unsigned short ud2; + + if (eip < PAGE_OFFSET) + return; + if (__get_user(ud2, (unsigned short __user *)eip)) + return; + if (ud2 != 0x0b0f) + return; + + printk(KERN_EMERG "------------[ cut here ]------------\n"); + +#ifdef CONFIG_DEBUG_BUGVERBOSE + do { + unsigned short line; + char *file; + char c; + + if (__get_user(line, (unsigned short __user *)(eip + 2))) + break; + if (__get_user(file, (char * __user *)(eip + 4)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = "<bad filename>"; + + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); + return; + } while (0); +#endif + printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); +} + +/* This is gone through when something in the kernel + * has done something bad and is about to be terminated. +*/ +void die(const char * str, struct pt_regs * regs, long err) +{ + static struct { + spinlock_t lock; + u32 lock_owner; + int lock_owner_depth; + } die = { + .lock = SPIN_LOCK_UNLOCKED, + .lock_owner = -1, + .lock_owner_depth = 0 + }; + static int die_counter; + unsigned long flags; + + oops_enter(); + + if (die.lock_owner != raw_smp_processor_id()) { + console_verbose(); + spin_lock_irqsave(&die.lock, flags); + die.lock_owner = smp_processor_id(); + die.lock_owner_depth = 0; + bust_spinlocks(1); + } + else + local_save_flags(flags); + + if (++die.lock_owner_depth < 3) { + int nl = 0; + unsigned long esp; + unsigned short ss; + + handle_BUG(regs); + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk(KERN_EMERG "PREEMPT "); + nl = 1; +#endif +#ifdef CONFIG_SMP + if (!nl) + printk(KERN_EMERG); + printk("SMP "); + nl = 1; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!nl) + printk(KERN_EMERG); + printk("DEBUG_PAGEALLOC"); + nl = 1; +#endif + if (nl) + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != + NOTIFY_STOP) { + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + esp = (unsigned long) (®s->esp); + savesegment(ss, ss); + if (user_mode(regs)) { + esp = regs->esp; + ss = regs->xss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); + print_symbol("%s", regs->eip); + printk(" SS:ESP %04x:%08lx\n", ss, esp); + } + else + regs = NULL; + } else + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irqrestore(&die.lock, flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(SIGSEGV); +} + +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, + struct pt_regs * regs, long error_code, + siginfo_t *info) +{ + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + + if (regs->eflags & VM_MASK) { + if (vm86) + goto vm86_trap; + goto trap_signal; + } + + if (!user_mode(regs)) + goto kernel_trap; + + trap_signal: { + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + } + + kernel_trap: { + if (!fixup_exception(regs)) + die(str, regs, error_code); + return; + } + + vm86_trap: { + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); + if (ret) goto trap_signal; + return; + } +} + +#define DO_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +} + +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +#ifndef CONFIG_KPROBES +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +#endif +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) + +fastcall void __kprobes do_general_protection(struct pt_regs * regs, + long error_code) +{ + current->thread.error_code = error_code; + current->thread.trap_no = 13; + + if (regs->eflags & VM_MASK) + goto gp_in_vm86; + + if (!user_mode(regs)) + goto gp_in_kernel; + + current->thread.error_code = error_code; + current->thread.trap_no = 13; + force_sig(SIGSEGV, current); + return; + +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; + +gp_in_kernel: + if (!fixup_exception(regs)) { + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); + } +} + +static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +{ + printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " + "to continue\n"); + printk(KERN_EMERG "You probably have a hardware problem with your RAM " + "chips\n"); + + /* Clear and disable the memory parity error line. */ + clear_mem_error(reason); +} + +static void io_check_error(unsigned char reason, struct pt_regs * regs) +{ + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + clear_io_check_error(reason); +} + +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +{ +#ifdef CONFIG_MCA + /* Might actually be able to figure out what the guilty party + * is. */ + if( MCA_bus ) { + mca_handle_nmi(); + return; + } +#endif + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void die_nmi (struct pt_regs *regs, const char *msg) +{ + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == + NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", msg); + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); + printk(KERN_EMERG "console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can. + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} + +static void default_do_nmi(struct pt_regs * regs) +{ + unsigned char reason = 0; + + /* Only the BSP gets external NMIs from the system. */ + if (!smp_processor_id()) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog) { + nmi_watchdog_tick(regs); + return; + } +#endif + unknown_nmi_error(reason, regs); + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered. + */ + reassert_nmi(); +} + +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +fastcall void do_nmi(struct pt_regs * regs, long error_code) +{ + int cpu; + + nmi_enter(); + + cpu = smp_processor_id(); + + ++nmi_count(cpu); + + if (!rcu_dereference(nmi_callback)(regs, cpu)) + default_do_nmi(regs); + + nmi_exit(); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + vmalloc_sync_all(); + rcu_assign_pointer(nmi_callback, callback); +} +EXPORT_SYMBOL_GPL(set_nmi_callback); + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} +EXPORT_SYMBOL_GPL(unset_nmi_callback); + +#ifdef CONFIG_KPROBES +fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + */ +fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) +{ + unsigned int condition; + struct task_struct *tsk = current; + + get_debugreg(condition, 6); + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + /* It's safe to allow irq's after DR6 has been saved */ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg[7]) + goto clear_dr7; + } + + if (regs->eflags & VM_MASK) + goto debug_vm86; + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg[6] = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + /* + * We already checked v86 mode above, so we can + * check for kernel mode by just checking the CPL + * of CS. + */ + if (!user_mode(regs)) + goto clear_TF_reenable; + } + + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code); + + /* Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + set_debugreg(0, 7); + return; + +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + return; + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->eflags &= ~TF_MASK; + return; +} + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't syncronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (swd & ~cwd & 0x3f) { + case 0x000: /* No unmasked exception */ + return; + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +{ + ignore_fpu_irq = 1; + math_error((void __user *)regs->eip); +} + +static void simd_math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_simd_coprocessor_error(struct pt_regs * regs, + long error_code) +{ + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->eip); + } else { + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->eflags & VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, + error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); + } +} + +#ifndef CONFIG_XEN +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, + long error_code) +{ +#if 0 + /* No need to warn about this any longer. */ + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +fastcall void setup_x86_bogus_stack(unsigned char * stk) +{ + unsigned long *switch16_ptr, *switch32_ptr; + struct pt_regs *regs; + unsigned long stack_top, stack_bot; + unsigned short iret_frame16_off; + int cpu = smp_processor_id(); + /* reserve the space on 32bit stack for the magic switch16 pointer */ + memmove(stk, stk + 8, sizeof(struct pt_regs)); + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); + regs = (struct pt_regs *)stk; + /* now the switch32 on 16bit stack */ + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; + /* copy iret frame on 16bit stack */ + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); + /* fill in the switch pointers */ + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; + switch16_ptr[1] = __ESPFIX_SS; + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + + 8 - CPU_16BIT_STACK_SIZE; + switch32_ptr[1] = __KERNEL_DS; +} + +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) +{ + unsigned long *switch32_ptr; + unsigned char *stack16, *stack32; + unsigned long stack_top, stack_bot; + int len; + int cpu = smp_processor_id(); + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + /* copy the data from 16bit stack to 32bit stack */ + len = CPU_16BIT_STACK_SIZE - 8 - sp; + stack16 = (unsigned char *)(stack_bot + sp); + stack32 = (unsigned char *) + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); + memcpy(stack32, stack16, len); + return stack32; +} +#endif + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(struct pt_regs regs) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* NB. 'clts' is done for us by Xen during virtual trap. */ + if (!tsk_used_math(tsk)) + init_fpu(tsk); + restore_fpu(tsk); + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ +} + +#ifndef CONFIG_MATH_EMULATION + +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n",current->comm); + force_sig(SIGFPE,current); + schedule(); +} + +#endif /* CONFIG_MATH_EMULATION */ + +#ifdef CONFIG_X86_F00F_BUG +void __init trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif + + +/* + * NB. All these are "trap gates" (i.e. events_mask isn't set) except + * for those that specify <dpl>|4 in the second field. + */ +static trap_info_t __cpuinitdata trap_table[] = { + { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3, __KERNEL_CS, (unsigned long)overflow }, + { 5, 0, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, +#ifdef CONFIG_X86_MCE + { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, +#endif + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, + { 0, 0, 0, 0 } +}; + +void __init trap_init(void) +{ + int ret; + + ret = HYPERVISOR_set_trap_table(trap_table); + if (ret) + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); + + if (cpu_has_fxsr) { + /* + * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. + * Generates a compile-time "error: zero width for bit-field" if + * the alignment is wrong. + */ + struct fxsrAlignAssert { + int _:!(offsetof(struct task_struct, + thread.i387.fxsave) & 15); + }; + + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO "Enabling unmasked SIMD FPU exception " + "support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + + /* + * Should be a barrier for any external CPU state. + */ + cpu_init(); +} + +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 1; +} +__setup("kstack=", kstack_setup); + +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 2) + call_trace = 2; + return 1; +} +__setup("call_trace=", call_trace_setup); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mach-xen/Makefile 2007-06-12 13:12:48.000000000 +0200 @@ -0,0 +1,5 @@ +# +# Makefile for the linux kernel. +# + +obj-y := setup.o --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mach-xen/setup.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,158 @@ +/* + * Machine specific setup for generic + */ + +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <asm/acpi.h> +#include <asm/arch_hooks.h> +#include <asm/e820.h> +#include <asm/setup.h> +#include <asm/fixmap.h> + +#include <xen/interface/callback.h> +#include <xen/interface/memory.h> + +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI (1) +#else +#define DEFAULT_SEND_IPI (0) +#endif + +int no_broadcast=DEFAULT_SEND_IPI; + +static __init int no_ipi_broadcast(char *str) +{ + get_option(&str, &no_broadcast); + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : + "IPI Broadcast"); + return 1; +} + +__setup("no_ipi_broadcast", no_ipi_broadcast); + +static int __init print_ipi_mode(void) +{ + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : + "Shortcut"); + return 0; +} + +late_initcall(print_ipi_mode); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + * + * Description: + * This is included late in kernel/setup.c so that it can make + * use of all of the static functions. + **/ + +char * __init machine_specific_memory_setup(void) +{ + int rc; + struct xen_memory_map memmap; + /* + * This is rather large for a stack variable but this early in + * the boot process we know we have plenty slack space. + */ + struct e820entry map[E820MAX]; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if ( rc == -ENOSYS ) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages); + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + sanitize_e820_map(map, (char *)&memmap.nr_entries); + + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0); + + return "Xen"; +} + + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void nmi(void); + +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + +void __init pre_setup_arch_hook(void) +{ + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; + struct xen_platform_parameters pp; + + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; + + setup_xen_features(); + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + set_fixaddr_top(pp.virt_start); + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } else + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = + (unsigned long *)xen_start_info->mfn_list; +} + +void __init machine_specific_arch_setup(void) +{ + int ret; + static struct callback_register __initdata event = { + .type = CALLBACKTYPE_event, + .address = { __KERNEL_CS, (unsigned long)hypervisor_callback }, + }; + static struct callback_register __initdata failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = { __KERNEL_CS, (unsigned long)failsafe_callback }, + }; + static struct callback_register __initdata nmi_cb = { + .type = CALLBACKTYPE_nmi, + .address = { __KERNEL_CS, (unsigned long)nmi }, + }; + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) + ret = HYPERVISOR_set_callbacks( + event.address.cs, event.address.eip, + failsafe.address.cs, failsafe.address.eip); +#endif + BUG_ON(ret); + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) { + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; + + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); + } +#endif +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/lib/scrub.c 2008-02-08 12:30:51.000000000 +0100 @@ -0,0 +1,21 @@ +#include <asm/cpufeature.h> +#include <asm/page.h> +#include <asm/processor.h> + +void scrub_pages(void *v, unsigned int count) +{ + if (likely(cpu_has_xmm2)) { + unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4); + + for (; n--; v += sizeof(long) * 4) + asm("movnti %1,(%0)\n\t" + "movnti %1,%c2(%0)\n\t" + "movnti %1,2*%c2(%0)\n\t" + "movnti %1,3*%c2(%0)\n\t" + : : "r" (v), "r" (0L), "i" (sizeof(long)) + : "memory"); + asm volatile("sfence" : : : "memory"); + } else + for (; count--; v += PAGE_SIZE) + clear_page(v); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/fault_32-xen.c 2007-12-10 08:47:31.000000000 +0100 @@ -0,0 +1,779 @@ +/* + * linux/arch/i386/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/kprobes.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/kdebug.h> + +extern void die(const char *,struct pt_regs *,long); + +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out + */ +void bust_spinlocks(int yes) +{ + int loglevel_save = console_loglevel; + + if (yes) { + oops_in_progress = 1; + return; + } +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; +} + +/* + * Return EIP plus the CS segment base. The segment limit is also + * adjusted, clamped to the kernel/user address space (whichever is + * appropriate), and returned in *eip_limit. + * + * The segment is checked, because it might have been changed by another + * task between the original faulting instruction and here. + * + * If CS is no longer a valid code segment, or if EIP is beyond the + * limit, or if it is a kernel address when CS is not a kernel segment, + * then the returned value will be greater than *eip_limit. + * + * This is slow, but is very rarely executed. + */ +static inline unsigned long get_segment_eip(struct pt_regs *regs, + unsigned long *eip_limit) +{ + unsigned long eip = regs->eip; + unsigned seg = regs->xcs & 0xffff; + u32 seg_ar, seg_limit, base, *desc; + + /* Unlikely, but must come before segment checks. */ + if (unlikely(regs->eflags & VM_MASK)) { + base = seg << 4; + *eip_limit = base + 0xffff; + return base + (eip & 0xffff); + } + + /* The standard kernel/user address space limit. */ + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; + + /* By far the most common cases. */ + if (likely(seg == __USER_CS || seg == GET_KERNEL_CS())) + return eip; + + /* Check the segment exists, is within the current LDT/GDT size, + that kernel/user (ring 0..3) has the appropriate privilege, + that it's a code segment, and get the limit. */ + __asm__ ("larl %3,%0; lsll %3,%1" + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); + if ((~seg_ar & 0x9800) || eip > seg_limit) { + *eip_limit = 0; + return 1; /* So that returned eip > *eip_limit. */ + } + + /* Get the GDT/LDT descriptor base. + When you look for races in this code remember that + LDT and other horrors are only used in user space. */ + if (seg & (1<<2)) { + /* Must lock the LDT while reading it. */ + down(¤t->mm->context.sem); + desc = current->mm->context.ldt; + desc = (void *)desc + (seg & ~7); + } else { + /* Must disable preemption while reading the GDT. */ + desc = (u32 *)get_cpu_gdt_table(get_cpu()); + desc = (void *)desc + (seg & ~7); + } + + /* Decode the code segment base from the descriptor */ + base = get_desc_base((unsigned long *)desc); + + if (seg & (1<<2)) { + up(¤t->mm->context.sem); + } else + put_cpu(); + + /* Adjust EIP and segment limit, and clamp at the kernel limit. + It's legitimate for segments to wrap at 0xffffffff. */ + seg_limit += base; + if (seg_limit < *eip_limit && seg_limit >= base) + *eip_limit = seg_limit; + return eip + base; +} + +/* + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + */ +static int __is_prefetch(struct pt_regs *regs, unsigned long addr) +{ + unsigned long limit; + unsigned long instr = get_segment_eip (regs, &limit); + int scan_more = 1; + int prefetch = 0; + int i; + + for (i = 0; scan_more && i < 15; i++) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char __user *) instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char __user *) instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 6)) { + /* Catch an obscure case of prefetch inside an NX page. */ + if (nx_enabled && (error_code & 16)) + return 0; + return __is_prefetch(regs, addr); + } + return 0; +} + +static noinline void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + +fastcall void do_invalid_op(struct pt_regs *, unsigned long); + +#ifdef CONFIG_X86_PAE +static void dump_fault_path(unsigned long address) +{ + unsigned long *p, page; + unsigned long mfn; + + page = read_cr3(); + p = (unsigned long *)__va(page); + p += (address >> 30) * 2; + printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); + if (p[0] & _PAGE_PRESENT) { + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); + page = mfn_to_pfn(mfn) << PAGE_SHIFT; + p = (unsigned long *)__va(page); + address &= 0x3fffffff; + p += (address >> 21) * 2; + printk(KERN_ALERT "%08lx -> *pme = %08lx:%08lx\n", + page, p[1], p[0]); + mfn = (p[0] >> PAGE_SHIFT) | (p[1] << 20); +#ifdef CONFIG_HIGHPTE + if (mfn_to_pfn(mfn) >= highstart_pfn) + return; +#endif + if (p[0] & _PAGE_PRESENT) { + page = mfn_to_pfn(mfn) << PAGE_SHIFT; + p = (unsigned long *) __va(page); + address &= 0x001fffff; + p += (address >> 12) * 2; + printk(KERN_ALERT "%08lx -> *pte = %08lx:%08lx\n", + page, p[1], p[0]); + } + } +} +#else +static void dump_fault_path(unsigned long address) +{ + unsigned long page; + + page = read_cr3(); + page = ((unsigned long *) __va(page))[address >> 22]; + if (oops_may_print()) + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And lets rather not kmap-atomic the pte, just in case + * it's allocated already. + */ +#ifdef CONFIG_HIGHPTE + if ((page >> PAGE_SHIFT) >= highstart_pfn) + return; +#endif + if ((page & 1) && oops_may_print()) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = machine_to_phys(page); + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); + } +} +#endif + +static int spurious_fault(struct pt_regs *regs, + unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & 0x0c) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + if ((error_code & 0x02) && !pte_write(*pte)) + return 0; +#ifdef CONFIG_X86_PAE + if ((error_code & 0x10) && (__pte_val(*pte) & _PAGE_NX)) + return 0; +#endif + + return 1; +} + +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) +#if CONFIG_XEN_COMPAT > 0x030002 + set_pmd(pmd, *pmd_k); +#else + /* + * When running on older Xen we must launder *pmd_k through + * pmd_val() to ensure that _PAGE_PRESENT is correctly set. + */ + set_pmd(pmd, __pmd(pmd_val(*pmd_k))); +#endif + else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} + +/* + * Handle a fault on the vmalloc or module mapping area + * + * This assumes no large pages in there. + */ +static inline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * error_code: + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch + */ +fastcall void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long address; + int write, si_code; + + /* get the address */ + address = read_cr2(); + + /* Set the "privileged fault" bit to something sane. */ + error_code &= ~4; + error_code |= (regs->xcs & 2) << 1; + if (regs->eflags & X86_EFLAGS_VM) + error_code |= 4; + + tsk = current; + + si_code = SEGV_MAPERR; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { +#ifdef CONFIG_XEN + /* Faults in hypervisor area can never be patched up. */ + if (address >= hypervisor_virt_start) + goto bad_area_nosemaphore; +#endif + if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) + return; + /* Can take a spurious fault if mapping changes R/O -> R/W. */ + if (spurious_fault(regs, address, error_code)) + return; + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + mm = tsk->mm; + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault.. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->eip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + /* + * Accessing the stack below %esp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535,$31" pushes + * 32 pointers and then decrements %esp by 65535.) + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + si_code = SEGV_ACCERR; + write = 0; + switch (error_code & 3) { + default: /* 3: write, present */ +#ifdef TEST_VERIFY_AREA + if (regs->cs == GET_KERNEL_CS()) + printk("WP fault at %08lx\n", regs->eip); +#endif + /* fall through */ + case 2: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case 1: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + + survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + switch (handle_mm_fault(mm, vma, address, write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + goto do_sigbus; + case VM_FAULT_OOM: + goto out_of_memory; + default: + BUG(); + } + + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & 4) { + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + +#ifdef CONFIG_X86_F00F_BUG + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + unsigned long nr; + + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return; + } + } +#endif + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + */ + if (is_prefetch(regs, address, error_code)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + bust_spinlocks(1); + + if (oops_may_print()) { + #ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); + + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } + #endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "BUG: unable to handle kernel NULL " + "pointer dereference"); + else + printk(KERN_ALERT "BUG: unable to handle kernel paging" + " request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); + } + dump_fault_path(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & 4)) + goto no_context; + + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +#if !HAVE_SHARED_KERNEL_PMD +void vmalloc_sync_all(void) +{ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + * XEN: To work on PAE, we need to iterate over PMDs rather than PGDs. + * This change works just fine with 2-level paging too. + */ +#define sync_index(a) ((a) >> PMD_SHIFT) + static DECLARE_BITMAP(insync, PTRS_PER_PGD*PTRS_PER_PMD); + static unsigned long start = TASK_SIZE; + unsigned long address; + + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; + address >= TASK_SIZE && address < hypervisor_virt_start; + address += 1UL << PMD_SHIFT) { + if (!test_bit(sync_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + /* XEN: failure path assumes non-empty pgd_list. */ + if (unlikely(!pgd_list)) { + spin_unlock_irqrestore(&pgd_lock, flags); + return; + } + for (page = pgd_list; page; page = + (struct page *)page->index) + if (!vmalloc_sync_one(page_address(page), + address)) { + BUG_ON(page != pgd_list); + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(sync_index(address), insync); + } + if (address == start && test_bit(sync_index(address), insync)) + start = address + (1UL << PMD_SHIFT); + } +} +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/highmem_32-xen.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,183 @@ +#include <linux/highmem.h> +#include <linux/module.h> + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + inc_preempt_count(); + if (!PageHighMem(page)) + return page_address(page); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#ifdef CONFIG_DEBUG_HIGHMEM + if (!pte_none(*(kmap_pte-idx))) + BUG(); +#endif + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); + + return (void*) vaddr; +} + +void *kmap_atomic(struct page *page, enum km_type type) +{ + return __kmap_atomic(page, type, kmap_prot); +} + +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ +void *kmap_atomic_pte(struct page *page, enum km_type type) +{ + return __kmap_atomic(page, type, + test_bit(PG_pinned, &page->flags) + ? PAGE_KERNEL_RO : kmap_prot); +} + +void kunmap_atomic(void *kvaddr, enum km_type type) +{ +#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN) + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + if (vaddr < FIXADDR_START) { // FIXME + dec_preempt_count(); + preempt_check_resched(); + return; + } +#endif + +#if defined(CONFIG_DEBUG_HIGHMEM) + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + BUG(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + __flush_tlb_one(vaddr); +#elif defined(CONFIG_XEN) + /* + * We must ensure there are no dangling pagetable references when + * returning memory to Xen (decrease_reservation). + * XXX TODO: We could make this faster by only zapping when + * kmap_flush_unused is called but that is trickier and more invasive. + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); +#endif + + dec_preempt_count(); + preempt_check_resched(); +} + +/* This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + inc_preempt_count(); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} + +void clear_highpage(struct page *page) +{ + void *kaddr; + + if (likely(xen_feature(XENFEAT_highmem_assist)) + && PageHighMem(page)) { + struct mmuext_op meo; + + meo.cmd = MMUEXT_CLEAR_PAGE; + meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page)); + if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) + return; + } + + kaddr = kmap_atomic(page, KM_USER0); + clear_page(kaddr); + kunmap_atomic(kaddr, KM_USER0); +} + +void copy_highpage(struct page *to, struct page *from) +{ + void *vfrom, *vto; + + if (likely(xen_feature(XENFEAT_highmem_assist)) + && (PageHighMem(from) || PageHighMem(to))) { + unsigned long from_pfn = page_to_pfn(from); + unsigned long to_pfn = page_to_pfn(to); + struct mmuext_op meo; + + meo.cmd = MMUEXT_COPY_PAGE; + meo.arg1.mfn = pfn_to_mfn(to_pfn); + meo.arg2.src_mfn = pfn_to_mfn(from_pfn); + if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn + && mfn_to_pfn(meo.arg1.mfn) == to_pfn + && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0) + return; + } + + vfrom = kmap_atomic(from, KM_USER0); + vto = kmap_atomic(to, KM_USER1); + copy_page(vto, vfrom); + kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vto, KM_USER1); +} + +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_pte); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(clear_highpage); +EXPORT_SYMBOL(copy_highpage); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/hypervisor.c 2009-06-09 15:01:37.000000000 +0200 @@ -0,0 +1,579 @@ +/****************************************************************************** + * mm/hypervisor.c + * + * Update page tables via the hypervisor. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/hypervisor.h> +#include <xen/balloon.h> +#include <xen/features.h> +#include <xen/interface/memory.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <asm/tlbflush.h> +#include <linux/highmem.h> + +void xen_l1_entry_update(pte_t *ptr, pte_t val) +{ + mmu_update_t u; + u.ptr = ptep_to_machine(ptr); + u.val = __pte_val(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_l1_entry_update); + +void xen_l2_entry_update(pmd_t *ptr, pmd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = __pmd_val(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = __pud_val(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif + +#ifdef CONFIG_X86_64 +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = __pgd_val(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif /* CONFIG_X86_64 */ + +void xen_pt_switch(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_BASEPTR; + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_new_user_pt(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_USER_BASEPTR; + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL(xen_tlb_flush); + +void xen_invlpg(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_LOCAL; + op.arg1.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL(xen_invlpg); + +#ifdef CONFIG_SMP + +void xen_tlb_flush_all(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_ALL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_tlb_flush_all); + +void xen_tlb_flush_mask(cpumask_t *mask) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + set_xen_guest_handle(op.arg2.vcpumask, mask->bits); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_tlb_flush_mask); + +void xen_invlpg_all(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.arg1.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_invlpg_all); + +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_INVLPG_MULTI; + op.arg1.linear_addr = ptr & PAGE_MASK; + set_xen_guest_handle(op.arg2.vcpumask, mask->bits); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +EXPORT_SYMBOL_GPL(xen_invlpg_mask); + +#endif /* CONFIG_SMP */ + +void xen_pgd_pin(unsigned long ptr) +{ + struct mmuext_op op; +#ifdef CONFIG_X86_64 + op.cmd = MMUEXT_PIN_L4_TABLE; +#elif defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pgd_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_set_ldt(const void *ptr, unsigned int ents) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.arg1.linear_addr = (unsigned long)ptr; + op.arg2.nr_ents = ents; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +/* Protected by balloon_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; +static unsigned long limited_frames[1<<MAX_CONTIG_ORDER]; +static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER]; + +/* Ensure multi-page extents are contiguous in machine memory. */ +int xen_create_contiguous_region( + unsigned long vstart, unsigned int order, unsigned int address_bits) +{ + unsigned long *in_frames = discontig_frames, out_frame; + unsigned long frame, flags; + unsigned int i; + int rc, success; + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = 1UL << order, + .extent_order = 0, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = 1, + .extent_order = order, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + /* + * Currently an auto-translated guest will not perform I/O, nor will + * it require PAE page directories below 4GB. Therefore any calls to + * this function are redundant and can be ignored. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + set_xen_guest_handle(exchange.in.extent_start, in_frames); + set_xen_guest_handle(exchange.out.extent_start, &out_frame); + + scrub_pages((void *)vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + for (i = 0; i < (1U<<order); i++) { + in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i); + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), + __pte_ma(0), 0); + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, + INVALID_P2M_ENTRY); + } + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + /* 2. Get a new contiguous memory extent. */ + out_frame = __pa(vstart) >> PAGE_SHIFT; + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == (1UL << order)); + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unsupported. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != (1UL << order)) + BUG(); + success = (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) == 1); + if (!success) { + /* Couldn't get special memory: fall back to normal. */ + for (i = 0; i < (1U<<order); i++) + in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i; + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.in) != (1UL<<order)) + BUG(); + } + } +#endif + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1U<<order); i++) { + frame = success ? (out_frame + i) : in_frames[i]; + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), + pfn_pte_ma(frame, PAGE_KERNEL), 0); + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); + } + + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order + ? UVMF_TLB_FLUSH|UVMF_ALL + : UVMF_INVLPG|UVMF_ALL; + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + balloon_unlock(flags); + + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long frame, flags; + unsigned int i; + int rc, success; + struct xen_memory_exchange exchange = { + .in = { + .nr_extents = 1, + .extent_order = order, + .domid = DOMID_SELF + }, + .out = { + .nr_extents = 1UL << order, + .extent_order = 0, + .domid = DOMID_SELF + } + }; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + set_xen_guest_handle(exchange.in.extent_start, &in_frame); + set_xen_guest_handle(exchange.out.extent_start, out_frames); + + scrub_pages((void *)vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT); + + /* 2. Zap current PTEs. */ + for (i = 0; i < (1U<<order); i++) { + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), + __pte_ma(0), 0); + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, + INVALID_P2M_ENTRY); + out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i; + } + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + /* 3. Do the exchange for non-contiguous MFNs. */ + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == 1); + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unsupported. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != 1) + BUG(); + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) != (1UL << order)) + BUG(); + success = 1; + } +#endif + + /* 4. Map new pages in place of old pages. */ + for (i = 0; i < (1U<<order); i++) { + frame = success ? out_frames[i] : (in_frame + i); + MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE), + pfn_pte_ma(frame, PAGE_KERNEL), 0); + set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame); + } + + cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order + ? UVMF_TLB_FLUSH|UVMF_ALL + : UVMF_INVLPG|UVMF_ALL; + if (HYPERVISOR_multicall_check(cr_mcl, i, NULL)) + BUG(); + + balloon_unlock(flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + +static void undo_limit_pages(struct page *pages, unsigned int order) +{ + BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); + BUG_ON(order > MAX_CONTIG_ORDER); + xen_limit_pages_to_max_mfn(pages, order, 0); + ClearPageForeign(pages); + __free_pages(pages, order); +} + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits) +{ + unsigned long flags, frame; + unsigned long *in_frames = discontig_frames, *out_frames = limited_frames; + struct page *page; + unsigned int i, n, nr_mcl; + int rc, success; + DECLARE_BITMAP(limit_map, 1 << MAX_CONTIG_ORDER); + + struct xen_memory_exchange exchange = { + .in = { + .extent_order = 0, + .domid = DOMID_SELF + }, + .out = { + .extent_order = 0, + .address_bits = address_bits, + .domid = DOMID_SELF + } + }; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return -ENOMEM; + + if (address_bits) { + if (address_bits < PAGE_SHIFT) + return -EINVAL; + bitmap_zero(limit_map, 1U << order); + } else if (order) { + BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map)); + for (i = 0; i < BITS_TO_LONGS(1U << order); ++i) + limit_map[i] = pages[i + 1].index; + } else + __set_bit(0, limit_map); + + set_xen_guest_handle(exchange.in.extent_start, in_frames); + set_xen_guest_handle(exchange.out.extent_start, out_frames); + + /* 0. Scrub the pages. */ + for (i = 0, n = 0; i < 1U<<order ; i++) { + page = &pages[i]; + if (address_bits) { + if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT))) + continue; + __set_bit(i, limit_map); + } + + if (!PageHighMem(page)) + scrub_pages(page_address(page), 1); +#ifdef CONFIG_XEN_SCRUB_PAGES + else { + scrub_pages(kmap(page), 1); + kunmap(page); + ++n; + } +#endif + } + if (bitmap_empty(limit_map, 1U << order)) + return 0; + + if (n) + kmap_flush_unused(); + + balloon_lock(flags); + + /* 1. Zap current PTEs (if any), remembering MFNs. */ + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) { + if(!test_bit(i, limit_map)) + continue; + page = &pages[i]; + + out_frames[n] = page_to_pfn(page); + in_frames[n] = pfn_to_mfn(out_frames[n]); + + if (!PageHighMem(page)) + MULTI_update_va_mapping(cr_mcl + nr_mcl++, + (unsigned long)page_address(page), + __pte_ma(0), 0); + + set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY); + ++n; + } + if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL)) + BUG(); + + /* 2. Get new memory below the required limit. */ + exchange.in.nr_extents = n; + exchange.out.nr_extents = n; + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); + success = (exchange.nr_exchanged == n); + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); + BUG_ON(success && (rc != 0)); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + /* Compatibility when XENMEM_exchange is unsupported. */ + if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &exchange.in) != n) + BUG(); + if (HYPERVISOR_memory_op(XENMEM_populate_physmap, + &exchange.out) != n) + BUG(); + success = 1; + } +#endif + + /* 3. Map the new pages in place of old pages. */ + for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) { + if(!test_bit(i, limit_map)) + continue; + page = &pages[i]; + + frame = success ? out_frames[n] : in_frames[n]; + + if (!PageHighMem(page)) + MULTI_update_va_mapping(cr_mcl + nr_mcl++, + (unsigned long)page_address(page), + pfn_pte_ma(frame, PAGE_KERNEL), 0); + + set_phys_to_machine(page_to_pfn(page), frame); + ++n; + } + if (nr_mcl) { + cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order + ? UVMF_TLB_FLUSH|UVMF_ALL + : UVMF_INVLPG|UVMF_ALL; + if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL)) + BUG(); + } + + balloon_unlock(flags); + + if (!success) + return -ENOMEM; + + if (address_bits) { + if (order) { + BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index)); + for (i = 0; i < BITS_TO_LONGS(1U << order); ++i) + pages[i + 1].index = limit_map[i]; + } + SetPageForeign(pages, undo_limit_pages); + } + + return 0; +} +EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn); + +#ifdef __i386__ +int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b) +{ + __u32 *lp = (__u32 *)((char *)ldt + entry * 8); + maddr_t mach_lp = arbitrary_virt_to_machine(lp); + return HYPERVISOR_update_descriptor( + mach_lp, (u64)entry_a | ((u64)entry_b<<32)); +} +#endif + +#define MAX_BATCHED_FULL_PTES 32 + +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot) +{ + int rc = 0, i = 0; + mmu_update_t u[MAX_BATCHED_FULL_PTES]; + pte_t *pte; + spinlock_t *ptl; + + if (!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) + return 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { + if (pte_present(*pte)) { + u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK) + | ((unsigned long)pte & ~PAGE_MASK) + | MMU_PT_UPDATE_PRESERVE_AD; + u[i].val = __pte_val(pte_modify(*pte, newprot)); + if (++i == MAX_BATCHED_FULL_PTES) { + if ((rc = HYPERVISOR_mmu_update( + &u[0], i, NULL, DOMID_SELF)) != 0) + break; + i = 0; + } + } + } while (pte++, addr += PAGE_SIZE, addr != end); + if (i) + rc = HYPERVISOR_mmu_update( &u[0], i, NULL, DOMID_SELF); + pte_unmap_unlock(pte - 1, ptl); + BUG_ON(rc && rc != -ENOSYS); + return !rc; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/init_32-xen.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,840 @@ +/* + * linux/arch/i386/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/poison.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/efi.h> +#include <linux/memory_hotplug.h> +#include <linux/initrd.h> +#include <linux/cpumask.h> +#include <linux/dma-mapping.h> +#include <linux/scatterlist.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/hypervisor.h> +#include <asm/swiotlb.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +unsigned long highstart_pfn, highend_pfn; + +static int noinline do_test_wp_bit(void); + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); +#else + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +#endif + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry. + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_lowmem_page_readonly(page_table, + XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + if (page_table != pte_offset_kernel(pmd, 0)) + BUG(); + + return page_table; + } + + return pte_offset_kernel(pmd, 0); +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + */ + +/* + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + if (pgd_none(*pgd)) + one_md_table_init(pgd); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) + one_page_table_init(pmd); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET. + */ +static void __init kernel_physical_mapping_init(pgd_t *pgd_base) +{ + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int pgd_idx, pmd_idx, pte_ofs; + + unsigned long max_ram_pfn = xen_start_info->nr_pages; + if (max_ram_pfn > max_low_pfn) + max_ram_pfn = max_low_pfn; + + pgd_idx = pgd_index(PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + pfn = 0; + pmd_idx = pmd_index(PAGE_OFFSET); + pte_ofs = pte_index(PAGE_OFFSET); + + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { +#ifdef CONFIG_XEN + /* + * Native linux hasn't PAE-paging enabled yet at this + * point. When running as xen domain we are in PAE + * mode already, thus we can't simply hook a empty + * pmd. That would kill the mappings we are currently + * using ... + */ + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); +#else + pmd = one_md_table_init(pgd); +#endif + if (pfn >= max_low_pfn) + continue; + pmd += pmd_idx; + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + if (address >= hypervisor_virt_start) + continue; + + /* Map with big pages if possible, otherwise create normal page tables. */ + if (cpu_has_pse) { + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; + } else { + pte = one_page_table_init(pmd); + + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + /* XEN: Only map initial RAM allocation. */ + if ((pfn >= max_ram_pfn) || pte_present(*pte)) + continue; + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } + pte_ofs = 0; + } + } + pmd_idx = 0; + } +} + +#ifndef CONFIG_XEN + +static inline int page_kills_ppro(unsigned long pagenr) +{ + if (pagenr >= 0x70000 && pagenr <= 0x7003F) + return 1; + return 0; +} + +#else + +#define page_kills_ppro(p) 0 + +#endif + +extern int is_available_memory(efi_memory_desc_t *); + +int page_is_ram(unsigned long pagenr) +{ + int i; + unsigned long addr, end; + + if (efi_enabled) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; + } + + for (i = 0; i < e820.nr_map; i++) { + + if (e820.map[i].type != E820_RAM) /* not usable memory */ + continue; + /* + * !!!FIXME!!! Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area. We need a sanity + * check here. + */ + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +pgprot_t kmap_prot; + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} + +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +static void __meminit free_new_highpage(struct page *page, int pfn) +{ + init_page_count(page); + if (pfn < xen_start_info->nr_pages) + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) +{ + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { + ClearPageReserved(page); + free_new_highpage(page, pfn); + } else + SetPageReserved(page); +} + +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) +{ + free_new_highpage(page, pfn); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + +#ifdef CONFIG_NUMA +extern void set_highmem_pages_init(int); +#else +static void __init set_highmem_pages_init(int bad_ppro) +{ + int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + totalram_pages += totalhigh_pages; +} +#endif /* CONFIG_FLATMEM */ + +#else +#define kmap_init() do { } while (0) +#define permanent_kmaps_init(pgd_base) do { } while (0) +#define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif /* CONFIG_HIGHMEM */ + +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +EXPORT_SYMBOL(__PAGE_KERNEL); +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; + +#ifdef CONFIG_NUMA +extern void __init remap_numa_kva(void); +#else +#define remap_numa_kva() do {} while (0) +#endif + +pgd_t *swapper_pg_dir; + +static void __init pagetable_init (void) +{ + unsigned long vaddr; + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; + + /* Enable PSE if available */ + if (cpu_has_pse) { + set_in_cr4(X86_CR4_PSE); + } + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + } + + kernel_physical_mapping_init(pgd_base); + remap_numa_kva(); + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); + + permanent_kmaps_init(pgd_base); +} + +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) +/* + * Swap suspend & friends need this for resume because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +char __nosavedata swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +static inline void save_pg_dir(void) +{ + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); +} +#else +static inline void save_pg_dir(void) +{ +} +#endif + +void zap_low_mappings (void) +{ + int i; + + save_pg_dir(); + + /* + * Zap initial low-memory mappings. + * + * Note that "pgd_clear()" doesn't do it for + * us, because pgd_clear() is a no-op on i386. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); +#else + set_pgd(swapper_pg_dir+i, __pgd(0)); +#endif + flush_tlb_all(); +} + +static int disable_nx __initdata = 0; +u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL(__supported_pte_mask); + +/* + * noexec = on|off + * + * Control non executable mappings. + * + * on Enable + * off Disable + */ +void __init noexec_setup(const char *str) +{ + if (!strncmp(str, "on",2) && cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str,"off",3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } +} + +int nx_enabled = 0; +#ifdef CONFIG_X86_PAE + +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} + +/* + * Enables/disables executability of a given kernel page and + * returns the previous setting. + */ +int __init set_kernel_exec(unsigned long vaddr, int enable) +{ + pte_t *pte; + int ret = 1; + + if (!nx_enabled) + goto out; + + pte = lookup_address(vaddr); + BUG_ON(!pte); + + if (!pte_exec_kernel(*pte)) + ret = 0; + + if (enable) + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); + else + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); + __flush_tlb_all(); +out: + return ret; +} + +#endif + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ + int i; + +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk("NX (Execute Disable) protection: active\n"); +#endif + + pagetable_init(); + +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) + /* + * We will bail out later - printk doesn't work right now so + * the user would just see a hanging kernel. + * when running as xen domain we are already in PAE mode at + * this point. + */ + if (cpu_has_pae) + set_in_cr4(X86_CR4_PAE); +#endif + __flush_tlb_all(); + + kmap_init(); + + /* Switch to the real shared_info page, and clear the + * dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, sizeof(empty_zero_page)); + + /* Setup mapping of lower 1st MB */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (is_initial_xendomain()) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This + * used to involve black magic jumps to work around some nasty CPU bugs, + * but fortunately the switch to using exceptions got rid of all that. + */ + +static void __init test_wp_bit(void) +{ + printk("Checking if this processor honours the WP bit even in supervisor mode... "); + + /* Any page-aligned address will do, the test is non-destructive */ + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + boot_cpu_data.wp_works_ok = do_test_wp_bit(); + clear_fixmap(FIX_WP_TEST); + + if (!boot_cpu_data.wp_works_ok) { + printk("No.\n"); +#ifdef CONFIG_X86_WP_WORKS_OK + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); +#endif + } else { + printk("Ok.\n"); + } +} + +static void __init set_max_mapnr_init(void) +{ +#ifdef CONFIG_HIGHMEM + num_physpages = highend_pfn; +#else + num_physpages = max_low_pfn; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif +} + +static struct kcore_list kcore_mem, kcore_vmalloc; + +void __init mem_init(void) +{ + extern int ppro_with_ram_bug(void); + int codesize, reservedpages, datasize, initsize; + int tmp; + int bad_ppro; + unsigned long pfn; + +#if defined(CONFIG_SWIOTLB) + swiotlb_init(); +#endif + +#ifdef CONFIG_FLATMEM + if (!mem_map) + BUG(); +#endif + + bad_ppro = ppro_with_ram_bug(); + +#ifdef CONFIG_HIGHMEM + /* check that fixmap and pkmap do not overlap */ + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + BUG(); + } +#endif + + set_max_mapnr_init(); + +#ifdef CONFIG_HIGHMEM + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk("vmalloc area: %lx-%lx, maxmem %lx\n", + VMALLOC_START,VMALLOC_END,MAXMEM); + BUG_ON(VMALLOC_START > VMALLOC_END); + + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + /* XEN: init and count low-mem pages outside initial allocation. */ + for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) { + ClearPageReserved(pfn_to_page(pfn)); + init_page_count(pfn_to_page(pfn)); + totalram_pages++; + } + + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* + * Only count reserved RAM pages + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; + + set_highmem_pages_init(bad_ppro); + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10, + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) + ); + +#ifdef CONFIG_X86_PAE + if (!cpu_has_pae) + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); +#endif + if (boot_cpu_data.wp_works_ok < 0) + test_wp_bit(); + + /* + * Subtle. SMP is doing it's boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif + + set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); +} + +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifdef CONFIG_MEMORY_HOTPLUG +#ifndef CONFIG_NEED_MULTIPLE_NODES +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif +#endif + +kmem_cache_t *pgd_cache; +kmem_cache_t *pmd_cache; + +void __init pgtable_cache_init(void) +{ + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + 0, + pmd_ctor, + NULL); + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + pgd_cache = kmem_cache_create("pgd", +#ifndef CONFIG_XEN + PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), +#else + PAGE_SIZE, + PAGE_SIZE, +#endif + 0, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); +} + +/* + * This function cannot be __init, since exceptions don't work in that + * section. Put this after the callers, so that it cannot be inlined. + */ +static int noinline do_test_wp_bit(void) +{ + char tmp_reg; + int flag; + + __asm__ __volatile__( + " movb %0,%1 \n" + "1: movb %1,%0 \n" + " xorl %2,%2 \n" + "2: \n" + ".section __ex_table,\"a\"\n" + " .align 4 \n" + " .long 1b,2b \n" + ".previous \n" + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), + "=q" (tmp_reg), + "=r" (flag) + :"2" (1) + :"memory"); + + return flag; +} + +#ifdef CONFIG_DEBUG_RODATA + +void mark_rodata_ro(void) +{ + unsigned long addr = (unsigned long)__start_rodata; + + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); + + printk("Write protecting the kernel read-only data: %uk\n", + (__end_rodata - __start_rodata) >> 10); + + /* + * change_page_attr() requires a global_flush_tlb() call after it. + * We do this after the printk so that if something went wrong in the + * change, the printk gets out at least to give a better debug hint + * of who is the culprit. + */ + global_flush_tlb(); +} +#endif + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/ioremap_32-xen.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,443 @@ +/* + * arch/i386/mm/ioremap.c + * + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <asm/io.h> +#include <asm/fixmap.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> + +#define ISA_START_ADDRESS 0x0 +#define ISA_END_ADDRESS 0x100000 + +static int direct_remap_area_pte_fn(pte_t *pte, + struct page *pmd_page, + unsigned long address, + void *data) +{ + mmu_update_t **v = (mmu_update_t **)data; + + BUG_ON(!pte_none(*pte)); + + (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + (*v)++; + + return 0; +} + +static int __direct_remap_pfn_range(struct mm_struct *mm, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int rc; + unsigned long i, start_address; + mmu_update_t *u, *v, *w; + + u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (u == NULL) + return -ENOMEM; + + start_address = address; + + flush_cache_all(); + + for (i = 0; i < size; i += PAGE_SIZE) { + if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) { + /* Flush a full batch after filling in the PTE ptrs. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) + goto out; + v = w = u; + start_address = address; + } + + /* + * Fill in the machine address: PTE ptr is done later by + * apply_to_page_range(). + */ + v->val = __pte_val(pfn_pte_ma(mfn, prot)) | _PAGE_IO; + + mfn++; + address += PAGE_SIZE; + v++; + } + + if (v != u) { + /* Final batch. */ + rc = apply_to_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + if (rc) + goto out; + rc = -EFAULT; + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) + goto out; + } + + rc = 0; + + out: + flush_tlb_all(); + + free_page((unsigned long)u); + + return rc; +} + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return remap_pfn_range(vma, address, mfn, size, prot); + + if (domid == DOMID_SELF) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED; + + vma->vm_mm->context.has_foreign_mappings = 1; + + return __direct_remap_pfn_range( + vma->vm_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_remap_pfn_range); + +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + return __direct_remap_pfn_range( + &init_mm, address, mfn, size, prot, domid); +} +EXPORT_SYMBOL(direct_kernel_remap_pfn_range); + +static int lookup_pte_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + uint64_t *ptep = (uint64_t *)data; + if (ptep) + *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + return 0; +} + +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep) +{ + return apply_to_page_range(mm, address, PAGE_SIZE, + lookup_pte_fn, ptep); +} + +EXPORT_SYMBOL(create_lookup_pte_addr); + +static int noop_fn( + pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) +{ + return 0; +} + +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size) +{ + return apply_to_page_range(mm, address, size, noop_fn, NULL); +} + +EXPORT_SYMBOL(touch_pte_range); + +/* + * Does @address reside within a non-highmem page that is local to this virtual + * machine (i.e., not an I/O page, nor a memory page belonging to another VM). + * See the comment that accompanies mfn_to_local_pfn() in page.h to understand + * why this works. + */ +static inline int is_local_lowmem(unsigned long address) +{ + extern unsigned long max_low_pfn; + return (mfn_to_local_pfn(address >> PAGE_SHIFT) < max_low_pfn); +} + +/* + * Generic mapping function (not visible outside): + */ + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +{ + void __iomem * addr; + struct vm_struct * area; + unsigned long offset, last_addr; + domid_t domid = DOMID_IO; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (is_initial_xendomain() && + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (void __iomem *) isa_bus_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (is_local_lowmem(phys_addr)) { + char *t_addr, *t_end; + struct page *page; + + t_addr = bus_to_virt(phys_addr); + t_end = t_addr + (size - 1); + + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) + if(!PageReserved(page)) + return NULL; + + domid = DOMID_SELF; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + flags |= _KERNPG_TABLE; + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, + phys_addr>>PAGE_SHIFT, + size, __pgprot(flags), domid)) { + vunmap((void __force *) addr); + return NULL; + } + return (void __iomem *) (offset + (char __iomem *)addr); +} +EXPORT_SYMBOL(__ioremap); + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + unsigned long last_addr; + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); + if (!p) + return p; + + /* Guaranteed to be > phys_addr, as per __ioremap() */ + last_addr = phys_addr + size - 1; + + if (is_local_lowmem(last_addr)) { + struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); + unsigned long npages; + + phys_addr &= PAGE_MASK; + + /* This might overflow and become zero.. */ + last_addr = PAGE_ALIGN(last_addr); + + /* .. but that's ok, because modulo-2**n arithmetic will make + * the page-aligned "last - first" come out right. + */ + npages = (last_addr - phys_addr) >> PAGE_SHIFT; + + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { + iounmap(p); + p = NULL; + } + global_flush_tlb(); + } + + return p; +} +EXPORT_SYMBOL(ioremap_nocache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; + + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk("iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { + /* p->size includes the guard page, but cpa doesn't like that */ + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), + (p->size - PAGE_SIZE) >> PAGE_SHIFT, + PAGE_KERNEL); + global_flush_tlb(); + } + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (is_initial_xendomain() && + phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return isa_bus_to_virt(phys_addr); + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); +} + +void __init bt_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) + return; + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + clear_fixmap(idx); + --idx; + --nrpages; + } +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/pgtable_32-xen.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,731 @@ +/* + * linux/arch/i386/mm/pgtable.c + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/mmu_context.h> + +#include <xen/features.h> +#include <asm/hypervisor.h> + +static void pgd_test_and_unpin(pgd_t *pgd); + +void show_mem(void) +{ + int total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + pg_data_t *pgdat; + unsigned long i; + unsigned long flags; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_online_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pgdat_page_nr(pgdat, i); + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + pgdat_resize_unlock(pgdat, &flags); + } + printk(KERN_INFO "%d pages of RAM\n", total); + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); + printk(KERN_INFO "%d reserved pages\n", reserved); + printk(KERN_INFO "%d pages shared\n", shared); + printk(KERN_INFO "%d pages swap cached\n", cached); + + printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); + printk(KERN_INFO "%lu pages writeback\n", + global_page_state(NR_WRITEBACK)); + printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); + printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB)); + printk(KERN_INFO "%lu pages pagetables\n", + global_page_state(NR_PAGETABLE)); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +static int nr_fixmaps = 0; +unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START; +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE); +EXPORT_SYMBOL(__FIXADDR_TOP); + +void __init set_fixaddr_top(unsigned long top) +{ + BUG_ON(nr_fixmaps > 0); + hypervisor_virt_start = top; + __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE; +} + +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + pte_t pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + switch (idx) { + case FIX_WP_TEST: + case FIX_VDSO: + pte = pfn_pte(phys >> PAGE_SHIFT, flags); + break; + default: + pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags); + break; + } + if (HYPERVISOR_update_va_mapping(address, pte, + UVMF_INVLPG|UVMF_ALL)) + BUG(); + nr_fixmaps++; +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + if (pte) + make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables); + return pte; +} + +static void _pte_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + pte_free(page); +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) { + SetPageForeign(pte, _pte_free); + init_page_count(pte); + } + return pte; +} + +void pte_free(struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + + if (!PageHighMem(pte)) { + unsigned long va = (unsigned long)__va(pfn << PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + if (HYPERVISOR_update_va_mapping( + va, pfn_pte(pfn, PAGE_KERNEL), 0)) + BUG(); + } else + clear_bit(PG_pinned, &pte->flags); + + ClearPageForeign(pte); + init_page_count(pte); + + __free_page(pte); +} + +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +{ + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * The locking scheme was chosen on the basis of manfred's + * recommendations and having no core impact whatsoever. + * -- wli + */ +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + page->index = (unsigned long)pgd_list; + if (pgd_list) + set_page_private(pgd_list, (unsigned long)&page->index); + pgd_list = page; + set_page_private(page, (unsigned long)&pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + next = (struct page *)page->index; + pprev = (struct page **)page_private(page); + *pprev = next; + if (next) + set_page_private(next, (unsigned long)pprev); +} + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + if (PTRS_PER_PMD > 1) { + if (HAVE_SHARED_KERNEL_PMD) + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } else { + spin_lock_irqsave(&pgd_lock, flags); + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* never called when PTRS_PER_PMD > 1 */ +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; /* can be called from interrupt context */ + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + + pgd_test_and_unpin(pgd); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + pmd_t **pmd; + unsigned long flags; + + pgd_test_and_unpin(pgd); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + if (HAVE_SHARED_KERNEL_PMD) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } + return pgd; + } + + /* + * We can race save/restore (if we sleep during a GFP_KERNEL memory + * allocation). We therefore store virtual addresses of pmds as they + * do not change across save/restore, and poke the machine addresses + * into the pgdir under the pgd_lock. + */ + pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); + if (!pmd) { + kmem_cache_free(pgd_cache, pgd); + return NULL; + } + + /* Allocate pmds, remember virtual addresses. */ + for (i = 0; i < PTRS_PER_PGD; ++i) { + pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd[i]) + goto out_oom; + } + + spin_lock_irqsave(&pgd_lock, flags); + + /* Protect against save/restore: move below 4GB under pgd_lock. */ + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) { + int rc = xen_create_contiguous_region( + (unsigned long)pgd, 0, 32); + if (rc) { + spin_unlock_irqrestore(&pgd_lock, flags); + goto out_oom; + } + } + + /* Copy kernel pmd contents and write-protect the new pmds. */ + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { + unsigned long v = (unsigned long)i << PGDIR_SHIFT; + pgd_t *kpgd = pgd_offset_k(v); + pud_t *kpud = pud_offset(kpgd, v); + pmd_t *kpmd = pmd_offset(kpud, v); + memcpy(pmd[i], kpmd, PAGE_SIZE); + make_lowmem_page_readonly( + pmd[i], XENFEAT_writable_page_tables); + } + + /* It is safe to poke machine addresses of pmds under the pmd_lock. */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); + + /* Ensure this pgd gets picked up and pinned on save/restore. */ + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); + + kfree(pmd); + + return pgd; + +out_oom: + if (HAVE_SHARED_KERNEL_PMD) { + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, + (void *)__va(pgd_val(pgd[i])-1)); + } else { + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, pmd[i]); + kfree(pmd); + } + kmem_cache_free(pgd_cache, pgd); + return NULL; +} + +void pgd_free(pgd_t *pgd) +{ + int i; + + /* + * After this the pgd should not be pinned for the duration of this + * function's execution. We should never sleep and thus never race: + * 1. User pmds will not become write-protected under our feet due + * to a concurrent mm_pin_all(). + * 2. The machine addresses in PGD entries will not become invalid + * due to a concurrent save/restore. + */ + pgd_test_and_unpin(pgd); + + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); + kmem_cache_free(pmd_cache, pmd); + } + + if (!HAVE_SHARED_KERNEL_PMD) { + unsigned long flags; + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + + for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); + make_lowmem_page_writable( + pmd, XENFEAT_writable_page_tables); + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + kmem_cache_free(pmd_cache, pmd); + } + + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) + xen_destroy_contiguous_region( + (unsigned long)pgd, 0); + } + } + + /* in the non-PAE case, free_pgtables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} + +void make_lowmem_page_readonly(void *va, unsigned int feature) +{ + pte_t *pte; + int rc; + + if (xen_feature(feature)) + return; + + pte = virt_to_ptep(va); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_wrprotect(*pte), 0); + BUG_ON(rc); +} + +void make_lowmem_page_writable(void *va, unsigned int feature) +{ + pte_t *pte; + int rc; + + if (xen_feature(feature)) + return; + + pte = virt_to_ptep(va); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_mkwrite(*pte), 0); + BUG_ON(rc); +} + +void make_page_readonly(void *va, unsigned int feature) +{ + pte_t *pte; + int rc; + + if (xen_feature(feature)) + return; + + pte = virt_to_ptep(va); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_wrprotect(*pte), 0); + if (rc) /* fallback? */ + xen_l1_entry_update(pte, pte_wrprotect(*pte)); + if ((unsigned long)va >= (unsigned long)high_memory) { + unsigned long pfn = pte_pfn(*pte); +#ifdef CONFIG_HIGHMEM + if (pfn >= highstart_pfn) + kmap_flush_unused(); /* flush stale writable kmaps */ + else +#endif + make_lowmem_page_readonly( + phys_to_virt(pfn << PAGE_SHIFT), feature); + } +} + +void make_page_writable(void *va, unsigned int feature) +{ + pte_t *pte; + int rc; + + if (xen_feature(feature)) + return; + + pte = virt_to_ptep(va); + rc = HYPERVISOR_update_va_mapping( + (unsigned long)va, pte_mkwrite(*pte), 0); + if (rc) /* fallback? */ + xen_l1_entry_update(pte, pte_mkwrite(*pte)); + if ((unsigned long)va >= (unsigned long)high_memory) { + unsigned long pfn = pte_pfn(*pte); +#ifdef CONFIG_HIGHMEM + if (pfn < highstart_pfn) +#endif + make_lowmem_page_writable( + phys_to_virt(pfn << PAGE_SHIFT), feature); + } +} + +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature) +{ + if (xen_feature(feature)) + return; + + while (nr-- != 0) { + make_page_readonly(va, feature); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} + +void make_pages_writable(void *va, unsigned int nr, unsigned int feature) +{ + if (xen_feature(feature)) + return; + + while (nr-- != 0) { + make_page_writable(va, feature); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} + +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + +#define PIN_BATCH 4 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); + +static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags, + unsigned int cpu, unsigned seq) +{ + unsigned long pfn = page_to_pfn(page); + + if (PageHighMem(page)) { + if (pgprot_val(flags) & _PAGE_RW) + clear_bit(PG_pinned, &page->flags); + else + set_bit(PG_pinned, &page->flags); + } else { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); + if (unlikely(++seq == PIN_BATCH)) { + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + PIN_BATCH, NULL))) + BUG(); + seq = 0; + } + } + + return seq; +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + int g, u, m; + unsigned int cpu, seq; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + cpu = get_cpu(); + + for (g = 0, seq = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq); + } + } + } + + if (likely(seq != 0)) { + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + seq + 1, NULL))) + BUG(); + } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH)) + BUG(); + + put_cpu(); +} + +static void __pgd_pin(pgd_t *pgd) +{ + pgd_walk(pgd, PAGE_KERNEL_RO); + kmap_flush_unused(); + xen_pgd_pin(__pa(pgd)); + set_bit(PG_pinned, &virt_to_page(pgd)->flags); +} + +static void __pgd_unpin(pgd_t *pgd) +{ + xen_pgd_unpin(__pa(pgd)); + pgd_walk(pgd, PAGE_KERNEL); + clear_bit(PG_pinned, &virt_to_page(pgd)->flags); +} + +static void pgd_test_and_unpin(pgd_t *pgd) +{ + if (test_bit(PG_pinned, &virt_to_page(pgd)->flags)) + __pgd_unpin(pgd); +} + +void mm_pin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + pin_lock(mm); + __pgd_pin(mm->pgd); + pin_unlock(mm); +} + +void mm_unpin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + pin_lock(mm); + __pgd_unpin(mm->pgd); + pin_unlock(mm); +} + +void mm_pin_all(void) +{ + struct page *page; + unsigned long flags; + + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + /* + * Allow uninterrupted access to the pgd_list. Also protects + * __pgd_pin() by disabling preemption. + * All other CPUs must be at a safe point (e.g., in stop_machine + * or offlined entirely). + */ + spin_lock_irqsave(&pgd_lock, flags); + for (page = pgd_list; page; page = (struct page *)page->index) { + if (!test_bit(PG_pinned, &page->flags)) + __pgd_pin((pgd_t *)page_address(page)); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} + +void _arch_dup_mmap(struct mm_struct *mm) +{ + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) + mm_pin(mm); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) && + (atomic_read(&mm->mm_count) == 1) && + !mm->context.has_foreign_mappings) + mm_unpin(mm); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/oprofile/xenoprof.c 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,179 @@ +/** + * @file xenoprof.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + * + * Modified by Aravind Menon and Jose Renato Santos for Xen + * These modifications are: + * Copyright (C) 2005 Hewlett-Packard Co. + * + * x86-specific part + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + */ + +#include <linux/init.h> +#include <linux/oprofile.h> +#include <linux/sched.h> +#include <asm/pgtable.h> + +#include <xen/driver_util.h> +#include <xen/interface/xen.h> +#include <xen/interface/xenoprof.h> +#include <xen/xenoprof.h> +#include "op_counter.h" + +static unsigned int num_events = 0; + +void __init xenoprof_arch_init_counter(struct xenoprof_init *init) +{ + num_events = init->num_events; + /* just in case - make sure we do not overflow event list + (i.e. counter_config list) */ + if (num_events > OP_MAX_COUNTER) { + num_events = OP_MAX_COUNTER; + init->num_events = num_events; + } +} + +void xenoprof_arch_counter(void) +{ + int i; + struct xenoprof_counter counter; + + for (i=0; i<num_events; i++) { + counter.ind = i; + counter.count = (uint64_t)counter_config[i].count; + counter.enabled = (uint32_t)counter_config[i].enabled; + counter.event = (uint32_t)counter_config[i].event; + counter.kernel = (uint32_t)counter_config[i].kernel; + counter.user = (uint32_t)counter_config[i].user; + counter.unit_mask = (uint64_t)counter_config[i].unit_mask; + WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter, + &counter)); + } +} + +void xenoprof_arch_start(void) +{ + /* nothing */ +} + +void xenoprof_arch_stop(void) +{ + /* nothing */ +} + +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf) +{ + if (sbuf->buffer) { + vunmap(sbuf->buffer); + sbuf->buffer = NULL; + } +} + +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer, + struct xenoprof_shared_buffer * sbuf) +{ + int npages, ret; + struct vm_struct *area; + + sbuf->buffer = NULL; + if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) ) + return ret; + + npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1; + + area = alloc_vm_area(npages * PAGE_SIZE); + if (area == NULL) + return -ENOMEM; + + if ( (ret = direct_kernel_remap_pfn_range( + (unsigned long)area->addr, + get_buffer->buf_gmaddr >> PAGE_SHIFT, + npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE), + DOMID_SELF)) ) { + vunmap(area->addr); + return ret; + } + + sbuf->buffer = area->addr; + return ret; +} + +int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain, + struct xenoprof_shared_buffer * sbuf) +{ + int ret; + int npages; + struct vm_struct *area; + pgprot_t prot = __pgprot(_KERNPG_TABLE); + + sbuf->buffer = NULL; + ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain); + if (ret) + goto out; + + npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1; + + area = alloc_vm_area(npages * PAGE_SIZE); + if (area == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = direct_kernel_remap_pfn_range( + (unsigned long)area->addr, + pdomain->buf_gmaddr >> PAGE_SHIFT, + npages * PAGE_SIZE, prot, DOMID_SELF); + if (ret) { + vunmap(area->addr); + goto out; + } + sbuf->buffer = area->addr; + +out: + return ret; +} + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +int xenoprof_create_files(struct super_block * sb, struct dentry * root) +{ + unsigned int i; + + for (i = 0; i < num_events; ++i) { + struct dentry * dir; + char buf[2]; + + snprintf(buf, 2, "%d", i); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enabled", + &counter_config[i].enabled); + oprofilefs_create_ulong(sb, dir, "event", + &counter_config[i].event); + oprofilefs_create_ulong(sb, dir, "count", + &counter_config[i].count); + oprofilefs_create_ulong(sb, dir, "unit_mask", + &counter_config[i].unit_mask); + oprofilefs_create_ulong(sb, dir, "kernel", + &counter_config[i].kernel); + oprofilefs_create_ulong(sb, dir, "user", + &counter_config[i].user); + } + + return 0; +} + +int __init oprofile_arch_init(struct oprofile_operations * ops) +{ + return xenoprofile_init(ops); +} + +void oprofile_arch_exit(void) +{ + xenoprofile_exit(); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/pci/irq-xen.c 2008-03-06 08:54:32.000000000 +0100 @@ -0,0 +1,1211 @@ +/* + * Low-Level PCI Support for PC -- Routing of Interrupts + * + * (c) 1999--2000 Martin Mares <mj@ucw.cz> + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/dmi.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/io_apic.h> +#include <linux/irq.h> +#include <linux/acpi.h> + +#include "pci.h" + +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) +#define PIRQ_VERSION 0x0100 + +static int broken_hp_bios_irq9; +static int acer_tm360_irqrouting; + +static struct irq_routing_table *pirq_table; + +static int pirq_enable_irq(struct pci_dev *dev); + +/* + * Never use: 0, 1, 2 (timer, keyboard, and cascade) + * Avoid using: 13, 14 and 15 (FP error and IDE). + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse) + */ +unsigned int pcibios_irq_mask = 0xfff8; + +static int pirq_penalty[16] = { + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000, + 0, 0, 0, 0, 1000, 100000, 100000, 100000 +}; + +struct irq_router { + char *name; + u16 vendor, device; + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); +}; + +struct irq_router_handler { + u16 vendor; + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); +}; + +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; + +/* + * Check passed address for the PCI IRQ Routing Table signature + * and perform checksum verification. + */ + +static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr) +{ + struct irq_routing_table *rt; + int i; + u8 sum; + + rt = (struct irq_routing_table *) addr; + if (rt->signature != PIRQ_SIGNATURE || + rt->version != PIRQ_VERSION || + rt->size % 16 || + rt->size < sizeof(struct irq_routing_table)) + return NULL; + sum = 0; + for (i=0; i < rt->size; i++) + sum += addr[i]; + if (!sum) { + DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt); + return rt; + } + return NULL; +} + + + +/* + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. + */ + +static struct irq_routing_table * __init pirq_find_routing_table(void) +{ + u8 *addr; + struct irq_routing_table *rt; + +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return NULL; +#endif + if (pirq_table_addr) { + rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr)); + if (rt) + return rt; + printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n"); + } + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) { + rt = pirq_check_routing_table(addr); + if (rt) + return rt; + } + return NULL; +} + +/* + * If we have a IRQ routing table, use it to search for peer host + * bridges. It's a gross hack, but since there are no other known + * ways how to get a list of buses, we have to go this way. + */ + +static void __init pirq_peer_trick(void) +{ + struct irq_routing_table *rt = pirq_table; + u8 busmap[256]; + int i; + struct irq_info *e; + + memset(busmap, 0, sizeof(busmap)); + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { + e = &rt->slots[i]; +#ifdef DEBUG + { + int j; + DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); + for(j=0; j<4; j++) + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); + DBG("\n"); + } +#endif + busmap[e->bus] = 1; + } + for(i = 1; i < 256; i++) { + if (!busmap[i] || pci_find_bus(0, i)) + continue; + if (pci_scan_bus(i, &pci_root_ops, NULL)) + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i); + } + pcibios_last_bus = -1; +} + +/* + * Code for querying and setting of IRQ routes on various interrupt routers. + */ + +void eisa_set_level_irq(unsigned int irq) +{ + unsigned char mask = 1 << (irq & 7); + unsigned int port = 0x4d0 + (irq >> 3); + unsigned char val; + static u16 eisa_irq_mask; + + if (irq >= 16 || (1 << irq) & eisa_irq_mask) + return; + + eisa_irq_mask |= (1 << irq); + printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq); + val = inb(port); + if (!(val & mask)) { + DBG(KERN_DEBUG " -> edge"); + outb(val | mask, port); + } +} + +/* + * Common IRQ routing practice: nybbles in config space, + * offset by some magic constant. + */ +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + return (nr & 1) ? (x >> 4) : (x & 0xf); +} + +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val); + pci_write_config_byte(router, reg, x); +} + +/* + * ALI pirq entries are damn ugly, and completely undocumented. + * This has been figured out from pirq tables, and it's not a pretty + * picture. + */ +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + + return irqmap[read_config_nybble(router, 0x48, pirq-1)]; +} + +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; + unsigned int val = irqmap[irq]; + + if (val) { + write_config_nybble(router, 0x48, pirq-1, val); + return 1; + } + return 0; +} + +/* + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is + * just a pointer to the config space. + */ +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + + pci_read_config_byte(router, pirq, &x); + return (x < 16) ? x : 0; +} + +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + pci_write_config_byte(router, pirq, irq); + return 1; +} + +/* + * The VIA pirq rules are nibble-based, like ALI, + * but without the ugly irq number munging. + * However, PIRQD is in the upper instead of lower 4 bits. + */ +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq); +} + +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq); + return 1; +} + +/* + * The VIA pirq rules are nibble-based, like ALI, + * but without the ugly irq number munging. + * However, for 82C586, nibble map is different . + */ +static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + return read_config_nybble(router, 0x55, pirqmap[pirq-1]); +} + +static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); + return 1; +} + +/* + * ITE 8330G pirq rules are nibble-based + * FIXME: pirqmap may be { 1, 0, 3, 2 }, + * 2+3 are both mapped to irq 9 on my system + */ +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + return read_config_nybble(router,0x43, pirqmap[pirq-1]); +} + +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); + return 1; +} + +/* + * OPTI: high four bits are nibble pointer.. + * I wonder what the low bits do? + */ +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0xb8, pirq >> 4); +} + +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0xb8, pirq >> 4, irq); + return 1; +} + +/* + * Cyrix: nibble offset 0x5C + * 0x5C bits 7:4 is INTB bits 3:0 is INTA + * 0x5D bits 7:4 is INTD bits 3:0 is INTC + */ +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x5C, (pirq-1)^1); +} + +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x5C, (pirq-1)^1, irq); + return 1; +} + +/* + * PIRQ routing for SiS 85C503 router used in several SiS chipsets. + * We have to deal with the following issues here: + * - vendors have different ideas about the meaning of link values + * - some onboard devices (integrated in the chipset) have special + * links and are thus routed differently (i.e. not via PCI INTA-INTD) + * - different revision of the router have a different layout for + * the routing registers, particularly for the onchip devices + * + * For all routing registers the common thing is we have one byte + * per routeable link which is defined as: + * bit 7 IRQ mapping enabled (0) or disabled (1) + * bits [6:4] reserved (sometimes used for onchip devices) + * bits [3:0] IRQ to map to + * allowed: 3-7, 9-12, 14-15 + * reserved: 0, 1, 2, 8, 13 + * + * The config-space registers located at 0x41/0x42/0x43/0x44 are + * always used to route the normal PCI INT A/B/C/D respectively. + * Apparently there are systems implementing PCI routing table using + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. + * We try our best to handle both link mappings. + * + * Currently (2003-05-21) it appears most SiS chipsets follow the + * definition of routing registers from the SiS-5595 southbridge. + * According to the SiS 5595 datasheets the revision id's of the + * router (ISA-bridge) should be 0x01 or 0xb0. + * + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1. + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets. + * They seem to work with the current routing code. However there is + * some concern because of the two USB-OHCI HCs (original SiS 5595 + * had only one). YMMV. + * + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1: + * + * 0x61: IDEIRQ: + * bits [6:5] must be written 01 + * bit 4 channel-select primary (0), secondary (1) + * + * 0x62: USBIRQ: + * bit 6 OHCI function disabled (0), enabled (1) + * + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved + * + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved + * + * We support USBIRQ (in addition to INTA-INTD) and keep the + * IDE, ACPI and DAQ routing untouched as set by the BIOS. + * + * Currently the only reported exception is the new SiS 65x chipset + * which includes the SiS 69x southbridge. Here we have the 85C503 + * router revision 0x04 and there are changes in the register layout + * mostly related to the different USB HCs with USB 2.0 support. + * + * Onchip routing for router rev-id 0x04 (try-and-error observation) + * + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs + * bit 6-4 are probably unused, not like 5595 + */ + +#define PIRQ_SIS_IRQ_MASK 0x0f +#define PIRQ_SIS_IRQ_DISABLE 0x80 +#define PIRQ_SIS_USB_ENABLE 0x40 + +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); +} + +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; + pci_write_config_byte(router, reg, x); + return 1; +} + + +/* + * VLSI: nibble offset 0x74 - educated guess due to routing table and + * config space of VLSI 82C534 PCI-bridge/router (1004:0102) + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6 + * for the busbridge to the docking station. + */ + +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + return read_config_nybble(router, 0x74, pirq-1); +} + +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + write_config_nybble(router, 0x74, pirq-1, irq); + return 1; +} + +/* + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect + * register is a straight binary coding of desired PIC IRQ (low nibble). + * + * The 'link' value in the PIRQ table is already in the correct format + * for the Index register. There are some special index values: + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1, + * and 0x03 for SMBus. + */ +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + outb_p(pirq, 0xc00); + return inb(0xc01) & 0xf; +} + +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + outb_p(pirq, 0xc00); + outb_p(irq, 0xc01); + return 1; +} + +/* Support for AMD756 PCI IRQ Routing + * Jhon H. Caicedo <jhcaiced@osso.org.co> + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced) + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced) + * The AMD756 pirq rules are nibble-based + * offset 0x56 0-3 PIRQA 4-7 PIRQB + * offset 0x57 0-3 PIRQC 4-7 PIRQD + */ +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 irq; + irq = 0; + if (pirq <= 4) + { + irq = read_config_nybble(router, 0x56, pirq - 1); + } + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + return irq; +} + +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + if (pirq <= 4) + { + write_config_nybble(router, 0x56, pirq - 1, irq); + } + return 1; +} + +#ifdef CONFIG_PCI_BIOS + +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + struct pci_dev *bridge; + int pin = pci_get_interrupt_pin(dev, &bridge); + return pcibios_set_irq_routing(bridge, pin, irq); +} + +#endif + +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + static struct pci_device_id __initdata pirq_440gx[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, + { }, + }; + + /* 440GX has a proprietary PIRQ router -- don't use it */ + if (pci_dev_present(pirq_440gx)) + return 0; + + switch(device) + { + case PCI_DEVICE_ID_INTEL_82371FB_0: + case PCI_DEVICE_ID_INTEL_82371SB_0: + case PCI_DEVICE_ID_INTEL_82371AB_0: + case PCI_DEVICE_ID_INTEL_82371MX: + case PCI_DEVICE_ID_INTEL_82443MX_0: + case PCI_DEVICE_ID_INTEL_82801AA_0: + case PCI_DEVICE_ID_INTEL_82801AB_0: + case PCI_DEVICE_ID_INTEL_82801BA_0: + case PCI_DEVICE_ID_INTEL_82801BA_10: + case PCI_DEVICE_ID_INTEL_82801CA_0: + case PCI_DEVICE_ID_INTEL_82801CA_12: + case PCI_DEVICE_ID_INTEL_82801DB_0: + case PCI_DEVICE_ID_INTEL_82801E_0: + case PCI_DEVICE_ID_INTEL_82801EB_0: + case PCI_DEVICE_ID_INTEL_ESB_1: + case PCI_DEVICE_ID_INTEL_ICH6_0: + case PCI_DEVICE_ID_INTEL_ICH6_1: + case PCI_DEVICE_ID_INTEL_ICH7_0: + case PCI_DEVICE_ID_INTEL_ICH7_1: + case PCI_DEVICE_ID_INTEL_ICH7_30: + case PCI_DEVICE_ID_INTEL_ICH7_31: + case PCI_DEVICE_ID_INTEL_ESB2_0: + case PCI_DEVICE_ID_INTEL_ICH8_0: + case PCI_DEVICE_ID_INTEL_ICH8_1: + case PCI_DEVICE_ID_INTEL_ICH8_2: + case PCI_DEVICE_ID_INTEL_ICH8_3: + case PCI_DEVICE_ID_INTEL_ICH8_4: + case PCI_DEVICE_ID_INTEL_ICH9_0: + case PCI_DEVICE_ID_INTEL_ICH9_1: + case PCI_DEVICE_ID_INTEL_ICH9_2: + case PCI_DEVICE_ID_INTEL_ICH9_3: + case PCI_DEVICE_ID_INTEL_ICH9_4: + case PCI_DEVICE_ID_INTEL_ICH9_5: + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; +} + +static __init int via_router_probe(struct irq_router *r, + struct pci_dev *router, u16 device) +{ + /* FIXME: We should move some of the quirk fixup stuff here */ + + /* + * work arounds for some buggy BIOSes + */ + if (device == PCI_DEVICE_ID_VIA_82C586_0) { + switch(router->device) { + case PCI_DEVICE_ID_VIA_82C686: + /* + * Asus k7m bios wrongly reports 82C686A + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_82C686; + break; + case PCI_DEVICE_ID_VIA_8235: + /** + * Asus a7v-x bios wrongly reports 8235 + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_8235; + break; + } + } + + switch(device) { + case PCI_DEVICE_ID_VIA_82C586_0: + r->name = "VIA"; + r->get = pirq_via586_get; + r->set = pirq_via586_set; + return 1; + case PCI_DEVICE_ID_VIA_82C596: + case PCI_DEVICE_ID_VIA_82C686: + case PCI_DEVICE_ID_VIA_8231: + case PCI_DEVICE_ID_VIA_8233A: + case PCI_DEVICE_ID_VIA_8235: + case PCI_DEVICE_ID_VIA_8237: + /* FIXME: add new ones for 8233/5 */ + r->name = "VIA"; + r->get = pirq_via_get; + r->set = pirq_via_set; + return 1; + } + return 0; +} + +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_VLSI_82C534: + r->name = "VLSI 82C534"; + r->get = pirq_vlsi_get; + r->set = pirq_vlsi_set; + return 1; + } + return 0; +} + + +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_SERVERWORKS_OSB4: + case PCI_DEVICE_ID_SERVERWORKS_CSB5: + r->name = "ServerWorks"; + r->get = pirq_serverworks_get; + r->set = pirq_serverworks_set; + return 1; + } + return 0; +} + +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + if (device != PCI_DEVICE_ID_SI_503) + return 0; + + r->name = "SIS"; + r->get = pirq_sis_get; + r->set = pirq_sis_set; + return 1; +} + +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_CYRIX_5520: + r->name = "NatSemi"; + r->get = pirq_cyrix_get; + r->set = pirq_cyrix_set; + return 1; + } + return 0; +} + +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_OPTI_82C700: + r->name = "OPTI"; + r->get = pirq_opti_get; + r->set = pirq_opti_set; + return 1; + } + return 0; +} + +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_ITE_IT8330G_0: + r->name = "ITE"; + r->get = pirq_ite_get; + r->set = pirq_ite_set; + return 1; + } + return 0; +} + +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AL_M1533: + case PCI_DEVICE_ID_AL_M1563: + printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); + r->name = "ALI"; + r->get = pirq_ali_get; + r->set = pirq_ali_set; + return 1; + } + return 0; +} + +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AMD_VIPER_740B: + r->name = "AMD756"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7413: + r->name = "AMD766"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7443: + r->name = "AMD768"; + break; + default: + return 0; + } + r->get = pirq_amd756_get; + r->set = pirq_amd756_set; + return 1; +} + +static __initdata struct irq_router_handler pirq_routers[] = { + { PCI_VENDOR_ID_INTEL, intel_router_probe }, + { PCI_VENDOR_ID_AL, ali_router_probe }, + { PCI_VENDOR_ID_ITE, ite_router_probe }, + { PCI_VENDOR_ID_VIA, via_router_probe }, + { PCI_VENDOR_ID_OPTI, opti_router_probe }, + { PCI_VENDOR_ID_SI, sis_router_probe }, + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe }, + { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, + { PCI_VENDOR_ID_AMD, amd_router_probe }, + /* Someone with docs needs to add the ATI Radeon IGP */ + { 0, NULL } +}; +static struct irq_router pirq_router; +static struct pci_dev *pirq_router_dev; + + +/* + * FIXME: should we have an option to say "generic for + * chipset" ? + */ + +static void __init pirq_find_router(struct irq_router *r) +{ + struct irq_routing_table *rt = pirq_table; + struct irq_router_handler *h; + +#ifdef CONFIG_PCI_BIOS + if (!rt->signature) { + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n"); + r->set = pirq_bios_set; + r->name = "BIOS"; + return; + } +#endif + + /* Default unless a driver reloads it */ + r->name = "default"; + r->get = NULL; + r->set = NULL; + + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + rt->rtr_vendor, rt->rtr_device); + + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); + if (!pirq_router_dev) { + DBG(KERN_DEBUG "PCI: Interrupt router not found at " + "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); + return; + } + + for( h = pirq_routers; h->vendor; h++) { + /* First look for a router match */ + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) + break; + /* Fall back to a device match */ + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) + break; + } + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", + pirq_router.name, + pirq_router_dev->vendor, + pirq_router_dev->device, + pci_name(pirq_router_dev)); +} + +static struct irq_info *pirq_get_info(struct pci_dev *dev) +{ + struct irq_routing_table *rt = pirq_table; + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + struct irq_info *info; + + for (info = rt->slots; entries--; info++) + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + return info; + return NULL; +} + +static int pcibios_lookup_irq(struct pci_dev *dev, int assign) +{ + u8 pin; + struct irq_info *info; + int i, pirq, newirq; + int irq = 0; + u32 mask; + struct irq_router *r = &pirq_router; + struct pci_dev *dev2 = NULL; + char *msg = NULL; + + /* Find IRQ pin */ + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (!pin) { + DBG(KERN_DEBUG " -> no interrupt pin\n"); + return 0; + } + pin = pin - 1; + + /* Find IRQ routing entry */ + + if (!pirq_table) + return 0; + + DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); + info = pirq_get_info(dev); + if (!info) { + DBG(" -> not found in routing table\n" KERN_DEBUG); + return 0; + } + pirq = info->irq[pin].link; + mask = info->irq[pin].bitmap; + if (!pirq) { + DBG(" -> not routed\n" KERN_DEBUG); + return 0; + } + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); + mask &= pcibios_irq_mask; + + /* Work around broken HP Pavilion Notebooks which assign USB to + IRQ 9 even though it is actually wired to IRQ 11 */ + + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) { + dev->irq = 11; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11); + r->set(pirq_router_dev, dev, pirq, 11); + } + + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { + pirq = 0x68; + mask = 0x400; + dev->irq = r->get(pirq_router_dev, dev, pirq); + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); + } + + /* + * Find the best IRQ to assign: use the one + * reported by the device if possible. + */ + newirq = dev->irq; + if (newirq && !((1 << newirq) & mask)) { + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; + else printk("\n" KERN_WARNING + "PCI: IRQ %i for device %s doesn't match PIRQ mask " + "- try pci=usepirqmask\n" KERN_DEBUG, newirq, + pci_name(dev)); + } + if (!newirq && assign) { + for (i = 0; i < 16; i++) { + if (!(mask & (1 << i))) + continue; + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED)) + newirq = i; + } + } + DBG(" -> newirq=%d", newirq); + + /* Check if it is hardcoded */ + if ((pirq & 0xf0) == 0xf0) { + irq = pirq & 0xf; + DBG(" -> hardcoded IRQ %d\n", irq); + msg = "Hardcoded"; + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { + DBG(" -> got IRQ %d\n", irq); + msg = "Found"; + eisa_set_level_irq(irq); + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { + DBG(" -> assigning IRQ %d", newirq); + if (r->set(pirq_router_dev, dev, pirq, newirq)) { + eisa_set_level_irq(newirq); + DBG(" ... OK\n"); + msg = "Assigned"; + irq = newirq; + } + } + + if (!irq) { + DBG(" ... failed\n"); + if (newirq && mask == (1 << newirq)) { + msg = "Guessed"; + irq = newirq; + } else + return 0; + } + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); + + /* Update IRQ for all devices with the same pirq value */ + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); + if (!pin) + continue; + pin--; + info = pirq_get_info(dev2); + if (!info) + continue; + if (info->irq[pin].link == pirq) { + /* We refuse to override the dev->irq information. Give a warning! */ + if ( dev2->irq && dev2->irq != irq && \ + (!(pci_probe & PCI_USE_PIRQ_MASK) || \ + ((1 << dev2->irq) & mask)) ) { +#ifndef CONFIG_PCI_MSI + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", + pci_name(dev2), dev2->irq, irq); +#endif + continue; + } + dev2->irq = irq; + pirq_penalty[irq]++; + if (dev != dev2) + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); + } + } + return 1; +} + +static void __init pcibios_fixup_irqs(void) +{ + struct pci_dev *dev = NULL; + u8 pin; + + DBG(KERN_DEBUG "PCI: IRQ fixup\n"); + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + /* + * If the BIOS has set an out of range IRQ number, just ignore it. + * Also keep track of which IRQ's are already in use. + */ + if (dev->irq >= 16) { + DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); + dev->irq = 0; + } + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) + pirq_penalty[dev->irq] = 0; + pirq_penalty[dev->irq]++; + } + + dev = NULL; + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); +#ifdef CONFIG_X86_IO_APIC + /* + * Recalculate IRQ numbers if we use the I/O APIC. + */ + if (io_apic_assign_pci_irqs) + { + int irq; + + if (pin) { + pin--; /* interrupt pins are numbered starting from 1 */ + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + } + if (irq >= 0) { + if (use_pci_vector() && + !platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); + + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + } + } + } +#endif + /* + * Still no IRQ? Try to lookup one... + */ + if (pin && !dev->irq) + pcibios_lookup_irq(dev, 0); + } +} + +/* + * Work around broken HP Pavilion Notebooks which assign USB to + * IRQ 9 even though it is actually wired to IRQ 11 + */ +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) +{ + if (!broken_hp_bios_irq9) { + broken_hp_bios_irq9 = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +/* + * Work around broken Acer TravelMate 360 Notebooks which assign + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 + */ +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) +{ + if (!acer_tm360_irqrouting) { + acer_tm360_irqrouting = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pciirq_dmi_table[] = { + { + .callback = fix_broken_hp_bios_irq9, + .ident = "HP Pavilion N5400 Series Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), + }, + }, + { + .callback = fix_acer_tm360_irqrouting, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + { } +}; + +static int __init pcibios_irq_init(void) +{ + DBG(KERN_DEBUG "PCI: IRQ init\n"); + + if (pcibios_enable_irq || raw_pci_ops == NULL) + return 0; + + dmi_check_system(pciirq_dmi_table); + + pirq_table = pirq_find_routing_table(); + +#ifdef CONFIG_PCI_BIOS + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) + pirq_table = pcibios_get_irq_routing_table(); +#endif + if (pirq_table) { + pirq_peer_trick(); + pirq_find_router(&pirq_router); + if (pirq_table->exclusive_irqs) { + int i; + for (i=0; i<16; i++) + if (!(pirq_table->exclusive_irqs & (1 << i))) + pirq_penalty[i] += 100; + } + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ + if (io_apic_assign_pci_irqs) + pirq_table = NULL; + } + + pcibios_enable_irq = pirq_enable_irq; + + pcibios_fixup_irqs(); + return 0; +} + +subsys_initcall(pcibios_irq_init); + + +static void pirq_penalize_isa_irq(int irq, int active) +{ + /* + * If any ISAPnP device reports an IRQ in its list of possible + * IRQ's, we try to avoid assigning it to PCI devices. + */ + if (irq < 16) { + if (active) + pirq_penalty[irq] += 1000; + else + pirq_penalty[irq] += 100; + } +} + +void pcibios_penalize_isa_irq(int irq, int active) +{ +#ifdef CONFIG_ACPI + if (!acpi_noirq) + acpi_penalize_isa_irq(irq, active); + else +#endif + pirq_penalize_isa_irq(irq, active); +} + +static int pirq_enable_irq(struct pci_dev *dev) +{ + u8 pin; + struct pci_dev *temp_dev; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + char *msg = ""; + + pin--; /* interrupt pins are numbered starting from 1 */ + + if (io_apic_assign_pci_irqs) { + int irq; + + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + temp_dev = dev; + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + dev = bridge; + } + dev = temp_dev; + if (irq >= 0) { +#ifdef CONFIG_PCI_MSI + if (!platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); +#endif + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + return 0; + } else + msg = " Probably buggy MP table."; + } else if (pci_probe & PCI_BIOS_IRQ_SCAN) + msg = ""; + else + msg = " Please try using pci=biosirq."; + + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) + return 0; + + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", + 'A' + pin, pci_name(dev), msg); + } + return 0; +} + +int pci_vector_resources(int last, int nr_released) +{ + int count = nr_released; + + int next = last; + int offset = (last % 8); + + while (next < FIRST_SYSTEM_VECTOR) { + next += 8; +#ifdef CONFIG_X86_64 + if (next == IA32_SYSCALL_VECTOR) + continue; +#else + if (next == SYSCALL_VECTOR) + continue; +#endif + count++; + if (next >= FIRST_SYSTEM_VECTOR) { + if (offset%8) { + next = FIRST_DEVICE_VECTOR + offset; + offset++; + continue; + } + count--; + } + } + + return count; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/pci/pcifront.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,57 @@ +/* + * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core + * to support the Xen PCI Frontend's operation + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <asm/acpi.h> +#include <xen/evtchn.h> +#include "pci.h" + +static int pcifront_enable_irq(struct pci_dev *dev) +{ + u8 irq; + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); + evtchn_register_pirq(irq); + dev->irq = irq; + + return 0; +} + +extern u8 pci_cache_line_size; + +static int __init pcifront_x86_stub_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* Only install our method if we haven't found real hardware already */ + if (raw_pci_ops) + return 0; + + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n"); + + /* Copied from arch/i386/pci/common.c */ + pci_cache_line_size = 32 >> 2; + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) + pci_cache_line_size = 64 >> 2; /* K7 & K8 */ + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) + pci_cache_line_size = 128 >> 2; /* P4 */ + + /* On x86, we need to disable the normal IRQ routing table and + * just ask the backend + */ + pcibios_enable_irq = pcifront_enable_irq; + pcibios_disable_irq = NULL; + +#ifdef CONFIG_ACPI + /* Keep ACPI out of the picture */ + acpi_noirq = 1; +#endif + + return 0; +} + +arch_initcall(pcifront_x86_stub_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/ia32/ia32entry-xen.S 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,666 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/asm-offsets.h> +#include <asm/current.h> +#include <asm/errno.h> +#include <asm/ia32_unistd.h> +#include <asm/thread_info.h> +#include <asm/segment.h> +#include <asm/vsyscall32.h> +#include <asm/irqflags.h> +#include <linux/linkage.h> + +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) + + .macro IA32_ARG_FIXUP noebp=0 + movl %edi,%r8d + .if \noebp + .else + movl %ebp,%r9d + .endif + xchg %ecx,%esi + movl %ebx,%edi + movl %edx,%edx /* zero extension */ + .endm + + /* clobbers %eax */ + .macro CLEAR_RREGS + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %rax,R9(%rsp) + movq %rax,R8(%rsp) + .endm + + .macro LOAD_ARGS32 offset + movl \offset(%rsp),%r11d + movl \offset+8(%rsp),%r10d + movl \offset+16(%rsp),%r9d + movl \offset+24(%rsp),%r8d + movl \offset+40(%rsp),%ecx + movl \offset+48(%rsp),%edx + movl \offset+56(%rsp),%esi + movl \offset+64(%rsp),%edi + movl \offset+72(%rsp),%eax + .endm + + .macro CFI_STARTPROC32 simple + CFI_STARTPROC \simple + CFI_UNDEFINED r8 + CFI_UNDEFINED r9 + CFI_UNDEFINED r10 + CFI_UNDEFINED r11 + CFI_UNDEFINED r12 + CFI_UNDEFINED r13 + CFI_UNDEFINED r14 + CFI_UNDEFINED r15 + .endm + +/* + * 32bit SYSENTER instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp user stack + * 0(%ebp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + CFI_STARTPROC32 simple + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + movl %ebp,%ebp /* zero extension */ + movl %eax,%eax + movl $__USER32_DS,40(%rsp) + movq %rbp,32(%rsp) + movl $__USER32_CS,16(%rsp) + movl $VSYSCALL32_SYSEXIT,8(%rsp) + movq %rax,(%rsp) + cld + SAVE_ARGS 0,0,0 + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,threadinfo_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz sysenter_tracesys +sysenter_do_call: + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys + IA32_ARG_FIXUP 1 + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +sysenter_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + movl %ebp, %ebp + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + jmp sysenter_do_call + CFI_ENDPROC +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx return EIP + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched] + * %esp user stack + * 0(%esp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + CFI_STARTPROC32 simple + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + movl %eax,%eax /* zero extension */ + movl RSP-RIP+16(%rsp),%r8d + SAVE_ARGS -8,1,1 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movl %ebp,%ecx + movl $__USER32_CS,CS-ARGOFFSET(%rsp) + movl $__USER32_DS,SS-ARGOFFSET(%rsp) + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ + /* hardware stack frame is complete now */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,threadinfo_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz cstar_tracesys +cstar_do_call: + cmpl $IA32_NR_syscalls-1,%eax + ja ia32_badsys + IA32_ARG_FIXUP 1 + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +cstar_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + movl RSP-ARGOFFSET(%rsp), %r8d + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + movq $-EFAULT,%rax + jmp ia32_sysret + CFI_ENDPROC + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except %eax must be saved (but ptrace may violate that) + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts on. + */ + +ENTRY(ia32_syscall) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + movl %eax,%eax + movq %rax,(%rsp) + cld + /* note the registers are not zero extended to the sf. + this could be a problem. */ + SAVE_ARGS 0,0,1 + GET_THREAD_INFO(%r10) + orl $TS_COMPAT,threadinfo_status(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz ia32_tracesys +ia32_do_syscall: + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys + IA32_ARG_FIXUP + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_REST + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp ia32_do_syscall +END(ia32_syscall) + +ia32_badsys: + movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +quiet_ni_syscall: + movq $-ENOSYS,%rax + ret + CFI_ENDPROC + + .macro PTREGSCALL label, func, arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ia32_ptregs_common + .endm + + CFI_STARTPROC32 + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx + PTREGSCALL stub32_execve, sys32_execve, %rcx + PTREGSCALL stub32_fork, sys_fork, %rdi + PTREGSCALL stub32_clone, sys32_clone, %rdx + PTREGSCALL stub32_vfork, sys_vfork, %rdi + PTREGSCALL stub32_iopl, sys_iopl, %rsi + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx + +ENTRY(ia32_ptregs_common) + popq %r11 + CFI_ENDPROC + CFI_STARTPROC32 simple + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rip,RIP-ARGOFFSET +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + SAVE_REST + call *%rax + RESTORE_REST + jmp ia32_sysret /* misbalances the return cache */ + CFI_ENDPROC +END(ia32_ptregs_common) + + .section .rodata,"a" + .align 8 +ia32_sys_call_table: + .quad sys_restart_syscall + .quad sys_exit + .quad stub32_fork + .quad sys_read + .quad sys_write + .quad compat_sys_open /* 5 */ + .quad sys_close + .quad sys32_waitpid + .quad sys_creat + .quad sys_link + .quad sys_unlink /* 10 */ + .quad stub32_execve + .quad sys_chdir + .quad compat_sys_time + .quad sys_mknod + .quad sys_chmod /* 15 */ + .quad sys_lchown16 + .quad quiet_ni_syscall /* old break syscall holder */ + .quad sys_stat + .quad sys32_lseek + .quad sys_getpid /* 20 */ + .quad compat_sys_mount /* mount */ + .quad sys_oldumount /* old_umount */ + .quad sys_setuid16 + .quad sys_getuid16 + .quad compat_sys_stime /* stime */ /* 25 */ + .quad sys32_ptrace /* ptrace */ + .quad sys_alarm + .quad sys_fstat /* (old)fstat */ + .quad sys_pause + .quad compat_sys_utime /* 30 */ + .quad quiet_ni_syscall /* old stty syscall holder */ + .quad quiet_ni_syscall /* old gtty syscall holder */ + .quad sys_access + .quad sys_nice + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ + .quad sys_sync + .quad sys32_kill + .quad sys_rename + .quad sys_mkdir + .quad sys_rmdir /* 40 */ + .quad sys_dup + .quad sys32_pipe + .quad compat_sys_times + .quad quiet_ni_syscall /* old prof syscall holder */ + .quad sys_brk /* 45 */ + .quad sys_setgid16 + .quad sys_getgid16 + .quad sys_signal + .quad sys_geteuid16 + .quad sys_getegid16 /* 50 */ + .quad sys_acct + .quad sys_umount /* new_umount */ + .quad quiet_ni_syscall /* old lock syscall holder */ + .quad compat_sys_ioctl + .quad compat_sys_fcntl64 /* 55 */ + .quad quiet_ni_syscall /* old mpx syscall holder */ + .quad sys_setpgid + .quad quiet_ni_syscall /* old ulimit syscall holder */ + .quad sys32_olduname + .quad sys_umask /* 60 */ + .quad sys_chroot + .quad sys32_ustat + .quad sys_dup2 + .quad sys_getppid + .quad sys_getpgrp /* 65 */ + .quad sys_setsid + .quad sys32_sigaction + .quad sys_sgetmask + .quad sys_ssetmask + .quad sys_setreuid16 /* 70 */ + .quad sys_setregid16 + .quad stub32_sigsuspend + .quad compat_sys_sigpending + .quad sys_sethostname + .quad compat_sys_setrlimit /* 75 */ + .quad compat_sys_old_getrlimit /* old_getrlimit */ + .quad compat_sys_getrusage + .quad sys32_gettimeofday + .quad sys32_settimeofday + .quad sys_getgroups16 /* 80 */ + .quad sys_setgroups16 + .quad sys32_old_select + .quad sys_symlink + .quad sys_lstat + .quad sys_readlink /* 85 */ +#ifdef CONFIG_IA32_AOUT + .quad sys_uselib +#else + .quad quiet_ni_syscall +#endif + .quad sys_swapon + .quad sys_reboot + .quad compat_sys_old_readdir + .quad sys32_mmap /* 90 */ + .quad sys_munmap + .quad sys_truncate + .quad sys_ftruncate + .quad sys_fchmod + .quad sys_fchown16 /* 95 */ + .quad sys_getpriority + .quad sys_setpriority + .quad quiet_ni_syscall /* old profil syscall holder */ + .quad compat_sys_statfs + .quad compat_sys_fstatfs /* 100 */ + .quad sys_ioperm + .quad compat_sys_socketcall + .quad sys_syslog + .quad compat_sys_setitimer + .quad compat_sys_getitimer /* 105 */ + .quad compat_sys_newstat + .quad compat_sys_newlstat + .quad compat_sys_newfstat + .quad sys32_uname + .quad stub32_iopl /* 110 */ + .quad sys_vhangup + .quad quiet_ni_syscall /* old "idle" system call */ + .quad sys32_vm86_warning /* vm86old */ + .quad compat_sys_wait4 + .quad sys_swapoff /* 115 */ + .quad sys32_sysinfo + .quad sys32_ipc + .quad sys_fsync + .quad stub32_sigreturn + .quad stub32_clone /* 120 */ + .quad sys_setdomainname + .quad sys_uname + .quad sys_modify_ldt + .quad compat_sys_adjtimex + .quad sys32_mprotect /* 125 */ + .quad compat_sys_sigprocmask + .quad quiet_ni_syscall /* create_module */ + .quad sys_init_module + .quad sys_delete_module + .quad quiet_ni_syscall /* 130 get_kernel_syms */ + .quad sys_quotactl + .quad sys_getpgid + .quad sys_fchdir + .quad quiet_ni_syscall /* bdflush */ + .quad sys_sysfs /* 135 */ + .quad sys_personality + .quad quiet_ni_syscall /* for afs_syscall */ + .quad sys_setfsuid16 + .quad sys_setfsgid16 + .quad sys_llseek /* 140 */ + .quad compat_sys_getdents + .quad compat_sys_select + .quad sys_flock + .quad sys_msync + .quad compat_sys_readv /* 145 */ + .quad compat_sys_writev + .quad sys_getsid + .quad sys_fdatasync + .quad sys32_sysctl /* sysctl */ + .quad sys_mlock /* 150 */ + .quad sys_munlock + .quad sys_mlockall + .quad sys_munlockall + .quad sys_sched_setparam + .quad sys_sched_getparam /* 155 */ + .quad sys_sched_setscheduler + .quad sys_sched_getscheduler + .quad sys_sched_yield + .quad sys_sched_get_priority_max + .quad sys_sched_get_priority_min /* 160 */ + .quad sys_sched_rr_get_interval + .quad compat_sys_nanosleep + .quad sys_mremap + .quad sys_setresuid16 + .quad sys_getresuid16 /* 165 */ + .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* query_module */ + .quad sys_poll + .quad compat_sys_nfsservctl + .quad sys_setresgid16 /* 170 */ + .quad sys_getresgid16 + .quad sys_prctl + .quad stub32_rt_sigreturn + .quad sys32_rt_sigaction + .quad sys32_rt_sigprocmask /* 175 */ + .quad sys32_rt_sigpending + .quad compat_sys_rt_sigtimedwait + .quad sys32_rt_sigqueueinfo + .quad stub32_rt_sigsuspend + .quad sys32_pread /* 180 */ + .quad sys32_pwrite + .quad sys_chown16 + .quad sys_getcwd + .quad sys_capget + .quad sys_capset + .quad stub32_sigaltstack + .quad sys32_sendfile + .quad quiet_ni_syscall /* streams1 */ + .quad quiet_ni_syscall /* streams2 */ + .quad stub32_vfork /* 190 */ + .quad compat_sys_getrlimit + .quad sys32_mmap2 + .quad sys32_truncate64 + .quad sys32_ftruncate64 + .quad sys32_stat64 /* 195 */ + .quad sys32_lstat64 + .quad sys32_fstat64 + .quad sys_lchown + .quad sys_getuid + .quad sys_getgid /* 200 */ + .quad sys_geteuid + .quad sys_getegid + .quad sys_setreuid + .quad sys_setregid + .quad sys_getgroups /* 205 */ + .quad sys_setgroups + .quad sys_fchown + .quad sys_setresuid + .quad sys_getresuid + .quad sys_setresgid /* 210 */ + .quad sys_getresgid + .quad sys_chown + .quad sys_setuid + .quad sys_setgid + .quad sys_setfsuid /* 215 */ + .quad sys_setfsgid + .quad sys_pivot_root + .quad sys_mincore + .quad sys_madvise + .quad compat_sys_getdents64 /* 220 getdents64 */ + .quad compat_sys_fcntl64 + .quad quiet_ni_syscall /* tux */ + .quad quiet_ni_syscall /* security */ + .quad sys_gettid + .quad sys_readahead /* 225 */ + .quad sys_setxattr + .quad sys_lsetxattr + .quad sys_fsetxattr + .quad sys_getxattr + .quad sys_lgetxattr /* 230 */ + .quad sys_fgetxattr + .quad sys_listxattr + .quad sys_llistxattr + .quad sys_flistxattr + .quad sys_removexattr /* 235 */ + .quad sys_lremovexattr + .quad sys_fremovexattr + .quad sys_tkill + .quad sys_sendfile64 + .quad compat_sys_futex /* 240 */ + .quad compat_sys_sched_setaffinity + .quad compat_sys_sched_getaffinity + .quad sys32_set_thread_area + .quad sys32_get_thread_area + .quad compat_sys_io_setup /* 245 */ + .quad sys_io_destroy + .quad compat_sys_io_getevents + .quad compat_sys_io_submit + .quad sys_io_cancel + .quad sys_fadvise64 /* 250 */ + .quad quiet_ni_syscall /* free_huge_pages */ + .quad sys_exit_group + .quad sys32_lookup_dcookie + .quad sys_epoll_create + .quad sys_epoll_ctl /* 255 */ + .quad sys_epoll_wait + .quad sys_remap_file_pages + .quad sys_set_tid_address + .quad compat_sys_timer_create + .quad compat_sys_timer_settime /* 260 */ + .quad compat_sys_timer_gettime + .quad sys_timer_getoverrun + .quad sys_timer_delete + .quad compat_sys_clock_settime + .quad compat_sys_clock_gettime /* 265 */ + .quad compat_sys_clock_getres + .quad compat_sys_clock_nanosleep + .quad compat_sys_statfs64 + .quad compat_sys_fstatfs64 + .quad sys_tgkill /* 270 */ + .quad compat_sys_utimes + .quad sys32_fadvise64_64 + .quad quiet_ni_syscall /* sys_vserver */ + .quad sys_mbind + .quad compat_sys_get_mempolicy /* 275 */ + .quad sys_set_mempolicy + .quad compat_sys_mq_open + .quad sys_mq_unlink + .quad compat_sys_mq_timedsend + .quad compat_sys_mq_timedreceive /* 280 */ + .quad compat_sys_mq_notify + .quad compat_sys_mq_getsetattr + .quad compat_sys_kexec_load /* reserved for kexec */ + .quad compat_sys_waitid + .quad quiet_ni_syscall /* 285: sys_altroot */ + .quad sys_add_key + .quad sys_request_key + .quad sys_keyctl + .quad sys_ioprio_set + .quad sys_ioprio_get /* 290 */ + .quad sys_inotify_init + .quad sys_inotify_add_watch + .quad sys_inotify_rm_watch + .quad sys_migrate_pages + .quad compat_sys_openat /* 295 */ + .quad sys_mkdirat + .quad sys_mknodat + .quad sys_fchownat + .quad compat_sys_futimesat + .quad sys32_fstatat /* 300 */ + .quad sys_unlinkat + .quad sys_renameat + .quad sys_linkat + .quad sys_symlinkat + .quad sys_readlinkat /* 305 */ + .quad sys_fchmodat + .quad sys_faccessat + .quad quiet_ni_syscall /* pselect6 for now */ + .quad quiet_ni_syscall /* ppoll for now */ + .quad sys_unshare /* 310 */ + .quad compat_sys_set_robust_list + .quad compat_sys_get_robust_list + .quad sys_splice + .quad sys_sync_file_range + .quad sys_tee + .quad compat_sys_vmsplice + .quad compat_sys_move_pages +ia32_syscall_end: --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/acpi/sleep_64-xen.c 2008-04-15 09:29:41.000000000 +0200 @@ -0,0 +1,146 @@ +/* + * acpi.c - Architecture-Specific Low-Level ACPI Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> + * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) + * Copyright (C) 2003 Pavel Machek, SuSE Labs + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/cpumask.h> + +#include <asm/mpspec.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> +#include <asm/tlbflush.h> + +/* -------------------------------------------------------------------------- + Low-Level Sleep Support + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_SLEEP + +#ifndef CONFIG_ACPI_PV_SLEEP +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_video_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); + +static pgd_t low_ptr; + +static void init_low_mapping(void) +{ + pgd_t *slot0 = pgd_offset(current->mm, 0UL); + low_ptr = *slot0; + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); + WARN_ON(num_online_cpus() != 1); + local_flush_tlb(); +} +#endif + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + init_low_mapping(); + + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); +#endif + return 0; +} + +/* + * acpi_restore_state + */ +void acpi_restore_state_mem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + set_pgd(pgd_offset(current->mm, 0UL), low_ptr); + local_flush_tlb(); +#endif +} + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page in low memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16M pages, but not + * <1M pages. + */ +void __init acpi_reserve_bootmem(void) +{ +#ifndef CONFIG_ACPI_PV_SLEEP + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + printk(KERN_CRIT + "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); +#endif +} + +#ifndef CONFIG_ACPI_PV_SLEEP +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_video_flags = 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_video_flags |= 2; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); +#endif /* CONFIG_ACPI_PV_SLEEP */ + +#endif /*CONFIG_ACPI_SLEEP */ + +void acpi_pci_link_exit(void) +{ +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/apic_64-xen.c 2007-06-12 13:13:01.000000000 +0200 @@ -0,0 +1,197 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/module.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/hpet.h> +#include <asm/idle.h> + +int apic_verbosity; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +void smp_local_timer_interrupt(struct pt_regs *regs) +{ + profile_tick(CPU_PROFILING, regs); +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP + update_process_times(user_mode(regs)); +#endif +#endif + /* + * We take the 'long' return path, and there every subsystem + * grabs the appropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); +} + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + exit_idle(); + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + +#if 0 + static unsigned long last_warning; + static unsigned long skipped; + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + if (time_before(last_warning+30*HZ,jiffies)) { + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", + smp_processor_id(), skipped); + last_warning = jiffies; + skipped = 0; + } else { + skipped++; + } +#endif + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ + +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + exit_idle(); + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +int disable_apic; + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 1; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/e820_64-xen.c 2009-12-04 08:45:56.000000000 +0100 @@ -0,0 +1,800 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/string.h> +#include <linux/kexec.h> +#include <linux/module.h> + +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/bootsetup.h> +#include <asm/sections.h> +#include <xen/interface/memory.h> + +/* + * PFN of last memory page. + */ +unsigned long end_pfn; +EXPORT_SYMBOL(end_pfn); + +#ifndef CONFIG_XEN +/* + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. + * The direct mapping extends to end_pfn_map, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long end_pfn_map; +#endif + +/* + * Last pfn which the user wants to use. + */ +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; + +extern struct resource code_resource, data_resource; + +#ifdef CONFIG_XEN +extern struct e820map machine_e820; +#endif + +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + unsigned long addr = *addrp, last = addr + size; + +#ifndef CONFIG_XEN + /* various gunk below that needed for SMP startup */ + if (addr < 0x8000) { + *addrp = 0x8000; + return 1; + } + + /* direct mapping tables of the kernel */ + if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { + *addrp = table_end << PAGE_SHIFT; + return 1; + } + + /* initrd */ +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START && last >= INITRD_START && + addr < INITRD_START+INITRD_SIZE) { + *addrp = INITRD_START + INITRD_SIZE; + return 1; + } +#endif + /* kernel code + 640k memory hole (later should not be needed, but + be paranoid for now) */ + if (last >= 640*1024 && addr < 1024*1024) { + *addrp = 1024*1024; + return 1; + } + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { + *addrp = __pa_symbol(&_end); + return 1; + } + + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { + *addrp = ebda_addr + ebda_size; + return 1; + } + + /* XXX ramdisk image here? */ +#else + if (last < (table_end<<PAGE_SHIFT)) { + *addrp = table_end << PAGE_SHIFT; + return 1; + } +#endif + return 0; +} + +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; i++) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; i++) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + if (ei->type != E820_RAM) + continue; + if (addr < start) + addr = start; + if (addr > ei->addr + ei->size) + continue; + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) + ; + last = addr + size; + if (last > ei->addr + ei->size) + continue; + if (last > end) + continue; + return addr; + } + return -1UL; +} + +/* + * Free bootmem based on the e820 table for a node. + */ +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr && last-addr >= PAGE_SIZE) + free_bootmem_node(pgdat, addr, last-addr); + } +} + +/* + * Find the highest page frame number we have available + */ +unsigned long __init e820_end_of_ram(void) +{ + int i; + unsigned long end_pfn = 0; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + + start = round_up(ei->addr, PAGE_SIZE); + end = round_down(ei->addr + ei->size, PAGE_SIZE); + if (start >= end) + continue; + if (ei->type == E820_RAM) { + if (end > end_pfn<<PAGE_SHIFT) + end_pfn = end>>PAGE_SHIFT; + } else { + if (end > end_pfn_map<<PAGE_SHIFT) + end_pfn_map = end>>PAGE_SHIFT; + } + } + + if (end_pfn > end_pfn_map) + end_pfn_map = end_pfn; + if (end_pfn_map > MAXMEM>>PAGE_SHIFT) + end_pfn_map = MAXMEM>>PAGE_SHIFT; + if (end_pfn > end_user_pfn) + end_pfn = end_user_pfn; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; + + return end_pfn; +} + +/* + * Compute how much memory is missing in a range. + * Unlike the other functions in this file the arguments are in page numbers. + */ +unsigned long __init +e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long ram = 0; + unsigned long start = start_pfn << PAGE_SHIFT; + unsigned long end = end_pfn << PAGE_SHIFT; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram) >> PAGE_SHIFT; +} + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +void __init e820_reserve_resources(struct e820entry *e820, int nr_map) +{ + int i; + for (i = 0; i < nr_map; i++) { + struct resource *res; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820[i].addr; + res->end = res->start + e820[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ +#ifndef CONFIG_XEN + request_resource(res, &code_resource); + request_resource(res, &data_resource); +#endif +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); +#ifdef CONFIG_XEN + xen_machine_kexec_register_resources(res); +#endif +#endif + } + } +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(unsigned long start, unsigned long size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i=0; i < 2*old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i=0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if <current_addr> > <last_addr>, swap */ + /* or, if current=<start_addr> & last=<end_addr>, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; i<overlap_entries; i++) + { + if (overlap_list[i] == change_point[chgidx]->pbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; i<overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ +#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +#else + BUG_ON(nr_map < 1); +#endif + + do { + unsigned long start = biosmap->addr; + unsigned long size = biosmap->size; + unsigned long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + +#ifndef CONFIG_XEN + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + * + * This should be removed on Hammer which is supposed to not + * have non e820 covered ISA mappings there, but I had some strange + * problems so it stays for now. -AK + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } +#endif + + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + struct xen_memory_map memmap; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; + } else + machine_e820 = e820; +#endif + + return 0; +} + +#ifndef CONFIG_XEN +void __init setup_memory_region(void) +{ + char *who = "BIOS-e820"; + + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + sanitize_e820_map(E820_MAP, &E820_MAP_NR); + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { + unsigned long mem_size; + + /* compare results from other methods and take the greater */ + if (ALT_MEM_K < EXT_MEM_K) { + mem_size = EXT_MEM_K; + who = "BIOS-88"; + } else { + mem_size = ALT_MEM_K; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + } + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map(who); +} + +#else /* CONFIG_XEN */ + +void __init setup_memory_region(void) +{ + int rc; + struct xen_memory_map memmap; + /* + * This is rather large for a stack variable but this early in + * the boot process we know we have plenty slack space. + */ + struct e820entry map[E820MAX]; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if ( rc == -ENOSYS ) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = xen_start_info->nr_pages << PAGE_SHIFT; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8 << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + sanitize_e820_map(map, (char *)&memmap.nr_entries); + + BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0); + + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map("Xen"); +} +#endif + +void __init parse_memopt(char *p, char **from) +{ + int i; + unsigned long current_end; + unsigned long end; + + end_user_pfn = memparse(p, from); + end_user_pfn >>= PAGE_SHIFT; + + end = end_user_pfn<<PAGE_SHIFT; + i = e820.nr_map-1; + current_end = e820.map[i].addr + e820.map[i].size; + + if (current_end < end) { + /* + * The e820 map ends before our requested size so + * extend the final entry to the requested address. + */ + if (e820.map[i].type == E820_RAM) + e820.map[i].size = end - e820.map[i].addr; + else + add_memory_region(current_end, end - current_end, E820_RAM); + } +} + +void __init parse_memmapopt(char *p, char **from) +{ + unsigned long long start_at, mem_size; + + mem_size = memparse(p, from); + p = *from; + if (*p == '@') { + start_at = memparse(p+1, from); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, from); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, from); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + end_user_pfn = (mem_size >> PAGE_SHIFT); + } + p = *from; +} + +unsigned long pci_mem_start = 0xaeedbabe; +EXPORT_SYMBOL(pci_mem_start); + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(struct e820entry *e820, int nr_map) +{ + unsigned long gapstart, gapsize, round; + unsigned long last; + int i; + int found = 0; + + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = nr_map; + while (--i >= 0) { + unsigned long long start = e820[i].addr; + unsigned long long end = start + e820[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + + if (!found) { + gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/early_printk-xen.c 2007-06-12 13:13:01.000000000 +0200 @@ -0,0 +1,302 @@ +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/screen_info.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/fcntl.h> + +/* Simple VGA output */ + +#ifdef __i386__ +#include <asm/setup.h> +#define VGABASE (__ISA_IO_base + 0xb8000) +#else +#include <asm/bootsetup.h> +#define VGABASE ((void __iomem *)0xffffffff800b8000UL) +#endif + +#ifndef CONFIG_XEN +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s,"0x",2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static int bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s,"ttyS",4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +#else /* CONFIG_XEN */ + +static void +early_serial_write(struct console *con, const char *s, unsigned count) +{ + int n; + + while (count > 0) { + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); + if (n <= 0) + break; + count -= n; + s += n; + } +} + +static __init void early_serial_init(char *s) +{ +} + +/* + * No early VGA console on Xen, as we do not have convenient ISA-space + * mappings. Someone should fix this for domain 0. For now, use fake serial. + */ +#define early_vga_console early_serial_console + +#endif + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Console interface to a host file on AMD's SimNow! */ + +static int simnow_fd; + +enum { + MAGIC1 = 0xBACCD00A, + MAGIC2 = 0xCA110000, + XOPEN = 5, + XWRITE = 4, +}; + +static noinline long simnow(long cmd, long a, long b, long c) +{ + long ret; + asm volatile("cpuid" : + "=a" (ret) : + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); + return ret; +} + +void __init simnow_init(char *str) +{ + char *fn = "klog"; + if (*str == '=') + fn = ++str; + /* error ignored */ + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); +} + +static void simnow_write(struct console *con, const char *s, unsigned n) +{ + simnow(XWRITE, simnow_fd, (unsigned long)s, n); +} + +static struct console simnow_console = { + .name = "simnow", + .write = simnow_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap,fmt); + n = vscnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int __initdata keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space; + char buf[256]; + + if (early_console_initialized) + return 1; + + strlcpy(buf,opt,sizeof(buf)); + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3) +#ifndef CONFIG_XEN + && SCREEN_INFO.orig_video_isVGA == 1) { + max_xpos = SCREEN_INFO.orig_video_cols; + max_ypos = SCREEN_INFO.orig_video_lines; + current_ypos = SCREEN_INFO.orig_y; +#else + || !strncmp(buf, "xen", 3)) { +#endif + early_console = &early_vga_console; + } else if (!strncmp(buf, "simnow", 6)) { + simnow_init(buf + 6); + early_console = &simnow_console; + keep_early = 1; + } + early_console_initialized = 1; + register_console(early_console); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console\n"); + } +} + +__setup("earlyprintk=", setup_early_printk); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/entry_64-xen.S 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,1317 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * + * $Id$ + * + * Jun Nakajima <jun.nakajima@intel.com> + * Asit Mallick <asit.k.mallick@intel.com> + * Modified for Xen + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers upto R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * TODO: + * - schedule it carefully for the final hardware. + */ + +#define ASSEMBLY 1 +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/asm-offsets.h> +#include <asm/msr.h> +#include <asm/unistd.h> +#include <asm/thread_info.h> +#include <asm/hw_irq.h> +#include <asm/page.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <xen/interface/arch-x86_64.h> +#include <xen/interface/features.h> + +#include "xen_entry.S" + + .code64 + +#ifndef CONFIG_PREEMPT +#define retint_kernel retint_restore_args +#endif + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +NMI_MASK = 0x80000000 + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp + movq $__USER_CS,CS(%rsp) + movq $-1,RCX(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp,offset=0 + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorl %eax, %eax + pushq %rax /* ss */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ + pushq %rax /* rsp */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rsp,0 + pushq $(1<<9) /* eflags - interrupts on */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ + pushq $__KERNEL_CS /* cs */ + CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ + pushq \child_rip /* rip */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip,0 + pushq %rax /* orig rax */ + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + + .macro CFI_DEFAULT_STACK start=1,adj=0 + .if \start + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8 - \adj*ARGOFFSET + .else + CFI_DEF_CFA_OFFSET SS+8 - \adj*ARGOFFSET + .endif + .if \adj == 0 + CFI_REL_OFFSET r15,R15 + CFI_REL_OFFSET r14,R14 + CFI_REL_OFFSET r13,R13 + CFI_REL_OFFSET r12,R12 + CFI_REL_OFFSET rbp,RBP + CFI_REL_OFFSET rbx,RBX + .endif + CFI_REL_OFFSET r11,R11 - \adj*ARGOFFSET + CFI_REL_OFFSET r10,R10 - \adj*ARGOFFSET + CFI_REL_OFFSET r9,R9 - \adj*ARGOFFSET + CFI_REL_OFFSET r8,R8 - \adj*ARGOFFSET + CFI_REL_OFFSET rax,RAX - \adj*ARGOFFSET + CFI_REL_OFFSET rcx,RCX - \adj*ARGOFFSET + CFI_REL_OFFSET rdx,RDX - \adj*ARGOFFSET + CFI_REL_OFFSET rsi,RSI - \adj*ARGOFFSET + CFI_REL_OFFSET rdi,RDI - \adj*ARGOFFSET + CFI_REL_OFFSET rip,RIP - \adj*ARGOFFSET + /*CFI_REL_OFFSET cs,CS - \adj*ARGOFFSET*/ + /*CFI_REL_OFFSET rflags,EFLAGS - \adj*ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP - \adj*ARGOFFSET + /*CFI_REL_OFFSET ss,SS - \adj*ARGOFFSET*/ + .endm + + /* + * Must be consistent with the definition in arch-x86/xen-x86_64.h: + * struct iret_context { + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + * }; + * with rax, r11, and rcx being taken care of in the hypercall stub. + */ + .macro HYPERVISOR_IRET flag + testb $3,1*8(%rsp) + jnz 2f + testl $NMI_MASK,2*8(%rsp) + jnz 2f + + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip) + jne 1f + + /* Direct iret to kernel space. Correct CS and SS. */ + orl $3,1*8(%rsp) + orl $3,4*8(%rsp) +1: iretq + +2: /* Slow iret via hypervisor. */ + andl $~NMI_MASK, 2*8(%rsp) + pushq $\flag + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .endm + +/* + * A newly forked process directly context switches into this. + */ +/* rdi: prev */ +ENTRY(ret_from_fork) + CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 + call schedule_tail + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace +rff_action: + RESTORE_REST + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + je int_ret_from_sys_call + testl $_TIF_IA32,threadinfo_flags(%rcx) + jnz int_ret_from_sys_call + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET + jmp ret_from_sys_call +rff_trace: + movq %rsp,%rdi + call syscall_trace_leave + GET_THREAD_INFO(%rcx) + jmp rff_action + CFI_ENDPROC +END(ret_from_fork) + +/* + * initial frame state for interrupts and exceptions + */ + .macro _frame ref + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-\ref + /*CFI_REL_OFFSET ss,SS-\ref*/ + CFI_REL_OFFSET rsp,RSP-\ref + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ + /*CFI_REL_OFFSET cs,CS-\ref*/ + CFI_REL_OFFSET rip,RIP-\ref + .endm + +/* + * System call entry. Upto 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are enabled on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + _frame (RIP-0x10) + SAVE_ARGS -8,0 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + CFI_REMEMBER_STATE + jnz tracesys + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ + .globl ret_from_sys_call +ret_from_sys_call: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + GET_THREAD_INFO(%rcx) + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz sysret_careful + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON + XEN_UNBLOCK_EVENTS(%rsi) + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET VGCF_IN_SYSCALL + + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + TRACE_IRQS_ON + XEN_UNBLOCK_EVENTS(%rsi) + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + TRACE_IRQS_ON +/* sti */ + XEN_UNBLOCK_EVENTS(%rsi) + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz 1f + + /* Really a signal */ + /* edx: work flags (arg3) */ + leaq do_notify_resume(%rip),%rax + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call ptregscall_common +1: movl $_TIF_NEED_RESCHED,%edi + /* Use IRET because user could have changed frame. This + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + jmp int_with_check + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + + /* Do syscall tracing */ +tracesys: + CFI_RESTORE_STATE + SAVE_REST + movq $-ENOSYS,RAX(%rsp) + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpq $__NR_syscall_max,%rax + ja 1f + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) +1: movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + jmp int_ret_from_sys_call + CFI_ENDPROC +END(system_call) + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +ENTRY(int_ret_from_sys_call) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ + CFI_REL_OFFSET rip,RIP-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET r8,R8-ARGOFFSET + CFI_REL_OFFSET r9,R9-ARGOFFSET + CFI_REL_OFFSET r10,R10-ARGOFFSET + CFI_REL_OFFSET r11,R11-ARGOFFSET + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + testb $3,CS-ARGOFFSET(%rsp) + jnz 1f + /* Need to set the proper %ss (not NULL) for ring 3 iretq */ + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) + jmp retint_restore_args # retrun from ring3 kernel +1: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +int_with_check: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,threadinfo_status(%rcx) + jmp retint_restore_args + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON +/* sti */ + XEN_UNBLOCK_EVENTS(%rsi) + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + TRACE_IRQS_ON +/* sti */ + XEN_UNBLOCK_EVENTS(%rsi) + SAVE_REST + /* Check for syscall exit trace */ + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + jz int_signal + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + jmp int_restore_rest + +int_signal: + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_NEED_RESCHED,%edi +int_restore_rest: + RESTORE_REST + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + jmp int_with_check + CFI_ENDPROC +END(int_ret_from_sys_call) + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + + .macro PTREGSCALL label,func,arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ptregscall_common +END(\label) + .endm + + CFI_STARTPROC + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST + movq %r11, %r15 + CFI_REGISTER rip, r15 + FIXUP_TOP_OF_STACK %r11 + call *%rax + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + CFI_REGISTER rip, r11 + RESTORE_REST + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 + ret + CFI_ENDPROC +END(ptregscall_common) + +ENTRY(stub_execve) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execve) + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_rt_sigreturn) + +/* initial frame state for interrupts (and exceptions without error code) */ +#define INTR_FRAME _frame (RIP-0x10); \ + CFI_REL_OFFSET rcx,0; \ + CFI_REL_OFFSET r11,8 + +/* initial frame state for exceptions with error code (and interrupts with + vector already pushed) */ +#define XCPT_FRAME _frame (RIP-0x18); \ + CFI_REL_OFFSET rcx,0; \ + CFI_REL_OFFSET r11,8 + +/* + * Interrupt exit. + * + */ + +retint_check: + CFI_DEFAULT_STACK adj=1 + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz retint_careful +retint_restore_args: + movl EFLAGS-REST_SKIP(%rsp), %eax + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + XEN_GET_VCPU_INFO(%rsi) + andb evtchn_upcall_mask(%rsi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + jnz restore_all_enable_events # != 0 => enable event delivery + XEN_PUT_VCPU_INFO(%rsi) + + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET 0 + + /* edi: workmask, edx: work */ +retint_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + XEN_UNBLOCK_EVENTS(%rsi) +/* sti */ + pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 + call schedule + popq %rdi + CFI_ADJUST_CFA_OFFSET -8 + GET_THREAD_INFO(%rcx) + XEN_BLOCK_EVENTS(%rsi) +/* cli */ + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz retint_restore_args + TRACE_IRQS_ON + XEN_UNBLOCK_EVENTS(%rsi) + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + movl $_TIF_NEED_RESCHED,%edi + GET_THREAD_INFO(%rcx) + jmp retint_check + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ + .p2align +retint_kernel: + cmpl $0,threadinfo_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp retint_kernel /* check again */ +#endif + + CFI_ENDPROC +END(retint_check) + +#ifndef CONFIG_XEN +/* + * APIC interrupts. + */ + .macro apicinterrupt num,func + INTR_FRAME + pushq $~(\num) + CFI_ADJUST_CFA_OFFSET 8 + interrupt \func + jmp error_entry + CFI_ENDPROC + .endm + +ENTRY(thermal_interrupt) + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +END(thermal_interrupt) + +ENTRY(threshold_interrupt) + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt +END(threshold_interrupt) + +#ifdef CONFIG_SMP +ENTRY(reschedule_interrupt) + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt +END(reschedule_interrupt) + + .macro INVALIDATE_ENTRY num +ENTRY(invalidate_interrupt\num) + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt +END(invalidate_interrupt\num) + .endm + + INVALIDATE_ENTRY 0 + INVALIDATE_ENTRY 1 + INVALIDATE_ENTRY 2 + INVALIDATE_ENTRY 3 + INVALIDATE_ENTRY 4 + INVALIDATE_ENTRY 5 + INVALIDATE_ENTRY 6 + INVALIDATE_ENTRY 7 + +ENTRY(call_function_interrupt) + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +END(call_function_interrupt) +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +ENTRY(apic_timer_interrupt) + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt +END(apic_timer_interrupt) + +ENTRY(error_interrupt) + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt +END(error_interrupt) + +ENTRY(spurious_interrupt) + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +END(spurious_interrupt) +#endif +#endif /* !CONFIG_XEN */ + +/* + * Exception entry points. + */ + .macro zeroentry sym + INTR_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x10,%rsp /* skip rcx and r11 */ + CFI_ADJUST_CFA_OFFSET -0x10 + pushq $0 /* push error code/oldrax */ + CFI_ADJUST_CFA_OFFSET 8 + pushq %rax /* push real oldrax to the rdi slot */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 + leaq \sym(%rip),%rax + jmp error_entry + CFI_ENDPROC + .endm + + .macro errorentry sym + XCPT_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x10,%rsp /* rsp points to the error code */ + CFI_ADJUST_CFA_OFFSET -0x10 + pushq %rax + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 + leaq \sym(%rip),%rax + jmp error_entry + CFI_ENDPROC + .endm + +#if 0 /* not XEN */ + /* error code is on the stack already */ + /* handle NMI like exceptions that can happen everywhere */ + .macro paranoidentry sym, ist=0, irqtrace=1 + movq (%rsp),%rcx + movq 8(%rsp),%r11 + addq $0x10,%rsp /* skip rcx and r11 */ + SAVE_ALL + cld +#if 0 /* not XEN */ + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f + swapgs + xorl %ebx,%ebx +1: +#endif + .if \ist + movq %gs:pda_data_offset, %rbp + .endif + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi + movq $-1,ORIG_RAX(%rsp) + .if \ist + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif + call \sym + .if \ist + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif +/* cli */ + XEN_BLOCK_EVENTS(%rsi) + .if \irqtrace + TRACE_IRQS_OFF + .endif + .endm + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + TRACE_IRQS_IRETQ 0 + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm +#endif + +/* + * Exception entry point. This expects an error code/orig_rax on the stack + * and the exception handler in %rax. + */ +ENTRY(error_entry) + _frame RDI + CFI_REL_OFFSET rax,0 + /* rdi slot contains rax, oldrax contains error code */ + cld + subq $14*8,%rsp + CFI_ADJUST_CFA_OFFSET (14*8) + movq %rsi,13*8(%rsp) + CFI_REL_OFFSET rsi,RSI + movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + CFI_REGISTER rax,rsi + movq %rdx,12*8(%rsp) + CFI_REL_OFFSET rdx,RDX + movq %rcx,11*8(%rsp) + CFI_REL_OFFSET rcx,RCX + movq %rsi,10*8(%rsp) /* store rax */ + CFI_REL_OFFSET rax,RAX + movq %r8, 9*8(%rsp) + CFI_REL_OFFSET r8,R8 + movq %r9, 8*8(%rsp) + CFI_REL_OFFSET r9,R9 + movq %r10,7*8(%rsp) + CFI_REL_OFFSET r10,R10 + movq %r11,6*8(%rsp) + CFI_REL_OFFSET r11,R11 + movq %rbx,5*8(%rsp) + CFI_REL_OFFSET rbx,RBX + movq %rbp,4*8(%rsp) + CFI_REL_OFFSET rbp,RBP + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12,R12 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13,R13 + movq %r14,1*8(%rsp) + CFI_REL_OFFSET r14,R14 + movq %r15,(%rsp) + CFI_REL_OFFSET r15,R15 +#if 0 + cmpl $__KERNEL_CS,CS(%rsp) + CFI_REMEMBER_STATE + je error_kernelspace +#endif +error_call_handler: + movq %rdi, RDI(%rsp) + CFI_REL_OFFSET rdi,RDI + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi # get error code + movq $-1,ORIG_RAX(%rsp) + call *%rax +error_exit: + RESTORE_REST +/* cli */ + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testb $3,CS-ARGOFFSET(%rsp) + jz retint_kernel + movl threadinfo_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ + jmp retint_restore_args + +#if 0 + /* + * We need to re-write the logic here because we don't do iretq to + * to return to user mode. It's still possible that we get trap/fault + * in the kernel (when accessing buffers pointed to by system calls, + * for example). + * + */ + CFI_RESTORE_STATE +error_kernelspace: + incl %ebx + /* There are two places in the kernel that can potentially fault with + usergs. Handle them here. The exception handlers after + iret run with kernel gs again, so don't set the user space flag. + B stepping K8s sometimes report an truncated RIP for IRET + exceptions returning to compat mode. Check for these here too. */ + leaq iret_label(%rip),%rbp + cmpq %rbp,RIP(%rsp) + je error_swapgs + movl %ebp,%ebp /* zero extend */ + cmpq %rbp,RIP(%rsp) + je error_swapgs + cmpq $gs_change,RIP(%rsp) + je error_swapgs + jmp error_sti +#endif + CFI_ENDPROC +END(error_entry) + +ENTRY(hypervisor_callback) + zeroentry do_hypervisor_callback +END(hypervisor_callback) + +/* + * Copied from arch/xen/i386/kernel/entry.S + */ +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will +# see the correct pointer to the pt_regs + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +#ifdef CONFIG_X86_LOCAL_APIC +KPROBE_ENTRY(nmi) + zeroentry do_nmi_callback +ENTRY(do_nmi_callback) + CFI_STARTPROC + addq $8, %rsp + CFI_ENDPROC + CFI_DEFAULT_STACK + call do_nmi + orl $NMI_MASK,EFLAGS(%rsp) + RESTORE_REST + XEN_BLOCK_EVENTS(%rsi) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_restore_args + CFI_ENDPROC + .previous .text +END(nmi) +#endif + + ALIGN +restore_all_enable_events: + CFI_DEFAULT_STACK adj=1 + TRACE_IRQS_ON + XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... + +scrit: /**** START OF CRITICAL REGION ****/ + XEN_TEST_PENDING(%rsi) + CFI_REMEMBER_STATE + jnz 14f # process more events if necessary... + XEN_PUT_VCPU_INFO(%rsi) + RESTORE_ARGS 0,8,0 + HYPERVISOR_IRET 0 + + CFI_RESTORE_STATE +14: XEN_LOCKED_BLOCK_EVENTS(%rsi) + XEN_PUT_VCPU_INFO(%rsi) + SAVE_REST + movq %rsp,%rdi # set the argument again + jmp 11b + CFI_ENDPROC +ecrit: /**** END OF CRITICAL REGION ****/ +# At this point, unlike on x86-32, we don't do the fixup to simplify the +# code and the stack frame is more complex on x86-64. +# When the kernel is interrupted in the critical section, the kernel +# will do IRET in that case, and everything will be restored at that point, +# i.e. it just resumes from the next instruction interrupted with the same context. + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +ENTRY(failsafe_callback) + _frame (RIP-0x30) + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + movq $11,%rdi /* SIGSEGV */ + jmp do_exit + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +#if 0 + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: +/* swapgs */ /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous +#endif + +/* + * Create a kernel thread. + * + * C extern interface: + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + * + * asm input arguments: + * rdi: fn, rsi: arg, rdx: flags + */ +ENTRY(kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq kernel_thread_flags(%rip),%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + + # clone now + call do_fork + movq %rax,RAX(%rsp) + xorl %edi,%edi + + /* + * It isn't worth to check for reschedule here, + * so internally to the x86_64 port you can rely on kernel_thread() + * not to reschedule the child before returning, this avoids the need + * of hacks for example to fork off the per-CPU idle tasks. + * [Hopefully no generic code relies on the reschedule -AK] + */ + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(kernel_thread) + +child_rip: + pushq $0 # fake return address + CFI_STARTPROC + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + # exit + xorl %edi, %edi + call do_exit + CFI_ENDPROC +ENDPROC(child_rip) + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + */ +ENTRY(execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + jne 1f + jmp int_ret_from_sys_call +1: RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +ENDPROC(execve) + +KPROBE_ENTRY(page_fault) + errorentry do_page_fault +END(page_fault) + .previous .text + +ENTRY(coprocessor_error) + zeroentry do_coprocessor_error +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + zeroentry do_simd_coprocessor_error +END(simd_coprocessor_error) + +ENTRY(device_not_available) + zeroentry math_state_restore +END(device_not_available) + + /* runs on exception stack */ +KPROBE_ENTRY(debug) +/* INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 */ + zeroentry do_debug +/* paranoidexit + CFI_ENDPROC */ +END(debug) + .previous .text + +#if 0 + /* runs on exception stack */ +KPROBE_ENTRY(nmi) + INTR_FRAME + pushq $-1 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif +END(nmi) + .previous .text +#endif + +KPROBE_ENTRY(int3) +/* INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 */ + zeroentry do_int3 +/* jmp paranoid_exit1 + CFI_ENDPROC */ +END(int3) + .previous .text + +ENTRY(overflow) + zeroentry do_overflow +END(overflow) + +ENTRY(bounds) + zeroentry do_bounds +END(bounds) + +ENTRY(invalid_op) + zeroentry do_invalid_op +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + zeroentry do_coprocessor_segment_overrun +END(coprocessor_segment_overrun) + +ENTRY(reserved) + zeroentry do_reserved +END(reserved) + +#if 0 + /* runs on exception stack */ +ENTRY(double_fault) + XCPT_FRAME + paranoidentry do_double_fault + jmp paranoid_exit1 + CFI_ENDPROC +END(double_fault) +#endif + +ENTRY(invalid_TSS) + errorentry do_invalid_TSS +END(invalid_TSS) + +ENTRY(segment_not_present) + errorentry do_segment_not_present +END(segment_not_present) + + /* runs on exception stack */ +ENTRY(stack_segment) +/* XCPT_FRAME + paranoidentry do_stack_segment */ + errorentry do_stack_segment +/* jmp paranoid_exit1 + CFI_ENDPROC */ +END(stack_segment) + +KPROBE_ENTRY(general_protection) + errorentry do_general_protection +END(general_protection) + .previous .text + +ENTRY(alignment_check) + errorentry do_alignment_check +END(alignment_check) + +ENTRY(divide_error) + zeroentry do_divide_error +END(divide_error) + +ENTRY(spurious_interrupt_bug) + zeroentry do_spurious_interrupt_bug +END(spurious_interrupt_bug) + +#ifdef CONFIG_X86_MCE + /* runs on exception stack */ +KPROBE_ENTRY(machine_check) + zeroentry do_machine_check +END(machine_check) +#endif + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(call_softirq) + CFI_STARTPROC + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + incl %gs:pda_irqcount + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + decl %gs:pda_irqcount + ret + CFI_ENDPROC +ENDPROC(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %rcx, RIP(%rdi) + leaq 8(%rsp), %rcx + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %rcx, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rdx + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/head_64-xen.S 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,211 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> + * + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ + * + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen + */ + + +#include <linux/linkage.h> +#include <linux/threads.h> +#include <linux/init.h> +#include <linux/elfnote.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/cache.h> +#include <asm/dwarf2.h> +#include <xen/interface/elfnote.h> + + .section .bootstrap.text, "ax", @progbits + .code64 + .globl startup_64 +startup_64: + movq $(init_thread_union+THREAD_SIZE-8),%rsp + + /* rsi is pointer to startup info structure. + pass it to C */ + movq %rsi,%rdi + pushq $0 # fake return address + jmp x86_64_start_kernel + +#ifdef CONFIG_ACPI_SLEEP +.org 0xf00 + .globl pGDT32 +pGDT32: + .word gdt_end-cpu_gdt_table-1 + .long cpu_gdt_table-__START_KERNEL_map +#endif +ENTRY(stext) +ENTRY(_stext) + + $page = 0 +#define NEXT_PAGE(name) \ + $page = $page + 1; \ + .org $page * 0x1000; \ + phys_##name = $page * 0x1000 + __PHYSICAL_START; \ +ENTRY(name) + +NEXT_PAGE(init_level4_pgt) + /* This gets initialized in x86_64_start_kernel */ + .fill 512,8,0 +NEXT_PAGE(init_level4_user_pgt) + /* + * We update two pgd entries to make kernel and user pgd consistent + * at pgd_populate(). It can be used for kernel modules. So we place + * this page here for those cases to avoid memory corruption. + * We also use this page to establish the initial mapping for the + * vsyscall area. + */ + .fill 512,8,0 + +NEXT_PAGE(level3_kernel_pgt) + .fill 512,8,0 + + /* + * This is used for vsyscall area mapping as we have a different + * level4 page table for user. + */ +NEXT_PAGE(level3_user_pgt) + .fill 512,8,0 + +NEXT_PAGE(hypercall_page) + CFI_STARTPROC + .rept 0x1000 / 0x20 + .skip 1 /* push %rcx */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 2 /* push %r11 */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 5 /* mov $#,%eax */ + .skip 2 /* syscall */ + .skip 2 /* pop %r11 */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 + .skip 1 /* pop %rcx */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + .align 0x20,0 /* ret */ + .endr + CFI_ENDPROC + +#undef NEXT_PAGE + + .data +/* Just dummy symbol to allow compilation. Not used in sleep path */ +#ifdef CONFIG_ACPI_SLEEP + .align PAGE_SIZE +ENTRY(wakeup_level4_pgt) + .fill 512,8,0 +#endif + + .data + + .align 16 + .globl cpu_gdt_descr +cpu_gdt_descr: + .word gdt_end-cpu_gdt_table-1 +gdt: + .quad cpu_gdt_table +#ifdef CONFIG_SMP + .rept NR_CPUS-1 + .word 0 + .quad 0 + .endr +#endif + +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ + + .section .data.page_aligned, "aw" + .align PAGE_SIZE + +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ + +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0 /* unused */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x00cffa000000ffff /* __USER32_CS */ + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affa000000ffff /* __USER_CS */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0,0 /* TSS */ + .quad 0,0 /* LDT */ + .quad 0,0,0 /* three TLS descriptors */ + .quad 0 /* unused */ +gdt_end: + /* asm/segment.h:GDT_ENTRIES must match this */ + /* This should be a multiple of the cache line size */ + /* GDTs of other CPUs are now dynamically allocated */ + + /* zero the remaining page */ + .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 + + .section .bss.page_aligned, "aw", @nobits + .align PAGE_SIZE +ENTRY(empty_zero_page) + .skip PAGE_SIZE + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoh value + .if (\value) < 0 || (\value) >= 0x10 + utoh (((\value)>>4)&0x0fffffffffffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoh __START_KERNEL_map + .ascii ",ELF_PADDR_OFFSET=0x" + utoh __START_KERNEL_map + .ascii ",VIRT_ENTRY=0x" + utoh (__START_KERNEL_map + __PHYSICAL_START) + .ascii ",HYPERCALL_PAGE=0x" + utoh (phys_hypercall_page >> PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|supervisor_mode_kernel" + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, __START_KERNEL_map) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, __START_KERNEL_map) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad, startup_64) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/head64-xen.c 2007-06-12 13:13:01.000000000 +0200 @@ -0,0 +1,162 @@ +/* + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen. + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/percpu.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/bootsetup.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> + +unsigned long start_pfn; + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +#if 0 +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); +} +#endif + +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ +#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_BASE_ADDR 0x90000 +#define OLD_CL_OFFSET 0x90022 + +extern char saved_command_line[]; + +static void __init copy_bootdata(char *real_mode_data) +{ +#ifndef CONFIG_XEN + int new_data; + char * command_line; + + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + if (!new_data) { + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + printk("so old bootloader that it does not support commandline?!\n"); + return; + } + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + printk("old bootloader convention, maybe loadlin?\n"); + } + command_line = (char *) ((u64)(new_data)); + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); +#else + int max_cmdline; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline); + saved_command_line[max_cmdline-1] = '\0'; +#endif + printk("Bootdata ok (command line is %s)\n", saved_command_line); +} + +static void __init setup_boot_cpu_data(void) +{ + unsigned int dummy, eax; + + /* get vendor info */ + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, + (unsigned int *)&boot_cpu_data.x86_vendor_id[0], + (unsigned int *)&boot_cpu_data.x86_vendor_id[8], + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); + + /* get cpu type */ + cpuid(1, &eax, &dummy, &dummy, + (unsigned int *) &boot_cpu_data.x86_capability); + boot_cpu_data.x86 = (eax >> 8) & 0xf; + boot_cpu_data.x86_model = (eax >> 4) & 0xf; + boot_cpu_data.x86_mask = eax & 0xf; +} + +#include <xen/interface/memory.h> +unsigned long *machine_to_phys_mapping; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + +void __init x86_64_start_kernel(char * real_mode_data) +{ + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; + char *s; + int i; + + setup_xen_features(); + + xen_start_info = (struct start_info *)real_mode_data; + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = + (unsigned long *)xen_start_info->mfn_list; + start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + + xen_start_info->nr_pt_frames; + + machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START; + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) + machine_to_phys_order++; + +#if 0 + for (i = 0; i < 256; i++) + set_intr_gate(i, early_idt_handler); + asm volatile("lidt %0" :: "m" (idt_descr)); +#endif + + /* + * This must be called really, really early: + */ + lockdep_init(); + + for (i = 0; i < NR_CPUS; i++) + cpu_pda(i) = &boot_cpu_pda[i]; + + pda_init(0); + copy_bootdata(real_mode_data); +#ifdef CONFIG_SMP + cpu_set(0, cpu_online_map); +#endif + s = strstr(saved_command_line, "earlyprintk="); + if (s != NULL) + setup_early_printk(strchr(s, '=') + 1); +#ifdef CONFIG_NUMA + s = strstr(saved_command_line, "numa="); + if (s != NULL) + numa_setup(s+5); +#endif +#ifdef CONFIG_X86_IO_APIC + if (strstr(saved_command_line, "disableapic")) + disable_apic = 1; +#endif + /* You need early console to see that */ + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) + panic("Kernel too big for kernel mapping\n"); + + setup_boot_cpu_data(); + start_kernel(); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/io_apic_64-xen.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,2270 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi.h> +#include <linux/sysdev.h> +#ifdef CONFIG_ACPI +#include <acpi/acpi_bus.h> +#endif + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/mach_apic.h> +#include <asm/acpi.h> +#include <asm/dma.h> +#include <asm/nmi.h> + +#define __apicdebuginit __init + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static int no_timer_check; + +int disable_timer_pin_1 __initdata; + +#ifndef CONFIG_XEN +int timer_over_8254 __initdata = 0; + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; +#endif + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + short apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +#ifdef CONFIG_XEN + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> +#include <xen/evtchn.h> + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +} + +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct physdev_apic apic_op; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + apic_op.value = value; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +} + +#define io_apic_read(a,r) xen_io_apic_read(a,r) +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) + +#define clear_IO_APIC() ((void)0) + +#else + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(mask, tmp, CPU_MASK_ALL); + + dest = cpu_mask_to_apicid(mask); + + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __DO_ACTION(1, = dest, ) + set_irq_info(irq, mask); + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#endif + +#endif /* !CONFIG_XEN */ + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: ran out of irq_2_pin entries!"); + } + entry->apic = apic; + entry->pin = pin; +} + +#ifndef CONFIG_XEN +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_pin_list *entry = irq_2_pin + irq; \ + \ + BUG_ON(irq >= NR_IRQS); \ + for (;;) { \ + unsigned int reg; \ + pin = entry->pin; \ + if (pin == -1) \ + break; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + if (!entry->next) \ + break; \ + entry = irq_2_pin + entry->next; \ + } \ + FINAL; \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) + /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) + /* mask = 0 */ + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#endif /* !CONFIG_XEN */ + +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; +int ioapic_force; + +/* dummy parsing: see setup.c */ + +static int __init disable_ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +static int __init enable_ioapic_setup(char *str) +{ + ioapic_force = 1; + skip_ioapic_setup = 0; + return 1; +} + +__setup("noapic", disable_ioapic_setup); +__setup("apic", enable_ioapic_setup); + +#ifndef CONFIG_XEN +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); +#endif /* !CONFIG_XEN */ + +#include <asm/pci-direct.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> + + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC + off. Check for an Nvidia or VIA PCI bridge and turn it off. + Use pci direct infrastructure because this runs before the PCI subsystem. + + Can be overwritten with "apic" + + And another hack to disable the IOMMU on VIA chipsets. + + ... and others. Really should move this somewhere else. + + Kludge-O-Rama. */ +void __init check_ioapic(void) +{ + int num,slot,func; + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + switch (vendor) { + case PCI_VENDOR_ID_VIA: +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || + force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); + iommu_aperture_disabled = 1; + } +#endif + return; + case PCI_VENDOR_ID_NVIDIA: +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, + nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + return; + case PCI_VENDOR_ID_ATI: + + /* This should be actually default, but + for 2.6.16 let's do it for ATI only where + it's really needed. */ +#ifndef CONFIG_XEN + if (timer_over_8254 == 1) { + timer_over_8254 = 0; + printk(KERN_INFO + "ATI board detected. Disabling timer routing over 8254.\n"); + } +#endif + return; + } + + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +#ifndef CONFIG_XEN +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} +#endif + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + BUG_ON(best_guess >= NR_IRQS); + return best_guess; +} + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int next_irq = 16; + +/* + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number + * from ACPI, which can reach 800 in large boxen. + * + * Compact the sparse GSI space into a sequential IRQ series and reuse + * vectors if possible. + */ +int gsi_irq_sharing(int gsi) +{ + int i, tries, vector; + + BUG_ON(gsi >= NR_IRQ_VECTORS); + + if (platform_legacy_irq(gsi)) + return gsi; + + if (gsi_2_irq[gsi] != 0xFF) + return (int)gsi_2_irq[gsi]; + + tries = NR_IRQS; + try_again: + vector = assign_irq_vector(gsi); + + /* + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous + * use of vector and if found, return that IRQ. However, we never want + * to share legacy IRQs, which usually have a different trigger mode + * than PCI. + */ + for (i = 0; i < NR_IRQS; i++) + if (IO_APIC_VECTOR(i) == vector) + break; + if (platform_legacy_irq(i)) { + if (--tries >= 0) { + IO_APIC_VECTOR(i) = 0; + goto try_again; + } + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); + } + if (i < NR_IRQS) { + gsi_2_irq[gsi] = i; + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; + } + + i = next_irq++; + BUG_ON(i >= NR_IRQS); + gsi_2_irq[gsi] = i; + IO_APIC_VECTOR(i) = vector; + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + irq = gsi_irq_sharing(irq); + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + BUG_ON(irq >= NR_IRQS); + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + BUG_ON(irq >= NR_IRQS); + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; + +int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + struct physdev_irq irq_op; + + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + + if (irq < PIRQ_BASE || irq - PIRQ_BASE >= NR_PIRQS) + return -EINVAL; + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); + return IO_APIC_VECTOR(irq); + } + + irq_op.irq = irq; + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + spin_unlock_irqrestore(&vector_lock, flags); + return -ENOSPC; + } + + vector = irq_op.vector; + vector_irq[vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = vector; + + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; +} + +extern void (*interrupt[NR_IRQS])(void); +#ifndef CONFIG_XEN +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + unsigned idx; + + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].chip = &ioapic_level_type; + else + irq_desc[idx].chip = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); +} +#else +#define ioapic_register_intr(irq, vector, trigger) evtchn_register_pirq(irq) +#endif /* !CONFIG_XEN */ + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + } + + irq = pin_2_irq(idx, apic, pin); + add_pin_to_irq(irq, apic, pin); + + if (/* !apic && */ !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE," not connected.\n"); +} + +#ifndef CONFIG_XEN +/* + * Set up the 8259A-master output pin as broadcast to all + * CPUs. + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].chip = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +void __init UNEXPECTED_IO_APIC(void) +{ +} + +void __apicdebuginit print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) && + (reg_01.bits.entries != 0x03) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + if (reg_01.bits.version >= 0x10) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +static __apicdebuginit void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void __apicdebuginit print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void __apicdebuginit print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} +#endif /* !CONFIG_XEN */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; +#ifndef CONFIG_XEN + int i8259_apic, i8259_pin; +#endif + int i, apic; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } +#ifndef CONFIG_XEN + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } +#endif + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + +#ifndef CONFIG_XEN + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = + GET_APIC_ID(apic_read(APIC_ID)); + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, + *(((int *)&entry)+1)); + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, + *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +#endif +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +#ifndef CONFIG_XEN +static void __init setup_ioapic_ids_from_mpc (void) +{ + union IO_APIC_reg_00 reg_00; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE," ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +#ifndef CONFIG_XEN +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (jiffies - t1 > 4) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + move_irq(irq); + ack_APIC_irq(); +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_native_irq_info(vector, cpu_mask); + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif // CONFIG_SMP +#endif // CONFIG_PCI_MSI + +static int ioapic_retrigger(unsigned int irq) +{ + send_IPI_self(IO_APIC_VECTOR(irq)); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +static struct hw_interrupt_type ioapic_edge_type __read_mostly = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; + +static struct hw_interrupt_type ioapic_level_type __read_mostly = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); +#ifndef CONFIG_XEN + else + /* Strange. Oh, well.. */ + irq_desc[irq].chip = &no_irq_type; +#endif + } + } +} + +#ifndef CONFIG_XEN +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type __read_mostly = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq, +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + printk(KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(NULL); + + printk(" done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + apic = find_isa_irq_apic(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +int timer_uses_ioapic_pin_0; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for modern platforms only. + */ +static inline void check_timer(void) +{ + int apic1, pin1, apic2, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + if (timer_over_8254 > 0) + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (!no_timer_check && timer_irq_works()) { + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + return; + } + clear_IO_APIC_pin(apic1, pin1); + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " + "connected to IO-APIC\n"); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " + "through the 8259A ... "); + if (pin2 != -1) { + apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", + apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(apic2, pin2); + } + apic_printk(APIC_VERBOSE," failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].chip = &lapic_irq_type; + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_printk(APIC_VERBOSE," failed.\n"); + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + init_8259A(0); + make_8259A_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_printk(APIC_VERBOSE," failed :(.\n"); + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +} +#else +#define check_timer() ((void)0) +int timer_uses_ioapic_pin_0 = 0; +#endif /* !CONFIG_XEN */ + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1<<2) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + + /* + * Set up the IO-APIC IRQ routing table. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#ifndef CONFIG_XEN + sync_Arb_IDs(); +#endif /* !CONFIG_XEN */ + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +#ifndef CONFIG_XEN + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +#endif /* CONFIG_XEN */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#define IO_APIC_MAX_ID 0xFE + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; /* Disabled (masked) */ + + irq = gsi_irq_sharing(irq); + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /* CONFIG_ACPI */ + + +#ifndef CONFIG_XEN +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif +#endif /* !CONFIG_XEN */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ioport_64-xen.c 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,100 @@ +/* + * linux/arch/x86_64/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <xen/interface/physdev.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + int i; + + if (new_value) + for (i = base; i < base + extent; i++) + __set_bit(i, bitmap); + else + for (i = base; i < base + extent; i++) + clear_bit(i, bitmap); +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + unsigned long *bitmap; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + */ + +asmlinkage long sys_iopl(unsigned int new_iopl, struct pt_regs *regs) +{ + unsigned int old_iopl = current->thread.iopl; + struct physdev_set_iopl set_iopl; + + if (new_iopl > 3) + return -EINVAL; + + /* Need "raw I/O" privileges for direct port access. */ + if ((new_iopl > old_iopl) && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* Change our version of the privilege levels. */ + current->thread.iopl = new_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (new_iopl == 0) ? 1 : new_iopl; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); + + return 0; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/irq_64-xen.c 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,197 @@ +/* + * linux/arch/x86_64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <asm/uaccess.h> +#include <asm/io_apic.h> +#include <asm/idle.h> + +atomic_t irq_err_count; +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif + +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ + u64 curbase = (u64) current->thread_info; + static unsigned long warned = -60*HZ; + + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && + regs->rsp < curbase + sizeof(struct thread_info) + 128 && + time_after(jiffies, warned + 60*HZ)) { + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", + current->comm, curbase, regs->rsp); + show_stack(NULL,NULL); + warned = jiffies; + } +} +#endif + +/* + * Generic, controller-independent functions: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + + seq_printf(p, " %s", action->name); + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif +#endif + } + return 0; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bit used in ret_from_ code */ + unsigned irq = ~regs->orig_rax; + + if (unlikely(irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } + + /*exit_idle();*/ + /*irq_enter();*/ +#ifdef CONFIG_DEBUG_STACKOVERFLOW + stack_overflow_check(regs); +#endif + __do_IRQ(irq, regs); + /*irq_exit();*/ + + return 1; +} + +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_desc[irq].affinity, map); + if (any_online_cpu(mask) == NR_CPUS) { + /*printk("Breaking affinity for irq %i\n", irq);*/ + mask = map; + } + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif + +extern void call_softirq(void); + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + pending = local_softirq_pending(); + /* Switch to interrupt stack */ + if (pending) { + call_softirq(); + WARN_ON_ONCE(softirq_count()); + } + local_irq_restore(flags); +} +EXPORT_SYMBOL(do_softirq); + +#ifndef CONFIG_X86_LOCAL_APIC +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/ldt_64-xen.c 2007-06-12 13:13:01.000000000 +0200 @@ -0,0 +1,282 @@ +/* + * linux/arch/x86_64/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/pgalloc.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) +{ + void *oldldt; + void *newldt; + unsigned oldsize; + + if (mincount <= (unsigned)pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + wmb(); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); +#endif + make_pages_readonly( + pc->ldt, + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable( + oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + make_pages_readonly( + new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + init_MUTEX(&mm->context.sem); + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + } + return retval; +} + +/* + * + * Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable( + mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + /* Arbitrary number */ + /* x86-64 default LDT is all zeros */ + if (bytecount > 128) + bytecount = 128; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct task_struct *me = current; + struct mm_struct * mm = me->mm; + __u32 entry_1, entry_2, *lp; + unsigned long mach_lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, bytecount)) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= (unsigned)mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + mach_lp = arbitrary_virt_to_machine(lp); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/mpparse_64-xen.c 2007-06-12 13:13:01.000000000 +0200 @@ -0,0 +1,1011 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> +#include <asm/acpi.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +int acpi_found_madt; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +unsigned char apic_version [MAX_APICS]; +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; + +static int mp_current_pci_id = 0; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; +int pic_mode; +unsigned long mp_lapic_addr = 0; + + + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_id = -1U; +/* Internal processor count */ +unsigned int num_processors __initdata = 0; + +unsigned disabled_cpus __initdata; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; + +/* ACPI MADT entry parsing functions */ +#ifdef CONFIG_ACPI +extern struct acpi_boot_flags acpi_boot; +#ifdef CONFIG_X86_LOCAL_APIC +extern int acpi_parse_lapic (acpi_table_entry_header *header); +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); +#endif /*CONFIG_X86_LOCAL_APIC*/ +#ifdef CONFIG_X86_IO_APIC +extern int acpi_parse_ioapic (acpi_table_entry_header *header); +#endif /*CONFIG_X86_IO_APIC*/ +#endif /*CONFIG_ACPI*/ + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + +/* + * Intel MP BIOS table parsing routines: + */ + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +#ifndef CONFIG_XEN +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +{ + int cpu; + unsigned char ver; + cpumask_t tmp_map; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_id = m->mpc_apicid; + } + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + num_processors++; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + +#if MAX_APICS < 255 + if ((int)m->mpc_apicid > MAX_APICS) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } +#endif + ver = m->mpc_apicver; + + physid_set(m->mpc_apicid, phys_cpu_present_map); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + /* + * bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + cpu = 0; + } + bios_cpu_apicid[cpu] = m->mpc_apicid; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); +} +#else +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +{ + num_processors++; +} +#endif /* CONFIG_XEN */ + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + + if (strncmp(str, "ISA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, "EISA", 4) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, "PCI", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, "MCA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else { + printk(KERN_ERR "Unknown bustype %s\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk("I/O APIC #%d Version %d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries >= MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk("SMP mptable: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], + mpc->mpc_signature[1], + mpc->mpc_signature[2], + mpc->mpc_signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk("SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(str,mpc->mpc_oem,8); + str[8]=0; + printk(KERN_INFO "OEM ID: %s ",str); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + printk("APIC at: 0x%X\n",mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + } + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + extern void __bad_mpf_size(void); + unsigned int *bp = isa_bus_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + __bad_mpf_size(); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_intel_smp (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + */ + + address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + if (smp_scan_config(address, 0x1000)) + return; + + /* If we have come this far, we did not find an MP table */ + printk(KERN_INFO "No mptable found.\n"); +} + +/* + * - Intel MP Configuration Table + */ +void __init find_smp_config (void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + find_intel_smp(); +#endif +} + + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +void __init mp_register_lapic_address ( + u64 address) +{ +#ifndef CONFIG_XEN + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +#endif +} + + +void __cpuinit mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (id >= MAX_APICS) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + +#ifndef CONFIG_XEN + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; +#endif + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_start; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_start) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + mp_ioapics[idx].mpc_apicid = id; + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI IRQs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_start = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_start, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } + + return; +} + +#define MAX_GSI_NUM 4096 + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, to the IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi_to_irq[gsi]; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_fadt.sci_int) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /*CONFIG_X86_IO_APIC*/ +#endif /*CONFIG_ACPI*/ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/process_64-xen.c 2008-08-07 12:44:36.000000000 +0200 @@ -0,0 +1,848 @@ +/* + * linux/arch/x86-64/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + * + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <stdarg.h> + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/module.h> +#include <linux/a.out.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/ptrace.h> +#include <linux/utsname.h> +#include <linux/random.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/mmu_context.h> +#include <asm/pda.h> +#include <asm/prctl.h> +#include <asm/kdebug.h> +#include <xen/interface/platform.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/hardirq.h> +#include <asm/ia32.h> +#include <asm/idle.h> + +#include <xen/cpu_hotplug.h> + +asmlinkage extern void ret_from_fork(void); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL(idle_notifier_unregister); + +enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; +static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; + +void enter_idle(void) +{ + __get_cpu_var(idle_state) = CPU_IDLE; + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + __get_cpu_var(idle_state) = CPU_NOT_IDLE; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + if (current->pid | read_pda(irqcount)) + return; + __exit_idle(); +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + local_irq_enable(); + + asm volatile( + "2:" + "testl %0,%1;" + "rep; nop;" + "je 2b;" + : : + "i" (_TIF_NEED_RESCHED), + "m" (current_thread_info()->flags)); +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static inline void play_dead(void) +{ + idle_task_exit(); + local_irq_disable(); + cpu_clear(smp_processor_id(), cpu_initialized); + preempt_enable_no_resched(); + VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); + cpu_bringup(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + current_thread_info()->status |= TS_POLLING; + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + rmb(); + idle = xen_idle; /* no alternatives */ + if (cpu_is_offline(smp_processor_id())) + play_dead(); + enter_idle(); + idle(); + __exit_idle(); + } + + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && + !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ +} + +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } + + boot_option_idle_override = 1; + return 1; +} + +__setup("idle=", idle_setup); + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs * regs) +{ + unsigned long fs, gs, shadowgs; + unsigned int fsindex,gsindex; + unsigned int ds,cs,es; + + printk("\n"); + print_modules(); + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); + printk_address(regs->rip); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP: %016lx R08: %016lx R09: %016lx\n", + regs->rbp, regs->r8, regs->r9); + printk("R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk("R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("mov %%ds,%0" : "=r" (ds)); + asm("mov %%cs,%0" : "=r" (cs)); + asm("mov %%es,%0" : "=r" (es)); + asm("mov %%fs,%0" : "=r" (fsindex)); + asm("mov %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs,fsindex,gs,gsindex,shadowgs); + printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es); + +} + +void show_regs(struct pt_regs *regs) +{ + printk("CPU %d:", smp_processor_id()); + __show_regs(regs); + show_trace(NULL, regs, (void *)(regs + 1)); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + + if (me->thread.io_bitmap_ptr) { +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); +#endif +#ifdef CONFIG_XEN + struct physdev_set_iobitmap iobmp_op; + memset(&iobmp_op, 0, sizeof(iobmp_op)); +#endif + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + /* + * Careful, clear this in the TSS too: + */ +#ifndef CONFIG_X86_NO_TSS + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + put_cpu(); +#endif +#ifdef CONFIG_XEN + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &iobmp_op)); +#endif + t->io_bitmap_max = 0; + } +} + +void load_gs_index(unsigned gs) +{ + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + struct thread_info *t = current_thread_info(); + + if (t->flags & _TIF_ABI_PENDING) { + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); + if (t->flags & _TIF_IA32) + current_thread_info()->status |= TS_COMPAT; + } + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct n_desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + desc->a = LDT_entry_a(&ud); + desc->b = LDT_entry_b(&ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + struct desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + return desc->base0 | + (((u32)desc->base1) << 16) | + (((u32)desc->base2) << 24); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + int err; + struct pt_regs * childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) + (THREAD_SIZE + task_stack_page(p))) - 1; + *childregs = *regs; + + childregs->rax = 0; + childregs->rsp = rsp; + if (rsp == ~0UL) + childregs->rsp = (unsigned long)childregs; + + p->thread.rsp = (unsigned long) childregs; + p->thread.rsp0 = (unsigned long) (childregs+1); + p->thread.userrsp = me->thread.userrsp; + + set_tsk_thread_flag(p, TIF_FORK); + + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + + asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); + asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); + asm("mov %%es,%0" : "=m" (p->thread.es)); + asm("mov %%ds,%0" : "=m" (p->thread.ds)); + + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = ia32_child_tls(p, childregs); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + p->thread.iopl = current->thread.iopl; + + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +static inline void __save_init_fpu( struct task_struct *tsk ) +{ + asm volatile( "rex64 ; fxsave %0 ; fnclex" + : "=m" (tsk->thread.i387.fxsave)); + tsk->thread_info->status &= ~TS_USEDFPU; +} + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + */ +__kprobes struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + * The AMD workaround requires it to be after DS reload, or + * after DS has been cleared, which we do in __prepare_arch_switch. + */ + if (prev_p->thread_info->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } + + /* + * Reload esp0, LDT and the page table pointer: + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->rsp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = virt_to_machine( \ + &cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]); \ + mcl->args[1] = next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : next->iopl; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + if (unlikely(next->es)) + loadsegment(es, next->es); + + if (unlikely(next->ds)) + loadsegment(ds, next->ds); + + /* + * Switch FS and GS. + */ + if (unlikely(next->fsindex)) + loadsegment(fs, next->fsindex); + + if (next->fs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); + + if (unlikely(next->gsindex)) + load_gs_index(next->gsindex); + + if (next->gs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); + + /* + * Switch the PDA context. + */ + prev->userrsp = read_pda(oldrsp); + write_pda(oldrsp, next->userrsp); + write_pda(pcurrent, next_p); + write_pda(kernelstack, + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(next->debugreg7)) { + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); + } + + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs regs) +{ + long error; + char * filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + } + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); +} + +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->rsp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage long sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + NULL, NULL); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp,rip; + int count = 0; + + if (!p || p == current || p->state==TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.rsp); + do { + if (fp < (unsigned long)stack || + fp > (unsigned long)stack+THREAD_SIZE) + return 0; + rip = *(u64 *)(fp+8); + if (!in_sched_functions(rip)) + return rip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = HYPERVISOR_set_segment_base( + SEGBASE_GS_USER, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, + addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + asm("movl %%gs,%0" : "=r" (gsindex)); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } + else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs *pp, ptregs; + + pp = task_pt_regs(tsk); + + ptregs = *pp; + ptregs.cs &= 0xffff; + ptregs.ss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + boot_option_idle_override = 1; + return 1; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup_64-xen.c 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,1656 @@ +/* + * linux/arch/x86-64/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Nov 2001 Dave Jones <davej@suse.de> + * Forked from i386 setup code. + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/screen_info.h> +#include <linux/ioport.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <linux/console.h> +#include <linux/seq_file.h> +#include <linux/crash_dump.h> +#include <linux/root_dev.h> +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/kallsyms.h> +#include <linux/edd.h> +#include <linux/mmzone.h> +#include <linux/kexec.h> +#include <linux/cpufreq.h> +#include <linux/dmi.h> +#include <linux/dma-mapping.h> +#include <linux/ctype.h> + +#include <asm/mtrr.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/msr.h> +#include <asm/desc.h> +#include <video/edid.h> +#include <asm/e820.h> +#include <asm/dma.h> +#include <asm/mpspec.h> +#include <asm/mmu_context.h> +#include <asm/bootsetup.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/mach_apic.h> +#include <asm/numa.h> +#include <asm/sections.h> +#include <asm/dmi.h> +#ifdef CONFIG_XEN +#include <linux/percpu.h> +#include <xen/interface/physdev.h> +#include "setup_arch_pre.h" +#include <asm/hypervisor.h> +#include <xen/interface/nmi.h> +#include <xen/features.h> +#include <xen/firmware.h> +#include <xen/xencons.h> +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) +#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#include <asm/mach-xen/setup_arch_post.h> +#include <xen/interface/memory.h> + +#ifdef CONFIG_XEN +#include <xen/interface/kexec.h> +#endif + +extern unsigned long start_pfn; +extern struct edid_info edid_info; + +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +extern char hypercall_page[PAGE_SIZE]; +EXPORT_SYMBOL(hypercall_page); + +static int xen_panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block xen_panic_block = { + xen_panic_event, NULL, 0 /* try to go last */ +}; + +unsigned long *phys_to_machine_mapping; +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[512]; + +EXPORT_SYMBOL(phys_to_machine_mapping); + +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); +DEFINE_PER_CPU(int, nr_multicall_ents); + +/* Raw start-of-day parameters from the hypervisor. */ +start_info_t *xen_start_info; +EXPORT_SYMBOL(xen_start_info); +#endif + +/* + * Machine setup.. + */ + +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); + +unsigned long mmu_cr4_features; + +int acpi_disabled; +EXPORT_SYMBOL(acpi_disabled); +#ifdef CONFIG_ACPI +extern int __initdata acpi_ht; +extern acpi_interrupt_flags acpi_sci_flags; +int __initdata acpi_force = 0; +#endif + +int acpi_numa __initdata; + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +unsigned long saved_video_mode; + +/* + * Early DMI memory + */ +int dmi_alloc_index; +char dmi_alloc_data[DMI_MAX_DATA]; + +/* + * Setup options + */ +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct sys_desc_table_struct { + unsigned short length; + unsigned char table[0]; +}; + +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); +struct e820map e820; +#ifdef CONFIG_XEN +struct e820map machine_e820; +#endif + +extern int root_mountflags; + +char command_line[COMMAND_LINE_SIZE]; + +struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x6f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) + +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; + +#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_ROM, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_ROM, +}; + +static struct resource adapter_rom_resources[] = { + { .name = "Adapter ROM", .start = 0xc8000, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM }, + { .name = "Adapter ROM", .start = 0, .end = 0, + .flags = IORESOURCE_ROM } +}; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_ROM, +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_RAM, +}; + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + +#ifdef CONFIG_XEN + /* Nothing to do if not running in dom0. */ + if (!is_initial_xendomain()) + return; +#endif + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +/* Check for full argument with no trailing characters */ +static int fullarg(char *p, char *arg) +{ + int l = strlen(arg); + return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); +} + +static __init void parse_cmdline_early (char ** cmdline_p) +{ + char c = ' ', *to = command_line, *from = COMMAND_LINE; + int len = 0; + int userdef = 0; + + for (;;) { + if (c != ' ') + goto next_char; + +#ifdef CONFIG_SMP + /* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ + else if (!memcmp(from, "maxcpus=", 8)) { + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(from + 8, NULL, 0); + } +#endif +#ifdef CONFIG_ACPI + /* "acpi=off" disables both ACPI table parsing and interpreter init */ + if (fullarg(from,"acpi=off")) + disable_acpi(); + + if (fullarg(from, "acpi=force")) { + /* add later when we do DMI horrors: */ + acpi_force = 1; + acpi_disabled = 0; + } + + /* acpi=ht just means: do ACPI MADT parsing + at bootup, but don't enable the full ACPI interpreter */ + if (fullarg(from, "acpi=ht")) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + else if (fullarg(from, "pci=noacpi")) + acpi_disable_pci(); + else if (fullarg(from, "acpi=noirq")) + acpi_noirq_set(); + + else if (fullarg(from, "acpi_sci=edge")) + acpi_sci_flags.trigger = 1; + else if (fullarg(from, "acpi_sci=level")) + acpi_sci_flags.trigger = 3; + else if (fullarg(from, "acpi_sci=high")) + acpi_sci_flags.polarity = 1; + else if (fullarg(from, "acpi_sci=low")) + acpi_sci_flags.polarity = 3; + + /* acpi=strict disables out-of-spec workarounds */ + else if (fullarg(from, "acpi=strict")) { + acpi_strict = 1; + } +#ifdef CONFIG_X86_IO_APIC + else if (fullarg(from, "acpi_skip_timer_override")) + acpi_skip_timer_override = 1; +#endif +#endif + +#ifndef CONFIG_XEN + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + disable_apic = 1; + } + + if (fullarg(from, "noapic")) + skip_ioapic_setup = 1; + + if (fullarg(from,"apic")) { + skip_ioapic_setup = 0; + ioapic_force = 1; + } +#endif + + if (!memcmp(from, "mem=", 4)) + parse_memopt(from+4, &from); + + if (!memcmp(from, "memmap=", 7)) { + /* exactmap option is for used defined memory */ + if (!memcmp(from+7, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram(); +#endif + from += 8+7; + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + } + else { + parse_memmapopt(from+7, &from); + userdef = 1; + } + } + +#ifdef CONFIG_NUMA + if (!memcmp(from, "numa=", 5)) + numa_setup(from+5); +#endif + + if (!memcmp(from,"iommu=",6)) { + iommu_setup(from+6); + } + + if (fullarg(from,"oops=panic")) + panic_on_oops = 1; + + if (!memcmp(from, "noexec=", 7)) + nonx_setup(from + 7); + +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { +#ifndef CONFIG_XEN + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } +#else + printk("Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); +#endif + } +#endif + +#ifdef CONFIG_PROC_VMCORE + /* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ + else if(!memcmp(from, "elfcorehdr=", 11)) + elfcorehdr_addr = memparse(from+11, &from); +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) + else if (!memcmp(from, "additional_cpus=", 16)) + setup_additional_cpus(from+16); +#endif + + next_char: + c = *(from++); + if (!c) + break; + if (COMMAND_LINE_SIZE <= ++len) + break; + *(to++) = c; + } + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } + *to = '\0'; + *cmdline_p = command_line; +} + +#ifndef CONFIG_NUMA +static void __init +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long bootmap_size, bootmap; + + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n",bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); +#ifdef CONFIG_XEN + if (xen_start_info->nr_pages < end_pfn) + e820_bootmem_free(NODE_DATA(0), 0, + xen_start_info->nr_pages<<PAGE_SHIFT); + else +#endif + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size); +} +#endif + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +#ifndef CONFIG_XEN +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); + edd.mbr_signature_nr = EDD_MBR_SIG_NR; + edd.edd_info_nr = EDD_NR; +} +#endif +#else +static inline void copy_edd(void) +{ +} +#endif + +#ifndef CONFIG_XEN +#define EBDA_ADDR_POINTER 0x40E + +unsigned __initdata ebda_addr; +unsigned __initdata ebda_size; + +static void discover_ebda(void) +{ + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr <<= 4; + + ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; +} +#else +#define discover_ebda() ((void)0) +#endif + +void __init setup_arch(char **cmdline_p) +{ +#ifdef CONFIG_XEN + /* Register a call for panic conditions. */ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + + ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); + screen_info = SCREEN_INFO; + + if (is_initial_xendomain()) { + const struct dom0_vga_console_info *info = + (void *)((char *)xen_start_info + + xen_start_info->console.dom0.info_off); + + dom0_init_screen_info(info, + xen_start_info->console.dom0.info_size); + xen_start_info->console.domU.mfn = 0; + xen_start_info->console.domU.evtchn = 0; + } else + screen_info.orig_video_isVGA = 0; + + copy_edid(); + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + ARCH_SETUP +#else + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); + screen_info = SCREEN_INFO; + edid_info = EDID_INFO; +#endif /* !CONFIG_XEN */ + saved_video_mode = SAVED_VIDEO_MODE; + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + setup_memory_region(); + copy_edd(); + + if (!MOUNT_ROOT_RDONLY) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) &_text; + init_mm.end_code = (unsigned long) &_etext; + init_mm.end_data = (unsigned long) &_edata; + init_mm.brk = (unsigned long) &_end; + + code_resource.start = virt_to_phys(&_text); + code_resource.end = virt_to_phys(&_etext)-1; + data_resource.start = virt_to_phys(&_etext); + data_resource.end = virt_to_phys(&_edata)-1; + + parse_cmdline_early(cmdline_p); + + early_identify_cpu(&boot_cpu_data); + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + end_pfn = e820_end_of_ram(); + num_physpages = end_pfn; /* for pfn_valid */ + + check_efer(); + + discover_ebda(); + + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + + if (is_initial_xendomain()) + dmi_scan_machine(); + +#ifdef CONFIG_ACPI_NUMA + /* + * Parse SRAT to discover nodes. + */ + acpi_numa_init(); +#endif + +#ifdef CONFIG_NUMA + numa_initmem_init(0, end_pfn); +#else + contig_initmem_init(0, end_pfn); +#endif + +#ifdef CONFIG_XEN + /* + * Reserve kernel, physmap, start info, initial page tables, and + * direct mapping. + */ + reserve_bootmem_generic(__pa_symbol(&_text), + (table_end << PAGE_SHIFT) - __pa_symbol(&_text)); +#else + /* Reserve direct mapping */ + reserve_bootmem_generic(table_start << PAGE_SHIFT, + (table_end - table_start) << PAGE_SHIFT); + + /* reserve kernel */ + reserve_bootmem_generic(__pa_symbol(&_text), + __pa_symbol(&_end) - __pa_symbol(&_text)); + + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, + * enabling clean reboots, SMP operation, laptop functions. + */ + reserve_bootmem_generic(0, PAGE_SIZE); + + /* reserve ebda region */ + if (ebda_addr) + reserve_bootmem_generic(ebda_addr, ebda_size); + +#ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); + + /* Reserve SMP trampoline */ + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); +#endif +#endif + +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#ifdef CONFIG_XEN +#ifdef CONFIG_BLK_DEV_INITRD + if (xen_start_info->mod_start) { + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { + /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/ + initrd_start = INITRD_START + PAGE_OFFSET; + initrd_end = initrd_start+INITRD_SIZE; + initrd_below_start_ok = 1; + } else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + (unsigned long)(INITRD_START + INITRD_SIZE), + (unsigned long)(end_pfn << PAGE_SHIFT)); + initrd_start = 0; + } + } +#endif +#else /* CONFIG_XEN */ +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START) { + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { + reserve_bootmem_generic(INITRD_START, INITRD_SIZE); + initrd_start = + INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_end = initrd_start+INITRD_SIZE; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + (unsigned long)(INITRD_START + INITRD_SIZE), + (unsigned long)(end_pfn << PAGE_SHIFT)); + initrd_start = 0; + } + } +#endif +#endif /* !CONFIG_XEN */ +#ifdef CONFIG_KEXEC +#ifdef CONFIG_XEN + xen_machine_kexec_setup_resources(); +#else + if (crashk_res.start != crashk_res.end) { + reserve_bootmem_generic(crashk_res.start, + crashk_res.end - crashk_res.start + 1); + } +#endif +#endif + + paging_init(); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif +#ifdef CONFIG_XEN + { + int i, j, k, fpp; + unsigned long p2m_pages; + + p2m_pages = end_pfn; + if (xen_start_info->nr_pages > end_pfn) { + /* + * the end_pfn was shrunk (probably by mem= or highmem= + * kernel parameter); shrink reservation with the HV + */ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned int difference; + int ret; + + difference = xen_start_info->nr_pages - end_pfn; + + set_xen_guest_handle(reservation.extent_start, + ((unsigned long *)xen_start_info->mfn_list) + end_pfn); + reservation.nr_extents = difference; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + BUG_ON (ret != difference); + } + else if (end_pfn > xen_start_info->nr_pages) + p2m_pages = xen_start_info->nr_pages; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make sure we have a large enough P->M table. */ + phys_to_machine_mapping = alloc_bootmem_pages( + end_pfn * sizeof(unsigned long)); + memset(phys_to_machine_mapping, ~0, + end_pfn * sizeof(unsigned long)); + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info->mfn_list, + p2m_pages * sizeof(unsigned long)); + free_bootmem( + __pa(xen_start_info->mfn_list), + PFN_PHYS(PFN_UP(xen_start_info->nr_pages * + sizeof(unsigned long)))); + + /* + * Initialise the list of the frames that specify the + * list of frames that make up the p2m table. Used by + * save/restore. + */ + pfn_to_mfn_frame_list_list = alloc_bootmem_pages(PAGE_SIZE); + + fpp = PAGE_SIZE/sizeof(unsigned long); + for (i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++) { + if ((j % fpp) == 0) { + k++; + BUG_ON(k>=fpp); + pfn_to_mfn_frame_list[k] = + alloc_bootmem_pages(PAGE_SIZE); + pfn_to_mfn_frame_list_list[k] = + virt_to_mfn(pfn_to_mfn_frame_list[k]); + j=0; + } + pfn_to_mfn_frame_list[k][j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + HYPERVISOR_shared_info->arch.max_pfn = end_pfn; + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(pfn_to_mfn_frame_list_list); + } + + /* Mark all ISA DMA channels in-use - using them wouldn't work. */ + for (i = 0; i < MAX_DMA_CHANNELS; ++i) + if (i != 4 && request_dma(i, "xen") != 0) + BUG(); + } + + if (!is_initial_xendomain()) { + acpi_disabled = 1; +#ifdef CONFIG_ACPI + acpi_ht = 0; +#endif + } +#endif + +#ifndef CONFIG_XEN + check_ioapic(); +#endif + + zap_low_mappings(0); + + /* + * set this early, so we dont allocate cpu0 + * if MADT list doesnt list BSP first + * mpparse.c/MP_processor_info() allocates logical cpu numbers. + */ + cpu_set(0, cpu_present_map); +#ifdef CONFIG_ACPI + /* + * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). + * Call this early for SRAT node setup. + */ + acpi_boot_table_init(); + + /* + * Read APIC and some other early information from ACPI tables. + */ + acpi_boot_init(); +#endif + + init_cpu_to_node(); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + get_smp_config(); +#ifndef CONFIG_XEN + init_apic_mappings(); +#endif +#endif +#if defined(CONFIG_XEN) && defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) + prefill_possible_map(); +#endif + + /* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ + probe_roms(); +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + e820_reserve_resources(machine_e820.map, machine_e820.nr_map); +#else + e820_reserve_resources(e820.map, e820.nr_map); +#endif + + request_resource(&iomem_resource, &video_ram_resource); + + { + unsigned i; + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + } + +#ifdef CONFIG_XEN + if (is_initial_xendomain()) + e820_setup_gap(machine_e820.map, machine_e820.nr_map); +#else + e820_setup_gap(e820.map, e820.nr_map); +#endif + +#ifdef CONFIG_XEN + { + struct physdev_set_iopl set_iopl; + + set_iopl.iopl = 1; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); + + if (is_initial_xendomain()) { +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + } else { +#if defined(CONFIG_VT) && defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif + } + } +#else /* CONFIG_XEN */ + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + +#endif /* !CONFIG_XEN */ +} + +#ifdef CONFIG_XEN +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + HYPERVISOR_shutdown(SHUTDOWN_crash); + /* we're never actually going to get here... */ + return NOTIFY_DONE; +} +#endif /* !CONFIG_XEN */ + + +static int __cpuinit get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + + if (c->extended_cpuid_level < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + return 1; +} + + +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, eax, ebx, ecx, edx; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; + } + + if (n >= 0x80000006) { + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + ecx = cpuid_ecx(0x80000006); + c->x86_cache_size = ecx >> 16; + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + c->x86_cache_size, ecx & 0xFF); + } + + if (n >= 0x80000007) + cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); + if (n >= 0x80000008) { + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +} + +#ifdef CONFIG_NUMA +static int nearby_node(int apicid) +{ + int i; + for (i = apicid - 1; i >= 0; i--) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits; +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node = 0; + unsigned apicid = hard_smp_processor_id(); +#endif + unsigned ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + c->phys_proc_id = phys_pkg_id(bits); + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __init init_amd(struct cpuinfo_x86 *c) +{ + unsigned level; + +#ifdef CONFIG_SMP + unsigned long value; + + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 15) { + rdmsrl(MSR_K8_HWCR, value); + value |= 1 << 6; + wrmsrl(MSR_K8_HWCR, value); + } +#endif + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_bit(0*32+31, &c->x86_capability); + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + + level = get_model_name(c); + if (!level) { + switch (c->x86) { + case 15: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } + display_cacheinfo(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) + amd_detect_cmp(c); + + /* Fix cpuid4 emulation for more */ + num_cache_leaves = 3; +} + +static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1 ) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id(index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); + } +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + } + +#endif +} + +/* + * find out the number of processor cores on the die + */ +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, t; + + if (c->cpuid_level < 4) + return 1; + + cpuid_count(4, 0, &eax, &t, &t, &t); + + if (eax & 0x1f) + return ((eax >> 26) + 1); + else + return 1; +} + +static void srat_detect_node(void) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + if (acpi_numa > 0) + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + +static void __cpuinit init_intel(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned n; + + init_intel_cacheinfo(c); + if (c->cpuid_level > 9 ) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + } + + n = c->extended_cpuid_level; + if (n >= 0x80000008) { + unsigned eax = cpuid_eax(0x80000008); + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; + } + + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + c->x86_max_cores = intel_num_cpu_cores(c); + + srat_detect_node(); +} + +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + + if (!strcmp(v, "AuthenticAMD")) + c->x86_vendor = X86_VENDOR_AMD; + else if (!strcmp(v, "GenuineIntel")) + c->x86_vendor = X86_VENDOR_INTEL; + else + c->x86_vendor = X86_VENDOR_UNKNOWN; +} + +struct cpu_model_info { + int vendor; + int family; + char *model_names[16]; +}; + +/* Do some early cpuid on the boot CPU to get some parameter that are + needed before check_bugs. Everything advanced is in identify_cpu + below. */ +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +{ + u32 tfms; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + c->x86_max_cores = 1; + c->extended_cpuid_level = 0; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c); + + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + __u32 misc; + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], + &c->x86_capability[0]); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + +#ifdef CONFIG_SMP + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; +#endif +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + u32 xlvl; + + early_identify_cpu(c); + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if (xlvl >= 0x80000004) + get_model_name(c); /* Default name */ + } + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + /* Don't set x86_cpuid_level here for now to not confuse. */ + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } + +#ifndef CONFIG_XEN + c->apicid = phys_pkg_id(0); +#endif + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + init_amd(c); + break; + + case X86_VENDOR_INTEL: + init_intel(c); + break; + + case X86_VENDOR_UNKNOWN: + default: + display_cacheinfo(c); + break; + } + + select_idle_routine(c); + detect_ht(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0 ; i < NCAPINTS ; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + +#ifdef CONFIG_X86_MCE + mcheck_init(c); +#endif + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif +} + + +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +{ + if (c->x86_model_id[0]) + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +/* + * Get CPU information for use by the procfs. + */ + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + struct cpuinfo_x86 *c = v; + + /* + * These flag bits must match the definitions in <asm/cpufeature.h>. + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu/<cpu_nr>/cpuid instead. + */ + static char *x86_cap_flags[] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, + + /* AMD-defined */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, + "constant_tsc", NULL, NULL, + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + }; + static char *x86_power_flags[] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + NULL, + /* nothing */ /* constant_tsc - moved to flags */ + }; + + +#ifdef CONFIG_SMP + if (!cpu_online(c-cpu_data)) + return 0; +#endif + + seq_printf(m,"processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)(c-cpu_data), + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_mask || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_mask); + else + seq_printf(m, "stepping\t: unknown\n"); + + if (cpu_has(c,X86_FEATURE_TSC)) { + unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); + if (!freq) + freq = cpu_khz; + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", + freq / 1000, (freq % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size >= 0) + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); + +#ifdef CONFIG_SMP + if (smp_num_siblings * c->x86_max_cores > 1) { + int cpu = c - cpu_data; + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + } +#endif + + seq_printf(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", + c->cpuid_level); + + { + int i; + for ( i = 0 ; i < 32*NCAPINTS ; i++ ) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + } + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); + seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + seq_printf(m, "power management:"); + { + unsigned i; + for (i = 0; i < 32; i++) + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0]?" ":"", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } + } + + seq_printf(m, "\n\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + return *pos < NR_CPUS ? cpu_data + *pos : NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations cpuinfo_op = { + .start =c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) +#include <linux/platform_device.h> +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + if (!is_initial_xendomain()) + return 0; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/setup64-xen.c 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,367 @@ +/* + * X86-64 specific CPU setup. + * Copyright (C) 1995 Linus Torvalds + * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. + * See setup.c for older changelog. + * + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen + * + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <asm/bootsetup.h> +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/mmu_context.h> +#include <asm/smp.h> +#include <asm/i387.h> +#include <asm/percpu.h> +#include <asm/proto.h> +#include <asm/sections.h> +#ifdef CONFIG_XEN +#include <asm/hypervisor.h> +#endif + +char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); +struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; + +#ifndef CONFIG_X86_NO_IDT +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +#endif + +char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); + +unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL(__supported_pte_mask); +static int do_not_nx __cpuinitdata = 0; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +int __init nonx_setup(char *str) +{ + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 1; +} +__setup("noexec=", nonx_setup); /* parsed early actually */ + +int force_personality32 = 0; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes +off PROT_READ implies PROT_EXEC (default) +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +/* + * Great future plan: + * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. + * Always point %gs to its beginning + */ +void __init setup_per_cpu_areas(void) +{ + int i; + unsigned long size; + +#ifdef CONFIG_HOTPLUG_CPU + prefill_possible_map(); +#endif + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); +#ifdef CONFIG_MODULES + if (size < PERCPU_ENOUGH_ROOM) + size = PERCPU_ENOUGH_ROOM; +#endif + + for_each_cpu_mask (i, cpu_possible_map) { + char *ptr; + + if (!NODE_DATA(cpu_to_node(i))) { + printk("cpu with no node %d, num_online_nodes %d\n", + i, num_online_nodes()); + ptr = alloc_bootmem(size); + } else { + ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + } + if (!ptr) + panic("Cannot allocate cpu data for CPU %d\n", i); + cpu_pda(i)->data_offset = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + } +} + +#ifdef CONFIG_XEN +static void switch_pt(void) +{ + xen_pt_switch(__pa_symbol(init_level4_pgt)); + xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); +} + +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); + } + if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / + sizeof (struct desc_struct))) + BUG(); +} +#else +static void switch_pt(void) +{ + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); +} + +static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) +{ + asm volatile("lgdt %0" :: "m" (*gdt_descr)); + asm volatile("lidt %0" :: "m" (idt_descr)); +} +#endif + +void pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); +#ifndef CONFIG_XEN + wrmsrl(MSR_GS_BASE, pda); +#else + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, + (unsigned long)pda)) + BUG(); +#endif + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { +#ifdef CONFIG_XEN + xen_init_pt(); +#endif + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + } + + switch_pt(); + + pda->irqstackptr += IRQSTACKSIZE-64; +} + +#ifndef CONFIG_X86_NO_TSS +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] +__attribute__((section(".bss.page_aligned"))); +#endif + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ +#ifndef CONFIG_XEN + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to set CS/DS + * but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); +#endif +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init (); +#endif +} + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) { + __supported_pte_mask &= ~_PAGE_NX; + } +} + +unsigned long kernel_eflags; + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ +void __cpuinit cpu_init (void) +{ + int cpu = stack_smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + unsigned i; +#endif + struct task_struct *me; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) { + pda_init(cpu); + zap_low_mappings(cpu); + } +#ifndef CONFIG_X86_NO_TSS + else + estacks = boot_exception_stacks; +#endif + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk("Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ +#ifndef CONFIG_XEN + if (cpu) + memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); +#endif + + cpu_gdt_descr[cpu].size = GDT_SIZE; + cpu_gdt_init(&cpu_gdt_descr[cpu]); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + +#ifndef CONFIG_X86_NO_TSS + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + switch (v + 1) { +#if DEBUG_STKSZ > EXCEPTION_STKSZ + case DEBUG_STACK: + cpu_pda(cpu)->debugstack = (unsigned long)estacks; + estacks += DEBUG_STKSZ; + break; +#endif + default: + estacks += EXCEPTION_STKSZ; + break; + } + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + } + + t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; +#endif + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + +#ifndef CONFIG_X86_NO_TSS + set_tss_desc(cpu, t); +#endif +#ifndef CONFIG_XEN + load_TR_desc(); +#endif + load_LDT(&init_mm.context); + + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); + + fpu_init(); + + raw_local_save_flags(kernel_eflags); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/smp_64-xen.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,575 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/smp_lock.h> +#include <linux/smp.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/interrupt.h> + +#include <asm/mtrr.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mach_apic.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/apicdef.h> +#include <asm/idle.h> +#ifdef CONFIG_XEN +#include <xen/evtchn.h> +#endif + +#ifndef CONFIG_XEN +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + cpumask_t flush_cpumask; + struct mm_struct *flush_mm; + unsigned long flush_va; +#define FLUSH_ALL -1ULL + spinlock_t tlbstate_lock; + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +static inline void leave_mm(unsigned long cpu) +{ + if (read_pda(mmu_state) == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) +{ + int cpu; + int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); + + if (!cpu_isset(cpu, f->flush_cpumask)) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == read_pda(active_mm)) { + if (read_pda(mmu_state) == TLBSTATE_OK) { + if (f->flush_va == FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + cpu_clear(cpu, f->flush_cpumask); +} + +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* Could avoid this lock when + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + probably not worth checking this for a cache-hot lock. */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); + + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpus_empty(f->flush_cpumask)) + cpu_relax(); + + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} + +int __cpuinit init_smp_flush(void) +{ + int i; + for_each_cpu_mask(i, cpu_possible_map) { + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); + } + return 0; +} + +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + local_flush_tlb(); + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_current_task); + +void flush_tlb_mm (struct mm_struct * mm) +{ + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if(current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, va); + + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_page); + +static void do_flush_tlb_all(void* info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (read_pda(mmu_state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); +} +#endif /* Xen */ + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ + +void smp_send_reschedule(int cpu) +{ + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +} + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static struct call_data_struct * call_data; + +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + +/* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + * + * cpu is a standard Linux logical CPU number. + */ +static void +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} + +/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ +static void __smp_call_function (void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus()-1; + + if (!cpus) + return; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function - run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other + * CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute func or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + * Actually there are a few legal cases, like panic. + */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + spin_lock(&call_lock); + __smp_call_function(func,info,nonatomic,wait); + spin_unlock(&call_lock); + return 0; +} +EXPORT_SYMBOL(smp_call_function); + +void smp_stop_cpu(void) +{ + unsigned long flags; + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + local_irq_save(flags); + disable_all_local_evtchn(); + local_irq_restore(flags); +} + +static void smp_really_stop_cpu(void *dummy) +{ + smp_stop_cpu(); + for (;;) + halt(); +} + +void smp_send_stop(void) +{ + int nolock = 0; +#ifndef CONFIG_XEN + if (reboot_force) + return; +#endif + /* Don't deadlock on the call lock in panic */ + if (!spin_trylock(&call_lock)) { + /* ignore locking because we have panicked anyways */ + nolock = 1; + } + __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); + + local_irq_disable(); + disable_all_local_evtchn(); + local_irq_enable(); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +#ifndef CONFIG_XEN +asmlinkage void smp_reschedule_interrupt(void) +#else +asmlinkage irqreturn_t smp_reschedule_interrupt(void) +#endif +{ +#ifndef CONFIG_XEN + ack_APIC_irq(); +#else + return IRQ_HANDLED; +#endif +} + +#ifndef CONFIG_XEN +asmlinkage void smp_call_function_interrupt(void) +#else +asmlinkage irqreturn_t smp_call_function_interrupt(void) +#endif +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + +#ifndef CONFIG_XEN + ack_APIC_irq(); +#endif + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + exit_idle(); + irq_enter(); + (*func)(info); + irq_exit(); + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } +#ifdef CONFIG_XEN + return IRQ_HANDLED; +#endif +} + +int safe_smp_processor_id(void) +{ +#ifdef CONFIG_XEN + return smp_processor_id(); +#else + unsigned apicid, i; + + if (disable_apic) + return 0; + + apicid = hard_smp_processor_id(); + if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) + return apicid; + + for (i = 0; i < NR_CPUS; ++i) { + if (x86_cpu_to_apicid[i] == apicid) + return i; + } + + /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, + * or called too early. Either way, we must be CPU 0. */ + if (x86_cpu_to_apicid[0] == BAD_APICID) + return 0; + + return 0; /* Should not happen */ +#endif +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/traps_64-xen.c 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,1173 @@ +/* + * linux/arch/x86-64/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'entry.S'. + */ +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/nmi.h> +#include <linux/kprobes.h> +#include <linux/kexec.h> +#include <linux/unwind.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/kdebug.h> +#include <asm/processor.h> +#include <asm/unwind.h> +#include <asm/smp.h> +#include <asm/pgalloc.h> +#include <asm/pda.h> +#include <asm/proto.h> +#include <asm/nmi.h> + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void double_fault(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void reserved(void); +asmlinkage void alignment_check(void); +asmlinkage void machine_check(void); +asmlinkage void spurious_interrupt_bug(void); + +ATOMIC_NOTIFIER_HEAD(die_chain); +EXPORT_SYMBOL(die_chain); + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + preempt_disable(); + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_disable(); + /* Make sure to not schedule here because we could be running + on an exception stack. */ + preempt_enable_no_resched(); +} + +static int kstack_depth_to_print = 12; +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif + +#ifdef CONFIG_KALLSYMS +# include <linux/kallsyms.h> +void printk_address(unsigned long address) +{ + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%016lx>]\n", address); + return; + } + if (!modname) + modname = delim = ""; + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", + address, delim, modname, delim, symname, offset, symsize); +} +#else +void printk_address(unsigned long address) +{ + printk(" [<%016lx>]\n", address); +} +#endif + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, const char **idp) +{ +#ifndef CONFIG_X86_NO_TSS + static char ids[][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif + }; + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end; + + /* + * set 'end' to the end of the exception stack. + */ + switch (k + 1) { + /* + * TODO: this block is not needed i think, because + * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] + * properly too. + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + case DEBUG_STACK: + end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; + break; +#endif + default: + end = per_cpu(orig_ist, cpu).ist[k]; + break; + } + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif + } +#endif + return NULL; +} + +static int show_trace_unwind(struct unwind_frame_info *info, void *context) +{ + int n = 0; + + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + printk_address(UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + } + return n; +} + +/* + * x86-64 can have upto three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) +{ + const unsigned cpu = safe_smp_processor_id(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned used = 0; + + printk("\nCall Trace:\n"); + + if (!tsk) + tsk = current; + + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + + if (regs) { + if (unwind_init_frame_info(&info, tsk, regs) == 0) + unw_ret = show_trace_unwind(&info, NULL); + } else if (tsk == current) + unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + else { + if (unwind_init_blocked(&info, tsk) == 0) + unw_ret = show_trace_unwind(&info, NULL); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + print_symbol("DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if ((long)UNW_SP(&info) < 0) { + printk("Leftover inexact backtrace:\n"); + stack = (unsigned long *)UNW_SP(&info); + } else + printk("Full inexact backtrace again:\n"); + } else if (call_trace >= 1) + return; + else + printk("Full inexact backtrace again:\n"); + } else + printk("Inexact backtrace:\n"); + } + + /* + * Print function call entries within a stack. 'cond' is the + * "end of stackframe" condition, that the 'stack++' + * iteration will eventually trigger. + */ +#define HANDLE_STACK(cond) \ + do while (cond) { \ + unsigned long addr = *stack++; \ + if (kernel_text_address(addr)) { \ + /* \ + * If the address is either in the text segment of the \ + * kernel, or in the region which contains vmalloc'ed \ + * memory, it *may* be the address of a calling \ + * routine; if so, print it so that someone tracing \ + * down the cause of the crash will be able to figure \ + * out the call path that was taken. \ + */ \ + printk_address(addr); \ + } \ + } while (0) + + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + for ( ; ; ) { + const char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + printk(" <%s>", id); + HANDLE_STACK (stack < estack_end); + printk(" <EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) estack_end[-2]; + continue; + } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + printk(" <IRQ>"); + HANDLE_STACK (stack < irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + printk(" <EOI>"); + continue; + } + } + break; + } + + /* + * This prints the process stack: + */ + HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); +#undef HANDLE_STACK + + printk("\n"); +} + +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) +{ + unsigned long *stack; + int i; + const int cpu = safe_smp_processor_id(); + unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + + // debugging aid: "show_stack(NULL, NULL);" prints the + // back trace for this cpu. + + if (rsp == NULL) { + if (tsk) + rsp = (unsigned long *)tsk->thread.rsp; + else + rsp = (unsigned long *)&rsp; + } + + stack = rsp; + for(i=0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" <EOI> "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % 4) == 0)) + printk("\n"); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); + } + show_trace(tsk, regs, rsp); +} + +void show_stack(struct task_struct *tsk, unsigned long * rsp) +{ + _show_stack(tsk, NULL, rsp); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long dummy; + show_trace(NULL, NULL, &dummy); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + int in_kernel = !user_mode(regs); + unsigned long rsp; + const int cpu = safe_smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + + rsp = regs->rsp; + + printk("CPU %d ", cpu); + __show_regs(regs); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (in_kernel) { + + printk("Stack: "); + _show_stack(NULL, regs, (unsigned long*)rsp); + + printk("\nCode: "); + if (regs->rip < PAGE_OFFSET) + goto bad; + + for (i=0; i<20; i++) { + unsigned char c; + if (__get_user(c, &((unsigned char*)regs->rip)[i])) { +bad: + printk(" Bad RIP value."); + break; + } + printk("%02x ", c); + } + } + printk("\n"); +} + +void handle_BUG(struct pt_regs *regs) +{ + struct bug_frame f; + long len; + const char *prefix = ""; + + if (user_mode(regs)) + return; + if (__copy_from_user(&f, (const void __user *) regs->rip, + sizeof(struct bug_frame))) + return; + if (f.filename >= 0 || + f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) + return; + len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1; + if (len < 0 || len >= PATH_MAX) + f.filename = (int)(long)"unmapped filename"; + else if (len > 50) { + f.filename += len - 50; + prefix = "..."; + } + printk("----------- [cut here ] --------- [please bite here ] ---------\n"); + printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line); +} + +#ifdef CONFIG_BUG +void out_of_line_bug(void) +{ + BUG(); +} +EXPORT_SYMBOL(out_of_line_bug); +#endif + +static DEFINE_SPINLOCK(die_lock); +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu = safe_smp_processor_id(); + unsigned long flags; + + /* racy, but better than risking deadlock. */ + local_irq_save(flags); + if (!spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags) +{ + die_owner = -1; + bust_spinlocks(0); + die_nest_count--; + if (die_nest_count) + /* We still own the lock */ + local_irq_restore(flags); + else + /* Nest count reaches zero, release the lock. */ + spin_unlock_irqrestore(&die_lock, flags); + if (panic_on_oops) + panic("Fatal exception"); +} + +void __kprobes __die(const char * str, struct pt_regs * regs, long err) +{ + static int die_counter; + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->rip); + printk(" RSP <%016lx>\n", regs->rsp); + if (kexec_should_crash(current)) + crash_kexec(regs); +} + +void die(const char * str, struct pt_regs * regs, long err) +{ + unsigned long flags = oops_begin(); + + handle_BUG(regs); + __die(str, regs, err); + oops_end(flags); + do_exit(SIGSEGV); +} + +#ifdef CONFIG_X86_LOCAL_APIC +void __kprobes die_nmi(char *str, struct pt_regs *regs) +{ + unsigned long flags = oops_begin(); + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(str, safe_smp_processor_id()); + show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); + if (panic_on_timeout || panic_on_oops) + panic("nmi watchdog"); + printk("console shuts up ...\n"); + oops_end(flags); + nmi_exit(); + local_irq_enable(); + do_exit(SIGSEGV); +} +#endif + +static void __kprobes do_trap(int trapnr, int signr, char *str, + struct pt_regs * regs, long error_code, + siginfo_t *info) +{ + struct task_struct *tsk = current; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + + if (user_mode(regs)) { + if (exception_trace && unhandled_signal(tsk, signr)) + printk(KERN_INFO + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + tsk->comm, tsk->pid, str, + regs->rip, regs->rsp, error_code); + + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + } + + + /* kernel trap */ + { + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) + regs->rip = fixup->fixup; + else + die(str, regs, error_code); + return; + } +} + +#define DO_ERROR(trapnr, signr, str, name) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ +} + +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR( 4, SIGSEGV, "overflow", overflow) +DO_ERROR( 5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR(18, SIGSEGV, "reserved", reserved) + +/* Runs on IST stack */ +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} + +asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, + long error_code) +{ + struct task_struct *tsk = current; + + conditional_sti(regs); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + + if (user_mode(regs)) { + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + printk(KERN_INFO + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + tsk->comm, tsk->pid, + regs->rip, regs->rsp, error_code); + + force_sig(SIGSEGV, tsk); + return; + } + + /* kernel gp */ + { + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return; + } + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); + } +} + +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) +{ + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); + printk("You probably have a hardware problem with your RAM chips\n"); + +#if 0 /* XEN */ + /* Clear and disable the memory parity error line. */ + reason = (reason & 0xf) | 4; + outb(reason, 0x61); +#endif /* XEN */ +} + +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) +{ + printk("NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + +#if 0 /* XEN */ + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + mdelay(2000); + reason &= ~8; + outb(reason, 0x61); +#endif /* XEN */ +} + +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +/* Runs on IST stack. This code must keep interrupts off all the time. + Nested NMIs are prevented by the CPU. */ +asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); + + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog > 0) { + nmi_watchdog_tick(regs,reason); + return; + } +#endif + unknown_nmi_error(reason, regs); + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ + + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); +} + +/* runs on IST stack. */ +asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) +{ + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { + return; + } + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->rsp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->eflags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} + +/* runs on IST stack. */ +asmlinkage void __kprobes do_debug(struct pt_regs * regs, + unsigned long error_code) +{ + unsigned long condition; + struct task_struct *tsk = current; + siginfo_t info; + + get_debugreg(condition, 6); + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + + preempt_conditional_sti(regs); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg7) { + goto clear_dr7; + } + } + + tsk->thread.debugreg6 = condition; + + /* Mask out spurious TF errors due to lazy TF clearing */ + if (condition & DR_STEP) { + /* + * The TF error should be masked out only if the current + * process is not traced and if the TRAP flag has been set + * previously by a tracing process (condition detected by + * the PT_DTRACE flag); remember that the i386 TRAP flag + * can be modified by the process itself in user mode, + * allowing programs to debug themselves without the ptrace() + * interface. + */ + if (!user_mode(regs)) + goto clear_TF_reenable; + /* + * Was the TF flag set by a debugger? If so, clear it now, + * so that register information is correct. + */ + if (tsk->ptrace & PT_DTRACE) { + regs->eflags &= ~TF_MASK; + tsk->ptrace &= ~PT_DTRACE; + } + } + + /* Ok, finally something we can handle */ + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_BRKPT; + info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; + force_sig_info(SIGTRAP, &info, tsk); + +clear_dr7: + set_debugreg(0UL, 7); + preempt_conditional_cli(regs); + return; + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->eflags &= ~TF_MASK; + preempt_conditional_cli(regs); +} + +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + const struct exception_table_entry *fixup; + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return 1; + } + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +asmlinkage void do_coprocessor_error(struct pt_regs *regs) +{ + void __user *rip = (void __user *)(regs->rip); + struct task_struct * task; + siginfo_t info; + unsigned short cwd, swd; + + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = rip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (swd & ~cwd & 0x3f) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +asmlinkage void bad_intr(void) +{ + printk("bad interrupt"); +} + +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) +{ + void __user *rip = (void __user *)(regs->rip); + struct task_struct * task; + siginfo_t info; + unsigned short mxcsr; + + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = rip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) +{ +} + +#if 0 +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} +#endif + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + */ +asmlinkage void math_state_restore(void) +{ + struct task_struct *me = current; + /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ + + if (!used_math()) + init_fpu(me); + restore_fpu_checking(&me->thread.i387.fxsave); + task_thread_info(me)->status |= TS_USEDFPU; +} + + +/* + * NB. All these are "interrupt gates" (i.e. events_mask is set) because we + * specify <dpl>|4 in the second field. + */ +static trap_info_t __cpuinitdata trap_table[] = { + { 0, 0|4, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0|4, __KERNEL_CS, (unsigned long)debug }, + { 3, 3|4, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3|4, __KERNEL_CS, (unsigned long)overflow }, + { 5, 0|4, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0|4, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0|4, __KERNEL_CS, (unsigned long)device_not_available }, + { 9, 0|4, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun}, + { 10, 0|4, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0|4, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0|4, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0|4, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, + { 15, 0|4, __KERNEL_CS, (unsigned long)spurious_interrupt_bug }, + { 16, 0|4, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0|4, __KERNEL_CS, (unsigned long)alignment_check }, +#ifdef CONFIG_X86_MCE + { 18, 0|4, __KERNEL_CS, (unsigned long)machine_check }, +#endif + { 19, 0|4, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, +#ifdef CONFIG_IA32_EMULATION + { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall}, +#endif + { 0, 0, 0, 0 } +}; + +void __init trap_init(void) +{ + int ret; + + ret = HYPERVISOR_set_trap_table(trap_table); + if (ret) + printk("HYPERVISOR_set_trap_table failed: error %d\n", ret); + + /* + * Should be a barrier for any external CPU state. + */ + cpu_init(); +} + +void __cpuinit smp_trap_init(trap_info_t *trap_ctxt) +{ + const trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + + +/* Actual parsing is done early in setup.c. */ +static int __init oops_dummy(char *s) +{ + panic_on_oops = 1; + return 1; +} +__setup("oops=", oops_dummy); + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s,NULL,0); + return 1; +} +__setup("kstack=", kstack_setup); + +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; + return 1; +} +__setup("call_trace=", call_trace_setup); +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/vsyscall_64-xen.c 2007-06-18 08:38:13.000000000 +0200 @@ -0,0 +1,227 @@ +/* + * linux/arch/x86_64/kernel/vsyscall.c + * + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * If we want more than four we need a vDSO. + * + * Note: the concept clashes with user mode linux. If you use UML and + * want per guest time just set the kernel.vsyscall64 sysctl to 0. + */ + +#include <linux/time.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/seqlock.h> +#include <linux/jiffies.h> +#include <linux/sysctl.h> + +#include <asm/vsyscall.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/fixmap.h> +#include <asm/errno.h> +#include <asm/io.h> + +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) + +int __sysctl_vsyscall __section_sysctl_vsyscall = 1; +seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; + +#include <asm/unistd.h> + +static __always_inline void timeval_normalize(struct timeval * tv) +{ + time_t __sec; + + __sec = tv->tv_usec / 1000000; + if (__sec) { + tv->tv_usec %= 1000000; + tv->tv_sec += __sec; + } +} + +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + long sequence, t; + unsigned long sec, usec; + + do { + sequence = read_seqbegin(&__xtime_lock); + + sec = __xtime.tv_sec; + usec = (__xtime.tv_nsec / 1000) + + (__jiffies - __wall_jiffies) * (1000000 / HZ); + + if (__vxtime.mode != VXTIME_HPET) { + t = get_cycles_sync(); + if (t < __vxtime.last_tsc) + t = __vxtime.last_tsc; + usec += ((t - __vxtime.last_tsc) * + __vxtime.tsc_quot) >> 32; + /* See comment in x86_64 do_gettimeofday. */ + } else { + usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + __vxtime.last) * __vxtime.quot) >> 32; + } + } while (read_seqretry(&__xtime_lock, sequence)); + + tv->tv_sec = sec + usec / 1000000; + tv->tv_usec = usec % 1000000; +} + +/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ +static __always_inline void do_get_tz(struct timezone * tz) +{ + *tz = __sys_tz; +} + +static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret; + asm volatile("vsysc2: syscall" + : "=a" (ret) + : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); + return ret; +} + +static __always_inline long time_syscall(long *t) +{ + long secs; + asm volatile("vsysc1: syscall" + : "=a" (secs) + : "0" (__NR_time),"D" (t) : __syscall_clobber); + return secs; +} + +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +{ + if (!__sysctl_vsyscall) + return gettimeofday(tv,tz); + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + return 0; +} + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ +time_t __vsyscall(1) vtime(time_t *t) +{ + if (!__sysctl_vsyscall) + return time_syscall(t); + else if (t) + *t = __xtime.tv_sec; + return __xtime.tv_sec; +} + +long __vsyscall(2) venosys_0(void) +{ + return -ENOSYS; +} + +long __vsyscall(3) venosys_1(void) +{ + return -ENOSYS; +} + +#ifdef CONFIG_SYSCTL + +#define SYSCALL 0x050f +#define NOP2 0x9090 + +/* + * NOP out syscall in vsyscall page when not needed. + */ +static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + extern u16 vsysc1, vsysc2; + u16 *map1, *map2; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + if (!write) + return ret; + /* gcc has some trouble with __va(__pa()), so just do it this + way. */ + map1 = ioremap(__pa_symbol(&vsysc1), 2); + if (!map1) + return -ENOMEM; + map2 = ioremap(__pa_symbol(&vsysc2), 2); + if (!map2) { + ret = -ENOMEM; + goto out; + } + if (!sysctl_vsyscall) { + *map1 = SYSCALL; + *map2 = SYSCALL; + } else { + *map1 = NOP2; + *map2 = NOP2; + } + iounmap(map2); +out: + iounmap(map1); + return ret; +} + +static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + return -ENOSYS; +} + +static ctl_table kernel_table2[] = { + { .ctl_name = 99, .procname = "vsyscall64", + .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .strategy = vsyscall_sysctl_nostrat, + .proc_handler = vsyscall_sysctl_change }, + { 0, } +}; + +static ctl_table kernel_root_table2[] = { + { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + .child = kernel_table2 }, + { 0 }, +}; + +#endif + +static void __init map_vsyscall(void) +{ + extern char __vsyscall_0; + unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + + __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); +} + +static int __init vsyscall_init(void) +{ + BUG_ON(((unsigned long) &vgettimeofday != + VSYSCALL_ADDR(__NR_vgettimeofday))); + BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); + BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + map_vsyscall(); +#ifdef CONFIG_XEN + sysctl_vsyscall = 0; /* disable vgettimeofay() */ +#endif +#ifdef CONFIG_SYSCTL + register_sysctl_table(kernel_root_table2, 0); +#endif + return 0; +} + +__initcall(vsyscall_init); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/kernel/xen_entry_64.S 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,36 @@ +/* + * Copied from arch/xen/i386/kernel/entry.S + */ +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ + movq %gs:pda_cpunumber,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ + addq HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff +#else +#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) +#define XEN_PUT_VCPU_INFO_fixup +#endif + +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/fault_64-xen.c 2007-11-02 17:34:23.000000000 +0100 @@ -0,0 +1,724 @@ +/* + * linux/arch/x86-64/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/kprobes.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm/kdebug.h> +#include <asm-generic/sections.h> + +/* Page fault error code bits */ +#define PF_PROT (1<<0) /* or no page found */ +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + return NOTIFY_DONE; +} +#endif + +void bust_spinlocks(int yes) +{ + int loglevel_save = console_loglevel; + if (yes) { + oops_in_progress = 1; + } else { +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; + } +} + +/* Sometimes the CPU reports invalid exceptions on prefetch. + Check that here and ignore. + Opcode checker based on code by Richard Brunner */ +static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + + /* If it was a exec fault ignore */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char *)convert_rip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (__get_user(opcode, instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 + prefixes. In long mode, the CPU will signal + invalid opcode if some of these prefixes are + present so we will never get here anyway */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x40: + /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes + Need to figure out under what instruction mode the + instruction was issued ... */ + /* Could check the LDT for lm, but for now it's good + enough to assume that long mode only uses well known + segments or kernel. */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (__get_user(opcode, instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static int bad_address(void *p) +{ + unsigned long dummy; + return __get_user(dummy, (unsigned long *)p); +} + +void dump_pagetable(unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. */ + +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ + static int warned; + if (address != regs->rip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->rip = address; + return 1; + } + return 0; +} + +int unhandled_signal(struct task_struct *tsk, int sig) +{ + if (tsk->pid == 1) + return 1; + if (tsk->ptrace & PT_PTRACED) + return 0; + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); +} + +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + __die("Bad pagetable", regs, error_code); + oops_end(flags); + do_exit(SIGKILL); +} + +/* + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + /* On Xen the line below does not always work. Needs investigating! */ + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ + pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +} + +int page_fault_trace = 0; +int exception_trace = 1; + + +#define MEM_VERBOSE 1 + +#ifdef MEM_VERBOSE +#define MEM_LOG(_f, _a...) \ + printk("fault.c:[%d]-> " _f "\n", \ + __LINE__ , ## _a ) +#else +#define MEM_LOG(_f, _a...) ((void)0) +#endif + +static int spurious_fault(struct pt_regs *regs, + unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + +#ifdef CONFIG_XEN + /* Faults in hypervisor area are never spurious. */ + if ((address >= HYPERVISOR_VIRT_START) && + (address < HYPERVISOR_VIRT_END)) + return 0; +#endif + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_RSVD|PF_USER)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && (__pte_val(*pte) & _PAGE_NX)) + return 0; + + return 1; +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long address; + const struct exception_table_entry *fixup; + int write; + unsigned long flags; + siginfo_t info; + + if (!user_mode(regs)) + error_code &= ~PF_USER; /* means kernel */ + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = current_vcpu_info()->arch.cr2; + + info.si_code = SEGV_MAPERR; + + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(address >= TASK_SIZE64)) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { + if (vmalloc_fault(address) >= 0) + return; + } + /* Can take a spurious fault if mapping changes R/O -> R/W. */ + if (spurious_fault(regs, address, error_code)) + return; + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + + if (likely(regs->eflags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(page_fault_trace)) + printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt or have no user + * context, we must not take the fault.. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + again: + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->rip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (likely(vma->vm_start <= address)) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + /* Allow userspace just enough access below the stack pointer + * to let the 'enter' instruction work. + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + info.si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + switch (handle_mm_fault(mm, vma, address, write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + goto do_sigbus; + default: + goto out_of_memory; + } + + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + if (is_prefetch(regs, address, error_code)) + return; + + /* Work around K8 erratum #100 K8 in compat mode + occasionally jumps to illegal addresses >4GB. We + catch this here in the page fault handler because + these addresses are not reachable. Just detect this + case and return. Any code segment in LDT is + compatibility mode. */ + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return; + + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + printk( + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->rip, + regs->rsp, error_code); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code has been set above */ + info.si_addr = (void __user *)address; + force_sig_info(SIGSEGV, &info, tsk); + return; + } + +no_context: + + /* Are we prepared to handle this kernel fault? */ + fixup = search_exception_tables(regs->rip); + if (fixup) { + regs->rip = fixup->fixup; + return; + } + + /* + * Hall of shame of CPU/BIOS bugs. + */ + + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + flags = oops_begin(); + + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at %016lx RIP: \n" KERN_ALERT,address); + printk_address(regs->rip); + dump_pagetable(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + __die("Oops", regs, error_code); + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (current->pid == 1) { + yield(); + goto again; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, tsk); + return; +} + +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +void vmalloc_sync_all(void) +{ + /* Note that races in the updates of insync and start aren't + problematic: + insync can only get set bits added, and updates to start are only + improving performance (without affecting correctness if undone). */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + for (page = pgd_list; page; + page = (struct page *)page->index) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +} + +static int __init enable_pagefaulttrace(char *str) +{ + page_fault_trace = 1; + return 1; +} +__setup("pagefaulttrace", enable_pagefaulttrace); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/init_64-xen.c 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,1231 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> + * + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/proc_fs.h> +#include <linux/pci.h> +#include <linux/poison.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> +#include <linux/memory_hotplug.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/sections.h> + +#include <xen/features.h> + +#ifndef Dprintk +#define Dprintk(x...) +#endif + +struct dma_mapping_ops* dma_ops; +EXPORT_SYMBOL(dma_ops); + +#if CONFIG_XEN_COMPAT <= 0x030002 +unsigned int __kernel_page_user; +EXPORT_SYMBOL(__kernel_page_user); +#endif + +int after_bootmem; + +static unsigned long dma_reserve __initdata; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +extern unsigned long start_pfn; + +/* + * Use this until direct mapping is established, i.e. before __va() is + * available in init_memory_mapping(). + */ + +#define addr_to_page(addr, page) \ + (addr) &= PHYSICAL_PAGE_MASK; \ + (page) = ((unsigned long *) ((unsigned long) \ + (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \ + __START_KERNEL_map))) + +static void __meminit early_make_page_readonly(void *va, unsigned int feature) +{ + unsigned long addr, _va = (unsigned long)va; + pte_t pte, *ptep; + unsigned long *page = (unsigned long *) init_level4_pgt; + + BUG_ON(after_bootmem); + + if (xen_feature(feature)) + return; + + addr = (unsigned long) page[pgd_index(_va)]; + addr_to_page(addr, page); + + addr = page[pud_index(_va)]; + addr_to_page(addr, page); + + addr = page[pmd_index(_va)]; + addr_to_page(addr, page); + + ptep = (pte_t *) &page[pte_index(_va)]; + + pte.pte = ptep->pte & ~_PAGE_RW; + if (HYPERVISOR_update_va_mapping(_va, pte, 0)) + BUG(); +} + +static void __make_page_readonly(void *va) +{ + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; + unsigned long addr = (unsigned long) va; + + pgd = pgd_offset_k(addr); + pud = pud_offset(pgd, addr); + pmd = pmd_offset(pud, addr); + ptep = pte_offset_kernel(pmd, addr); + + pte.pte = ptep->pte & ~_PAGE_RW; + if (HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_l1_entry_update(ptep, pte); /* fallback */ + + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) + __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT)); +} + +static void __make_page_writable(void *va) +{ + pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep; + unsigned long addr = (unsigned long) va; + + pgd = pgd_offset_k(addr); + pud = pud_offset(pgd, addr); + pmd = pmd_offset(pud, addr); + ptep = pte_offset_kernel(pmd, addr); + + pte.pte = ptep->pte | _PAGE_RW; + if (HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_l1_entry_update(ptep, pte); /* fallback */ + + if ((addr >= VMALLOC_START) && (addr < VMALLOC_END)) + __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT)); +} + +void make_page_readonly(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_readonly(va); +} + +void make_page_writable(void *va, unsigned int feature) +{ + if (!xen_feature(feature)) + __make_page_writable(va); +} + +void make_pages_readonly(void *va, unsigned nr, unsigned int feature) +{ + if (xen_feature(feature)) + return; + + while (nr-- != 0) { + __make_page_readonly(va); + va = (void*)((unsigned long)va + PAGE_SIZE); + } +} + +void make_pages_writable(void *va, unsigned nr, unsigned int feature) +{ + if (xen_feature(feature)) + return; + + while (nr-- != 0) { + __make_page_writable(va); + va = (void*)((unsigned long)va + PAGE_SIZE); + } +} + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +void show_mem(void) +{ + long i, total = 0, reserved = 0; + long shared = 0, cached = 0; + pg_data_t *pgdat; + struct page *page; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + + for_each_online_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pfn_to_page(pgdat->node_start_pfn + i); + total++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n",reserved); + printk(KERN_INFO "%lu pages shared\n",shared); + printk(KERN_INFO "%lu pages swap cached\n",cached); +} + + +static __init void *spp_getpage(void) +{ + void *ptr; + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC); + else if (start_pfn < table_end) { + ptr = __va(start_pfn << PAGE_SHIFT); + start_pfn++; + memset(ptr, 0, PAGE_SIZE); + } else + ptr = alloc_bootmem_pages(PAGE_SIZE); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) + panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); + + Dprintk("spp_getpage %p\n", ptr); + return ptr; +} + +#define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address)) +#define pud_offset_u(address) (level3_user_pgt + pud_index(address)) + +static __init void set_pte_phys(unsigned long vaddr, + unsigned long phys, pgprot_t prot, int user_mode) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + + pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr)); + if (pgd_none(*pgd)) { + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + make_page_readonly(pmd, XENFEAT_writable_page_tables); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + if (pmd != pmd_offset(pud, 0)) { + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + return; + } + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + pte = (pte_t *) spp_getpage(); + make_page_readonly(pte, XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + if (pte != pte_offset_kernel(pmd, 0)) { + printk("PAGETABLE BUG #02!\n"); + return; + } + } + if (pgprot_val(prot)) + new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); + else + new_pte = __pte(0); + + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(*pte) && __pte_val(new_pte) && + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask)) + pte_ERROR(*pte); + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +static __init void set_pte_phys_ma(unsigned long vaddr, + unsigned long phys, pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, new_pte; + + Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + + pmd = (pmd_t *) spp_getpage(); + make_page_readonly(pmd, XENFEAT_writable_page_tables); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); + if (pmd != pmd_offset(pud, 0)) { + printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + return; + } + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + pte = (pte_t *) spp_getpage(); + make_page_readonly(pte, XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); + if (pte != pte_offset_kernel(pmd, 0)) { + printk("PAGETABLE BUG #02!\n"); + return; + } + } + new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot); + + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(*pte) && __pte_val(new_pte) && +#ifdef CONFIG_ACPI + /* __acpi_map_table() fails to properly call clear_fixmap() */ + (vaddr < __fix_to_virt(FIX_ACPI_END) || + vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) && +#endif + __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask)) + pte_ERROR(*pte); + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* NOTE: this is meant to be run only at boot */ +void __init +__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + printk("Invalid __set_fixmap\n"); + return; + } + switch (idx) { + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: + set_pte_phys(address, phys, prot, 0); + set_pte_phys(address, phys, prot, 1); + break; + default: + set_pte_phys_ma(address, phys, prot); + break; + } +} + +unsigned long __initdata table_start, table_end; + +static __meminit void *alloc_static_page(unsigned long *phys) +{ + unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map; + + if (after_bootmem) { + void *adr = (void *)get_zeroed_page(GFP_ATOMIC); + + *phys = __pa(adr); + return adr; + } + + *phys = start_pfn << PAGE_SHIFT; + start_pfn++; + memset((void *)va, 0, PAGE_SIZE); + return (void *)va; +} + +#define PTE_SIZE PAGE_SIZE + +static inline int make_readonly(unsigned long paddr) +{ + extern char __vsyscall_0; + int readonly = 0; + + /* Make new page tables read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && (paddr >= (table_start << PAGE_SHIFT)) + && (paddr < (table_end << PAGE_SHIFT))) + readonly = 1; + /* Make old page tables read-only. */ + if (!xen_feature(XENFEAT_writable_page_tables) + && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map)) + && (paddr < (start_pfn << PAGE_SHIFT))) + readonly = 1; + + /* + * No need for writable mapping of kernel image. This also ensures that + * page and descriptor tables embedded inside don't have writable + * mappings. Exclude the vsyscall area here, allowing alternative + * instruction patching to work. + */ + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)) + && !(paddr >= __pa_symbol(&__vsyscall_0) + && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE)) + readonly = 1; + + return readonly; +} + +#ifndef CONFIG_XEN +/* Must run before zap_low_mappings */ +__init void *early_ioremap(unsigned long addr, unsigned long size) +{ + unsigned long map = round_down(addr, LARGE_PAGE_SIZE); + + /* actually usually some more */ + if (size >= LARGE_PAGE_SIZE) { + printk("SMBIOS area too long %lu\n", size); + return NULL; + } + set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); + map += LARGE_PAGE_SIZE; + set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); +} + +/* To avoid virtual aliases later */ +__init void early_iounmap(void *addr, unsigned long size) +{ + if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) + printk("early_iounmap: bad address %p\n", addr); + set_pmd(temp_mappings[0].pmd, __pmd(0)); + set_pmd(temp_mappings[1].pmd, __pmd(0)); + __flush_tlb(); +} +#endif + +static void __meminit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i, k; + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++) { + unsigned long pte_phys; + pte_t *pte, *pte_save; + + if (address >= end) + break; + pte = alloc_static_page(&pte_phys); + pte_save = pte; + for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) { + unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE; + + if (address >= end || + (!after_bootmem && + (address >> PAGE_SHIFT) >= xen_start_info->nr_pages)) + pteval = 0; + else if (make_readonly(address)) + pteval &= ~_PAGE_RW; + set_pte(pte, __pte(pteval & __supported_pte_mask)); + } + if (!after_bootmem) { + early_make_page_readonly(pte_save, XENFEAT_writable_page_tables); + *pmd = __pmd(pte_phys | _KERNPG_TABLE); + } else { + make_page_readonly(pte_save, XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); + } + } +} + +static void __meminit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i = pud_index(address); + + pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr, pmd_phys; + pmd_t *pmd; + + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = alloc_static_page(&pmd_phys); + + spin_lock(&init_mm.page_table_lock); + *pud = __pud(pmd_phys | _KERNPG_TABLE); + phys_pmd_init(pmd, paddr, end); + spin_unlock(&init_mm.page_table_lock); + + early_make_page_readonly(pmd, XENFEAT_writable_page_tables); + } + __flush_tlb(); +} + +void __init xen_init_pt(void) +{ + unsigned long addr, *page; + + /* Find the initial pte page that was built for us. */ + page = (unsigned long *)xen_start_info->pt_base; + addr = page[pgd_index(__START_KERNEL_map)]; + addr_to_page(addr, page); + +#if CONFIG_XEN_COMPAT <= 0x030002 + /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER + in kernel PTEs. We check that here. */ + if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) { + unsigned long *pg; + pte_t pte; + + /* Mess with the initial mapping of page 0. It's not needed. */ + BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map); + addr = page[pud_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + addr = pg[pmd_index(__START_KERNEL_map)]; + addr_to_page(addr, pg); + pte.pte = pg[pte_index(__START_KERNEL_map)]; + BUG_ON(!(pte.pte & _PAGE_PRESENT)); + + /* If _PAGE_USER isn't set, we obviously do not need it. */ + if (pte.pte & _PAGE_USER) { + /* _PAGE_USER is needed, but is it set implicitly? */ + pte.pte &= ~_PAGE_USER; + if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map, + pte, 0) != 0) || + !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER)) + /* We need to explicitly specify _PAGE_USER. */ + __kernel_page_user = _PAGE_USER; + } + } +#endif + + /* Construct mapping of initial pte page in our own directories. */ + init_level4_pgt[pgd_index(__START_KERNEL_map)] = + __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE); + memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map), + page + pud_index(__START_KERNEL_map), + (PTRS_PER_PUD - pud_index(__START_KERNEL_map)) + * sizeof(*level3_kernel_pgt)); + + __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] = + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); + + early_make_page_readonly(init_level4_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(__user_pgd(init_level4_pgt), + XENFEAT_writable_page_tables); + early_make_page_readonly(level3_kernel_pgt, + XENFEAT_writable_page_tables); + early_make_page_readonly(level3_user_pgt, + XENFEAT_writable_page_tables); + + if (!xen_feature(XENFEAT_writable_page_tables)) { + xen_pgd_pin(__pa_symbol(init_level4_pgt)); + xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt))); + } +} + +static void __init extend_init_mapping(unsigned long tables_space) +{ + unsigned long va = __START_KERNEL_map; + unsigned long phys, addr, *pte_page; + pmd_t *pmd; + pte_t *pte, new_pte; + unsigned long *page = (unsigned long *)init_level4_pgt; + + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + addr = page[pud_index(va)]; + addr_to_page(addr, page); + + /* Kill mapping of low 1MB. */ + while (va < (unsigned long)&_text) { + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } + + /* Ensure init mappings cover kernel text/data and initial tables. */ + while (va < (__START_KERNEL_map + + (start_pfn << PAGE_SHIFT) + + tables_space)) { + if (!(pmd_index(va) | pte_index(va))) { + pud_t *pud; + + page = (unsigned long *)init_level4_pgt; + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + pud = (pud_t *)&page[pud_index(va)]; + if (pud_none(*pud)) { + page = alloc_static_page(&phys); + early_make_page_readonly( + page, XENFEAT_writable_page_tables); + set_pud(pud, __pud(phys | _KERNPG_TABLE)); + } else { + addr = page[pud_index(va)]; + addr_to_page(addr, page); + } + } + pmd = (pmd_t *)&page[pmd_index(va)]; + if (pmd_none(*pmd)) { + pte_page = alloc_static_page(&phys); + early_make_page_readonly( + pte_page, XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE)); + } else { + addr = page[pmd_index(va)]; + addr_to_page(addr, pte_page); + } + pte = (pte_t *)&pte_page[pte_index(va)]; + if (pte_none(*pte)) { + new_pte = pfn_pte( + (va - __START_KERNEL_map) >> PAGE_SHIFT, + __pgprot(_KERNPG_TABLE)); + xen_l1_entry_update(pte, new_pte); + } + va += PAGE_SIZE; + } + + /* Finally, blow away any spurious initial mappings. */ + while (1) { + if (!(pmd_index(va) | pte_index(va))) { + page = (unsigned long *)init_level4_pgt; + addr = page[pgd_index(va)]; + addr_to_page(addr, page); + if (pud_none(((pud_t *)page)[pud_index(va)])) + break; + addr = page[pud_index(va)]; + addr_to_page(addr, page); + } + pmd = (pmd_t *)&page[pmd_index(va)]; + if (pmd_none(*pmd)) + break; + if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0)) + BUG(); + va += PAGE_SIZE; + } +} + +static void __init find_early_table_space(unsigned long end) +{ + unsigned long puds, pmds, ptes, tables; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT; + + tables = round_up(puds * 8, PAGE_SIZE) + + round_up(pmds * 8, PAGE_SIZE) + + round_up(ptes * 8, PAGE_SIZE); + + extend_init_mapping(tables); + + table_start = start_pfn; + table_end = table_start + (tables>>PAGE_SHIFT); + + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); +} + +static void xen_finish_init_mapping(void) +{ + unsigned long i, start, end; + + /* Re-vector virtual addresses pointing into the initial + mapping to the just-established permanent ones. */ + xen_start_info = __va(__pa(xen_start_info)); + xen_start_info->pt_base = (unsigned long) + __va(__pa(xen_start_info->pt_base)); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + phys_to_machine_mapping = + __va(__pa(xen_start_info->mfn_list)); + xen_start_info->mfn_list = (unsigned long) + phys_to_machine_mapping; + } + if (xen_start_info->mod_start) + xen_start_info->mod_start = (unsigned long) + __va(__pa(xen_start_info->mod_start)); + + /* Destroy the Xen-created mappings beyond the kernel image as + * well as the temporary mappings created above. Prevents + * overlap with modules area (if init mapping is very big). + */ + start = PAGE_ALIGN((unsigned long)_end); + end = __START_KERNEL_map + (table_end << PAGE_SHIFT); + for (; start < end; start += PAGE_SIZE) + if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0)) + BUG(); + + /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */ + table_end = ~0UL; + + /* + * Prefetch pte's for the bt_ioremap() area. It gets used before the + * boot-time allocator is online, so allocate-on-demand would fail. + */ + for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++) + __set_fixmap(i, 0, __pgprot(0)); + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, sizeof(empty_zero_page)); + + /* Set up mapping of lowest 1MB of physical memory. */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (is_initial_xendomain()) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_mfn(empty_zero_page) + << PAGE_SHIFT, + PAGE_KERNEL_RO); + + /* Disable the 'start_pfn' allocator. */ + table_end = start_pfn; +} + +/* Setup the direct mapping of the physical memory at PAGE_OFFSET. + This runs before bootmem is initialized and gets pages directly from the + physical memory. To access them they are temporarily mapped. */ +void __meminit init_memory_mapping(unsigned long start, unsigned long end) +{ + unsigned long next; + + Dprintk("init_memory_mapping\n"); + + /* + * Find space for the kernel direct mapping tables. + * Later we should allocate these tables in the local node of the memory + * mapped. Unfortunately this is done currently before the nodes are + * discovered. + */ + if (!after_bootmem) + find_early_table_space(end); + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + for (; start < end; start = next) { + unsigned long pud_phys; + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + if (after_bootmem) + pud = pud_offset(pgd, start & PGDIR_MASK); + else + pud = alloc_static_page(&pud_phys); + next = start + PGDIR_SIZE; + if (next > end) + next = end; + phys_pud_init(pud, __pa(start), __pa(next)); + if (!after_bootmem) { + early_make_page_readonly(pud, XENFEAT_writable_page_tables); + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + } + } + + if (!after_bootmem) { + BUG_ON(start_pfn != table_end); + xen_finish_init_mapping(); + } + + __flush_tlb_all(); +} + +void __cpuinit zap_low_mappings(int cpu) +{ + /* this is not required for Xen */ +#if 0 + swap_low_mappings(); +#endif +} + +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ +__init void +size_zones(unsigned long *z, unsigned long *h, + unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + unsigned long w; + + for (i = 0; i < MAX_NR_ZONES; i++) + z[i] = 0; + + if (start_pfn < MAX_DMA_PFN) + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; + if (start_pfn < MAX_DMA32_PFN) { + unsigned long dma32_pfn = MAX_DMA32_PFN; + if (dma32_pfn > end_pfn) + dma32_pfn = end_pfn; + z[ZONE_DMA32] = dma32_pfn - start_pfn; + } + z[ZONE_NORMAL] = end_pfn - start_pfn; + + /* Remove lower zones from higher ones. */ + w = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + if (z[i]) + z[i] -= w; + w += z[i]; + } + + /* Compute holes */ + w = start_pfn; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long s = w; + w += z[i]; + h[i] = e820_hole_size(s, w); + } + + /* Add the space pace needed for mem_map to the holes too. */ + for (i = 0; i < MAX_NR_ZONES; i++) + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; + + /* The 16MB DMA zone has the kernel and other misc mappings. + Account them too */ + if (h[ZONE_DMA]) { + h[ZONE_DMA] += dma_reserve; + if (h[ZONE_DMA] >= z[ZONE_DMA]) { + printk(KERN_WARNING + "Kernel too large and filling up ZONE_DMA?\n"); + h[ZONE_DMA] = z[ZONE_DMA]; + } + } +} + +#ifndef CONFIG_NUMA +void __init paging_init(void) +{ + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; + + memory_present(0, 0, end_pfn); + sparse_init(); + size_zones(zones, holes, 0, end_pfn); + free_area_init_node(0, NODE_DATA(0), zones, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); + + init_mm.context.pinned = 1; +} +#endif + +/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches + from the CPU leading to inconsistent cache lines. address and size + must be aligned to 2MB boundaries. + Does nothing when the mapping doesn't exist. */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) +{ + unsigned long end = address + size; + + BUG_ON(address & ~LARGE_PAGE_MASK); + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, address); + if (!pmd || pmd_none(*pmd)) + continue; + if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) { + /* Could handle this, but it should not happen currently. */ + printk(KERN_ERR + "clear_kernel_mapping: mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); + } + set_pmd(pmd, __pmd(0)); + } + __flush_tlb_all(); +} + +/* + * Memory hotplug specific functions + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address + * via probe interface of sysfs. If acpi notifies hot-add event, then it + * can tell node id by searching dsdt. But, probe interface doesn't have + * node id. So, return 0 as node id at this time. + */ +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +#endif + +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages); + if (ret) + goto error; + + init_memory_mapping(start, (start + size -1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL_GPL(arch_add_memory); + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(remove_memory); + +#else /* CONFIG_MEMORY_HOTPLUG */ +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, + kcore_vsyscall; + +void __init mem_init(void) +{ + long codesize, reservedpages, datasize, initsize; + unsigned long pfn; + + pci_iommu_alloc(); + + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + num_physpages = end_pfn; + high_memory = (void *) __va(end_pfn * PAGE_SIZE); + + /* clear the zero-page */ + memset(empty_zero_page, 0, PAGE_SIZE); + + reservedpages = 0; + + /* this will put all low memory onto the freelists */ +#ifdef CONFIG_NUMA + totalram_pages = numa_free_all_bootmem(); +#else + totalram_pages = free_all_bootmem(); +#endif + /* XEN: init and count pages outside initial allocation. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + ClearPageReserved(pfn_to_page(pfn)); + init_page_count(pfn_to_page(pfn)); + totalram_pages++; + } + reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); + + after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + kclist_add(&kcore_kernel, &_stext, _end - _stext); + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + VSYSCALL_END - VSYSCALL_START); + + printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + end_pfn << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); + +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP + /* + * Sync boot_level4_pgt mappings with the init_level4_pgt + * except for the low identity mappings which are already zapped + * in init_level4_pgt. This sync-up is essential for AP's bringup + */ + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); +#endif +#endif +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr; + + if (begin >= end) + return; + + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) { + /* make_readonly() reports all kernel addresses. */ + __make_page_writable(__va(__pa(addr))); + if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + pte_t *pte = pte_offset_kernel(pmd, addr); + + xen_l1_entry_update(pte, __pte(0)); /* fallback */ + } + } + free_page(addr); + totalram_pages++; + } +} + +void free_initmem(void) +{ + memset(__initdata_begin, POISON_FREE_INITDATA, + __initdata_end - __initdata_begin); + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_DEBUG_RODATA + +void mark_rodata_ro(void) +{ + unsigned long addr = (unsigned long)__start_rodata; + + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); + + printk ("Write protecting the kernel read-only data: %luk\n", + (__end_rodata - __start_rodata) >> 10); + + /* + * change_page_attr_addr() requires a global_flush_tlb() call after it. + * We do this after the printk so that if something went wrong in the + * change, the printk gets out at least to give a better debug hint + * of who is the culprit. + */ + global_flush_tlb(); +} +#endif + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif + +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA + int nid = phys_to_nid(phys); + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); +#endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) + dma_reserve += len / PAGE_SIZE; +} + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return 0; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return 0; + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + return pfn_valid(pte_pfn(*pte)); +} + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> + +extern int exception_trace, page_fault_trace; + +static ctl_table debug_table2[] = { + { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, + proc_dointvec }, + { 0, } +}; + +static ctl_table debug_root_table2[] = { + { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, + .child = debug_table2 }, + { 0 }, +}; + +static __init int x8664_sysctl_init(void) +{ + register_sysctl_table(debug_root_table2, 1); + return 0; +} +__initcall(x8664_sysctl_init); +#endif + +/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only + covers the 64bit vsyscall page now. 32bit has a real VMA now and does + not need special handling anymore. */ + +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_END, + .vm_page_prot = PAGE_READONLY +}; + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(tsk, TIF_IA32)) + return NULL; +#endif + return &gate_vma; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) + return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives. + */ +int in_gate_area_no_task(unsigned long addr) +{ + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/mm/pageattr_64-xen.c 2009-03-18 10:39:31.000000000 +0100 @@ -0,0 +1,508 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +#ifdef CONFIG_XEN +#include <asm/pgalloc.h> +#include <asm/mmu_context.h> + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static void _pin_lock(struct mm_struct *mm, int lock) { + if (lock) + spin_lock(&mm->page_table_lock); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + /* While mm->page_table_lock protects us against insertions and + * removals of higher level page table pages, it doesn't protect + * against updates of pte-s. Such updates, however, require the + * pte pages to be in consistent state (unpinned+writable or + * pinned+readonly). The pinning and attribute changes, however + * cannot be done atomically, which is why such updates must be + * prevented from happening concurrently. + * Note that no pte lock can ever elsewhere be acquired nesting + * with an already acquired one in the same mm, or with the mm's + * page_table_lock already acquired, as that would break in the + * non-split case (where all these are actually resolving to the + * one page_table_lock). Thus acquiring all of them here is not + * going to result in dead locks, and the order of acquires + * doesn't matter. + */ + { + pgd_t *pgd = mm->pgd; + unsigned g; + + for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + pud_t *pud; + unsigned u; + + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + pmd_t *pmd; + unsigned m; + + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + spinlock_t *ptl; + + if (pmd_none(*pmd)) + continue; + ptl = pte_lockptr(0, pmd); + if (lock) + spin_lock(ptl); + else + spin_unlock(ptl); + } + } + } + } +#endif + if (!lock) + spin_unlock(&mm->page_table_lock); +} +#define pin_lock(mm) _pin_lock(mm, 1) +#define pin_unlock(mm) _pin_lock(mm, 0) + +#define PIN_BATCH 8 +static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl); + +static inline unsigned int mm_walk_set_prot(void *pt, pgprot_t flags, + unsigned int cpu, unsigned int seq) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq, + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); + if (unlikely(++seq == PIN_BATCH)) { + if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu), + PIN_BATCH, NULL))) + BUG(); + seq = 0; + } + + return seq; +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + unsigned int cpu, seq; + multicall_entry_t *mcl; + + pgd = mm->pgd; + cpu = get_cpu(); + + /* + * Cannot iterate up to USER_PTRS_PER_PGD as these pagetables may not + * be the 'current' task's pagetables (e.g., current may be 32-bit, + * but the pagetables may be for a 64-bit task). + * Subtracting 1 from TASK_SIZE64 means the loop limit is correct + * regardless of whether TASK_SIZE64 is a multiple of PGDIR_SIZE. + */ + for (g = 0, seq = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + seq = mm_walk_set_prot(pud,flags,cpu,seq); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + seq = mm_walk_set_prot(pmd,flags,cpu,seq); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + seq = mm_walk_set_prot(pte,flags,cpu,seq); + } + } + } + + mcl = per_cpu(pb_mcl, cpu); + if (unlikely(seq > PIN_BATCH - 2)) { + if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL))) + BUG(); + seq = 0; + } + MULTI_update_va_mapping(mcl + seq, + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, flags), + 0); + MULTI_update_va_mapping(mcl + seq + 1, + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, flags), + UVMF_TLB_FLUSH); + if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL))) + BUG(); + + put_cpu(); +} + +void mm_pin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + + mm_walk(mm, PAGE_KERNEL_RO); + xen_pgd_pin(__pa(mm->pgd)); /* kernel */ + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */ + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + pin_unlock(mm); +} + +void mm_unpin(struct mm_struct *mm) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + pin_lock(mm); + + xen_pgd_unpin(__pa(mm->pgd)); + xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); + mm_walk(mm, PAGE_KERNEL); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + + pin_unlock(mm); +} + +void mm_pin_all(void) +{ + if (xen_feature(XENFEAT_writable_page_tables)) + return; + + /* + * Allow uninterrupted access to the mm_unpinned list. We don't + * actually take the mm_unpinned_lock as it is taken inside mm_pin(). + * All other CPUs must be at a safe point (e.g., in stop_machine + * or offlined entirely). + */ + preempt_disable(); + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); + preempt_enable(); +} + +void _arch_dup_mmap(struct mm_struct *mm) +{ + if (!mm->context.pinned) + mm_pin(mm); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) && + !mm->context.has_foreign_mappings ) + mm_unpin(mm); +} + +static void _pte_free(struct page *page, unsigned int order) +{ + BUG_ON(order); + pte_free(page); +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + if (pte) { + SetPageForeign(pte, _pte_free); + init_page_count(pte); + } + return pte; +} + +void pte_free(struct page *pte) +{ + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + if (HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)) + BUG(); + + ClearPageForeign(pte); + init_page_count(pte); + + __free_page(pte); +} +#endif /* CONFIG_XEN */ + +pte_t *lookup_address(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, address); + if (pte && !pte_present(*pte)) + pte = NULL; + return pte; +} + +static struct page *split_large_page(unsigned long address, pgprot_t prot, + pgprot_t ref_prot) +{ + int i; + unsigned long addr; + struct page *base = alloc_pages(GFP_KERNEL, 0); + pte_t *pbase; + if (!base) + return NULL; + /* + * page_private is used to track the number of entries in + * the page table page have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : ref_prot); + } + return base; +} + + +static void flush_kernel_map(void *address) +{ + if (0 && address && cpu_has_clflush) { + /* is this worth it? */ + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (address + i)); + } else + asm volatile("wbinvd":::"memory"); + if (address) + __flush_tlb_one(address); + else + __flush_tlb_all(); +} + + +static inline void flush_map(unsigned long address) +{ + on_each_cpu(flush_kernel_map, (void *)address, 1, 1); +} + +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ + +static inline void save_page(struct page *fpage) +{ + fpage->lru.next = (struct list_head *)deferred_pages; + deferred_pages = fpage; +} + +/* + * No more special protections in this 2/4MB area - revert to a + * large page again. + */ +static void revert_page(unsigned long address, pgprot_t ref_prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t large_pte; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd,address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + BUG_ON(__pmd_val(*pmd) & _PAGE_PSE); + pgprot_val(ref_prot) |= _PAGE_PSE; + large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + set_pte((pte_t *)pmd, large_pte); +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, + pgprot_t ref_prot) +{ + pte_t *kpte; + struct page *kpte_page; + unsigned kpte_flags; + pgprot_t ref_prot2; + kpte = lookup_address(address); + if (!kpte) return 0; + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_flags = pte_val(*kpte); + if (pgprot_val(prot) != pgprot_val(ref_prot)) { + if ((kpte_flags & _PAGE_PSE) == 0) { + set_pte(kpte, pfn_pte(pfn, prot)); + } else { + /* + * split_large_page will take the reference for this + * change_page_attr on the split page. + */ + + struct page *split; + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); + + split = split_large_page(address, prot, ref_prot2); + if (!split) + return -ENOMEM; + set_pte(kpte,mk_pte(split, ref_prot2)); + kpte_page = split; + } + page_private(kpte_page)++; + } else if ((kpte_flags & _PAGE_PSE) == 0) { + set_pte(kpte, pfn_pte(pfn, ref_prot)); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; + } else + BUG(); + + /* on x86-64 the direct mapping set at boot is not using 4k pages */ + /* + * ..., but the XEN guest kernels (currently) do: + * If the pte was reserved, it means it was created at boot + * time (not via split_large_page) and in turn we must not + * replace it with a large page. + */ +#ifndef CONFIG_XEN + BUG_ON(PageReserved(kpte_page)); +#else + if (PageReserved(kpte_page)) + return 0; +#endif + + if (page_private(kpte_page) == 0) { + save_page(kpte_page); + revert_page(address, ref_prot); + } + return 0; +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * This should be used when a page is mapped with a different caching policy + * than write-back somewhere - some CPUs do not like it when mappings with + * different caching policies exist. This changes the page attributes of the + * in kernel linear mapping too. + * + * The caller needs to ensure that there are no conflicting mappings elsewhere. + * This function only deals with the kernel linear map. + * + * Caller must call global_flush_tlb() after this. + */ +int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) +{ + int err = 0; + int i; + + down_write(&init_mm.mmap_sem); + for (i = 0; i < numpages; i++, address += PAGE_SIZE) { + unsigned long pfn = __pa(address) >> PAGE_SHIFT; + + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); + if (err) + break; + /* Handle kernel mapping too which aliases part of the + * lowmem */ + if (__pa(address) < KERNEL_TEXT_SIZE) { + unsigned long addr2; + pgprot_t prot2 = prot; + addr2 = __START_KERNEL_map + __pa(address); + pgprot_val(prot2) &= ~_PAGE_NX; + err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + } + } + up_write(&init_mm.mmap_sem); + return err; +} + +/* Don't call this for MMIO areas that may not have a mem_map entry */ +int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + unsigned long addr = (unsigned long)page_address(page); + return change_page_attr_addr(addr, numpages, prot); +} + +void global_flush_tlb(void) +{ + struct page *dpage; + + down_read(&init_mm.mmap_sem); + dpage = xchg(&deferred_pages, NULL); + up_read(&init_mm.mmap_sem); + + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); + while (dpage) { + struct page *tmp = dpage; + dpage = (struct page *)dpage->lru.next; + ClearPagePrivate(tmp); + __free_page(tmp); + } +} + +EXPORT_SYMBOL(change_page_attr); +EXPORT_SYMBOL(global_flush_tlb); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/drivers/pci/msi-xen.c 2009-12-04 08:45:56.000000000 +0100 @@ -0,0 +1,910 @@ +/* + * File: msi.c + * Purpose: PCI Message Signaled Interrupt (MSI) + * + * Copyright (C) 2003-2004 Intel + * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/smp_lock.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> + +#include <xen/evtchn.h> + +#include <asm/errno.h> +#include <asm/io.h> +#include <asm/smp.h> + +#include "pci.h" +#include "msi.h" + +static int pci_msi_enable = 1; + +static struct msi_ops *msi_ops; + +int msi_register(struct msi_ops *ops) +{ + msi_ops = ops; + return 0; +} + +static LIST_HEAD(msi_dev_head); +DEFINE_SPINLOCK(msi_dev_lock); + +struct msi_dev_list { + struct pci_dev *dev; + struct list_head list; + spinlock_t pirq_list_lock; + struct list_head pirq_list_head; + /* Used for saving/restoring MSI-X tables */ + void __iomem *mask_base; + /* Store default pre-assigned irq */ + unsigned int default_irq; +}; + +struct msi_pirq_entry { + struct list_head list; + int pirq; + int entry_nr; +#ifdef CONFIG_PM + /* PM save area for MSIX address/data */ + u32 address_hi_save; + u32 address_lo_save; + u32 data_save; +#endif +}; + +static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev) +{ + struct msi_dev_list *msi_dev_list, *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&msi_dev_lock, flags); + + list_for_each_entry(msi_dev_list, &msi_dev_head, list) + if ( msi_dev_list->dev == dev ) + ret = msi_dev_list; + + if ( ret ) { + spin_unlock_irqrestore(&msi_dev_lock, flags); + return ret; + } + + /* Has not allocate msi_dev until now. */ + ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC); + + /* Failed to allocate msi_dev structure */ + if ( !ret ) { + spin_unlock_irqrestore(&msi_dev_lock, flags); + return NULL; + } + + ret->dev = dev; + spin_lock_init(&ret->pirq_list_lock); + INIT_LIST_HEAD(&ret->pirq_list_head); + list_add_tail(&ret->list, &msi_dev_head); + spin_unlock_irqrestore(&msi_dev_lock, flags); + return ret; +} + +static int attach_pirq_entry(int pirq, int entry_nr, + struct msi_dev_list *msi_dev_entry) +{ + struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + unsigned long flags; + + if (!entry) + return -ENOMEM; + entry->pirq = pirq; + entry->entry_nr = entry_nr; + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); + list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head); + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); + return 0; +} + +static void detach_pirq_entry(int entry_nr, + struct msi_dev_list *msi_dev_entry) +{ + unsigned long flags; + struct msi_pirq_entry *pirq_entry; + + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { + if (pirq_entry->entry_nr == entry_nr) { + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); + list_del(&pirq_entry->list); + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); + kfree(pirq_entry); + return; + } + } +} + +/* + * pciback will provide device's owner + */ +static int (*get_owner)(struct pci_dev *dev); + +int register_msi_get_owner(int (*func)(struct pci_dev *dev)) +{ + if (get_owner) { + printk(KERN_WARNING "register msi_get_owner again\n"); + return -EEXIST; + } + get_owner = func; + return 0; +} + +int unregister_msi_get_owner(int (*func)(struct pci_dev *dev)) +{ + if (get_owner != func) + return -EINVAL; + get_owner = NULL; + return 0; +} + +static int msi_get_dev_owner(struct pci_dev *dev) +{ + int owner; + + BUG_ON(!is_initial_xendomain()); + if (get_owner && (owner = get_owner(dev)) >= 0) { + printk(KERN_INFO "get owner for dev %x get %x \n", + dev->devfn, owner); + return owner; + } + + return DOMID_SELF; +} + +static int msi_unmap_pirq(struct pci_dev *dev, int pirq) +{ + struct physdev_unmap_pirq unmap; + int rc; + + unmap.domid = msi_get_dev_owner(dev); + /* See comments in msi_map_pirq_to_vector, input parameter pirq + * mean irq number only if the device belongs to dom0 itself. + */ + unmap.pirq = (unmap.domid != DOMID_SELF) + ? pirq : evtchn_get_xen_pirq(pirq); + + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap))) + printk(KERN_WARNING "unmap irq %x failed\n", pirq); + + if (rc < 0) + return rc; + + if (unmap.domid == DOMID_SELF) + evtchn_map_pirq(pirq, 0); + + return 0; +} + +static u64 find_table_base(struct pci_dev *dev, int pos) +{ + u8 bar; + u32 reg; + unsigned long flags; + + pci_read_config_dword(dev, msix_table_offset_reg(pos), ®); + bar = reg & PCI_MSIX_FLAGS_BIRMASK; + + flags = pci_resource_flags(dev, bar); + if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY)) + return 0; + + return pci_resource_start(dev, bar); +} + +/* + * Protected by msi_lock + */ +static int msi_map_pirq_to_vector(struct pci_dev *dev, int pirq, + int entry_nr, u64 table_base) +{ + struct physdev_map_pirq map_irq; + int rc; + domid_t domid = DOMID_SELF; + + domid = msi_get_dev_owner(dev); + + map_irq.domid = domid; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = pirq < 0 ? -1 : evtchn_get_xen_pirq(pirq); + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; + map_irq.entry_nr = entry_nr; + map_irq.table_base = table_base; + + if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq))) + printk(KERN_WARNING "map irq failed\n"); + + if (rc < 0) + return rc; + /* This happens when MSI support is not enabled in Xen. */ + if (rc == 0 && map_irq.pirq < 0) + return -ENOSYS; + + BUG_ON(map_irq.pirq <= 0); + + /* If mapping of this particular MSI is on behalf of another domain, + * we do not need to get an irq in dom0. This also implies: + * dev->irq in dom0 will be 'Xen pirq' if this device belongs to + * to another domain, and will be 'Linux irq' if it belongs to dom0. + */ + return ((domid != DOMID_SELF) ? + map_irq.pirq : evtchn_map_pirq(pirq, map_irq.pirq)); +} + +static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base) +{ + return msi_map_pirq_to_vector(dev, -1, entry_nr, table_base); +} + +static int msi_init(void) +{ + static int status = 0; + + if (pci_msi_quirk) { + pci_msi_enable = 0; + printk(KERN_WARNING "PCI: MSI quirk detected. MSI disabled.\n"); + status = -EINVAL; + } + + return status; +} + +void pci_scan_msi_device(struct pci_dev *dev) { } + +void disable_msi_mode(struct pci_dev *dev, int pos, int type) +{ + u16 control; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (type == PCI_CAP_ID_MSI) { + /* Set enabled bits to single MSI & enable MSI_enable bit */ + msi_disable(control); + pci_write_config_word(dev, msi_control_reg(pos), control); + dev->msi_enabled = 0; + } else { + msix_disable(control); + pci_write_config_word(dev, msi_control_reg(pos), control); + dev->msix_enabled = 0; + } + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { + /* PCI Express Endpoint device detected */ + pci_intx(dev, 1); /* enable intx */ + } +} + +static void enable_msi_mode(struct pci_dev *dev, int pos, int type) +{ + u16 control; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (type == PCI_CAP_ID_MSI) { + /* Set enabled bits to single MSI & enable MSI_enable bit */ + msi_enable(control, 1); + pci_write_config_word(dev, msi_control_reg(pos), control); + dev->msi_enabled = 1; + } else { + msix_enable(control); + pci_write_config_word(dev, msi_control_reg(pos), control); + dev->msix_enabled = 1; + } + if (pci_find_capability(dev, PCI_CAP_ID_EXP)) { + /* PCI Express Endpoint device detected */ + pci_intx(dev, 0); /* disable intx */ + } +} + +#ifdef CONFIG_PM +int pci_save_msi_state(struct pci_dev *dev) +{ + int pos, i = 0; + u16 control; + struct pci_cap_saved_state *save_state; + u32 *cap; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (pos <= 0 || dev->no_msi) + return 0; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (!(control & PCI_MSI_FLAGS_ENABLE)) + return 0; + + save_state = kzalloc(sizeof(struct pci_cap_saved_state) + sizeof(u32) * 5, + GFP_KERNEL); + if (!save_state) { + printk(KERN_ERR "Out of memory in pci_save_msi_state\n"); + return -ENOMEM; + } + cap = &save_state->data[0]; + + pci_read_config_dword(dev, pos, &cap[i++]); + control = cap[0] >> 16; + pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_LO, &cap[i++]); + if (control & PCI_MSI_FLAGS_64BIT) { + pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_HI, &cap[i++]); + pci_read_config_dword(dev, pos + PCI_MSI_DATA_64, &cap[i++]); + } else + pci_read_config_dword(dev, pos + PCI_MSI_DATA_32, &cap[i++]); + if (control & PCI_MSI_FLAGS_MASKBIT) + pci_read_config_dword(dev, pos + PCI_MSI_MASK_BIT, &cap[i++]); + save_state->cap_nr = PCI_CAP_ID_MSI; + pci_add_saved_cap(dev, save_state); + return 0; +} + +void pci_restore_msi_state(struct pci_dev *dev) +{ + int i = 0, pos; + u16 control; + struct pci_cap_saved_state *save_state; + u32 *cap; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_MSI); + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!save_state || pos <= 0) + return; + cap = &save_state->data[0]; + + control = cap[i++] >> 16; + pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO, cap[i++]); + if (control & PCI_MSI_FLAGS_64BIT) { + pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI, cap[i++]); + pci_write_config_dword(dev, pos + PCI_MSI_DATA_64, cap[i++]); + } else + pci_write_config_dword(dev, pos + PCI_MSI_DATA_32, cap[i++]); + if (control & PCI_MSI_FLAGS_MASKBIT) + pci_write_config_dword(dev, pos + PCI_MSI_MASK_BIT, cap[i++]); + pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control); + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); + pci_remove_saved_cap(save_state); + kfree(save_state); +} + +int pci_save_msix_state(struct pci_dev *dev) +{ + int pos; + u16 control; + struct pci_cap_saved_state *save_state; + unsigned long flags; + struct msi_dev_list *msi_dev_entry; + struct msi_pirq_entry *pirq_entry; + void __iomem *base; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (pos <= 0 || dev->no_msi) + return 0; + + /* save the capability */ + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (!(control & PCI_MSIX_FLAGS_ENABLE)) + return 0; + + msi_dev_entry = get_msi_dev_pirq_list(dev); + /* If we failed to map the MSI-X table at pci_enable_msix, + * We could not support saving them here. + */ + if (!(base = msi_dev_entry->mask_base)) + return -ENOMEM; + + save_state = kzalloc(sizeof(struct pci_cap_saved_state) + sizeof(u16), + GFP_KERNEL); + if (!save_state) { + printk(KERN_ERR "Out of memory in pci_save_msix_state\n"); + return -ENOMEM; + } + *((u16 *)&save_state->data[0]) = control; + + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { + int j; + + /* save the table */ + j = pirq_entry->entry_nr; + pirq_entry->address_lo_save = + readl(base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + pirq_entry->address_hi_save = + readl(base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + pirq_entry->data_save = + readl(base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_DATA_OFFSET); + } + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); + + save_state->cap_nr = PCI_CAP_ID_MSIX; + pci_add_saved_cap(dev, save_state); + return 0; +} + +void pci_restore_msix_state(struct pci_dev *dev) +{ + u16 save; + int pos, j; + void __iomem *base; + struct pci_cap_saved_state *save_state; + unsigned long flags; + struct msi_dev_list *msi_dev_entry; + struct msi_pirq_entry *pirq_entry; + + save_state = pci_find_saved_cap(dev, PCI_CAP_ID_MSIX); + if (!save_state) + return; + + save = *((u16 *)&save_state->data[0]); + pci_remove_saved_cap(save_state); + kfree(save_state); + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (pos <= 0) + return; + + msi_dev_entry = get_msi_dev_pirq_list(dev); + base = msi_dev_entry->mask_base; + + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { + /* route the table */ + j = pirq_entry->entry_nr; + writel(pirq_entry->address_lo_save, + base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + writel(pirq_entry->address_hi_save, + base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + writel(pirq_entry->data_save, + base + j * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_DATA_OFFSET); + } + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); + + pci_write_config_word(dev, msi_control_reg(pos), save); + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); +} +#endif + +/** + * msi_capability_init - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * + * Setup the MSI capability structure of device function with a single + * MSI vector, regardless of device function is capable of handling + * multiple messages. A return of zero indicates the successful setup + * of an entry zero with the new MSI vector or non-zero for otherwise. + **/ +static int msi_capability_init(struct pci_dev *dev) +{ + int pos, pirq; + u16 control; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pci_read_config_word(dev, msi_control_reg(pos), &control); + + pirq = msi_map_vector(dev, 0, 0); + if (pirq < 0) + return -EBUSY; + + dev->irq = pirq; + /* Set MSI enabled bits */ + enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); + dev->msi_enabled = 1; + + return 0; +} + +/** + * msix_capability_init - configure device's MSI-X capability + * @dev: pointer to the pci_dev data structure of MSI-X device function + * @entries: pointer to an array of struct msix_entry entries + * @nvec: number of @entries + * + * Setup the MSI-X capability structure of device function with a + * single MSI-X vector. A return of zero indicates the successful setup of + * requested MSI-X entries with allocated vectors or non-zero for otherwise. + **/ +static int msix_capability_init(struct pci_dev *dev, + struct msix_entry *entries, int nvec) +{ + u64 table_base; + u16 control; + int pirq, i, j, mapped, pos, nr_entries; + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); + struct msi_pirq_entry *pirq_entry; + + if (!msi_dev_entry) + return -ENOMEM; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + table_base = find_table_base(dev, pos); + if (!table_base) + return -ENODEV; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + nr_entries = multi_msix_capable(control); + if (!msi_dev_entry->mask_base) + msi_dev_entry->mask_base = + ioremap_nocache(table_base, nr_entries * PCI_MSIX_ENTRY_SIZE); + + /* MSI-X Table Initialization */ + for (i = 0; i < nvec; i++) { + mapped = 0; + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { + if (pirq_entry->entry_nr == entries[i].entry) { + printk(KERN_WARNING "msix entry %d for dev %02x:%02x:%01x are \ + not freed before acquire again.\n", entries[i].entry, + dev->bus->number, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + (entries + i)->vector = pirq_entry->pirq; + mapped = 1; + break; + } + } + if (mapped) + continue; + pirq = msi_map_vector(dev, entries[i].entry, table_base); + if (pirq < 0) + break; + attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry); + (entries + i)->vector = pirq; + } + + if (i != nvec) { + for (j = --i; j >= 0; j--) { + msi_unmap_pirq(dev, entries[j].vector); + detach_pirq_entry(entries[j].entry, msi_dev_entry); + entries[j].vector = 0; + } + return -EBUSY; + } + + enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); + dev->msix_enabled = 1; + + return 0; +} + +/** + * pci_enable_msi - configure device's MSI capability structure + * @dev: pointer to the pci_dev data structure of MSI device function + * + * Setup the MSI capability structure of device function with + * a single MSI vector upon its software driver call to request for + * MSI mode enabled on its hardware device function. A return of zero + * indicates the successful setup of an entry zero with the new MSI + * vector or non-zero for otherwise. + **/ +extern int pci_frontend_enable_msi(struct pci_dev *dev); +int pci_enable_msi(struct pci_dev* dev) +{ + struct pci_bus *bus; + int pos, temp, status = -EINVAL; + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); + + if (!pci_msi_enable || !dev) + return status; + + if (dev->no_msi) + return status; + + for (bus = dev->bus; bus; bus = bus->parent) + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) + return -EINVAL; + + status = msi_init(); + if (status < 0) + return status; + +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + if (!is_initial_xendomain()) + { + int ret; + + temp = dev->irq; + ret = pci_frontend_enable_msi(dev); + if (ret) + return ret; + + dev->irq = evtchn_map_pirq(-1, dev->irq); + dev->msi_enabled = 1; + msi_dev_entry->default_irq = temp; + + return ret; + } +#endif + + temp = dev->irq; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!pos) + return -EINVAL; + + /* Check whether driver already requested for MSI-X vectors */ + if (dev->msix_enabled) { + printk(KERN_INFO "PCI: %s: Can't enable MSI. " + "Device already has MSI-X vectors assigned\n", + pci_name(dev)); + dev->irq = temp; + return -EINVAL; + } + + status = msi_capability_init(dev); + if ( !status ) + msi_dev_entry->default_irq = temp; + else + dev->irq = temp; + + return status; +} + +extern void pci_frontend_disable_msi(struct pci_dev* dev); +void pci_disable_msi(struct pci_dev* dev) +{ + int pos; + int pirq; + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); + + if (!pci_msi_enable) + return; + if (!dev) + return; + +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + if (!is_initial_xendomain()) { + if (!(dev->msi_enabled)) { + printk(KERN_INFO "PCI: %s: Device did not enabled MSI.\n", + pci_name(dev)); + return; + } + evtchn_map_pirq(dev->irq, 0); + pci_frontend_disable_msi(dev); + dev->irq = msi_dev_entry->default_irq; + dev->msi_enabled = 0; + return; + } +#endif + + pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + if (!pos) + return; + + if (!(dev->msi_enabled)) { + printk(KERN_INFO "PCI: %s: Device did not enabled MSI.\n", + pci_name(dev)); + return; + } + + pirq = dev->irq; + /* Restore dev->irq to its default pin-assertion vector */ + dev->irq = msi_dev_entry->default_irq; + msi_unmap_pirq(dev, pirq); + + /* Disable MSI mode */ + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI); +} + +/** + * pci_enable_msix - configure device's MSI-X capability structure + * @dev: pointer to the pci_dev data structure of MSI-X device function + * @entries: pointer to an array of MSI-X entries + * @nvec: number of MSI-X vectors requested for allocation by device driver + * + * Setup the MSI-X capability structure of device function with the number + * of requested vectors upon its software driver call to request for + * MSI-X mode enabled on its hardware device function. A return of zero + * indicates the successful configuration of MSI-X capability structure + * with new allocated MSI-X vectors. A return of < 0 indicates a failure. + * Or a return of > 0 indicates that driver request is exceeding the number + * of vectors available. Driver should use the returned value to re-send + * its request. + **/ +extern int pci_frontend_enable_msix(struct pci_dev *dev, + struct msix_entry *entries, int nvec); +int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) +{ + struct pci_bus *bus; + int status, pos, nr_entries; + int i, j, temp; + u16 control; + struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev); + + if (!pci_msi_enable || !dev || !entries) + return -EINVAL; + + if (dev->no_msi) + return -EINVAL; + + for (bus = dev->bus; bus; bus = bus->parent) + if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) + return -EINVAL; + +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + if (!is_initial_xendomain()) { + struct msi_pirq_entry *pirq_entry; + int ret, irq; + + temp = dev->irq; + ret = pci_frontend_enable_msix(dev, entries, nvec); + if (ret) { + printk("get %x from pci_frontend_enable_msix\n", ret); + return ret; + } + dev->msix_enabled = 1; + msi_dev_entry->default_irq = temp; + + for (i = 0; i < nvec; i++) { + int mapped = 0; + + list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) { + if (pirq_entry->entry_nr == entries[i].entry) { + irq = pirq_entry->pirq; + BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq)); + entries[i].vector = irq; + mapped = 1; + break; + } + } + if (mapped) + continue; + irq = evtchn_map_pirq(-1, entries[i].vector); + attach_pirq_entry(irq, entries[i].entry, msi_dev_entry); + entries[i].vector = irq; + } + return 0; + } +#endif + + status = msi_init(); + if (status < 0) + return status; + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return -EINVAL; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + nr_entries = multi_msix_capable(control); + if (nvec > nr_entries) + return -EINVAL; + + /* Check for any invalid entries */ + for (i = 0; i < nvec; i++) { + if (entries[i].entry >= nr_entries) + return -EINVAL; /* invalid entry */ + for (j = i + 1; j < nvec; j++) { + if (entries[i].entry == entries[j].entry) + return -EINVAL; /* duplicate entry */ + } + } + + temp = dev->irq; + /* Check whether driver already requested for MSI vector */ + if (dev->msi_enabled) { + printk(KERN_INFO "PCI: %s: Can't enable MSI-X. " + "Device already has an MSI vector assigned\n", + pci_name(dev)); + dev->irq = temp; + return -EINVAL; + } + + status = msix_capability_init(dev, entries, nvec); + + if ( !status ) + msi_dev_entry->default_irq = temp; + else + dev->irq = temp; + + return status; +} + +extern void pci_frontend_disable_msix(struct pci_dev* dev); +void pci_disable_msix(struct pci_dev* dev) +{ + int pos; + u16 control; + + if (!pci_msi_enable) + return; + if (!dev) + return; + if (!dev->msix_enabled) { + printk(KERN_INFO "PCI: %s: Device did not enabled MSI-X.\n", + pci_name(dev)); + return; + } + +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + if (!is_initial_xendomain()) { + struct msi_dev_list *msi_dev_entry; + struct msi_pirq_entry *pirq_entry, *tmp; + + pci_frontend_disable_msix(dev); + + msi_dev_entry = get_msi_dev_pirq_list(dev); + list_for_each_entry_safe(pirq_entry, tmp, + &msi_dev_entry->pirq_list_head, list) { + evtchn_map_pirq(pirq_entry->pirq, 0); + list_del(&pirq_entry->list); + kfree(pirq_entry); + } + + dev->irq = msi_dev_entry->default_irq; + dev->msix_enabled = 0; + return; + } +#endif + + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + if (!pos) + return; + + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (!(control & PCI_MSIX_FLAGS_ENABLE)) + return; + + msi_remove_pci_irq_vectors(dev); + + /* Disable MSI mode */ + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); +} + +/** + * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state + * @dev: pointer to the pci_dev data structure of MSI(X) device function + * + * Being called during hotplug remove, from which the device function + * is hot-removed. All previous assigned MSI/MSI-X vectors, if + * allocated for this device function, are reclaimed to unused state, + * which may be used later on. + **/ +void msi_remove_pci_irq_vectors(struct pci_dev* dev) +{ + unsigned long flags; + struct msi_dev_list *msi_dev_entry; + struct msi_pirq_entry *pirq_entry, *tmp; + + if (!pci_msi_enable || !dev) + return; + + msi_dev_entry = get_msi_dev_pirq_list(dev); + + spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags); + if (!list_empty(&msi_dev_entry->pirq_list_head)) + list_for_each_entry_safe(pirq_entry, tmp, + &msi_dev_entry->pirq_list_head, list) { + msi_unmap_pirq(dev, pirq_entry->pirq); + list_del(&pirq_entry->list); + kfree(pirq_entry); + } + spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags); + iounmap(msi_dev_entry->mask_base); + msi_dev_entry->mask_base = NULL; + dev->irq = msi_dev_entry->default_irq; +} + +void pci_no_msi(void) +{ + pci_msi_enable = 0; +} + +EXPORT_SYMBOL(pci_enable_msi); +EXPORT_SYMBOL(pci_disable_msi); +EXPORT_SYMBOL(pci_enable_msix); +EXPORT_SYMBOL(pci_disable_msix); +#ifdef CONFIG_XEN +EXPORT_SYMBOL(register_msi_get_owner); +EXPORT_SYMBOL(unregister_msi_get_owner); +#endif + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/agp.h 2007-06-22 09:08:06.000000000 +0200 @@ -0,0 +1,44 @@ +#ifndef AGP_H +#define AGP_H 1 + +#include <asm/pgtable.h> +#include <asm/cacheflush.h> +#include <asm/system.h> + +/* + * Functions to keep the agpgart mappings coherent with the MMU. + * The GART gives the CPU a physical alias of pages in memory. The alias region is + * mapped uncacheable. Make sure there are no conflicting mappings + * with different cachability attributes for the same page. This avoids + * data corruption on some CPUs. + */ + +/* Caller's responsibility to call global_flush_tlb() for + * performance reasons */ +#define map_page_into_agp(page) ( \ + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ + ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) +#define unmap_page_from_agp(page) ( \ + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + change_page_attr(page, 1, PAGE_KERNEL)) +#define flush_agp_mappings() global_flush_tlb() + +/* Could use CLFLUSH here if the cpu supports it. But then it would + need to be called for each cacheline of the whole page so it may not be + worth it. Would need a page for it. */ +#define flush_agp_cache() wbinvd() + +/* Convert a physical address to an address suitable for the GART. */ +#define phys_to_gart(x) phys_to_machine(x) +#define gart_to_phys(x) machine_to_phys(x) + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) ({ \ + char *_t; dma_addr_t _d; \ + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ + _t; }) +#define free_gatt_pages(table, order) \ + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/desc_32.h 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,166 @@ +#ifndef __ARCH_DESC_H +#define __ARCH_DESC_H + +#include <asm/ldt.h> +#include <asm/segment.h> + +#define CPU_16BIT_STACK_SIZE 1024 + +#ifndef __ASSEMBLY__ + +#include <linux/preempt.h> +#include <linux/smp.h> + +#include <asm/mmu.h> + +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; + +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); + +struct Xgt_desc_struct { + unsigned short size; + unsigned long address __attribute__((packed)); + unsigned short pad; +} __attribute__ ((packed)); + +extern struct Xgt_desc_struct idt_descr; +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); + + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; +} + +#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) +#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) + +#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) +#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr)) +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt)) + +#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) +#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) +#define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) +#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) + +/* + * This is the ldt that every process will get unless we need + * something other than this. + */ +extern struct desc_struct default_ldt[]; +extern void set_intr_gate(unsigned int irq, void * addr); + +#define _set_tssldt_desc(n,addr,limit,type) \ +__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ + "movw %w1,2(%2)\n\t" \ + "rorl $16,%1\n\t" \ + "movb %b1,4(%2)\n\t" \ + "movb %4,5(%2)\n\t" \ + "movb $0,6(%2)\n\t" \ + "movb %h1,7(%2)\n\t" \ + "rorl $16,%1" \ + : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type)) + +#ifndef CONFIG_X86_NO_TSS +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) +{ + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, + offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); +} + +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) +#endif + +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) +{ + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); +} + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); + +#if TLS_SIZE != 24 +# error update this code. +#endif + +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \ + *(u64 *)&t->tls_array[i])) \ + BUG(); + C(0); C(1); C(2); +#undef C +} + +static inline void clear_LDT(void) +{ + int cpu = get_cpu(); + + /* + * NB. We load the default_ldt for lcall7/27 handling on demand, as + * it slows down context switching. Noone uses it anyway. + */ + cpu = cpu; /* XXX avoid compiler warning */ + xen_set_ldt(NULL, 0); + put_cpu(); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + void *segments = pc->ldt; + int count = pc->size; + + if (likely(!count)) + segments = NULL; + + xen_set_ldt(segments, count); +} + +static inline void load_LDT(mm_context_t *pc) +{ + int cpu = get_cpu(); + load_LDT_nolock(pc, cpu); + put_cpu(); +} + +static inline unsigned long get_desc_base(unsigned long *desc) +{ + unsigned long base; + base = ((desc[0] >> 16) & 0x0000ffff) | + ((desc[1] << 16) & 0x00ff0000) | + (desc[1] & 0xff000000); + return base; +} + +#endif /* !__ASSEMBLY__ */ + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/fixmap_32.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,155 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#ifndef _ASM_FIXMAP_H +#define _ASM_FIXMAP_H + + +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; + +#ifndef __ASSEMBLY__ +#include <linux/kernel.h> +#include <asm/acpi.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#ifdef CONFIG_HIGHMEM +#include <linux/threads.h> +#include <asm/kmap_types.h> +#endif + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * these 'compile-time allocated' memory buffers are + * fixed-size 4k pages. (or larger if used with an increment + * highger than 1) use fixmap_set(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { + FIX_HOLE, + FIX_VDSO, +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#endif +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, + __end_of_permanent_fixed_addresses, + /* temporary boot-time mappings, used before ioremap() is functional */ +#define NR_FIX_BTMAPS 16 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, + FIX_WP_TEST, + __end_of_fixed_addresses +}; + +extern void set_fixaddr_top(unsigned long top); + +extern void __set_fixmap(enum fixed_addresses idx, + maddr_t phys, pgprot_t flags); + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without tranlation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} + +#endif /* !__ASSEMBLY__ */ +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/gnttab_dma.h 2007-08-06 15:10:49.000000000 +0200 @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> + * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_I386_GNTTAB_DMA_H +#define _ASM_I386_GNTTAB_DMA_H + +static inline int gnttab_dma_local_pfn(struct page *page) +{ + /* Has it become a local MFN? */ + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page)))); +} + +static inline maddr_t gnttab_dma_map_page(struct page *page) +{ + __gnttab_dma_map_page(page); + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT); +} + +static inline void gnttab_dma_unmap_page(maddr_t maddr) +{ + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr))); +} + +#endif /* _ASM_I386_GNTTAB_DMA_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/highmem.h 2008-10-29 09:55:56.000000000 +0100 @@ -0,0 +1,97 @@ +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#ifndef _ASM_HIGHMEM_H +#define _ASM_HIGHMEM_H + +#ifdef __KERNEL__ + +#include <linux/interrupt.h> +#include <linux/threads.h> +#include <asm/kmap_types.h> +#include <asm/tlbflush.h> + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif +/* + * Ordering is: + * + * FIXADDR_TOP + * fixed_addresses + * FIXADDR_START + * temp fixed addresses + * FIXADDR_BOOT_START + * Persistent kmap area + * PKMAP_BASE + * VMALLOC_END + * Vmalloc area + * VMALLOC_START + * high_memory + */ +#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void * FASTCALL(kmap_high(struct page *page)); +extern void FASTCALL(kunmap_high(struct page *page)); + +void *kmap(struct page *page); +void kunmap(struct page *page); +void *kmap_atomic(struct page *page, enum km_type type); +void *kmap_atomic_pte(struct page *page, enum km_type type); +void kunmap_atomic(void *kvaddr, enum km_type type); +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); +struct page *kmap_atomic_to_page(void *ptr); + +#define flush_cache_kmaps() do { } while (0) + +void clear_highpage(struct page *); +static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +{ + clear_highpage(page); +} +#define __HAVE_ARCH_CLEAR_HIGHPAGE +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE + +void copy_highpage(struct page *to, struct page *from); +static inline void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr) +{ + copy_highpage(to, from); +} +#define __HAVE_ARCH_COPY_HIGHPAGE +#define __HAVE_ARCH_COPY_USER_HIGHPAGE + +#endif /* __KERNEL__ */ + +#endif /* _ASM_HIGHMEM_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/hypercall_32.h 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,415 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#include <linux/string.h> /* memcpy() */ +#include <linux/stringify.h> + +#ifndef __HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#ifdef CONFIG_XEN +#define HYPERCALL_STR(name) \ + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)" +#else +#define HYPERCALL_STR(name) \ + "mov hypercall_stubs,%%eax; " \ + "add $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\ + "call *%%eax" +#endif + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res) \ + : \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + type __res; \ + long __ign1; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1) \ + : "1" ((long)(a1)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + long __ign1, __ign2; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ + : "1" ((long)(a1)), "2" ((long)(a2)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3, __ign4; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3, __ign4, __ign5; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)), "4" ((long)(a4)), \ + "5" ((long)(a5)) \ + : "memory" ); \ + __res; \ +}) + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); +} + +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} + +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + unsigned long pte_hi = 0; +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte_low, pte_hi, flags, domid); +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +static inline int __must_check +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + + +#endif /* __HYPERCALL_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/hypervisor.h 2009-07-13 14:25:35.000000000 +0200 @@ -0,0 +1,263 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/version.h> +#include <linux/errno.h> +#include <xen/interface/xen.h> +#include <xen/interface/platform.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/nmi.h> +#include <xen/interface/tmem.h> +#include <asm/ptrace.h> +#include <asm/page.h> +#if defined(__i386__) +# ifdef CONFIG_X86_PAE +# include <asm-generic/pgtable-nopud.h> +# else +# include <asm-generic/pgtable-nopmd.h> +# endif +#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) +# include <asm-generic/pgtable-nopud.h> +#endif + +extern shared_info_t *HYPERVISOR_shared_info; + +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu)) +#ifdef CONFIG_SMP +#define current_vcpu_info() vcpu_info(smp_processor_id()) +#else +#define current_vcpu_info() vcpu_info(0) +#endif + +#ifdef CONFIG_X86_32 +extern unsigned long hypervisor_virt_start; +#endif + +/* arch/xen/i386/kernel/setup.c */ +extern start_info_t *xen_start_info; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) +#else +#define is_initial_xendomain() 0 +#endif + +/* arch/xen/kernel/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* arch/xen/kernel/process.c */ +void xen_cpu_idle (void); + +/* arch/xen/i386/kernel/hypervisor.c */ +void do_hypervisor_callback(struct pt_regs *regs); + +/* arch/xen/i386/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + +void xen_pt_switch(unsigned long ptr); +void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ +void xen_load_gs(unsigned int selector); /* x86_64 only */ +void xen_tlb_flush(void); +void xen_invlpg(unsigned long ptr); + +void xen_l1_entry_update(pte_t *ptr, pte_t val); +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ +void xen_pgd_pin(unsigned long ptr); +void xen_pgd_unpin(unsigned long ptr); + +void xen_set_ldt(const void *ptr, unsigned int ents); + +#ifdef CONFIG_SMP +#include <linux/cpumask.h> +void xen_tlb_flush_all(void); +void xen_invlpg_all(unsigned long ptr); +void xen_tlb_flush_mask(cpumask_t *mask); +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr); +#else +#define xen_tlb_flush_all xen_tlb_flush +#define xen_invlpg_all xen_invlpg +#endif + +/* Returns zero on success else negative errno. */ +int xen_create_contiguous_region( + unsigned long vstart, unsigned int order, unsigned int address_bits); +void xen_destroy_contiguous_region( + unsigned long vstart, unsigned int order); + +struct page; + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits); + +/* Turn jiffies into Xen system time. */ +u64 jiffies_to_st(unsigned long jiffies); + +#ifdef CONFIG_XEN_SCRUB_PAGES +void scrub_pages(void *, unsigned int); +#else +#define scrub_pages(_p,_n) ((void)0) +#endif + +#include <xen/hypercall.h> + +#if defined(CONFIG_X86_64) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +#ifdef CONFIG_XEN +#define is_running_on_xen() 1 +#else +extern char *hypercall_stubs; +#define is_running_on_xen() (!!hypercall_stubs) +#endif + +static inline int +HYPERVISOR_yield( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int +HYPERVISOR_block( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif + + return rc; +} + +static inline void /*__noreturn*/ +HYPERVISOR_shutdown( + unsigned int reason) +{ + struct sched_shutdown sched_shutdown = { + .reason = reason + }; + + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown)); +#if CONFIG_XEN_COMPAT <= 0x030002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason)); +#endif + /* Don't recurse needlessly. */ + BUG_ON(reason != SHUTDOWN_crash); + for(;;); +} + +static inline int __must_check +HYPERVISOR_poll( + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports, + .timeout = jiffies_to_st(timeout) + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +#ifdef CONFIG_XEN + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; +#endif + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; +} + +static inline void +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, + void *uop, unsigned int count) +{ + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)uop; + mcl->args[2] = count; +} + +#else /* !defined(CONFIG_XEN) */ + +/* Multicalls not supported for HVM guests. */ +#define MULTI_update_va_mapping(a,b,c,d) ((void)0) +#define MULTI_grant_table_op(a,b,c,d) ((void)0) + +#endif + +#endif /* __HYPERVISOR_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/irqflags_32.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,127 @@ +/* + * include/asm-i386/irqflags.h + * + * IRQ flags handling + * + * This file gets included from lowlevel asm headers too, to provide + * wrapped versions of the local_irq_*() APIs, based on the + * raw_local_irq_*() functions from the lowlevel headers. + */ +#ifndef _ASM_IRQFLAGS_H +#define _ASM_IRQFLAGS_H + +#ifndef __ASSEMBLY__ + +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) + +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_restore(x) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define raw_local_irq_disable() \ +do { \ + current_vcpu_info()->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define raw_local_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +void raw_safe_halt(void); + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +void halt(void); + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) + +/* + * For spinlocks, etc: + */ +#define __raw_local_irq_save() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_local_irq_disable(); \ + \ + flags; \ +}) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +#endif /* __ASSEMBLY__ */ + +/* + * Do the CPU's IRQ-state tracing from assembly code. We call a + * C function, so save all the C-clobbered registers: + */ +#ifdef CONFIG_TRACE_IRQFLAGS + +# define TRACE_IRQS_ON \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_on; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +# define TRACE_IRQS_OFF \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call trace_hardirqs_off; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/maddr_32.h 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,193 @@ +#ifndef _I386_MADDR_H +#define _I386_MADDR_H + +#include <xen/features.h> +#include <xen/interface/xen.h> + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + BUG_ON(max_mapnr && pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + BUG_ON(max_mapnr && pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movl %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if ((pfn < max_mapnr) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + BUG_ON(max_mapnr && pfn >= max_mapnr); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +#ifdef CONFIG_X86_PAE +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to pfn_to_mfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} +#endif + +#ifdef CONFIG_X86_PAE +#define __pte_ma(x) ((pte_t) { (x), (maddr_t)(x) >> 32 } ) +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ + (pgprot_val(pgprot) >> 32); + pte.pte_high &= (__supported_pte_mask >> 32); + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ + __supported_pte_mask; + return pte; +} +#else +#define __pte_ma(x) ((pte_t) { (x) } ) +#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#endif + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _I386_MADDR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/mmu_context_32.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,108 @@ +#ifndef __I386_SCHED_H +#define __I386_SCHED_H + +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if 0 /* XEN: no lazy tlb */ + unsigned cpu = smp_processor_id(); + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ + /* + * Save away %fs and %gs. No need to save %es and %ds, as those + * are always kernel segments while inside the kernel. Must + * happen before reload of cr3/ldt (i.e., not in __switch_to). + */ + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1" + : "=m" (current->thread.fs), + "=m" (current->thread.gs)); + asm volatile ( "movl %0,%%fs ; movl %0,%%gs" + : : "r" (0) ); +} + +extern void mm_pin(struct mm_struct *mm); +extern void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + int cpu = smp_processor_id(); + struct mmuext_op _op[2], *op = _op; + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)); + + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +#if 0 /* XEN: no lazy tlb */ + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; + per_cpu(cpu_tlbstate, cpu).active_mm = next; +#endif + cpu_set(cpu, next->cpu_vm_mask); + + /* Re-load page tables: load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); + op++; + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) { + /* load_LDT_nolock(&next->context, cpu) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + } +#if 0 /* XEN: no lazy tlb */ + else { + per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; + BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload %cr3. + */ + load_cr3(next->pgd); + load_LDT_nolock(&next->context, cpu); + } + } +#endif +} + +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) + +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) + mm_pin(next); + switch_mm(prev, next, NULL); +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgalloc_32.h 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,59 @@ +#ifndef _I386_PGALLOC_H +#define _I386_PGALLOC_H + +#include <asm/fixmap.h> +#include <linux/threads.h> +#include <linux/mm.h> /* for struct page */ +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ + +#define pmd_populate_kernel(mm, pmd, pte) \ + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) + +#define pmd_populate(mm, pmd, pte) \ +do { \ + unsigned long pfn = page_to_pfn(pte); \ + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) { \ + if (!PageHighMem(pte)) \ + BUG_ON(HYPERVISOR_update_va_mapping( \ + (unsigned long)__va(pfn << PAGE_SHIFT), \ + pfn_pte(pfn, PAGE_KERNEL_RO), 0)); \ + else if (!test_and_set_bit(PG_pinned, &pte->flags)) \ + kmap_flush_unused(); \ + set_pmd(pmd, \ + __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT))); \ + } else \ + *(pmd) = __pmd(_PAGE_TABLE + ((paddr_t)pfn << PAGE_SHIFT)); \ +} while (0) + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); + +static inline void pte_free_kernel(pte_t *pte) +{ + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +extern void pte_free(struct page *pte); + +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) + +#ifdef CONFIG_X86_PAE +/* + * In the PAE case we free the pmds as part of the pgd. + */ +#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_free(x) do { } while (0) +#define __pmd_free_tlb(tlb,x) do { } while (0) +#define pud_populate(mm, pmd, pte) BUG() +#endif + +#define check_pgt_cache() do { } while (0) + +#endif /* _I386_PGALLOC_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable-3level-defs.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,24 @@ +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H +#define _I386_PGTABLE_3LEVEL_DEFS_H + +#define HAVE_SHARED_KERNEL_PMD 0 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable-3level.h 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,211 @@ +#ifndef _I386_PGTABLE_3LEVEL_H +#define _I386_PGTABLE_3LEVEL_H + +#include <asm-generic/pgtable-nopud.h> + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", __FILE__, __LINE__, \ + &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ + &(e), __pmd_val(e), (pmd_val(e) & PTE_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", __FILE__, __LINE__, \ + &(e), __pgd_val(e), (pgd_val(e) & PTE_MASK) >> PAGE_SHIFT) + +#define pud_none(pud) 0 +#define pud_bad(pud) 0 +#define pud_present(pud) 1 + +/* + * Is the pte executable? + */ +static inline int pte_x(pte_t pte) +{ + return !(__pte_val(pte) & _PAGE_NX); +} + +/* + * All present user-pages with !NX bit are user-executable: + */ +static inline int pte_exec(pte_t pte) +{ + return pte_user(pte) && pte_x(pte); +} +/* + * All present pages with !NX bit are kernel-executable: + */ +static inline int pte_exec_kernel(pte_t pte) +{ + return pte_x(pte); +} + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ +#define __HAVE_ARCH_SET_PTE_ATOMIC + +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} +#define set_pte_atomic(pteptr,pteval) \ + set_64bit((unsigned long long *)(pteptr),__pte_val(pteval)) + +#define set_pte_at(_mm,addr,ptep,pteval) do { \ + if (((_mm) != current->mm && (_mm) != &init_mm) || \ + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ + set_pte((ptep), (pteval)); \ +} while (0) + +#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \ + if (((_mm) != current->mm && (_mm) != &init_mm) || \ + HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \ + set_pte((ptep), (pteval)); \ + xen_invlpg((addr)); \ + } \ +} while (0) + +#define set_pmd(pmdptr,pmdval) \ + xen_l2_entry_update((pmdptr), (pmdval)) +#define set_pud(pudptr,pudval) \ + xen_l3_entry_update((pudptr), (pudval)) + +/* + * Pentium-II erratum A13: in PAE mode we explicitly have to flush + * the TLB via cr3 if the top-level pgd is changed... + * We do not let the generic code free and clear pgd entries due to + * this erratum. + */ +static inline void pud_clear (pud_t * pud) { } + +#define pud_page(pud) \ +((struct page *) __va(pud_val(pud) & PAGE_MASK)) + +#define pud_page_kernel(pud) \ +((unsigned long) __va(pud_val(pud) & PAGE_MASK)) + + +/* Find an entry in the second-level page table.. */ +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) + +static inline int pte_none(pte_t pte) +{ + return !(pte.pte_low | pte.pte_high); +} + +/* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; + } +} + +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) + +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte)) { + if ((mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { + uint64_t val = __pte_val(pte); + if (__cmpxchg64(ptep, val, 0) != val) { + /* xchg acts as a barrier before the setting of the high bits */ + pte.pte_low = xchg(&ptep->pte_low, 0); + pte.pte_high = ptep->pte_high; + ptep->pte_high = 0; + } + } + } + return pte; +} + +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ + UVMF_INVLPG|UVMF_MULTI))) { \ + __ptep->pte_low = 0; \ + smp_wmb(); \ + __ptep->pte_high = 0; \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte_low == b.pte_low && a.pte_high == b.pte_high; +} + +#define pte_page(x) pfn_to_page(pte_pfn(x)) + +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ + ((_pte).pte_high << (32-PAGE_SHIFT))) +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IO ? max_mapnr : \ + (_pte).pte_low & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +extern unsigned long long __supported_pte_mask; + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | + pgprot_val(pgprot)) & __supported_pte_mask); +} + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) + +#define __pmd_free_tlb(tlb, x) do { } while (0) + +void vmalloc_sync_all(void); + +#endif /* _I386_PGTABLE_3LEVEL_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable_32.h 2009-03-18 10:39:32.000000000 +0100 @@ -0,0 +1,550 @@ +#ifndef _I386_PGTABLE_H +#define _I386_PGTABLE_H + +#include <asm/hypervisor.h> + +/* + * The Linux memory management assumes a three-level page table setup. On + * the i386, we use that, but "fold" the mid level into the top-level page + * table, so that we physically have the same two-level page table as the + * i386 mmu expects. + * + * This file contains the functions and defines necessary to modify and use + * the i386 page table tree. + */ +#ifndef __ASSEMBLY__ +#include <asm/processor.h> +#include <asm/fixmap.h> +#include <linux/threads.h> + +#ifndef _I386_BITOPS_H +#include <asm/bitops.h> +#endif + +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +/* Is this pagetable pinned? */ +#define PG_pinned PG_arch_1 + +struct mm_struct; +struct vm_area_struct; + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +extern unsigned long empty_zero_page[1024]; +extern pgd_t *swapper_pg_dir; +extern kmem_cache_t *pgd_cache; +extern kmem_cache_t *pmd_cache; +extern spinlock_t pgd_lock; +extern struct page *pgd_list; + +void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_ctor(void *, kmem_cache_t *, unsigned long); +void pgd_dtor(void *, kmem_cache_t *, unsigned long); +void pgtable_cache_init(void); +void paging_init(void); + +/* + * The Linux x86 paging architecture is 'compile-time dual-mode', it + * implements both the traditional 2-level x86 page tables and the + * newer 3-level PAE-mode page tables. + */ +#ifdef CONFIG_X86_PAE +# include <asm/pgtable-3level-defs.h> +# define PMD_SIZE (1UL << PMD_SHIFT) +# define PMD_MASK (~(PMD_SIZE-1)) +#else +# include <asm/pgtable-2level-defs.h> +#endif + +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define FIRST_USER_ADDRESS 0 + +#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) + +#define TWOLEVEL_PGDIR_SHIFT 22 +#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) +#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) + +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ +#define VMALLOC_OFFSET (8*1024*1024) +#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \ + 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) +#ifdef CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif + +/* + * _PAGE_PSE set in the page directory entry just means that + * the page directory entry points directly to a 4MB-aligned block of + * memory. + */ +#define _PAGE_BIT_PRESENT 0 +#define _PAGE_BIT_RW 1 +#define _PAGE_BIT_USER 2 +#define _PAGE_BIT_PWT 3 +#define _PAGE_BIT_PCD 4 +#define _PAGE_BIT_ACCESSED 5 +#define _PAGE_BIT_DIRTY 6 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +/*#define _PAGE_BIT_UNUSED1 9*/ /* available for programmer */ +#define _PAGE_BIT_UNUSED2 10 +#define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_NX 63 + +#define _PAGE_PRESENT 0x001 +#define _PAGE_RW 0x002 +#define _PAGE_USER 0x004 +#define _PAGE_PWT 0x008 +#define _PAGE_PCD 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ +#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ +/*#define _PAGE_UNUSED1 0x200*/ /* available for programmer */ +#define _PAGE_UNUSED2 0x400 +#define _PAGE_UNUSED3 0x800 + +/* If _PAGE_PRESENT is clear, we use these: */ +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_PROTNONE 0x080 /* if the user mapped it with PROT_NONE; + pte_present gives true */ +#ifdef CONFIG_X86_PAE +#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) +#else +#define _PAGE_NX 0 +#endif + +/* Mapped page is I/O or foreign and has no associated page struct. */ +#define _PAGE_IO 0x200 + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) + +#define PAGE_NONE \ + __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED \ + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) + +#define PAGE_SHARED_EXEC \ + __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC \ + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC \ + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY \ + PAGE_COPY_NOEXEC +#define PAGE_READONLY \ + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC \ + __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) + +#define _PAGE_KERNEL \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) +#define _PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) + +extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) + +/* + * The i386 can't do page protection for execute, and considers that + * the same are read. Also, write permissions imply read permissions. + * This is the closest we can get.. + */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK + +/* The boot page tables (all created as a single array) */ +extern unsigned long pg0[]; + +#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) + +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ +#define pmd_none(x) (!(unsigned long)__pmd_val(x)) +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ +#define pmd_present(x) (__pmd_val(x)) +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) +#else +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) +#define pmd_bad(x) ((__pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#endif + + +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } +static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } +static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } +static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } +static inline int pte_huge(pte_t pte) { return (pte).pte_low & _PAGE_PSE; } + +/* + * The following only works if pte_present() is not true. + */ +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } + +static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } +static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } +static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } +static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } +static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } +static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } +static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } +static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } +static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } +static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } + +#ifdef CONFIG_X86_PAE +# include <asm/pgtable-3level.h> +#else +# include <asm/pgtable-2level.h> +#endif + +#define ptep_test_and_clear_dirty(vma, addr, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __ret = pte_dirty(__pte); \ + if (__ret) { \ + __pte = pte_mkclean(__pte); \ + if ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ + (ptep)->pte_low = __pte.pte_low; \ + } \ + __ret; \ +}) + +#define ptep_test_and_clear_young(vma, addr, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __ret = pte_young(__pte); \ + if (__ret) \ + __pte = pte_mkold(__pte); \ + if ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte, 0)) \ + (ptep)->pte_low = __pte.pte_low; \ + __ret; \ +}) + +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ + ((full) ? ({ \ + pte_t __res = *(ptep); \ + if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) \ + xen_l1_entry_update(ptep, __pte(0)); \ + else \ + *(ptep) = __pte(0); \ + __res; \ + }) : \ + ptep_get_and_clear(mm, addr, ptep)) + +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + +/* + * Macro to mark a page protection value as "uncacheable". On processors which do not support + * it, this is a no-op. + */ +#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + /* + * Since this might change the present bit (which controls whether + * a pte_t object has undergone p2m translation), we must use + * pte_val() on the input pte and __pte() for the return value. + */ + paddr_t pteval = pte_val(pte); + + pteval &= _PAGE_CHG_MASK; + pteval |= pgprot_val(newprot); +#ifdef CONFIG_X86_PAE + pteval &= __supported_pte_mask; +#endif + return __pte(pteval); +} + +#define pmd_large(pmd) \ +((__pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) + +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) +#define pgd_index_k(addr) pgd_index(addr) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) + +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +#define pmd_index(address) \ + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) + +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this macro returns the index of the entry in the pte page which would + * control the given virtual address + */ +#define pte_index(address) \ + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) + +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) + +#define pmd_page_kernel(pmd) \ + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address); + +/* + * Make a given kernel text page executable/non-executable. + * Returns the previous executability setting of that page (which + * is used to restore the previous state). Used by the SMP bootup code. + * NOTE: this is an __init function for security reasons. + */ +#ifdef CONFIG_X86_PAE + extern int set_kernel_exec(unsigned long vaddr, int enable); +#else + static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} +#endif + +extern void noexec_setup(const char *str); + +#if defined(CONFIG_HIGHPTE) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \ + pte_index(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \ + pte_index(address)) +#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) +#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) +#define pte_unmap(pte) do { } while (0) +#define pte_unmap_nested(pte) do { } while (0) +#endif + +#define __HAVE_ARCH_PTEP_ESTABLISH +#define ptep_establish(vma, address, ptep, pteval) \ + do { \ + if ( likely((vma)->vm_mm == current->mm) ) { \ + BUG_ON(HYPERVISOR_update_va_mapping(address, \ + pteval, \ + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ + UVMF_INVLPG|UVMF_MULTI)); \ + } else { \ + xen_l1_entry_update(ptep, pteval); \ + flush_tlb_page(vma, address); \ + } \ + } while (0) + +/* + * The i386 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + * + * Also, we only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +#define update_mmu_cache(vma,address,pte) do { } while (0) +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ + do { \ + if (dirty) \ + ptep_establish(vma, address, ptep, entry); \ + } while (0) + +#include <xen/features.h> +void make_lowmem_page_readonly(void *va, unsigned int feature); +void make_lowmem_page_writable(void *va, unsigned int feature); +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +#define virt_to_ptep(va) \ +({ \ + pte_t *__ptep = lookup_address((unsigned long)(va)); \ + BUG_ON(!__ptep || !pte_present(*__ptep)); \ + __ptep; \ +}) + +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + +#ifdef CONFIG_HIGHPTE +#include <asm/io.h> +struct page *kmap_atomic_to_page(void *); +#define ptep_to_machine(ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + page_to_phys(kmap_atomic_to_page(__ptep)) \ + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ +}) +#else +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +#endif /* !__ASSEMBLY__ */ + +#ifdef CONFIG_FLATMEM +#define kern_addr_valid(addr) (1) +#endif /* CONFIG_FLATMEM */ + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); + +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot); + +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ + xen_change_pte_range(mm, pmd, addr, end, newprot) + +#define io_remap_pfn_range(vma,from,pfn,size,prot) \ +direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) + +#define MK_IOSPACE_PFN(space, pfn) (pfn) +#define GET_IOSPACE(pfn) 0 +#define GET_PFN(pfn) (pfn) + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define __HAVE_ARCH_PTE_SAME +#include <asm-generic/pgtable.h> + +#endif /* _I386_PGTABLE_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/processor_32.h 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,743 @@ +/* + * include/asm-i386/processor.h + * + * Copyright (C) 1994 Linus Torvalds + */ + +#ifndef __ASM_I386_PROCESSOR_H +#define __ASM_I386_PROCESSOR_H + +#include <asm/vm86.h> +#include <asm/math_emu.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/sigcontext.h> +#include <asm/cpufeature.h> +#include <asm/msr.h> +#include <asm/system.h> +#include <linux/cache.h> +#include <linux/threads.h> +#include <asm/percpu.h> +#include <linux/cpumask.h> +#include <xen/interface/physdev.h> + +/* flag for disabling the tsc */ +extern int tsc_disable; + +struct desc_struct { + unsigned long a,b; +}; + +#define desc_empty(desc) \ + (!((desc)->a | (desc)->b)) + +#define desc_equal(desc1, desc2) \ + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; + char wp_works_ok; /* It doesn't on 386's */ + char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ + char hard_math; + char rfu; + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ + unsigned long x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + int x86_cache_size; /* in KB - valid for CPUS which support this + call */ + int x86_cache_alignment; /* In bytes */ + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; + int x86_power; + unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ +#endif + unsigned char x86_max_cores; /* cpuid returned max cores value */ + unsigned char apicid; +#ifdef CONFIG_SMP + unsigned char booted_cores; /* number of cores as seen by OS */ + __u8 phys_proc_id; /* Physical processor id. */ + __u8 cpu_core_id; /* Core id */ +#endif +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_NEXGEN 4 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_RISE 6 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 +#define X86_VENDOR_UNKNOWN 0xff + +/* + * capabilities of CPUs + */ + +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; +#ifndef CONFIG_X86_NO_TSS +extern struct tss_struct doublefault_tss; +DECLARE_PER_CPU(struct tss_struct, init_tss); +#endif + +#ifdef CONFIG_SMP +extern struct cpuinfo_x86 cpu_data[]; +#define current_cpu_data cpu_data[smp_processor_id()] +#else +#define cpu_data (&boot_cpu_data) +#define current_cpu_data boot_cpu_data +#endif + +extern int cpu_llc_id[NR_CPUS]; +extern char ignore_fpu_irq; + +extern void identify_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +#ifdef CONFIG_X86_HT +extern void detect_ht(struct cpuinfo_x86 *c); +#else +static inline void detect_ht(struct cpuinfo_x86 *c) {} +#endif + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + __asm__(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c"(0)); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + __asm__(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c" (count)); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax; + + __asm__(XEN_CPUID + : "=a" (eax) + : "0" (op) + : "bx", "cx", "dx"); + return eax; +} +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx; + + __asm__(XEN_CPUID + : "=a" (eax), "=b" (ebx) + : "0" (op) + : "cx", "dx" ); + return ebx; +} +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ecx; + + __asm__(XEN_CPUID + : "=a" (eax), "=c" (ecx) + : "0" (op) + : "bx", "dx" ); + return ecx; +} +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, edx; + + __asm__(XEN_CPUID + : "=a" (eax), "=d" (edx) + : "0" (op) + : "bx", "cx"); + return edx; +} + +#define load_cr3(pgdir) write_cr3(__pa(pgdir)) + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4 (unsigned long mask) +{ + unsigned cr4; + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4 (unsigned long mask) +{ + unsigned cr4; + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +/* + * NSC/Cyrix CPU configuration register indexes + */ + +#define CX86_PCR0 0x20 +#define CX86_GCR 0xb8 +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_PCR1 0xf0 +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + +/* + * NSC/Cyrix CPU indexed register access macros + */ + +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) + +#define setCx86(reg, data) do { \ + outb((reg), 0x22); \ + outb((data), 0x23); \ +} while (0) + +/* Stop speculative execution */ +static inline void sync_core(void) +{ + int tmp; + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax,%ecx,%edx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc8;" + : :"a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + +/* from system description table in BIOS. Mostly for MCA use, but +others may find it useful. */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; +extern unsigned int mca_pentium_flag; + +/* Boot loader type from the setup header */ +extern int bootloader_type; + +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE (PAGE_OFFSET) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define HAVE_ARCH_PICK_MMAP_LAYOUT + +/* + * Size of io_bitmap. + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#ifndef CONFIG_X86_NO_TSS +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) +#endif +#define INVALID_IO_BITMAP_OFFSET 0x8000 +#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 + +struct i387_fsave_struct { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + long status; /* software status information */ +}; + +struct i387_fxsave_struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; + unsigned short fop; + long fip; + long fcs; + long foo; + long fos; + long mxcsr; + long mxcsr_mask; + long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ + long padding[56]; +} __attribute__ ((aligned (16))); + +struct i387_soft_struct { + long cwd; + long swd; + long twd; + long fip; + long fcs; + long foo; + long fos; + long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ + unsigned char ftop, changed, lookahead, no_update, rm, alimit; + struct info *info; + unsigned long entry_eip; +}; + +union i387_union { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; +}; + +typedef struct { + unsigned long seg; +} mm_segment_t; + +struct thread_struct; + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + unsigned short back_link,__blh; + unsigned long esp0; + unsigned short ss0,__ss0h; + unsigned long esp1; + unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ + unsigned long esp2; + unsigned short ss2,__ss2h; + unsigned long __cr3; + unsigned long eip; + unsigned long eflags; + unsigned long eax,ecx,edx,ebx; + unsigned long esp; + unsigned long ebp; + unsigned long esi; + unsigned long edi; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace, io_bitmap_base; + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + /* + * Cache the current maximum and the last task that used the bitmap: + */ + unsigned long io_bitmap_max; + struct thread_struct *io_bitmap_owner; + /* + * pads the TSS to be cacheline-aligned (size is 0x100) + */ + unsigned long __cacheline_filler[35]; + /* + * .. and then another 0x100 bytes for emergency kernel stack + */ + unsigned long stack[64]; +} __attribute__((packed)); +#endif + +#define ARCH_MIN_TASKALIGN 16 + +struct thread_struct { +/* cached TLS descriptors. */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long esp0; + unsigned long sysenter_cs; + unsigned long eip; + unsigned long esp; + unsigned long fs; + unsigned long gs; +/* Hardware debugging registers */ + unsigned long debugreg[8]; /* %%db0-7 debug registers */ +/* fault info */ + unsigned long cr2, trap_no, error_code; +/* floating point info */ + union i387_union i387; +/* virtual 86 mode info */ + struct vm86_struct __user * vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, saved_esp0; + unsigned int saved_fs, saved_gs; +/* IO permissions */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; +/* max allowed port in the bitmap, in bytes: */ + unsigned long io_bitmap_max; +}; + +#define INIT_THREAD { \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ +} + +#ifndef CONFIG_X86_NO_TSS +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .esp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} +#define load_esp0(tss, thread) \ + __load_esp0(tss, thread) +#else +#define load_esp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \ + BUG(); \ +} while (0) +#endif + +#define start_thread(regs, new_eip, new_esp) do { \ + __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + set_fs(USER_DS); \ + regs->xds = __USER_DS; \ + regs->xes = __USER_DS; \ + regs->xss = __USER_DS; \ + regs->xcs = __USER_CS; \ + regs->eip = new_eip; \ + regs->esp = new_esp; \ +} while (0) + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = HYPERVISOR_get_debugreg((register)) +#define set_debugreg(value, register) \ + WARN_ON(HYPERVISOR_set_debugreg((register), (value))) + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy status */ +extern void prepare_to_copy(struct task_struct *tsk); + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + +extern unsigned long thread_saved_pc(struct task_struct *tsk); +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack); + +unsigned long get_wchan(struct task_struct *p); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the xss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + +#define KSTK_EIP(task) (task_pt_regs(task)->eip) +#define KSTK_ESP(task) (task_pt_regs(task)->esp) + + +struct microcode_header { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode { + struct microcode_header hdr; + unsigned int bits[0]; +}; + +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + +/* generic versions from gas */ +#define GENERIC_NOP1 ".byte 0x90\n" +#define GENERIC_NOP2 ".byte 0x89,0xf6\n" +#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" +#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" +#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 +#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" +#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 + +/* Opteron nops */ +#define K8_NOP1 GENERIC_NOP1 +#define K8_NOP2 ".byte 0x66,0x90\n" +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" +#define K8_NOP5 K8_NOP3 K8_NOP2 +#define K8_NOP6 K8_NOP3 K8_NOP3 +#define K8_NOP7 K8_NOP4 K8_NOP3 +#define K8_NOP8 K8_NOP4 K8_NOP4 + +/* K7 nops */ +/* uses eax dependencies (arbitary choice) */ +#define K7_NOP1 GENERIC_NOP1 +#define K7_NOP2 ".byte 0x8b,0xc0\n" +#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" +#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" +#define K7_NOP5 K7_NOP4 ASM_NOP1 +#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" +#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" +#define K7_NOP8 K7_NOP7 ASM_NOP1 + +#ifdef CONFIG_MK8 +#define ASM_NOP1 K8_NOP1 +#define ASM_NOP2 K8_NOP2 +#define ASM_NOP3 K8_NOP3 +#define ASM_NOP4 K8_NOP4 +#define ASM_NOP5 K8_NOP5 +#define ASM_NOP6 K8_NOP6 +#define ASM_NOP7 K8_NOP7 +#define ASM_NOP8 K8_NOP8 +#elif defined(CONFIG_MK7) +#define ASM_NOP1 K7_NOP1 +#define ASM_NOP2 K7_NOP2 +#define ASM_NOP3 K7_NOP3 +#define ASM_NOP4 K7_NOP4 +#define ASM_NOP5 K7_NOP5 +#define ASM_NOP6 K7_NOP6 +#define ASM_NOP7 K7_NOP7 +#define ASM_NOP8 K7_NOP8 +#else +#define ASM_NOP1 GENERIC_NOP1 +#define ASM_NOP2 GENERIC_NOP2 +#define ASM_NOP3 GENERIC_NOP3 +#define ASM_NOP4 GENERIC_NOP4 +#define ASM_NOP5 GENERIC_NOP5 +#define ASM_NOP6 GENERIC_NOP6 +#define ASM_NOP7 GENERIC_NOP7 +#define ASM_NOP8 GENERIC_NOP8 +#endif + +#define ASM_NOP_MAX 8 + +/* Prefetch instructions for Pentium III and AMD Athlon */ +/* It's not worth to care about 3dnow! prefetches for the K6 + because they are microcoded there and very slow. + However we don't do prefetches for pre XP Athlons currently + That should be fixed. */ +#define ARCH_HAS_PREFETCH +static inline void prefetch(const void *x) +{ + alternative_input(ASM_NOP4, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +#define ARCH_HAS_PREFETCH +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +/* 3dnow! prefetch to get an exclusive cache line. Useful for + spinlocks to avoid one state transition in the cache coherency protocol. */ +static inline void prefetchw(const void *x) +{ + alternative_input(ASM_NOP4, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} +#define spin_lock_prefetch(x) prefetchw(x) + +extern void select_idle_routine(const struct cpuinfo_x86 *c); + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern unsigned long boot_option_idle_override; +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +#endif /* __ASM_I386_PROCESSOR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/smp_32.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,103 @@ +#ifndef __ASM_SMP_H +#define __ASM_SMP_H + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifndef __ASSEMBLY__ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +#ifndef __ASSEMBLY__ +#include <asm/fixmap.h> +#include <asm/bitops.h> +#include <asm/mpspec.h> +#ifdef CONFIG_X86_IO_APIC +#include <asm/io_apic.h> +#endif +#include <asm/apic.h> +#endif +#endif + +#define BAD_APICID 0xFFu +#ifdef CONFIG_SMP +#ifndef __ASSEMBLY__ + +/* + * Private routines/data + */ + +extern void smp_alloc_memory(void); +extern int pic_mode; +extern int smp_num_siblings; +extern cpumask_t cpu_sibling_map[]; +extern cpumask_t cpu_core_map[]; + +extern void (*mtrr_hook) (void); +extern void zap_low_mappings (void); +extern void lock_ipi_call_lock(void); +extern void unlock_ipi_call_lock(void); + +#define MAX_APICID 256 +extern u8 x86_cpu_to_apicid[]; + +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] + +#ifdef CONFIG_HOTPLUG_CPU +extern void cpu_exit_clear(void); +extern void cpu_uninit(void); +#endif + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() (current_thread_info()->cpu) + +extern cpumask_t cpu_possible_map; +#define cpu_callin_map cpu_possible_map + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpus_weight(cpu_possible_map); +} + +#ifdef CONFIG_X86_LOCAL_APIC + +#ifdef APIC_DEFINITION +extern int hard_smp_processor_id(void); +#else +#include <mach_apicdef.h> +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); +} +#endif + +static __inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); +} + +#endif + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +extern void prefill_possible_map(void); +#endif /* !__ASSEMBLY__ */ + +#else /* CONFIG_SMP */ + +#define cpu_physical_id(cpu) boot_cpu_physical_apicid + +#define NO_PROC_ID 0xFF /* No processor magic marker */ + +#endif +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/synch_bitops.h 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,126 @@ +#ifndef __XEN_SYNCH_BITOPS_H__ +#define __XEN_SYNCH_BITOPS_H__ + +/* + * Copyright 1992, Linus Torvalds. + * Heavily modified to provide guaranteed strong synchronisation + * when communicating with Xen or other guest OSes running on other CPUs. + */ + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include <xen/platform-compat.h> +#endif + +#define ADDR (*(volatile long *) addr) + +static __inline__ void synch_set_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btsl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btrl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ void synch_change_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__ ( + "lock btcl %1,%0" + : "+m" (ADDR) : "Ir" (nr) : "memory" ); +} + +static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btsl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) +{ + int oldbit; + __asm__ __volatile__ ( + "lock btrl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) +{ + int oldbit; + + __asm__ __volatile__ ( + "lock btcl %2,%1\n\tsbbl %0,%0" + : "=r" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); + return oldbit; +} + +struct __synch_xchg_dummy { unsigned long a[100]; }; +#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) + +#define synch_cmpxchg(ptr, old, new) \ +((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ + (unsigned long)(old), \ + (unsigned long)(new), \ + sizeof(*(ptr)))) + +static inline unsigned long __synch_cmpxchg(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__("lock; cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__("lock; cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#ifdef CONFIG_X86_64 + case 4: + __asm__ __volatile__("lock; cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#else + case 4: + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__synch_xg(ptr)), + "0"(old) + : "memory"); + return prev; +#endif + } + return old; +} + +#define synch_test_bit test_bit + +#define synch_cmpxchg_subword synch_cmpxchg + +#endif /* __XEN_SYNCH_BITOPS_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/system_32.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,488 @@ +#ifndef __ASM_SYSTEM_H +#define __ASM_SYSTEM_H + +#include <linux/kernel.h> +#include <asm/segment.h> +#include <asm/cpufeature.h> +#include <linux/bitops.h> /* for LOCK_PREFIX */ +#include <asm/synch_bitops.h> +#include <asm/hypervisor.h> + +#ifdef __KERNEL__ + +struct task_struct; /* one of the stranger aspects of C forward declarations.. */ +extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev,next,last) do { \ + unsigned long esi,edi; \ + asm volatile("pushfl\n\t" /* Save flags */ \ + "pushl %%ebp\n\t" \ + "movl %%esp,%0\n\t" /* save ESP */ \ + "movl %5,%%esp\n\t" /* restore ESP */ \ + "movl $1f,%1\n\t" /* save EIP */ \ + "pushl %6\n\t" /* restore EIP */ \ + "jmp __switch_to\n" \ + "1:\t" \ + "popl %%ebp\n\t" \ + "popfl" \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ + "=a" (last),"=S" (esi),"=D" (edi) \ + :"m" (next->thread.esp),"m" (next->thread.eip), \ + "2" (prev), "d" (next)); \ +} while (0) + +#define _set_base(addr,base) do { unsigned long __pr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %%dl,%2\n\t" \ + "movb %%dh,%3" \ + :"=&d" (__pr) \ + :"m" (*((addr)+2)), \ + "m" (*((addr)+4)), \ + "m" (*((addr)+7)), \ + "0" (base) \ + ); } while(0) + +#define _set_limit(addr,limit) do { unsigned long __lr; \ +__asm__ __volatile__ ("movw %%dx,%1\n\t" \ + "rorl $16,%%edx\n\t" \ + "movb %2,%%dh\n\t" \ + "andb $0xf0,%%dh\n\t" \ + "orb %%dh,%%dl\n\t" \ + "movb %%dl,%2" \ + :"=&d" (__lr) \ + :"m" (*(addr)), \ + "m" (*((addr)+6)), \ + "0" (limit) \ + ); } while(0) + +#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) +#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) ) + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg,value) \ + asm volatile("\n" \ + "1:\t" \ + "mov %0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "pushl $0\n\t" \ + "popl %%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n\t" \ + ".align 4\n\t" \ + ".long 1b,3b\n" \ + ".previous" \ + : :"rm" (value)) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm volatile("mov %%" #seg ",%0":"=rm" (value)) + +#define read_cr0() ({ \ + unsigned int __dummy; \ + __asm__ __volatile__( \ + "movl %%cr0,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) +#define write_cr0(x) \ + __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) + +#define read_cr2() (current_vcpu_info()->arch.cr2) +#define write_cr2(x) \ + __asm__ __volatile__("movl %0,%%cr2": :"r" (x)) + +#define read_cr3() ({ \ + unsigned int __dummy; \ + __asm__ ( \ + "movl %%cr3,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy = xen_cr3_to_pfn(__dummy); \ + mfn_to_pfn(__dummy) << PAGE_SHIFT; \ +}) +#define write_cr3(x) ({ \ + unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \ + __dummy = xen_pfn_to_cr3(__dummy); \ + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \ +}) +#define read_cr4() ({ \ + unsigned int __dummy; \ + __asm__( \ + "movl %%cr4,%0\n\t" \ + :"=r" (__dummy)); \ + __dummy; \ +}) +#define read_cr4_safe() ({ \ + unsigned int __dummy; \ + /* This could fault if %cr4 does not exist */ \ + __asm__("1: movl %%cr4, %0 \n" \ + "2: \n" \ + ".section __ex_table,\"a\" \n" \ + ".long 1b,2b \n" \ + ".previous \n" \ + : "=r" (__dummy): "0" (0)); \ + __dummy; \ +}) + +#define write_cr4(x) \ + __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) + +/* + * Clear and set 'TS' bit respectively + */ +#define clts() (HYPERVISOR_fpu_taskswitch(0)) +#define stts() (HYPERVISOR_fpu_taskswitch(1)) + +#endif /* __KERNEL__ */ + +#define wbinvd() \ + __asm__ __volatile__ ("wbinvd": : :"memory") + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + __asm__("lsll %1,%0" + :"=r" (__limit):"r" (segment)); + return __limit+1; +} + +#define nop() __asm__ __volatile__ ("nop") + +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) + +#define tas(ptr) (xchg((ptr),1)) + +struct __xchg_dummy { unsigned long a[100]; }; +#define __xg(x) ((struct __xchg_dummy *)(x)) + + +#ifdef CONFIG_X86_CMPXCHG64 + +/* + * The semantics of XCHGCMP8B are a bit strange, this is why + * there is a loop and the loading of %%eax and %%edx has to + * be inside. This inlines well in most cases, the cached + * cost is around ~38 cycles. (in the future we might want + * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that + * might have an implicit FPU-save as a cost, so it's not + * clear which path to go.) + * + * cmpxchg8b must be used with the lock prefix here to allow + * the instruction to be executed atomically, see page 3-102 + * of the instruction set reference 24319102.pdf. We need + * the reader side to see the coherent 64bit value. + */ +static inline void __set_64bit (unsigned long long * ptr, + unsigned int low, unsigned int high) +{ + __asm__ __volatile__ ( + "\n1:\t" + "movl (%0), %%eax\n\t" + "movl 4(%0), %%edx\n\t" + "lock cmpxchg8b (%0)\n\t" + "jnz 1b" + : /* no outputs */ + : "D"(ptr), + "b"(low), + "c"(high) + : "ax","dx","memory"); +} + +static inline void __set_64bit_constant (unsigned long long *ptr, + unsigned long long value) +{ + __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); +} +#define ll_low(x) *(((unsigned int*)&(x))+0) +#define ll_high(x) *(((unsigned int*)&(x))+1) + +static inline void __set_64bit_var (unsigned long long *ptr, + unsigned long long value) +{ + __set_64bit(ptr,ll_low(value), ll_high(value)); +} + +#define set_64bit(ptr,value) \ +(__builtin_constant_p(value) ? \ + __set_64bit_constant(ptr, value) : \ + __set_64bit_var(ptr, value) ) + +#define _set_64bit(ptr,value) \ +(__builtin_constant_p(value) ? \ + __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ + __set_64bit(ptr, ll_low(value), ll_high(value)) ) + +#endif + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ + +#ifdef CONFIG_X86_CMPXCHG +#define __HAVE_ARCH_CMPXCHG 1 +#define cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) +#endif + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +#ifndef CONFIG_X86_CMPXCHG +/* + * Building a kernel capable running on 80386. It may be necessary to + * simulate the cmpxchg on the 80386 CPU. For that purpose we define + * a function for each of the sizes we support. + */ + +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); + +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: + return cmpxchg_386_u8(ptr, old, new); + case 2: + return cmpxchg_386_u16(ptr, old, new); + case 4: + return cmpxchg_386_u32(ptr, old, new); + } + return old; +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 3)) \ + __ret = __cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + else \ + __ret = cmpxchg_386((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ +}) +#endif + +#ifdef CONFIG_X86_CMPXCHG64 + +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, + unsigned long long new) +{ + unsigned long long prev; + __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), + "m"(*__xg(ptr)), + "0"(old) + : "memory"); + return prev; +} + +#define cmpxchg64(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ + (unsigned long long)(n))) + +#endif + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + * + * For now, "wmb()" doesn't actually do anything, as all + * Intel CPU's follow what Intel calls a *Processor Order*, + * in which all writes are seen in the program order even + * outside the CPU. + * + * I expect future Intel CPU's to have a weaker ordering, + * but I'd also expect them to finally get their act together + * and add some real memory barriers if so. + * + * Some non intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ + + +/* + * Actually only lfence would be needed for mb() because all stores done + * by the kernel should be already ordered. But keep a full barrier for now. + */ + +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * <programlisting> + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * </programlisting> + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * <programlisting> + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * </programlisting> + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while(0) + +#ifdef CONFIG_X86_OOSTORE +/* Actually there are no OOO store capable CPUs for now that do SSE, + but make it already an possibility. */ +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define wmb() __asm__ __volatile__ ("": : :"memory") +#endif + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while(0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +#include <linux/irqflags.h> + +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +void disable_hlt(void); +void enable_hlt(void); + +extern int es7000_plat; +void cpu_idle_wait(void); + +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible: + */ +static inline void sched_cacheflush(void) +{ + wbinvd(); +} + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/tlbflush_32.h 2007-11-26 16:59:25.000000000 +0100 @@ -0,0 +1,101 @@ +#ifndef _I386_TLBFLUSH_H +#define _I386_TLBFLUSH_H + +#include <linux/mm.h> +#include <asm/processor.h> + +#define __flush_tlb() xen_tlb_flush() +#define __flush_tlb_global() xen_tlb_flush() +#define __flush_tlb_all() xen_tlb_flush() + +extern unsigned long pgkern_mask; + +#define cpu_has_invlpg (boot_cpu_data.x86 > 3) + +#define __flush_tlb_single(addr) xen_invlpg(addr) + +#define __flush_tlb_one(addr) __flush_tlb_single(addr) + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +#else + +#include <asm/smp.h> + +#define local_flush_tlb() \ + __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +struct tlb_state +{ + struct mm_struct *active_mm; + int state; + char __cacheline_padding[L1_CACHE_BYTES-8]; +}; +DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); + + +#endif + +#define flush_tlb_kernel_range(start, end) flush_tlb_all() + +static inline void flush_tlb_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + /* i386 does not keep any page table caches in TLB */ +} + +#endif /* _I386_TLBFLUSH_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/vga.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,20 @@ +/* + * Access to VGA videoram + * + * (c) 1998 Martin Mares <mj@ucw.cz> + */ + +#ifndef _LINUX_ASM_VGA_H_ +#define _LINUX_ASM_VGA_H_ + +/* + * On the PC, we can just recalculate addresses and then + * access the videoram directly without any black magic. + */ + +#define VGA_MAP_MEM(x,s) (unsigned long)isa_bus_to_virt(x) + +#define vga_readb(x) (*(x)) +#define vga_writeb(x,y) (*(y) = (x)) + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/xenoprof.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,48 @@ +/****************************************************************************** + * asm-i386/mach-xen/asm/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef __ASM_XENOPROF_H__ +#define __ASM_XENOPROF_H__ +#ifdef CONFIG_XEN + +struct super_block; +struct dentry; +int xenoprof_create_files(struct super_block * sb, struct dentry * root); +#define HAVE_XENOPROF_CREATE_FILES + +struct xenoprof_init; +void xenoprof_arch_init_counter(struct xenoprof_init *init); +void xenoprof_arch_counter(void); +void xenoprof_arch_start(void); +void xenoprof_arch_stop(void); + +struct xenoprof_arch_shared_buffer { + /* nothing */ +}; +struct xenoprof_shared_buffer; +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); +struct xenoprof_get_buffer; +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); +struct xenoprof_passive; +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); + +#endif /* CONFIG_XEN */ +#endif /* __ASM_XENOPROF_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/irq_vectors.h 2008-09-25 13:55:32.000000000 +0200 @@ -0,0 +1,125 @@ +/* + * This file should contain #defines for all of the interrupt vector + * numbers used by this architecture. + * + * In addition, there are some standard defines: + * + * FIRST_EXTERNAL_VECTOR: + * The first free place for external interrupts + * + * SYSCALL_VECTOR: + * The IRQ vector a syscall makes the user to kernel transition + * under. + * + * TIMER_IRQ: + * The IRQ number the timer interrupt comes in at. + * + * NR_IRQS: + * The total number of interrupt vectors (including all the + * architecture specific interrupts) needed. + * + */ +#ifndef _ASM_IRQ_VECTORS_H +#define _ASM_IRQ_VECTORS_H + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#define SYSCALL_VECTOR 0x80 + +/* + * Vectors 0x20-0x2f are used for ISA interrupts. + */ + +#if 0 +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xfa are free (reserved for future Linux use). + */ +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define INVALIDATE_TLB_VECTOR 0xfd +#define RESCHEDULE_VECTOR 0xfc +#define CALL_FUNCTION_VECTOR 0xfb + +#define THERMAL_APIC_VECTOR 0xf0 +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef +#endif + +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) + * we start at 0x31 to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#define FIRST_DEVICE_VECTOR 0x31 +#define FIRST_SYSTEM_VECTOR 0xef + +/* + * 16 8259A IRQ's, 208 potential APIC interrupt sources. + * Right now the APIC is mostly only used for SMP. + * 256 vectors is an architectural limit. (we can have + * more than 256 devices theoretically, but they will + * have to use shared interrupts) + * Since vectors 0x00-0x1f are used/reserved for the CPU, + * the usable vector space is 0x20-0xff (224 vectors) + */ + +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define NR_IPIS 2 + +/* + * The maximum number of vectors supported by i386 processors + * is limited to 256. For processors other than i386, NR_VECTORS + * should be changed accordingly. + */ +#define NR_VECTORS 256 + +#define FPU_IRQ 13 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 +#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) + +/* + * The flat IRQ space is divided into two regions: + * 1. A one-to-one mapping of real physical IRQs. This space is only used + * if we have physical device-access privilege. This region is at the + * start of the IRQ space so that existing device drivers do not need + * to be modified to translate physical IRQ numbers into our IRQ space. + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These + * are bound using the provided bind/unbind functions. + */ + +#define PIRQ_BASE 0 +#if !defined(MAX_IO_APICS) +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) +#elif NR_CPUS < MAX_IO_APICS +# define NR_PIRQS (NR_VECTORS + 32 * NR_CPUS) +#else +# define NR_PIRQS (NR_VECTORS + 32 * MAX_IO_APICS) +#endif + +#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) +#define NR_DYNIRQS 256 + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) +#define NR_IRQ_VECTORS NR_IRQS + +#endif /* _ASM_IRQ_VECTORS_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/mach_traps.h 2007-06-12 13:14:02.000000000 +0200 @@ -0,0 +1,33 @@ +/* + * include/asm-xen/asm-i386/mach-xen/mach_traps.h + * + * Machine specific NMI handling for Xen + */ +#ifndef _MACH_TRAPS_H +#define _MACH_TRAPS_H + +#include <linux/bitops.h> +#include <xen/interface/nmi.h> + +static inline void clear_mem_error(unsigned char reason) {} +static inline void clear_io_check_error(unsigned char reason) {} + +static inline unsigned char get_nmi_reason(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned char reason = 0; + + /* construct a value which looks like it came from + * port 0x61. + */ + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason)) + reason |= 0x40; + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason)) + reason |= 0x80; + + return reason; +} + +static inline void reassert_nmi(void) {} + +#endif /* !_MACH_TRAPS_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/desc_64.h 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,265 @@ +/* Written 2000 by Andi Kleen */ +#ifndef __ARCH_DESC_H +#define __ARCH_DESC_H + +#include <linux/threads.h> +#include <asm/ldt.h> + +#ifndef __ASSEMBLY__ + +#include <linux/string.h> +#include <linux/smp.h> + +#include <asm/segment.h> +#include <asm/mmu.h> + +// 8 byte segment descriptor +struct desc_struct { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; + unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; +} __attribute__((packed)); + +struct n_desc_struct { + unsigned int a,b; +}; + +enum { + GATE_INTERRUPT = 0xE, + GATE_TRAP = 0xF, + GATE_CALL = 0xC, +}; + +// 16byte gate +struct gate_struct { + u16 offset_low; + u16 segment; + unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; + u16 offset_middle; + u32 offset_high; + u32 zero1; +} __attribute__((packed)); + +#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) +#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF) +#define PTR_HIGH(x) ((unsigned long)(x) >> 32) + +enum { + DESC_TSS = 0x9, + DESC_LDT = 0x2, +}; + +// LDT or TSS descriptor in the GDT. 16 bytes. +struct ldttss_desc { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +struct desc_ptr { + unsigned short size; + unsigned long address; +} __attribute__((packed)) ; + +extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; + +extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; + +#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8)) +#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8)) + +static inline void clear_LDT(void) +{ + int cpu = get_cpu(); + + /* + * NB. We load the default_ldt for lcall7/27 handling on demand, as + * it slows down context switching. Noone uses it anyway. + */ + cpu = cpu; /* XXX avoid compiler warning */ + xen_set_ldt(NULL, 0); + put_cpu(); +} + +/* + * This is the ldt that every process will get unless we need + * something other than this. + */ +extern struct desc_struct default_ldt[]; +#ifndef CONFIG_X86_NO_IDT +extern struct gate_struct idt_table[]; +#endif +extern struct desc_ptr cpu_gdt_descr[]; + +/* the cpu gdt accessor */ +#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address) + +static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) +{ + struct gate_struct s; + s.offset_low = PTR_LOW(func); + s.segment = __KERNEL_CS; + s.ist = ist; + s.p = 1; + s.dpl = dpl; + s.zero0 = 0; + s.zero1 = 0; + s.type = type; + s.offset_middle = PTR_MIDDLE(func); + s.offset_high = PTR_HIGH(func); + /* does not need to be atomic because it is only done once at setup time */ + memcpy(adr, &s, 16); +} + +#ifndef CONFIG_X86_NO_IDT +static inline void set_intr_gate(int nr, void *func) +{ + BUG_ON((unsigned)nr > 0xFF); + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); +} + +static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) +{ + BUG_ON((unsigned)nr > 0xFF); + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); +} + +static inline void set_system_gate(int nr, void *func) +{ + BUG_ON((unsigned)nr > 0xFF); + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); +} + +static inline void set_system_gate_ist(int nr, void *func, unsigned ist) +{ + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); +} +#endif + +static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, + unsigned size) +{ + struct ldttss_desc d; + memset(&d,0,sizeof(d)); + d.limit0 = size & 0xFFFF; + d.base0 = PTR_LOW(tss); + d.base1 = PTR_MIDDLE(tss) & 0xFF; + d.type = type; + d.p = 1; + d.limit1 = (size >> 16) & 0xF; + d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; + d.base3 = PTR_HIGH(tss); + memcpy(ptr, &d, 16); +} + +#ifndef CONFIG_X86_NO_TSS +static inline void set_tss_desc(unsigned cpu, void *addr) +{ + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_TSS], + (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); +} +#endif + +static inline void set_ldt_desc(unsigned cpu, void *addr, int size) +{ + set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT], (unsigned long)addr, + DESC_LDT, size * 8 - 1); +} + +static inline void set_seg_base(unsigned cpu, int entry, void *base) +{ + struct desc_struct *d = &cpu_gdt(cpu)[entry]; + u32 addr = (u32)(u64)base; + BUG_ON((u64)base >> 32); + d->base0 = addr & 0xffff; + d->base1 = (addr >> 16) & 0xff; + d->base2 = (addr >> 24) & 0xff; +} + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) +/* Don't allow setting of the lm bit. It is useless anyways because + 64bit system calls require __USER_CS. */ +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + /* ((info)->lm << 21) | */ \ + 0x7000) + +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 && \ + (info)->lm == 0) + +#if TLS_SIZE != 24 +# error update this code. +#endif + +static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +{ +#if 0 + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); + gdt[0] = t->tls_array[0]; + gdt[1] = t->tls_array[1]; + gdt[2] = t->tls_array[2]; +#endif +#define C(i) \ + if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \ + t->tls_array[i])) \ + BUG(); + + C(0); C(1); C(2); +#undef C +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock (mm_context_t *pc, int cpu) +{ + void *segments = pc->ldt; + int count = pc->size; + + if (likely(!count)) + segments = NULL; + + xen_set_ldt(segments, count); +} + +static inline void load_LDT(mm_context_t *pc) +{ + int cpu = get_cpu(); + load_LDT_nolock(pc, cpu); + put_cpu(); +} + +extern struct desc_ptr idt_descr; + +#endif /* !__ASSEMBLY__ */ + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/fixmap_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,112 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + */ + +#ifndef _ASM_FIXMAP_H +#define _ASM_FIXMAP_H + +#include <linux/kernel.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#include <asm/vsyscall.h> +#include <asm/vsyscall32.h> +#include <asm/acpi.h> + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * + * these 'compile-time allocated' memory buffers are + * fixed-size 4k pages. (or larger if used with an increment + * highger than 1) use fixmap_set(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ + +enum fixed_addresses { + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VSYSCALL_HPET, + FIX_HPET_BASE, +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, +#endif +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, + __end_of_permanent_fixed_addresses, + /* temporary boot-time mappings, used before ioremap() is functional */ +#define NR_FIX_BTMAPS 16 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, + __end_of_fixed_addresses +}; + +extern void __set_fixmap (enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/hypercall_64.h 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,422 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu <benjamin.liu@intel.com> + * Jun Nakajima <jun.nakajima@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#include <linux/string.h> /* memcpy() */ +#include <linux/stringify.h> +#include <xen/interface/arch-x86/xen-mca.h> + +#ifndef __HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#ifdef CONFIG_XEN +#define HYPERCALL_STR(name) \ + "call hypercall_page + ("__stringify(__HYPERVISOR_##name)" * 32)" +#else +#define HYPERCALL_STR(name) \ + "mov $("__stringify(__HYPERVISOR_##name)" * 32),%%eax; "\ + "add hypercall_stubs(%%rip),%%rax; " \ + "call *%%rax" +#endif + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res) \ + : \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + type __res; \ + long __ign1; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1) \ + : "1" ((long)(a1)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + long __ign1, __ign2; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \ + : "1" ((long)(a1)), "2" ((long)(a2)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + register long __arg4 asm("r10") = (long)(a4); \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3), "+r" (__arg4) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + long __ign1, __ign2, __ign3; \ + register long __arg4 asm("r10") = (long)(a4); \ + register long __arg5 asm("r8") = (long)(a5); \ + asm volatile ( \ + HYPERCALL_STR(name) \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3), "+r" (__arg4), "+r" (__arg5) \ + : "1" ((long)(a1)), "2" ((long)(a2)), \ + "3" ((long)(a3)) \ + : "memory" ); \ + __res; \ +}) + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall1(long, set_timer_op, timeout); +} + +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} +static inline int __must_check +HYPERVISOR_mca( + struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_set_segment_base( + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +static inline int __must_check +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + +#endif /* __HYPERCALL_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/irqflags_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,139 @@ +/* + * include/asm-x86_64/irqflags.h + * + * IRQ flags handling + * + * This file gets included from lowlevel asm headers too, to provide + * wrapped versions of the local_irq_*() APIs, based on the + * raw_local_irq_*() functions from the lowlevel headers. + */ +#ifndef _ASM_IRQFLAGS_H +#define _ASM_IRQFLAGS_H + +#ifndef __ASSEMBLY__ +/* + * Interrupt control: + */ + +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) + +#define raw_local_save_flags(flags) \ + do { (flags) = __raw_local_save_flags(); } while (0) + +#define raw_local_irq_restore(x) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#ifdef CONFIG_X86_VSMP + +/* + * Interrupt control for the VSMP architecture: + */ + +static inline void raw_local_irq_disable(void) +{ + unsigned long flags = __raw_local_save_flags(); + + raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); +} + +static inline void raw_local_irq_enable(void) +{ + unsigned long flags = __raw_local_save_flags(); + + raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); +} + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return !(flags & (1<<9)) || (flags & (1 << 18)); +} + +#else /* CONFIG_X86_VSMP */ + +#define raw_local_irq_disable() \ +do { \ + current_vcpu_info()->evtchn_upcall_mask = 1; \ + barrier(); \ +} while (0) + +#define raw_local_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ + force_evtchn_callback(); \ +} while (0) + +static inline int raw_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#endif + +/* + * For spinlocks, etc.: + */ + +#define __raw_local_irq_save() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_local_irq_disable(); \ + \ + flags; \ +}) + +#define raw_local_irq_save(flags) \ + do { (flags) = __raw_local_irq_save(); } while (0) + +#define raw_irqs_disabled() \ +({ \ + unsigned long flags = __raw_local_save_flags(); \ + \ + raw_irqs_disabled_flags(flags); \ +}) + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +void raw_safe_halt(void); + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +void halt(void); + +#else /* __ASSEMBLY__: */ +# ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk +# else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +# endif +#endif + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/maddr_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,161 @@ +#ifndef _X86_64_MADDR_H +#define _X86_64_MADDR_H + +#include <xen/features.h> +#include <xen/interface/xen.h> + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<63) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + BUG_ON(end_pfn && pfn >= end_pfn); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + BUG_ON(end_pfn && pfn >= end_pfn); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return end_pfn; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: movq %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movq %2,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,3b\n" + ".previous" + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (end_pfn) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if ((pfn < end_pfn) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (phys_to_machine_mapping[pfn] != mfn)) + return end_pfn; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + BUG_ON(end_pfn && pfn >= end_pfn); + if (xen_feature(XENFEAT_auto_translated_physmap)) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + maddr_t machine; + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + paddr_t phys; + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#define __pte_ma(x) ((pte_t) { (x) } ) +#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask) + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) (1) +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#endif /* _X86_64_MADDR_H */ + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/mmu_context_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,136 @@ +#ifndef __X86_64_MMU_CONTEXT_H +#define __X86_64_MMU_CONTEXT_H + +#include <asm/desc.h> +#include <asm/atomic.h> +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +/* + * possibly do the LDT unload here? + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + if (read_pda(mmu_state) == TLBSTATE_OK) + write_pda(mmu_state, TLBSTATE_LAZY); +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ + /* + * Save away %es, %ds, %fs and %gs. Must happen before reload + * of cr3/ldt (i.e., not in __switch_to). + */ + __asm__ __volatile__ ( + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" + : "=m" (current->thread.es), + "=m" (current->thread.ds), + "=m" (current->thread.fsindex), + "=m" (current->thread.gsindex) ); + + if (current->thread.ds) + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); + + if (current->thread.es) + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); + + if (current->thread.fsindex) { + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); + current->thread.fs = 0; + } + + if (current->thread.gsindex) { + load_gs_index(0); + current->thread.gs = 0; + } +} + +extern void mm_pin(struct mm_struct *mm); +extern void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void load_cr3(pgd_t *pgd) +{ + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) : + "memory"); +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + struct mmuext_op _op[3], *op = _op; + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !next->context.pinned); + + /* stop flush ipis for the previous mm */ + cpu_clear(cpu, prev->cpu_vm_mask); +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + write_pda(mmu_state, TLBSTATE_OK); + write_pda(active_mm, next); +#endif + cpu_set(cpu, next->cpu_vm_mask); + + /* load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); + op++; + + /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ + op->cmd = MMUEXT_NEW_USER_BASEPTR; + op->arg1.mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT); + op++; + + if (unlikely(next->context.ldt != prev->context.ldt)) { + /* load_LDT_nolock(&next->context, cpu) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + } +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + else { + write_pda(mmu_state, TLBSTATE_OK); + if (read_pda(active_mm) != next) + out_of_line_bug(); + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + xen_new_user_pt(__pa(__user_pgd(next->pgd))); + load_LDT_nolock(&next->context, cpu); + } + } +#endif +} + +#define deactivate_mm(tsk,mm) do { \ + load_gs_index(0); \ + asm volatile("movl %0,%%fs"::"r"(0)); \ +} while(0) + +static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + if (!next->context.pinned) + mm_pin(next); + switch_mm(prev, next, NULL); +} + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgalloc_64.h 2007-06-18 08:38:13.000000000 +0200 @@ -0,0 +1,204 @@ +#ifndef _X86_64_PGALLOC_H +#define _X86_64_PGALLOC_H + +#include <asm/fixmap.h> +#include <asm/pda.h> +#include <linux/threads.h> +#include <linux/mm.h> +#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ + +#include <xen/features.h> +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) +{ + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) +{ + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + } else { + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); + } +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + } else { + *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); + } +} + +/* + * We need to use the batch mode here, but pgd_pupulate() won't be + * be called frequently. + */ +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); + } else { + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); + *(__user_pgd(pgd)) = *(pgd); + } +} + +extern struct page *pte_alloc_one(struct mm_struct *mm, unsigned long addr); +extern void pte_free(struct page *pte); + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + struct page *pg; + + pg = pte_alloc_one(mm, addr); + return pg ? page_address(pg) : NULL; +} + +static inline void pmd_free(pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + pte_free(virt_to_page(pmd)); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + struct page *pg; + + pg = pte_alloc_one(mm, addr); + return pg ? page_address(pg) : NULL; +} + +static inline void pud_free(pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + pte_free(virt_to_page(pud)); +} + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); + page->index = (pgoff_t)pgd_list; + if (pgd_list) + pgd_list->private = (unsigned long)&page->index; + pgd_list = page; + page->private = (unsigned long)&pgd_list; + spin_unlock(&pgd_lock); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); + next = (struct page *)page->index; + pprev = (struct page **)page->private; + *pprev = next; + if (next) + next->private = (unsigned long)pprev; + spin_unlock(&pgd_lock); +} + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + /* + * We allocate two contiguous pages for kernel and user. + */ + unsigned boundary; + pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1); + if (!pgd) + return NULL; + pgd_list_add(pgd); + /* + * Copy kernel pointers in from init. + * Could keep a freelist or slab cache of those because the kernel + * part never changes. + */ + boundary = pgd_index(__PAGE_OFFSET); + memset(pgd, 0, boundary * sizeof(pgd_t)); + memcpy(pgd + boundary, + init_level4_pgt + boundary, + (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); + + memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ + /* + * Set level3_user_pgt for vsyscall area + */ + __user_pgd(pgd)[pgd_index(VSYSCALL_START)] = + __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE); + return pgd; +} + +static inline void pgd_free(pgd_t *pgd) +{ + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } + + ptep = virt_to_ptep(__user_pgd(pgd)); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(__user_pgd(pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(pgd), + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, + PAGE_KERNEL), + 0)); + } + + pgd_list_del(pgd); + free_pages((unsigned long)pgd, 1); +} + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + if (pte) + make_page_readonly(pte, XENFEAT_writable_page_tables); + + return pte; +} + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + make_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) +#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) + +#endif /* _X86_64_PGALLOC_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/pgtable_64.h 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,585 @@ +#ifndef _X86_64_PGTABLE_H +#define _X86_64_PGTABLE_H + +/* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ +#include <asm/processor.h> +#include <asm/fixmap.h> +#include <asm/bitops.h> +#include <linux/threads.h> +#include <linux/sched.h> +#include <asm/pda.h> +#ifdef CONFIG_XEN +#include <asm/hypervisor.h> + +extern pud_t level3_user_pgt[512]; + +extern void xen_init_pt(void); + +extern pte_t *lookup_address(unsigned long address); + +#define virt_to_ptep(va) \ +({ \ + pte_t *__ptep = lookup_address((unsigned long)(va)); \ + BUG_ON(!__ptep || !pte_present(*__ptep)); \ + __ptep; \ +}) + +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)pte_mfn(*virt_to_ptep(va)) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_physmem_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pgd_t init_level4_pgt[]; +extern pgd_t boot_level4_pgt[]; +extern unsigned long __supported_pte_mask; + +#define swapper_pg_dir init_level4_pgt + +extern int nonx_setup(char *str); +extern void paging_init(void); +extern void clear_kernel_mapping(unsigned long addr, unsigned long size); + +extern unsigned long pgkern_mask; + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ + &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ + &(e), __pmd_val(e), pmd_pfn(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ + &(e), __pud_val(e), (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \ + &(e), __pgd_val(e), (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +#define pgd_none(x) (!__pgd_val(x)) +#define pud_none(x) (!__pud_val(x)) + +static inline void set_pte(pte_t *dst, pte_t val) +{ + *dst = val; +} + +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) +#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) +#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval)) + +static inline void pud_clear (pud_t * pud) +{ + set_pud(pud, __pud(0)); +} + +#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) + +static inline void pgd_clear (pgd_t * pgd) +{ + set_pgd(pgd, __pgd(0)); + set_pgd(__user_pgd(pgd), __pgd(0)); +} + +#define pud_page(pud) \ + ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) + +#define pte_same(a, b) ((a).pte == (b).pte) + +#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) + +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) +#define FIRST_USER_ADDRESS 0 + +#ifndef __ASSEMBLY__ +#define MAXMEM 0x6fffffffffUL +#define VMALLOC_START 0xffffc20000000000UL +#define VMALLOC_END 0xffffe1ffffffffffUL +#define MODULES_VADDR 0xffffffff88000000UL +#define MODULES_END 0xffffffffff000000UL +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#define _PAGE_BIT_PRESENT 0 +#define _PAGE_BIT_RW 1 +#define _PAGE_BIT_USER 2 +#define _PAGE_BIT_PWT 3 +#define _PAGE_BIT_PCD 4 +#define _PAGE_BIT_ACCESSED 5 +#define _PAGE_BIT_DIRTY 6 +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +#define _PAGE_PRESENT 0x001 +#define _PAGE_RW 0x002 +#define _PAGE_USER 0x004 +#define _PAGE_PWT 0x008 +#define _PAGE_PCD 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_PSE 0x080 /* 2MB page */ +#define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_GLOBAL 0x100 /* Global TLB entry */ + +#define _PAGE_PROTNONE 0x080 /* If not present */ +#define _PAGE_NX (1UL<<_PAGE_BIT_NX) + +/* Mapped page is I/O or foreign and has no associated page struct. */ +#define _PAGE_IO 0x200 + +#if CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user) + +#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_IO) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define __PAGE_KERNEL \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL_NOCACHE \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) +#define __PAGE_KERNEL_RO \ + (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | __kernel_page_user) +#define __PAGE_KERNEL_VSYSCALL \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define __PAGE_KERNEL_VSYSCALL_NOCACHE \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD) +#define __PAGE_KERNEL_LARGE \ + (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC \ + (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +/* + * We don't support GLOBAL page in xenolinux64 + */ +#define MAKE_GLOBAL(x) __pgprot((x)) + +#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) +#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +static inline unsigned long pgd_bad(pgd_t pgd) +{ + unsigned long val = __pgd_val(pgd); + val &= ~PTE_MASK; + val &= ~(_PAGE_USER | _PAGE_DIRTY); + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); +} + +static inline unsigned long pud_bad(pud_t pud) +{ + unsigned long val = __pud_val(pud); + val &= ~PTE_MASK; + val &= ~(_PAGE_USER | _PAGE_DIRTY); + return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); +} + +#define set_pte_at(_mm,addr,ptep,pteval) do { \ + if (((_mm) != current->mm && (_mm) != &init_mm) || \ + HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ + set_pte((ptep), (pteval)); \ +} while (0) + +#define pte_none(x) (!(x).pte) +#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) + +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) + +#define __pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) +#define pte_mfn(_pte) ((_pte).pte & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte & _PAGE_IO ? end_pfn : \ + (_pte).pte & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +#define pte_page(x) pfn_to_page(pte_pfn(x)) + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + unsigned long pte = page_nr << PAGE_SHIFT; + pte |= pgprot_val(pgprot); + pte &= __supported_pte_mask; + return __pte(pte); +} + +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte)) { + if ((mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + pte = __pte_ma(xchg(&ptep->pte, 0)); + } + return pte; +} + +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) +{ + if (full) { + pte_t pte = *ptep; + if (mm->context.pinned) + xen_l1_entry_update(ptep, __pte(0)); + else + *ptep = __pte(0); + return pte; + } + return ptep_get_and_clear(mm, addr, ptep); +} + +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ + UVMF_INVLPG|UVMF_MULTI))) { \ + __ptep->pte = 0; \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) +static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } +static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } +static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } +static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } +static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } +static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } +static inline int pte_huge(pte_t pte) { return __pte_val(pte) & _PAGE_PSE; } + +static inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } +static inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } +static inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; } +static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } +static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } +static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } +static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } +static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } +static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } +static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } + +#define ptep_test_and_clear_dirty(vma, addr, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __ret = pte_dirty(__pte); \ + if (__ret) \ + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \ + __ret; \ +}) + +#define ptep_test_and_clear_young(vma, addr, ptep) \ +({ \ + pte_t __pte = *(ptep); \ + int __ret = pte_young(__pte); \ + if (__ret) \ + set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \ + __ret; \ +}) + +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +/* + * Macro to mark a page protection value as "uncacheable". + */ +#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) + +static inline int pmd_large(pmd_t pte) { + return (__pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; +} + + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +/* + * Level 4 access. + * Never use these in the common code. + */ +#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) +#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) +#define pgd_offset_k(address) (init_level4_pgt + pgd_index(address)) +#define pgd_present(pgd) (__pgd_val(pgd) & _PAGE_PRESENT) +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + +/* PUD - Level3 access */ +/* to find an entry in a page-table-directory. */ +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) +#define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT) + +/* PMD - Level 2 access */ +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) + +#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ + pmd_index(address)) +#define pmd_none(x) (!__pmd_val(x)) +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ +#define pmd_present(x) (__pmd_val(x)) +#else +#define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT) +#endif +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) +#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \ + != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT))) +#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT + +/* PTE - Level 1 access. */ + +/* page, protection -> pte */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) +#define mk_pte_huge(entry) (__pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) + +/* physical address -> PTE */ +static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) +{ + unsigned long pteval; + pteval = physpage | pgprot_val(pgprot); + return __pte(pteval); +} + +/* Change flags of a PTE */ +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + /* + * Since this might change the present bit (which controls whether + * a pte_t object has undergone p2m translation), we must use + * pte_val() on the input pte and __pte() for the return value. + */ + unsigned long pteval = pte_val(pte); + + pteval &= _PAGE_CHG_MASK; + pteval |= pgprot_val(newprot); + pteval &= __supported_pte_mask; + return __pte(pteval); +} + +#define pte_index(address) \ + (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \ + pte_index(address)) + +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir,address) pte_offset_kernel(dir,address) +#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address) +#define pte_unmap(pte) /* NOP */ +#define pte_unmap_nested(pte) /* NOP */ + +#define update_mmu_cache(vma,address,pte) do { } while (0) + +/* + * Rules for using ptep_establish: the pte MUST be a user pte, and + * must be a present->present transition. + */ +#define __HAVE_ARCH_PTEP_ESTABLISH +#define ptep_establish(vma, address, ptep, pteval) \ + do { \ + if ( likely((vma)->vm_mm == current->mm) ) { \ + BUG_ON(HYPERVISOR_update_va_mapping(address, \ + pteval, \ + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \ + UVMF_INVLPG|UVMF_MULTI)); \ + } else { \ + xen_l1_entry_update(ptep, pteval); \ + flush_tlb_page(vma, address); \ + } \ + } while (0) + +/* We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. */ +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ + do { \ + if (dirty) \ + ptep_establish(vma, address, ptep, entry); \ + } while (0) + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val >> 1) & 0x3f) +#define __swp_offset(x) ((x).val >> 8) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +extern spinlock_t pgd_lock; +extern struct page *pgd_list; +void vmalloc_sync_all(void); + +#endif /* !__ASSEMBLY__ */ + +extern int kern_addr_valid(unsigned long addr); + +#define DOMID_LOCAL (0xFFFFU) + +struct vm_area_struct; + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); + +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); + +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); + +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); + +int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot); + +#define arch_change_pte_range(mm, pmd, addr, end, newprot) \ + xen_change_pte_range(mm, pmd, addr, end, newprot) + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) + +#define MK_IOSPACE_PFN(space, pfn) (pfn) +#define GET_IOSPACE(pfn) 0 +#define GET_PFN(pfn) (pfn) + +#define HAVE_ARCH_UNMAPPED_AREA + +#define pgtable_cache_init() do { } while (0) +#define check_pgt_cache() do { } while (0) + +#define PAGE_AGP PAGE_KERNEL_NOCACHE +#define HAVE_PAGE_AGP 1 + +/* fs/proc/kcore.c */ +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) +#define kc_offset_to_vaddr(o) \ + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +#define __HAVE_ARCH_PTE_SAME +#include <asm-generic/pgtable.h> + +#endif /* _X86_64_PGTABLE_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/processor_64.h 2008-03-06 08:54:32.000000000 +0100 @@ -0,0 +1,502 @@ +/* + * include/asm-x86_64/processor.h + * + * Copyright (C) 1994 Linus Torvalds + */ + +#ifndef __ASM_X86_64_PROCESSOR_H +#define __ASM_X86_64_PROCESSOR_H + +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/sigcontext.h> +#include <asm/cpufeature.h> +#include <linux/threads.h> +#include <asm/msr.h> +#include <asm/current.h> +#include <asm/system.h> +#include <asm/mmsegment.h> +#include <asm/percpu.h> +#include <linux/personality.h> +#include <linux/cpumask.h> + +#define TF_MASK 0x00000100 +#define IF_MASK 0x00000200 +#define IOPL_MASK 0x00003000 +#define NT_MASK 0x00004000 +#define VM_MASK 0x00020000 +#define AC_MASK 0x00040000 +#define VIF_MASK 0x00080000 /* virtual interrupt flag */ +#define VIP_MASK 0x00100000 /* virtual interrupt pending */ +#define ID_MASK 0x00200000 + +#define desc_empty(desc) \ + (!((desc)->a | (desc)->b)) + +#define desc_equal(desc1, desc2) \ + (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; }) + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; + int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + int x86_cache_size; /* in KB */ + int x86_clflush_size; + int x86_cache_alignment; + int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ + __u8 x86_virt_bits, x86_phys_bits; + __u8 x86_max_cores; /* cpuid returned max cores value */ + __u32 x86_power; + __u32 extended_cpuid_level; /* Max extended CPUID function supported */ + unsigned long loops_per_jiffy; +#ifdef CONFIG_SMP + cpumask_t llc_shared_map; /* cpus sharing the last level cache */ +#endif + __u8 apicid; +#ifdef CONFIG_SMP + __u8 booted_cores; /* number of cores as seen by OS */ + __u8 phys_proc_id; /* Physical Processor id. */ + __u8 cpu_core_id; /* Core id. */ +#endif +} ____cacheline_aligned; + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_NEXGEN 4 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_RISE 6 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NUM 8 +#define X86_VENDOR_UNKNOWN 0xff + +#ifdef CONFIG_SMP +extern struct cpuinfo_x86 cpu_data[]; +#define current_cpu_data cpu_data[smp_processor_id()] +#else +#define cpu_data (&boot_cpu_data) +#define current_cpu_data boot_cpu_data +#endif + +extern char ignore_irq13; + +extern void identify_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +/* + * Intel CPU features in CR4 + */ +#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +#define X86_CR4_MCE 0x0040 /* Machine check enable */ +#define X86_CR4_PGE 0x0080 /* enable global pages */ +#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ +#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4 (unsigned long mask) +{ + mmu_cr4_features |= mask; + __asm__("movq %%cr4,%%rax\n\t" + "orq %0,%%rax\n\t" + "movq %%rax,%%cr4\n" + : : "irg" (mask) + :"ax"); +} + +static inline void clear_in_cr4 (unsigned long mask) +{ + mmu_cr4_features &= ~mask; + __asm__("movq %%cr4,%%rax\n\t" + "andq %0,%%rax\n\t" + "movq %%rax,%%cr4\n" + : : "irg" (~mask) + :"ax"); +} + + +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE64 (0x800000000000UL - 4096) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) + +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE/3) + +/* + * Size of io_bitmap. + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#ifndef CONFIG_X86_NO_TSS +#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) +#endif +#define INVALID_IO_BITMAP_OFFSET 0x8000 + +struct i387_fxsave_struct { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ + u32 padding[24]; +} __attribute__ ((aligned (16))); + +union i387_union { + struct i387_fxsave_struct fxsave; +}; + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + u32 reserved1; + u64 rsp0; + u64 rsp1; + u64 rsp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. Thus we have: + * + * 128 bytes, the bitmap itself, for ports 0..0x3ff + * 8 bytes, for an extra "long" of ~0UL + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +} __attribute__((packed)) ____cacheline_aligned; + +DECLARE_PER_CPU(struct tss_struct,init_tss); +#endif + + +extern struct cpuinfo_x86 boot_cpu_data; +#ifndef CONFIG_X86_NO_TSS +/* Save the original ist values for checking stack pointers during debugging */ +struct orig_ist { + unsigned long ist[7]; +}; +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +#ifdef CONFIG_X86_VSMP +#define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +#define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +#define ARCH_MIN_TASKALIGN 16 +#define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +struct thread_struct { + unsigned long rsp0; + unsigned long rsp; + unsigned long userrsp; /* Copy from PDA */ + unsigned long fs; + unsigned long gs; + unsigned short es, ds, fsindex, gsindex; +/* Hardware debugging registers */ + unsigned long debugreg0; + unsigned long debugreg1; + unsigned long debugreg2; + unsigned long debugreg3; + unsigned long debugreg6; + unsigned long debugreg7; +/* fault info */ + unsigned long cr2, trap_no, error_code; +/* floating point info */ + union i387_union i387 __attribute__((aligned(16))); +/* IO permissions. the bitmap could be moved into the GDT, that would make + switch faster for a limited number of ioperm using tasks. -AK */ + int ioperm; + unsigned long *io_bitmap_ptr; + unsigned io_bitmap_max; +/* cached TLS descriptors. */ + u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned int iopl; +} __attribute__((aligned(16))); + +#define INIT_THREAD { \ + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#ifndef CONFIG_X86_NO_TSS +#define INIT_TSS { \ + .rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} +#endif + +#define INIT_MMAP \ +{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } + +#define start_thread(regs,new_rip,new_rsp) do { \ + asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ + load_gs_index(0); \ + (regs)->rip = (new_rip); \ + (regs)->rsp = (new_rsp); \ + write_pda(oldrsp, (new_rsp)); \ + (regs)->cs = __USER_CS; \ + (regs)->ss = __USER_DS; \ + (regs)->eflags = 0x200; \ + set_fs(USER_DS); \ +} while(0) + +#define get_debugreg(var, register) \ + var = HYPERVISOR_get_debugreg(register) +#define set_debugreg(value, register) do { \ + if (HYPERVISOR_set_debugreg(register, value)) \ + BUG(); \ +} while (0) + +struct task_struct; +struct mm_struct; + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy status */ +extern void prepare_to_copy(struct task_struct *tsk); + +/* + * create a kernel thread without removing it from tasklists + */ +extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8)) + +extern unsigned long get_wchan(struct task_struct *p); +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.rsp0 - 1) +#define KSTK_EIP(tsk) (task_pt_regs(tsk)->rip) +#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ + + +struct microcode_header { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode { + struct microcode_header hdr; + unsigned int bits[0]; +}; + +typedef struct microcode microcode_t; +typedef struct microcode_header microcode_header_t; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + + +#define ASM_NOP1 K8_NOP1 +#define ASM_NOP2 K8_NOP2 +#define ASM_NOP3 K8_NOP3 +#define ASM_NOP4 K8_NOP4 +#define ASM_NOP5 K8_NOP5 +#define ASM_NOP6 K8_NOP6 +#define ASM_NOP7 K8_NOP7 +#define ASM_NOP8 K8_NOP8 + +/* Opteron nops */ +#define K8_NOP1 ".byte 0x90\n" +#define K8_NOP2 ".byte 0x66,0x90\n" +#define K8_NOP3 ".byte 0x66,0x66,0x90\n" +#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" +#define K8_NOP5 K8_NOP3 K8_NOP2 +#define K8_NOP6 K8_NOP3 K8_NOP3 +#define K8_NOP7 K8_NOP4 K8_NOP3 +#define K8_NOP8 K8_NOP4 K8_NOP4 + +#define ASM_NOP_MAX 8 + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +/* Stop speculative execution */ +static inline void sync_core(void) +{ + int tmp; + asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); +} + +#define cpu_has_fpu 1 + +#define ARCH_HAS_PREFETCH +static inline void prefetch(void *x) +{ + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); +} + +#define ARCH_HAS_PREFETCHW 1 +static inline void prefetchw(void *x) +{ + alternative_input("prefetcht0 (%1)", + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +#define ARCH_HAS_SPINLOCK_PREFETCH 1 + +#define spin_lock_prefetch(x) prefetchw(x) + +#define cpu_relax() rep_nop() + +/* + * NSC/Cyrix CPU configuration register indexes + */ +#define CX86_CCR0 0xc0 +#define CX86_CCR1 0xc1 +#define CX86_CCR2 0xc2 +#define CX86_CCR3 0xc3 +#define CX86_CCR4 0xe8 +#define CX86_CCR5 0xe9 +#define CX86_CCR6 0xea +#define CX86_CCR7 0xeb +#define CX86_DIR0 0xfe +#define CX86_DIR1 0xff +#define CX86_ARR_BASE 0xc4 +#define CX86_RCR_BASE 0xdc + +/* + * NSC/Cyrix CPU indexed register access macros + */ + +#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) + +#define setCx86(reg, data) do { \ + outb((reg), 0x22); \ + outb((data), 0x23); \ +} while (0) + +static inline void serialize_cpu(void) +{ + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax,%ecx,%edx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc8;" + : :"a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax,%ecx;" */ + asm volatile( + ".byte 0x0f,0x01,0xc9;" + : :"a" (eax), "c" (ecx)); +} + +#define stack_current() \ +({ \ + struct thread_info *ti; \ + asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->task; \ +}) + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern unsigned long boot_option_idle_override; +/* Boot loader type from the setup header */ +extern int bootloader_type; + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 + +#endif /* __ASM_X86_64_PROCESSOR_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/smp_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,150 @@ +#ifndef __ASM_SMP_H +#define __ASM_SMP_H + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifndef __ASSEMBLY__ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/bitops.h> +extern int disable_apic; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +#ifndef __ASSEMBLY__ +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#ifdef CONFIG_X86_IO_APIC +#include <asm/io_apic.h> +#endif +#include <asm/apic.h> +#include <asm/thread_info.h> +#endif +#endif + +#ifdef CONFIG_SMP +#ifndef ASSEMBLY + +#include <asm/pda.h> + +struct pt_regs; + +extern cpumask_t cpu_present_mask; +extern cpumask_t cpu_possible_map; +extern cpumask_t cpu_online_map; +extern cpumask_t cpu_initialized; + +/* + * Private routines/data + */ + +extern void smp_alloc_memory(void); +extern volatile unsigned long smp_invalidate_needed; +extern int pic_mode; +extern void lock_ipi_call_lock(void); +extern void unlock_ipi_call_lock(void); +extern int smp_num_siblings; +extern void smp_send_reschedule(int cpu); +void smp_stop_cpu(void); +extern int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait); + +extern cpumask_t cpu_sibling_map[NR_CPUS]; +extern cpumask_t cpu_core_map[NR_CPUS]; +extern u8 cpu_llc_id[NR_CPUS]; + +#define SMP_TRAMPOLINE_BASE 0x6000 + +/* + * On x86 all CPUs are mapped 1:1 to the APIC space. + * This simplifies scheduling and IPI sending and + * compresses data structures. + */ + +static inline int num_booting_cpus(void) +{ + return cpus_weight(cpu_possible_map); +} + +#define raw_smp_processor_id() read_pda(cpunumber) + +#ifdef CONFIG_X86_LOCAL_APIC +static inline int hard_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); +} +#endif + +extern int safe_smp_processor_id(void); +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +extern void prefill_possible_map(void); +extern unsigned num_processors; +extern unsigned disabled_cpus; + +#endif /* !ASSEMBLY */ + +#define NO_PROC_ID 0xFF /* No processor magic marker */ + +#endif + +#ifndef ASSEMBLY +/* + * Some lowlevel functions might want to know about + * the real APIC ID <-> CPU # mapping. + */ +extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */ +extern u8 x86_cpu_to_log_apicid[NR_CPUS]; +extern u8 bios_cpu_apicid[]; + +#ifdef CONFIG_X86_LOCAL_APIC +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + return cpus_addr(cpumask)[0]; +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int)bios_cpu_apicid[mps_cpu]; + else + return BAD_APICID; +} +#endif + +#endif /* !ASSEMBLY */ + +#ifndef CONFIG_SMP +#define stack_smp_processor_id() 0 +#define safe_smp_processor_id() 0 +#define cpu_logical_map(x) (x) +#else +#include <asm/thread_info.h> +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_LOCAL_APIC +static __inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); +} +#endif +#endif + +#ifdef CONFIG_SMP +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +#else +#define cpu_physical_id(cpu) boot_cpu_id +#endif + +#endif + --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/system_64.h 2007-11-26 16:59:25.000000000 +0100 @@ -0,0 +1,256 @@ +#ifndef __ASM_SYSTEM_H +#define __ASM_SYSTEM_H + +#include <linux/kernel.h> +#include <asm/segment.h> +#include <asm/alternative.h> + +#include <asm/synch_bitops.h> +#include <asm/hypervisor.h> +#include <xen/interface/arch-x86_64.h> + +#ifdef __KERNEL__ + +#define __STR(x) #x +#define STR(x) __STR(x) + +#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" +#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\n\t" + +#define __EXTRA_CLOBBER \ + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" + +#define switch_to(prev,next,last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + ".globl thread_return\n" \ + "thread_return:\n\t" \ + "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "jc ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)),\ + [tif_fork] "i" (TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ + : "memory", "cc" __EXTRA_CLOBBER) + +extern void load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg,value) \ + asm volatile("\n" \ + "1:\t" \ + "movl %k0,%%" #seg "\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3:\t" \ + "movl %1,%%" #seg "\n\t" \ + "jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n\t" \ + ".align 8\n\t" \ + ".quad 1b,3b\n" \ + ".previous" \ + : :"r" (value), "r" (0)) + +/* + * Clear and set 'TS' bit respectively + */ +#define clts() (HYPERVISOR_fpu_taskswitch(0)) + +static inline unsigned long read_cr0(void) +{ + unsigned long cr0; + asm volatile("movq %%cr0,%0" : "=r" (cr0)); + return cr0; +} + +static inline void write_cr0(unsigned long val) +{ + asm volatile("movq %0,%%cr0" :: "r" (val)); +} + +#define read_cr3() ({ \ + unsigned long __dummy; \ + asm("movq %%cr3,%0" : "=r" (__dummy)); \ + machine_to_phys(__dummy); \ +}) + +static inline unsigned long read_cr4(void) +{ + unsigned long cr4; + asm("movq %%cr4,%0" : "=r" (cr4)); + return cr4; +} + +static inline void write_cr4(unsigned long val) +{ + asm volatile("movq %0,%%cr4" :: "r" (val)); +} + +#define stts() (HYPERVISOR_fpu_taskswitch(1)) + +#define wbinvd() \ + __asm__ __volatile__ ("wbinvd": : :"memory"); + +/* + * On SMP systems, when the scheduler does migration-cost autodetection, + * it needs a way to flush as much of the CPU's caches as possible. + */ +static inline void sched_cacheflush(void) +{ + wbinvd(); +} + +#endif /* __KERNEL__ */ + +#define nop() __asm__ __volatile__ ("nop") + +#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) + +#define tas(ptr) (xchg((ptr),1)) + +#define __xg(x) ((volatile long *)(x)) + +static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +{ + *ptr = val; +} + +#define _set_64bit set_64bit + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway + * Note 2: xchg has side effect, so that attribute volatile is necessary, + * but generally the primitive is invalid, *ptr is output argument. --ANK + */ +static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) +{ + switch (size) { + case 1: + __asm__ __volatile__("xchgb %b0,%1" + :"=q" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 2: + __asm__ __volatile__("xchgw %w0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 4: + __asm__ __volatile__("xchgl %k0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + case 8: + __asm__ __volatile__("xchgq %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + break; + } + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ + +#define __HAVE_ARCH_CMPXCHG 1 + +static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + unsigned long prev; + switch (size) { + case 1: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 8: + __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + } + return old; +} + +#define cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#define smp_read_barrier_depends() do {} while(0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do {} while(0) +#endif + + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") + +#ifdef CONFIG_UNORDERED_IO +#define wmb() asm volatile("sfence" ::: "memory") +#else +#define wmb() asm volatile("" ::: "memory") +#endif +#define read_barrier_depends() do {} while(0) +#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) + +#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0) + +#include <linux/irqflags.h> + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +#endif --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/tlbflush_64.h 2007-11-26 16:59:25.000000000 +0100 @@ -0,0 +1,103 @@ +#ifndef _X8664_TLBFLUSH_H +#define _X8664_TLBFLUSH_H + +#include <linux/mm.h> +#include <asm/processor.h> + +#define __flush_tlb() xen_tlb_flush() + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +#define __flush_tlb_global() xen_tlb_flush() + + +extern unsigned long pgkern_mask; + +#define __flush_tlb_all() __flush_tlb_global() + +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) + + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +#else + +#include <asm/smp.h> + +#define local_flush_tlb() \ + __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(¤t->mm->cpu_vm_mask) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(&(mm)->cpu_vm_mask) +#define flush_tlb_page(vma, va) xen_invlpg_mask(&(vma)->vm_mm->cpu_vm_mask, va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +/* Roughly an IPI every 20MB with 4k pages for freeing page table + ranges. Cost is about 42k of memory for each CPU. */ +#define ARCH_FREE_PTE_NR 5350 + +#endif + +#define flush_tlb_kernel_range(start, end) flush_tlb_all() + +static inline void flush_tlb_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + /* x86_64 does not keep any page table caches in a software TLB. + The CPUs do in their hardware TLBs, but they are handled + by the normal TLB flushing algorithms. */ +} + +#endif /* _X8664_TLBFLUSH_H */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/asm/xor_64.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,328 @@ +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; + +/* Doesn't use gcc to save the XMM registers, because there is no easy way to + tell it to do a clts before the register saving. */ +#define XMMS_SAVE do { \ + preempt_disable(); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + clts(); \ + __asm__ __volatile__ ( \ + "movups %%xmm0,(%1) ;\n\t" \ + "movups %%xmm1,0x10(%1) ;\n\t" \ + "movups %%xmm2,0x20(%1) ;\n\t" \ + "movups %%xmm3,0x30(%1) ;\n\t" \ + : "=&r" (cr0) \ + : "r" (xmm_save) \ + : "memory"); \ +} while(0) + +#define XMMS_RESTORE do { \ + asm volatile ( \ + "sfence ;\n\t" \ + "movups (%1),%%xmm0 ;\n\t" \ + "movups 0x10(%1),%%xmm1 ;\n\t" \ + "movups 0x20(%1),%%xmm2 ;\n\t" \ + "movups 0x30(%1),%%xmm3 ;\n\t" \ + : \ + : "r" (cr0), "r" (xmm_save) \ + : "memory"); \ + if (!(current_thread_info()->status & TS_USEDFPU)) \ + stts(); \ + preempt_enable(); \ +} while(0) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" +#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned int lines = bytes >> 8; + unsigned long cr0; + xmm_store_t xmm_save[4]; + + XMMS_SAVE; + + asm volatile ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + PF1(i) \ + PF1(i+2) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " decl %[cnt] ; jnz 1b" + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] "r" (256UL) + : "memory"); + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] "r" (256UL) + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + PF4(i) \ + PF4(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + XO4(i+1,1) \ + XO4(i+2,2) \ + XO4(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p5] "+r" (p5) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_sse = { + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ + do { \ + xor_speed(&xor_block_sse); \ + } while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/setup_arch_post.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,63 @@ +/** + * machine_specific_* - Hooks for machine specific setup. + * + * Description: + * This is included late in kernel/setup.c so that it can make + * use of all of the static functions. + **/ + +#include <xen/interface/callback.h> + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void nmi(void); + +static void __init machine_specific_arch_setup(void) +{ + int ret; + static struct callback_register __initdata event = { + .type = CALLBACKTYPE_event, + .address = (unsigned long) hypervisor_callback, + }; + static struct callback_register __initdata failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = (unsigned long)failsafe_callback, + }; + static struct callback_register __initdata syscall = { + .type = CALLBACKTYPE_syscall, + .address = (unsigned long)system_call, + }; +#ifdef CONFIG_X86_LOCAL_APIC + static struct callback_register __initdata nmi_cb = { + .type = CALLBACKTYPE_nmi, + .address = (unsigned long)nmi, + }; +#endif + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) + ret = HYPERVISOR_set_callbacks( + event.address, + failsafe.address, + syscall.address); +#endif + BUG_ON(ret); + +#ifdef CONFIG_X86_LOCAL_APIC + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) { + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; + + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); + } +#endif +#endif +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/arch/x86/include/mach-xen/setup_arch_pre.h 2007-06-12 13:14:13.000000000 +0200 @@ -0,0 +1,5 @@ +/* Hook to call BIOS initialisation function */ + +#define ARCH_SETUP machine_specific_arch_setup(); + +static void __init machine_specific_arch_setup(void); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/balloon.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,57 @@ +/****************************************************************************** + * balloon.h + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __ASM_BALLOON_H__ +#define __ASM_BALLOON_H__ + +/* + * Inform the balloon driver that it should allow some slop for device-driver + * memory activities. + */ +void balloon_update_driver_allowance(long delta); + +/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ +struct page **alloc_empty_pages_and_pagevec(int nr_pages); +void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); + +void balloon_release_driver_page(struct page *page); + +/* + * Prevent the balloon driver from changing the memory reservation during + * a driver critical region. + */ +extern spinlock_t balloon_lock; +#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags) +#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags) + +#endif /* __ASM_BALLOON_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/blkif.h 2008-07-21 11:00:33.000000000 +0200 @@ -0,0 +1,123 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_BLKIF_H__ +#define __XEN_BLKIF_H__ + +#include <xen/interface/io/ring.h> +#include <xen/interface/io/blkif.h> +#include <xen/interface/io/protocols.h> + +/* Not a real protocol. Used to generate ring structs which contain + * the elements common to all protocols only. This way we get a + * compiler-checkable way to use common struct elements, so we can + * avoid using switch(protocol) in a number of places. */ +struct blkif_common_request { + char dummy; +}; +struct blkif_common_response { + char dummy; +}; + +/* i386 protocol version */ +#pragma pack(push, 4) +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_32_response { + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_32_request blkif_x86_32_request_t; +typedef struct blkif_x86_32_response blkif_x86_32_response_t; +#pragma pack(pop) + +/* x86_64 protocol version */ +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t __attribute__((__aligned__(8))) id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_64_response { + uint64_t __attribute__((__aligned__(8))) id; + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +typedef struct blkif_x86_64_request blkif_x86_64_request_t; +typedef struct blkif_x86_64_response blkif_x86_64_response_t; + +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); + +union blkif_back_rings { + blkif_back_ring_t native; + blkif_common_back_ring_t common; + blkif_x86_32_back_ring_t x86_32; + blkif_x86_64_back_ring_t x86_64; +}; +typedef union blkif_back_rings blkif_back_rings_t; + +enum blkif_protocol { + BLKIF_PROTOCOL_NATIVE = 1, + BLKIF_PROTOCOL_X86_32 = 2, + BLKIF_PROTOCOL_X86_64 = 3, +}; + +static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->seg[i] = src->seg[i]; +} + +#endif /* __XEN_BLKIF_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/compat_ioctl.h 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,75 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * + * Authors: Jimi Xenidis <jimix@watson.ibm.com> + * Hollis Blanchard <hollisb@us.ibm.com> + */ + +#ifndef __LINUX_XEN_COMPAT_H__ +#define __LINUX_XEN_COMPAT_H__ + +#include <linux/compat.h> +#include <linux/compiler.h> + +#if defined(CONFIG_X86) || defined(CONFIG_IA64) +#define xen_pfn32_t __u32 +#endif + +extern int privcmd_ioctl_32(int fd, unsigned int cmd, unsigned long arg); +struct privcmd_mmap_32 { + int num; + domid_t dom; + compat_uptr_t entry; +}; + +struct privcmd_mmapbatch_32 { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ +#if defined(CONFIG_X86) || defined(CONFIG_IA64) + union { /* virtual address */ + __u64 addr __attribute__((packed)); + __u32 va; /* ensures union is 4-byte aligned */ + }; +#else + __u64 addr; /* virtual address */ +#endif + compat_uptr_t arr; /* array of mfns - top nibble set on err */ +}; + +struct privcmd_mmapbatch_v2_32 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ +#if defined(CONFIG_X86) || defined(CONFIG_IA64) + union { /* virtual address */ + __u64 addr __attribute__((packed)); + __u32 va; /* ensures union is 4-byte aligned */ + }; +#else + __u64 addr; /* virtual address */ +#endif + compat_uptr_t arr; /* array of mfns */ + compat_uptr_t err; /* array of error codes */ +}; + +#define IOCTL_PRIVCMD_MMAP_32 \ + _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32)) +#define IOCTL_PRIVCMD_MMAPBATCH_32 \ + _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2_32 \ + _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2_32)) + +#endif /* __LINUX_XEN_COMPAT_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/cpu_hotplug.h 2007-08-16 18:07:01.000000000 +0200 @@ -0,0 +1,41 @@ +#ifndef __XEN_CPU_HOTPLUG_H__ +#define __XEN_CPU_HOTPLUG_H__ + +#include <linux/kernel.h> +#include <linux/cpumask.h> + +#if defined(CONFIG_X86) && defined(CONFIG_SMP) +extern cpumask_t cpu_initialized_map; +#endif + +#if defined(CONFIG_HOTPLUG_CPU) + +int cpu_up_check(unsigned int cpu); +void init_xenbus_allowed_cpumask(void); +int smp_suspend(void); +void smp_resume(void); + +void cpu_bringup(void); + +#else /* !defined(CONFIG_HOTPLUG_CPU) */ + +#define cpu_up_check(cpu) (0) +#define init_xenbus_allowed_cpumask() ((void)0) + +static inline int smp_suspend(void) +{ + if (num_online_cpus() > 1) { + printk(KERN_WARNING "Can't suspend SMP guests " + "without CONFIG_HOTPLUG_CPU\n"); + return -EOPNOTSUPP; + } + return 0; +} + +static inline void smp_resume(void) +{ +} + +#endif /* !defined(CONFIG_HOTPLUG_CPU) */ + +#endif /* __XEN_CPU_HOTPLUG_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/driver_util.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,14 @@ + +#ifndef __ASM_XEN_DRIVER_UTIL_H__ +#define __ASM_XEN_DRIVER_UTIL_H__ + +#include <linux/vmalloc.h> +#include <linux/device.h> + +/* Allocate/destroy a 'vmalloc' VM area. */ +extern struct vm_struct *alloc_vm_area(unsigned long size); +extern void free_vm_area(struct vm_struct *area); + +extern struct class *get_xen_class(void); + +#endif /* __ASM_XEN_DRIVER_UTIL_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/firmware.h 2007-07-02 08:16:19.000000000 +0200 @@ -0,0 +1,10 @@ +#ifndef __XEN_FIRMWARE_H__ +#define __XEN_FIRMWARE_H__ + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +void copy_edd(void); +#endif + +void copy_edid(void); + +#endif /* __XEN_FIRMWARE_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/gnttab.h 2008-11-04 11:13:10.000000000 +0100 @@ -0,0 +1,164 @@ +/****************************************************************************** + * gnttab.h + * + * Two sets of functionality: + * 1. Granting foreign access to our memory reservation. + * 2. Accessing others' memory reservations via grant references. + * (i.e., mechanisms for both sender and recipient of grant references) + * + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __ASM_GNTTAB_H__ +#define __ASM_GNTTAB_H__ + +#include <asm/hypervisor.h> +#include <asm/maddr.h> /* maddr_t */ +#include <linux/mm.h> +#include <xen/interface/grant_table.h> +#include <xen/features.h> + +struct gnttab_free_callback { + struct gnttab_free_callback *next; + void (*fn)(void *); + void *arg; + u16 count; + u8 queued; +}; + +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int flags); + +/* + * End access through the given grant reference, iff the grant entry is no + * longer in use. Return 1 if the grant entry was freed, 0 if it is still in + * use. + */ +int gnttab_end_foreign_access_ref(grant_ref_t ref); + +/* + * Eventually end access through the given grant reference, and once that + * access has been ended, free the given page too. Access will be ended + * immediately iff the grant entry is not in use, otherwise it will happen + * some time later. page may be 0, in which case no freeing will occur. + */ +void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page); + +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); + +int gnttab_query_foreign_access(grant_ref_t ref); + +/* + * operations on reserved batches of grant references + */ +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); + +void gnttab_free_grant_reference(grant_ref_t ref); + +void gnttab_free_grant_references(grant_ref_t head); + +int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); + +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); + +void gnttab_release_grant_reference(grant_ref_t *private_head, + grant_ref_t release); + +void gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count); +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); + +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int flags); + +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); + +int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep); +void __gnttab_dma_map_page(struct page *page); +static inline void __gnttab_dma_unmap_page(struct page *page) +{ +} + +void gnttab_reset_grant_page(struct page *page); + +int gnttab_suspend(void); +int gnttab_resume(void); + +void *arch_gnttab_alloc_shared(unsigned long *frames); + +static inline void +gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr, + uint32_t flags, grant_ref_t ref, domid_t domid) +{ + if (flags & GNTMAP_contains_pte) + map->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + map->host_addr = __pa(addr); + else + map->host_addr = addr; + + map->flags = flags; + map->ref = ref; + map->dom = domid; +} + +static inline void +gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr, + uint32_t flags, grant_handle_t handle) +{ + if (flags & GNTMAP_contains_pte) + unmap->host_addr = addr; + else if (xen_feature(XENFEAT_auto_translated_physmap)) + unmap->host_addr = __pa(addr); + else + unmap->host_addr = addr; + + unmap->handle = handle; + unmap->dev_bus_addr = 0; +} + +static inline void +gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr, + maddr_t new_addr, grant_handle_t handle) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + unmap->host_addr = __pa(addr); + unmap->new_addr = __pa(new_addr); + } else { + unmap->host_addr = addr; + unmap->new_addr = new_addr; + } + + unmap->handle = handle; +} + +#endif /* __ASM_GNTTAB_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/hvm.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,23 @@ +/* Simple wrappers around HVM functions */ +#ifndef XEN_HVM_H__ +#define XEN_HVM_H__ + +#include <xen/interface/hvm/params.h> + +static inline unsigned long hvm_get_parameter(int idx) +{ + struct xen_hvm_param xhv; + int r; + + xhv.domid = DOMID_SELF; + xhv.index = idx; + r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); + if (r < 0) { + printk(KERN_ERR "cannot get hvm parameter %d: %d.\n", + idx, r); + return 0; + } + return xhv.value; +} + +#endif /* XEN_HVM_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/hypercall.h 2008-01-28 12:24:19.000000000 +0100 @@ -0,0 +1,30 @@ +#ifndef __XEN_HYPERCALL_H__ +#define __XEN_HYPERCALL_H__ + +#include <asm/hypercall.h> + +static inline int __must_check +HYPERVISOR_multicall_check( + multicall_entry_t *call_list, unsigned int nr_calls, + const unsigned long *rc_list) +{ + int rc = HYPERVISOR_multicall(call_list, nr_calls); + + if (unlikely(rc < 0)) + return rc; + BUG_ON(rc); + BUG_ON((int)nr_calls < 0); + + for ( ; nr_calls > 0; --nr_calls, ++call_list) + if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0))) + return nr_calls; + + return 0; +} + +/* A construct to ignore the return value of hypercall wrappers in a few + * exceptional cases (simply casting the function result to void doesn't + * avoid the compiler warning): */ +#define VOID(expr) ((void)((expr)?:0)) + +#endif /* __XEN_HYPERCALL_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/hypervisor_sysfs.h 2007-06-22 09:08:06.000000000 +0200 @@ -0,0 +1,30 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day <ncmike@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _HYP_SYSFS_H_ +#define _HYP_SYSFS_H_ + +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +#endif /* _HYP_SYSFS_H_ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/pcifront.h 2007-06-18 08:38:13.000000000 +0200 @@ -0,0 +1,83 @@ +/* + * PCI Frontend - arch-dependendent declarations + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_ASM_PCIFRONT_H__ +#define __XEN_ASM_PCIFRONT_H__ + +#include <linux/spinlock.h> + +#ifdef __KERNEL__ + +#ifndef __ia64__ + +struct pcifront_device; +struct pci_bus; + +struct pcifront_sd { + int domain; + struct pcifront_device *pdev; +}; + +static inline struct pcifront_device * +pcifront_get_pdev(struct pcifront_sd *sd) +{ + return sd->pdev; +} + +static inline void pcifront_init_sd(struct pcifront_sd *sd, + unsigned int domain, unsigned int bus, + struct pcifront_device *pdev) +{ + sd->domain = domain; + sd->pdev = pdev; +} + +#if defined(CONFIG_PCI_DOMAINS) +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pcifront_sd *sd = bus->sysdata; + return sd->domain; +} +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} +#endif /* CONFIG_PCI_DOMAINS */ + +static inline void pcifront_setup_root_resources(struct pci_bus *bus, + struct pcifront_sd *sd) +{ +} + +#else /* __ia64__ */ + +#include <linux/acpi.h> +#include <asm/pci.h> +#define pcifront_sd pci_controller + +extern void xen_add_resource(struct pci_controller *, unsigned int, + unsigned int, struct acpi_resource *); +extern void xen_pcibios_setup_root_windows(struct pci_bus *, + struct pci_controller *); + +static inline struct pcifront_device * +pcifront_get_pdev(struct pcifront_sd *sd) +{ + return (struct pcifront_device *)sd->platform_data; +} + +static inline void pcifront_setup_root_resources(struct pci_bus *bus, + struct pcifront_sd *sd) +{ + xen_pcibios_setup_root_windows(bus, sd); +} + +#endif /* __ia64__ */ + +extern struct rw_semaphore pci_bus_sem; + +#endif /* __KERNEL__ */ + +#endif /* __XEN_ASM_PCIFRONT_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/public/evtchn.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,88 @@ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_EVTCHN_H__ +#define __LINUX_PUBLIC_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) +struct ioctl_evtchn_bind_virq { + unsigned int virq; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IOC(_IOC_NONE, 'E', 5, 0) + +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/public/gntdev.h 2008-04-02 12:34:02.000000000 +0200 @@ -0,0 +1,119 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/public/privcmd.h 2010-01-18 15:23:12.000000000 +0100 @@ -0,0 +1,89 @@ +/****************************************************************************** + * privcmd.h + * + * Interface to /proc/xen/privcmd. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_PRIVCMD_H__ +#define __LINUX_PUBLIC_PRIVCMD_H__ + +#include <linux/types.h> + +#ifndef __user +#define __user +#endif + +typedef struct privcmd_hypercall +{ + __u64 op; + __u64 arg[5]; +} privcmd_hypercall_t; + +typedef struct privcmd_mmap_entry { + __u64 va; + __u64 mfn; + __u64 npages; +} privcmd_mmap_entry_t; + +typedef struct privcmd_mmap { + int num; + domid_t dom; /* target domain */ + privcmd_mmap_entry_t __user *entry; +} privcmd_mmap_t; + +typedef struct privcmd_mmapbatch { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ +} privcmd_mmapbatch_t; + +typedef struct privcmd_mmapbatch_v2 { + unsigned int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + __u64 addr; /* virtual address */ + const xen_pfn_t __user *arr; /* array of mfns */ + int __user *err; /* array of error codes */ +} privcmd_mmapbatch_v2_t; + +/* + * @cmd: IOCTL_PRIVCMD_HYPERCALL + * @arg: &privcmd_hypercall_t + * Return: Value returned from execution of the specified hypercall. + */ +#define IOCTL_PRIVCMD_HYPERCALL \ + _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) +#define IOCTL_PRIVCMD_MMAP \ + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) +#define IOCTL_PRIVCMD_MMAPBATCH_V2 \ + _IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t)) + +#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/public/xenbus.h 2009-05-29 10:25:53.000000000 +0200 @@ -0,0 +1,56 @@ +/****************************************************************************** + * xenbus.h + * + * Interface to /proc/xen/xenbus. + * + * Copyright (c) 2008, Diego Ongaro <diego.ongaro@citrix.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_XENBUS_H__ +#define __LINUX_PUBLIC_XENBUS_H__ + +#include <linux/types.h> + +#ifndef __user +#define __user +#endif + +typedef struct xenbus_alloc { + domid_t dom; + __u32 port; + __u32 grant_ref; +} xenbus_alloc_t; + +/* + * @cmd: IOCTL_XENBUS_ALLOC + * @arg: &xenbus_alloc_t + * Return: 0, or -1 for error + */ +#define IOCTL_XENBUS_ALLOC \ + _IOC(_IOC_NONE, 'X', 0, sizeof(xenbus_alloc_t)) + +#endif /* __LINUX_PUBLIC_XENBUS_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/xen_proc.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,12 @@ + +#ifndef __ASM_XEN_PROC_H__ +#define __ASM_XEN_PROC_H__ + +#include <linux/proc_fs.h> + +extern struct proc_dir_entry *create_xen_proc_entry( + const char *name, mode_t mode); +extern void remove_xen_proc_entry( + const char *name); + +#endif /* __ASM_XEN_PROC_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/xencons.h 2007-10-15 09:39:38.000000000 +0200 @@ -0,0 +1,17 @@ +#ifndef __ASM_XENCONS_H__ +#define __ASM_XENCONS_H__ + +struct dom0_vga_console_info; +void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t); + +void xencons_force_flush(void); +void xencons_resume(void); + +/* Interrupt work hooks. Receive data, or kick data out. */ +void xencons_rx(char *buf, unsigned len, struct pt_regs *regs); +void xencons_tx(void); + +int xencons_ring_init(void); +int xencons_ring_send(const char *data, unsigned len); + +#endif /* __ASM_XENCONS_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/include/xen/xenoprof.h 2007-06-12 13:14:19.000000000 +0200 @@ -0,0 +1,42 @@ +/****************************************************************************** + * xen/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __XEN_XENOPROF_H__ +#define __XEN_XENOPROF_H__ +#ifdef CONFIG_XEN + +#include <asm/xenoprof.h> + +struct oprofile_operations; +int xenoprofile_init(struct oprofile_operations * ops); +void xenoprofile_exit(void); + +struct xenoprof_shared_buffer { + char *buffer; + struct xenoprof_arch_shared_buffer arch; +}; +#else +#define xenoprofile_init(ops) (-ENOSYS) +#define xenoprofile_exit() do { } while (0) + +#endif /* CONFIG_XEN */ +#endif /* __XEN_XENOPROF_H__ */ --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/lib/swiotlb-xen.c 2009-04-07 13:58:49.000000000 +0200 @@ -0,0 +1,739 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is a fallback for platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2005 Keir Fraser <keir@xensource.com> + */ + +#include <linux/cache.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <asm/io.h> +#include <asm/pci.h> +#include <asm/dma.h> +#include <asm/uaccess.h> +#include <xen/gnttab.h> +#include <xen/interface/memory.h> +#include <asm-i386/mach-xen/asm/gnttab_dma.h> + +int swiotlb; +EXPORT_SYMBOL(swiotlb); + +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) + +/* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? + * The complexity of {map,unmap}_single is linearly dependent on this value. + */ +#define IO_TLB_SEGSIZE 128 + +/* + * log of the size of each IO TLB slab. The number of slabs is command line + * controllable. + */ +#define IO_TLB_SHIFT 11 + +int swiotlb_force; + +static char *iotlb_virt_start; +static unsigned long iotlb_nslabs; + +/* + * Used to do a quick range check in swiotlb_unmap_single and + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static unsigned long iotlb_pfn_start, iotlb_pfn_end; + +/* Does the given dma address reside within the swiotlb aperture? */ +static inline int in_swiotlb_aperture(dma_addr_t dev_addr) +{ + unsigned long pfn = mfn_to_local_pfn(dev_addr >> PAGE_SHIFT); + return (pfn_valid(pfn) + && (pfn >= iotlb_pfn_start) + && (pfn < iotlb_pfn_end)); +} + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +void *io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +static struct phys_addr { + struct page *page; + unsigned int offset; +} *io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static unsigned int dma_bits; +static unsigned int __initdata max_dma_bits = 32; +static int __init +setup_dma_bits(char *str) +{ + max_dma_bits = simple_strtoul(str, NULL, 0); + return 0; +} +__setup("dma_bits=", setup_dma_bits); + +static int __init +setup_io_tlb_npages(char *str) +{ + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */ + if (isdigit(*str)) { + iotlb_nslabs = simple_strtoul(str, &str, 0) << + (20 - IO_TLB_SHIFT); + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); + } + if (*str == ',') + ++str; + /* + * NB. 'force' enables the swiotlb, but doesn't force its use for + * every DMA like it does on native Linux. 'off' forcibly disables + * use of the swiotlb. + */ + if (!strcmp(str, "force")) + swiotlb_force = 1; + else if (!strcmp(str, "off")) + swiotlb_force = -1; + return 1; +} +__setup("swiotlb=", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the PCI DMA API. + */ +void +swiotlb_init_with_default_size (size_t default_size) +{ + unsigned long i, bytes; + int rc; + + if (!iotlb_nslabs) { + iotlb_nslabs = (default_size >> IO_TLB_SHIFT); + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT); + + /* + * Get IO TLB memory from the low pages + */ + iotlb_virt_start = alloc_bootmem_pages(bytes); + if (!iotlb_virt_start) + panic("Cannot allocate SWIOTLB buffer!\n"); + + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + for (i = 0; i < iotlb_nslabs; i += IO_TLB_SEGSIZE) { + do { + rc = xen_create_contiguous_region( + (unsigned long)iotlb_virt_start + (i << IO_TLB_SHIFT), + get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) { + if (i == 0) + panic("No suitable physical memory available for SWIOTLB buffer!\n" + "Use dom0_mem Xen boot parameter to reserve\n" + "some DMA memory (e.g., dom0_mem=-128M).\n"); + iotlb_nslabs = i; + i <<= IO_TLB_SHIFT; + free_bootmem(__pa(iotlb_virt_start + i), bytes - i); + bytes = i; + for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) { + unsigned int bits = fls64(virt_to_bus(iotlb_virt_start + i - 1)); + + if (bits > dma_bits) + dma_bits = bits; + } + break; + } + } + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE. + */ + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int)); + for (i = 0; i < iotlb_nslabs; i++) + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_index = 0; + io_tlb_orig_addr = alloc_bootmem( + iotlb_nslabs * sizeof(*io_tlb_orig_addr)); + + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = alloc_bootmem(io_tlb_overflow); + if (!io_tlb_overflow_buffer) + panic("Cannot allocate SWIOTLB overflow buffer!\n"); + + do { + rc = xen_create_contiguous_region( + (unsigned long)io_tlb_overflow_buffer, + get_order(io_tlb_overflow), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) + panic("No suitable physical memory available for SWIOTLB overflow buffer!\n"); + + iotlb_pfn_start = __pa(iotlb_virt_start) >> PAGE_SHIFT; + iotlb_pfn_end = iotlb_pfn_start + (bytes >> PAGE_SHIFT); + + printk(KERN_INFO "Software IO TLB enabled: \n" + " Aperture: %lu megabytes\n" + " Kernel range: %p - %p\n" + " Address size: %u bits\n", + bytes >> 20, + iotlb_virt_start, iotlb_virt_start + bytes, + dma_bits); +} + +void +swiotlb_init(void) +{ + long ram_end; + size_t defsz = 64 * (1 << 20); /* 64MB default size */ + + if (swiotlb_force == 1) { + swiotlb = 1; + } else if ((swiotlb_force != -1) && + is_running_on_xen() && + is_initial_xendomain()) { + /* Domain 0 always has a swiotlb. */ + ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); + if (ram_end <= 0x7ffff) + defsz = 2 * (1 << 20); /* 2MB on <2GB on systems. */ + swiotlb = 1; + } + + if (swiotlb) + swiotlb_init_with_default_size(defsz); + else + printk(KERN_INFO "Software IO TLB disabled\n"); +} + +/* + * We use __copy_to_user_inatomic to transfer to the host buffer because the + * buffer may be mapped read-only (e.g, in blkback driver) but lower-level + * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an + * unnecessary copy from the aperture to the host buffer, and a page fault. + */ +static void +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir) +{ + if (PageHighMem(buffer.page)) { + size_t len, bytes; + char *dev, *host, *kmp; + len = size; + while (len != 0) { + unsigned long flags; + + if (((bytes = len) + buffer.offset) > PAGE_SIZE) + bytes = PAGE_SIZE - buffer.offset; + local_irq_save(flags); /* protects KM_BOUNCE_READ */ + kmp = kmap_atomic(buffer.page, KM_BOUNCE_READ); + dev = dma_addr + size - len; + host = kmp + buffer.offset; + if (dir == DMA_FROM_DEVICE) { + if (__copy_to_user_inatomic(host, dev, bytes)) + /* inaccessible */; + } else + memcpy(dev, host, bytes); + kunmap_atomic(kmp, KM_BOUNCE_READ); + local_irq_restore(flags); + len -= bytes; + buffer.page++; + buffer.offset = 0; + } + } else { + char *host = (char *)phys_to_virt( + page_to_pseudophys(buffer.page)) + buffer.offset; + if (dir == DMA_FROM_DEVICE) { + if (__copy_to_user_inatomic(host, dma_addr, size)) + /* inaccessible */; + } else if (dir == DMA_TO_DEVICE) + memcpy(dma_addr, host, size); + } +} + +/* + * Allocates bounce buffer and returns its kernel virtual address. + */ +static void * +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir) +{ + unsigned long flags; + char *dma_addr; + unsigned int nslots, stride, index, wrap; + struct phys_addr slot_buf; + int i; + + /* + * For mappings greater than a page, we limit the stride (and + * hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size > PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + wrap = index = ALIGN(io_tlb_index, stride); + + if (index >= iotlb_nslabs) + wrap = index = 0; + + do { + /* + * If we find a slot that indicates we have 'nslots' + * number of contiguous buffers, we allocate the + * buffers from that slot and mark the entries as '0' + * indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int)(index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; + (OFFSET(i, IO_TLB_SEGSIZE) != + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + dma_addr = iotlb_virt_start + + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in + * the next round. + */ + io_tlb_index = + ((index + nslots) < iotlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= iotlb_nslabs) + index = 0; + } while (index != wrap); + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return NULL; + } + found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + slot_buf = buffer; + for (i = 0; i < nslots; i++) { + slot_buf.page += slot_buf.offset >> PAGE_SHIFT; + slot_buf.offset &= PAGE_SIZE - 1; + io_tlb_orig_addr[index+i] = slot_buf; + slot_buf.offset += 1 << IO_TLB_SHIFT; + } + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL)) + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE); + + return dma_addr; +} + +static struct phys_addr dma_addr_to_phys_addr(char *dma_addr) +{ + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + struct phys_addr buffer = io_tlb_orig_addr[index]; + buffer.offset += (long)dma_addr & ((1 << IO_TLB_SHIFT) - 1); + buffer.page += buffer.offset >> PAGE_SHIFT; + buffer.offset &= PAGE_SIZE - 1; + return buffer; +} + +/* + * dma_addr is the kernel virtual address of the bounce buffer to unmap. + */ +static void +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); + + /* + * First, sync the memory before unmapping the entry + */ + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)) + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contigous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) + io_tlb_list[i] = ++count; + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; + (OFFSET(i, IO_TLB_SEGSIZE) != + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +static void +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +{ + struct phys_addr buffer = dma_addr_to_phys_addr(dma_addr); + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE)); + __sync_single(buffer, dma_addr, size, dir); +} + +static void +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for pci_dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at " + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?"); + + if (size > io_tlb_overflow && do_panic) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Random memory would be DMAed\n"); + } +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * PCI address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + */ +dma_addr_t +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) +{ + dma_addr_t dev_addr = gnttab_dma_map_page(virt_to_page(ptr)) + + offset_in_page(ptr); + void *map; + struct phys_addr buffer; + + BUG_ON(dir == DMA_NONE); + + /* + * If the pointer passed in happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (!range_straddles_page_boundary(__pa(ptr), size) && + !address_needs_mapping(hwdev, dev_addr)) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + gnttab_dma_unmap_page(dev_addr); + buffer.page = virt_to_page(ptr); + buffer.offset = (unsigned long)ptr & ~PAGE_MASK; + map = map_single(hwdev, buffer, size, dir); + if (!map) { + swiotlb_full(hwdev, size, dir, 1); + map = io_tlb_overflow_buffer; + } + + dev_addr = virt_to_bus(map); + return dev_addr; +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_single call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +void +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, + int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir); + else + gnttab_dma_unmap_page(dev_addr); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must + * call this function before doing so. At the next point you give the PCI dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_single + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_single are the + * same here. + */ +int +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + struct phys_addr buffer; + dma_addr_t dev_addr; + char *map; + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) { + dev_addr = gnttab_dma_map_page(sg->page) + sg->offset; + + if (range_straddles_page_boundary(page_to_pseudophys(sg->page) + + sg->offset, sg->length) + || address_needs_mapping(hwdev, dev_addr)) { + gnttab_dma_unmap_page(dev_addr); + buffer.page = sg->page; + buffer.offset = sg->offset; + map = map_single(hwdev, buffer, sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + swiotlb_unmap_sg(hwdev, sg - i, i, dir); + sg[0].dma_length = 0; + return 0; + } + sg->dma_address = (dma_addr_t)virt_to_bus(map); + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_single() above. + */ +void +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (in_swiotlb_aperture(sg->dma_address)) + unmap_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); + else + gnttab_dma_unmap_page(sg->dma_address); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (in_swiotlb_aperture(sg->dma_address)) + sync_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (in_swiotlb_aperture(sg->dma_address)) + sync_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); +} + +#ifdef CONFIG_HIGHMEM + +dma_addr_t +swiotlb_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + struct phys_addr buffer; + dma_addr_t dev_addr; + char *map; + + dev_addr = gnttab_dma_map_page(page) + offset; + if (address_needs_mapping(hwdev, dev_addr)) { + gnttab_dma_unmap_page(dev_addr); + buffer.page = page; + buffer.offset = offset; + map = map_single(hwdev, buffer, size, direction); + if (!map) { + swiotlb_full(hwdev, size, direction, 1); + map = io_tlb_overflow_buffer; + } + dev_addr = (dma_addr_t)virt_to_bus(map); + } + + return dev_addr; +} + +void +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (in_swiotlb_aperture(dma_address)) + unmap_single(hwdev, bus_to_virt(dma_address), size, direction); + else + gnttab_dma_unmap_page(dma_address); +} + +#endif + +int +swiotlb_dma_mapping_error(dma_addr_t dma_addr) +{ + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); +} + +/* + * Return whether the given PCI device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported (struct device *hwdev, u64 mask) +{ + return (mask >= ((1UL << dma_bits) - 1)); +} + +EXPORT_SYMBOL(swiotlb_init); +EXPORT_SYMBOL(swiotlb_map_single); +EXPORT_SYMBOL(swiotlb_unmap_single); +EXPORT_SYMBOL(swiotlb_map_sg); +EXPORT_SYMBOL(swiotlb_unmap_sg); +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_single_for_device); +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); +EXPORT_SYMBOL(swiotlb_dma_mapping_error); +EXPORT_SYMBOL(swiotlb_dma_supported); --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/mm/tmem-xen.c 2009-06-23 09:28:21.000000000 +0200 @@ -0,0 +1,41 @@ +/* + * Xen implementation for transcendent memory (tmem) + * + * Dan Magenheimer <dan.magenheimer@oracle.com> 2009 + */ + +#include <linux/types.h> +#include <xen/interface/xen.h> +#include <asm/hypervisor.h> + +int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index, + unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) +{ + struct tmem_op op; + int rc = 0; + + op.cmd = tmem_cmd; + op.pool_id = tmem_pool; + op.u.gen.object = object; + op.u.gen.index = index; + op.u.gen.tmem_offset = tmem_offset; + op.u.gen.pfn_offset = pfn_offset; + op.u.gen.len = len; + op.u.gen.cmfn = gmfn; + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +int xen_tmem_new_pool(uint32_t tmem_cmd, uint64_t uuid_lo, + uint64_t uuid_hi, uint32_t flags) +{ + struct tmem_op op; + int rc = 0; + + op.cmd = tmem_cmd; + op.u.new.uuid[0] = uuid_lo; + op.u.new.uuid[1] = uuid_hi; + op.u.new.flags = flags; + rc = HYPERVISOR_tmem_op(&op); + return rc; +} --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ sle11sp1-2010-03-22/scripts/Makefile.xen.awk 2007-08-06 15:10:49.000000000 +0200 @@ -0,0 +1,34 @@ +BEGIN { + is_rule = 0 +} + +/^[[:space:]]*#/ { + next +} + +/^[[:space:]]*$/ { + if (is_rule) + print("") + is_rule = 0 + next +} + +/:[[:space:]]*%\.[cS][[:space:]]/ { + line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0) + line = gensub(/(single-used-m)/, "xen-\\1", "g", line) + print line + is_rule = 1 + next +} + +/^[^\t]$/ { + if (is_rule) + print("") + is_rule = 0 + next +} + +is_rule { + print $0 + next +}